PR target/82145
[official-gcc.git] / gcc / config / i386 / i386.c
blob7103d11937478dcab82ec3c6eddff92617f7a05c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 static rtx legitimize_dllimport_symbol (rtx, bool);
96 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
97 static rtx legitimize_pe_coff_symbol (rtx, bool);
98 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
99 static bool ix86_save_reg (unsigned int, bool, bool);
100 static bool ix86_function_naked (const_tree);
102 #ifndef CHECK_STACK_LIMIT
103 #define CHECK_STACK_LIMIT (-1)
104 #endif
106 /* Return index of given mode in mult and division cost tables. */
107 #define MODE_INDEX(mode) \
108 ((mode) == QImode ? 0 \
109 : (mode) == HImode ? 1 \
110 : (mode) == SImode ? 2 \
111 : (mode) == DImode ? 3 \
112 : 4)
114 /* Processor costs (relative to an add) */
115 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
116 #define COSTS_N_BYTES(N) ((N) * 2)
118 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
120 static stringop_algs ix86_size_memcpy[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
123 static stringop_algs ix86_size_memset[2] = {
124 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
127 const
128 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
129 COSTS_N_BYTES (2), /* cost of an add instruction */
130 COSTS_N_BYTES (3), /* cost of a lea instruction */
131 COSTS_N_BYTES (2), /* variable shift costs */
132 COSTS_N_BYTES (3), /* constant shift costs */
133 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
134 COSTS_N_BYTES (3), /* HI */
135 COSTS_N_BYTES (3), /* SI */
136 COSTS_N_BYTES (3), /* DI */
137 COSTS_N_BYTES (5)}, /* other */
138 0, /* cost of multiply per each bit set */
139 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
140 COSTS_N_BYTES (3), /* HI */
141 COSTS_N_BYTES (3), /* SI */
142 COSTS_N_BYTES (3), /* DI */
143 COSTS_N_BYTES (5)}, /* other */
144 COSTS_N_BYTES (3), /* cost of movsx */
145 COSTS_N_BYTES (3), /* cost of movzx */
146 0, /* "large" insn */
147 2, /* MOVE_RATIO */
148 2, /* cost for loading QImode using movzbl */
149 {2, 2, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 2, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {2, 2, 2}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {2, 2, 2}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 3, /* cost of moving MMX register */
159 {3, 3}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {3, 3}, /* cost of storing MMX registers
162 in SImode and DImode */
163 3, /* cost of moving SSE register */
164 {3, 3, 3}, /* cost of loading SSE registers
165 in SImode, DImode and TImode */
166 {3, 3, 3}, /* cost of storing SSE registers
167 in SImode, DImode and TImode */
168 3, /* MMX or SSE register to integer */
169 0, /* size of l1 cache */
170 0, /* size of l2 cache */
171 0, /* size of prefetch block */
172 0, /* number of parallel prefetches */
173 2, /* Branch cost */
174 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
175 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
176 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
177 COSTS_N_BYTES (2), /* cost of FABS instruction. */
178 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
179 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
180 ix86_size_memcpy,
181 ix86_size_memset,
182 1, /* scalar_stmt_cost. */
183 1, /* scalar load_cost. */
184 1, /* scalar_store_cost. */
185 1, /* vec_stmt_cost. */
186 1, /* vec_to_scalar_cost. */
187 1, /* scalar_to_vec_cost. */
188 1, /* vec_align_load_cost. */
189 1, /* vec_unalign_load_cost. */
190 1, /* vec_store_cost. */
191 1, /* cond_taken_branch_cost. */
192 1, /* cond_not_taken_branch_cost. */
195 /* Processor costs (relative to an add) */
196 static stringop_algs i386_memcpy[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
199 static stringop_algs i386_memset[2] = {
200 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
201 DUMMY_STRINGOP_ALGS};
203 static const
204 struct processor_costs i386_cost = { /* 386 specific costs */
205 COSTS_N_INSNS (1), /* cost of an add instruction */
206 COSTS_N_INSNS (1), /* cost of a lea instruction */
207 COSTS_N_INSNS (3), /* variable shift costs */
208 COSTS_N_INSNS (2), /* constant shift costs */
209 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
210 COSTS_N_INSNS (6), /* HI */
211 COSTS_N_INSNS (6), /* SI */
212 COSTS_N_INSNS (6), /* DI */
213 COSTS_N_INSNS (6)}, /* other */
214 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
215 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
216 COSTS_N_INSNS (23), /* HI */
217 COSTS_N_INSNS (23), /* SI */
218 COSTS_N_INSNS (23), /* DI */
219 COSTS_N_INSNS (23)}, /* other */
220 COSTS_N_INSNS (3), /* cost of movsx */
221 COSTS_N_INSNS (2), /* cost of movzx */
222 15, /* "large" insn */
223 3, /* MOVE_RATIO */
224 4, /* cost for loading QImode using movzbl */
225 {2, 4, 2}, /* cost of loading integer registers
226 in QImode, HImode and SImode.
227 Relative to reg-reg move (2). */
228 {2, 4, 2}, /* cost of storing integer registers */
229 2, /* cost of reg,reg fld/fst */
230 {8, 8, 8}, /* cost of loading fp registers
231 in SFmode, DFmode and XFmode */
232 {8, 8, 8}, /* cost of storing fp registers
233 in SFmode, DFmode and XFmode */
234 2, /* cost of moving MMX register */
235 {4, 8}, /* cost of loading MMX registers
236 in SImode and DImode */
237 {4, 8}, /* cost of storing MMX registers
238 in SImode and DImode */
239 2, /* cost of moving SSE register */
240 {4, 8, 16}, /* cost of loading SSE registers
241 in SImode, DImode and TImode */
242 {4, 8, 16}, /* cost of storing SSE registers
243 in SImode, DImode and TImode */
244 3, /* MMX or SSE register to integer */
245 0, /* size of l1 cache */
246 0, /* size of l2 cache */
247 0, /* size of prefetch block */
248 0, /* number of parallel prefetches */
249 1, /* Branch cost */
250 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
251 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
252 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
253 COSTS_N_INSNS (22), /* cost of FABS instruction. */
254 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
255 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
256 i386_memcpy,
257 i386_memset,
258 1, /* scalar_stmt_cost. */
259 1, /* scalar load_cost. */
260 1, /* scalar_store_cost. */
261 1, /* vec_stmt_cost. */
262 1, /* vec_to_scalar_cost. */
263 1, /* scalar_to_vec_cost. */
264 1, /* vec_align_load_cost. */
265 2, /* vec_unalign_load_cost. */
266 1, /* vec_store_cost. */
267 3, /* cond_taken_branch_cost. */
268 1, /* cond_not_taken_branch_cost. */
271 static stringop_algs i486_memcpy[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
274 static stringop_algs i486_memset[2] = {
275 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
276 DUMMY_STRINGOP_ALGS};
278 static const
279 struct processor_costs i486_cost = { /* 486 specific costs */
280 COSTS_N_INSNS (1), /* cost of an add instruction */
281 COSTS_N_INSNS (1), /* cost of a lea instruction */
282 COSTS_N_INSNS (3), /* variable shift costs */
283 COSTS_N_INSNS (2), /* constant shift costs */
284 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
285 COSTS_N_INSNS (12), /* HI */
286 COSTS_N_INSNS (12), /* SI */
287 COSTS_N_INSNS (12), /* DI */
288 COSTS_N_INSNS (12)}, /* other */
289 1, /* cost of multiply per each bit set */
290 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
291 COSTS_N_INSNS (40), /* HI */
292 COSTS_N_INSNS (40), /* SI */
293 COSTS_N_INSNS (40), /* DI */
294 COSTS_N_INSNS (40)}, /* other */
295 COSTS_N_INSNS (3), /* cost of movsx */
296 COSTS_N_INSNS (2), /* cost of movzx */
297 15, /* "large" insn */
298 3, /* MOVE_RATIO */
299 4, /* cost for loading QImode using movzbl */
300 {2, 4, 2}, /* cost of loading integer registers
301 in QImode, HImode and SImode.
302 Relative to reg-reg move (2). */
303 {2, 4, 2}, /* cost of storing integer registers */
304 2, /* cost of reg,reg fld/fst */
305 {8, 8, 8}, /* cost of loading fp registers
306 in SFmode, DFmode and XFmode */
307 {8, 8, 8}, /* cost of storing fp registers
308 in SFmode, DFmode and XFmode */
309 2, /* cost of moving MMX register */
310 {4, 8}, /* cost of loading MMX registers
311 in SImode and DImode */
312 {4, 8}, /* cost of storing MMX registers
313 in SImode and DImode */
314 2, /* cost of moving SSE register */
315 {4, 8, 16}, /* cost of loading SSE registers
316 in SImode, DImode and TImode */
317 {4, 8, 16}, /* cost of storing SSE registers
318 in SImode, DImode and TImode */
319 3, /* MMX or SSE register to integer */
320 4, /* size of l1 cache. 486 has 8kB cache
321 shared for code and data, so 4kB is
322 not really precise. */
323 4, /* size of l2 cache */
324 0, /* size of prefetch block */
325 0, /* number of parallel prefetches */
326 1, /* Branch cost */
327 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
328 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
329 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
330 COSTS_N_INSNS (3), /* cost of FABS instruction. */
331 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
332 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
333 i486_memcpy,
334 i486_memset,
335 1, /* scalar_stmt_cost. */
336 1, /* scalar load_cost. */
337 1, /* scalar_store_cost. */
338 1, /* vec_stmt_cost. */
339 1, /* vec_to_scalar_cost. */
340 1, /* scalar_to_vec_cost. */
341 1, /* vec_align_load_cost. */
342 2, /* vec_unalign_load_cost. */
343 1, /* vec_store_cost. */
344 3, /* cond_taken_branch_cost. */
345 1, /* cond_not_taken_branch_cost. */
348 static stringop_algs pentium_memcpy[2] = {
349 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
350 DUMMY_STRINGOP_ALGS};
351 static stringop_algs pentium_memset[2] = {
352 {libcall, {{-1, rep_prefix_4_byte, false}}},
353 DUMMY_STRINGOP_ALGS};
355 static const
356 struct processor_costs pentium_cost = {
357 COSTS_N_INSNS (1), /* cost of an add instruction */
358 COSTS_N_INSNS (1), /* cost of a lea instruction */
359 COSTS_N_INSNS (4), /* variable shift costs */
360 COSTS_N_INSNS (1), /* constant shift costs */
361 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
362 COSTS_N_INSNS (11), /* HI */
363 COSTS_N_INSNS (11), /* SI */
364 COSTS_N_INSNS (11), /* DI */
365 COSTS_N_INSNS (11)}, /* other */
366 0, /* cost of multiply per each bit set */
367 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
368 COSTS_N_INSNS (25), /* HI */
369 COSTS_N_INSNS (25), /* SI */
370 COSTS_N_INSNS (25), /* DI */
371 COSTS_N_INSNS (25)}, /* other */
372 COSTS_N_INSNS (3), /* cost of movsx */
373 COSTS_N_INSNS (2), /* cost of movzx */
374 8, /* "large" insn */
375 6, /* MOVE_RATIO */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, /* cost of moving SSE register */
392 {4, 8, 16}, /* cost of loading SSE registers
393 in SImode, DImode and TImode */
394 {4, 8, 16}, /* cost of storing SSE registers
395 in SImode, DImode and TImode */
396 3, /* MMX or SSE register to integer */
397 8, /* size of l1 cache. */
398 8, /* size of l2 cache */
399 0, /* size of prefetch block */
400 0, /* number of parallel prefetches */
401 2, /* Branch cost */
402 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
403 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
404 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
405 COSTS_N_INSNS (1), /* cost of FABS instruction. */
406 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
407 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
408 pentium_memcpy,
409 pentium_memset,
410 1, /* scalar_stmt_cost. */
411 1, /* scalar load_cost. */
412 1, /* scalar_store_cost. */
413 1, /* vec_stmt_cost. */
414 1, /* vec_to_scalar_cost. */
415 1, /* scalar_to_vec_cost. */
416 1, /* vec_align_load_cost. */
417 2, /* vec_unalign_load_cost. */
418 1, /* vec_store_cost. */
419 3, /* cond_taken_branch_cost. */
420 1, /* cond_not_taken_branch_cost. */
423 static const
424 struct processor_costs lakemont_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (11), /* HI */
431 COSTS_N_INSNS (11), /* SI */
432 COSTS_N_INSNS (11), /* DI */
433 COSTS_N_INSNS (11)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (25), /* HI */
437 COSTS_N_INSNS (25), /* SI */
438 COSTS_N_INSNS (25), /* DI */
439 COSTS_N_INSNS (25)}, /* other */
440 COSTS_N_INSNS (3), /* cost of movsx */
441 COSTS_N_INSNS (2), /* cost of movzx */
442 8, /* "large" insn */
443 17, /* MOVE_RATIO */
444 6, /* cost for loading QImode using movzbl */
445 {2, 4, 2}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 4, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 8, /* cost of moving MMX register */
455 {8, 8}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {8, 8}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {4, 8, 16}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {4, 8, 16}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 8, /* size of l2 cache */
467 0, /* size of prefetch block */
468 0, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (1), /* cost of FABS instruction. */
474 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
476 pentium_memcpy,
477 pentium_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
491 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
492 (we ensure the alignment). For small blocks inline loop is still a
493 noticeable win, for bigger blocks either rep movsl or rep movsb is
494 way to go. Rep movsb has apparently more expensive startup time in CPU,
495 but after 4K the difference is down in the noise. */
496 static stringop_algs pentiumpro_memcpy[2] = {
497 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
498 {8192, rep_prefix_4_byte, false},
499 {-1, rep_prefix_1_byte, false}}},
500 DUMMY_STRINGOP_ALGS};
501 static stringop_algs pentiumpro_memset[2] = {
502 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
503 {8192, rep_prefix_4_byte, false},
504 {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS};
506 static const
507 struct processor_costs pentiumpro_cost = {
508 COSTS_N_INSNS (1), /* cost of an add instruction */
509 COSTS_N_INSNS (1), /* cost of a lea instruction */
510 COSTS_N_INSNS (1), /* variable shift costs */
511 COSTS_N_INSNS (1), /* constant shift costs */
512 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
513 COSTS_N_INSNS (4), /* HI */
514 COSTS_N_INSNS (4), /* SI */
515 COSTS_N_INSNS (4), /* DI */
516 COSTS_N_INSNS (4)}, /* other */
517 0, /* cost of multiply per each bit set */
518 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
519 COSTS_N_INSNS (17), /* HI */
520 COSTS_N_INSNS (17), /* SI */
521 COSTS_N_INSNS (17), /* DI */
522 COSTS_N_INSNS (17)}, /* other */
523 COSTS_N_INSNS (1), /* cost of movsx */
524 COSTS_N_INSNS (1), /* cost of movzx */
525 8, /* "large" insn */
526 6, /* MOVE_RATIO */
527 2, /* cost for loading QImode using movzbl */
528 {4, 4, 4}, /* cost of loading integer registers
529 in QImode, HImode and SImode.
530 Relative to reg-reg move (2). */
531 {2, 2, 2}, /* cost of storing integer registers */
532 2, /* cost of reg,reg fld/fst */
533 {2, 2, 6}, /* cost of loading fp registers
534 in SFmode, DFmode and XFmode */
535 {4, 4, 6}, /* cost of storing fp registers
536 in SFmode, DFmode and XFmode */
537 2, /* cost of moving MMX register */
538 {2, 2}, /* cost of loading MMX registers
539 in SImode and DImode */
540 {2, 2}, /* cost of storing MMX registers
541 in SImode and DImode */
542 2, /* cost of moving SSE register */
543 {2, 2, 8}, /* cost of loading SSE registers
544 in SImode, DImode and TImode */
545 {2, 2, 8}, /* cost of storing SSE registers
546 in SImode, DImode and TImode */
547 3, /* MMX or SSE register to integer */
548 8, /* size of l1 cache. */
549 256, /* size of l2 cache */
550 32, /* size of prefetch block */
551 6, /* number of parallel prefetches */
552 2, /* Branch cost */
553 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
554 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
555 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
556 COSTS_N_INSNS (2), /* cost of FABS instruction. */
557 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
558 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 1, /* scalar_stmt_cost. */
562 1, /* scalar load_cost. */
563 1, /* scalar_store_cost. */
564 1, /* vec_stmt_cost. */
565 1, /* vec_to_scalar_cost. */
566 1, /* scalar_to_vec_cost. */
567 1, /* vec_align_load_cost. */
568 2, /* vec_unalign_load_cost. */
569 1, /* vec_store_cost. */
570 3, /* cond_taken_branch_cost. */
571 1, /* cond_not_taken_branch_cost. */
574 static stringop_algs geode_memcpy[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static stringop_algs geode_memset[2] = {
578 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
579 DUMMY_STRINGOP_ALGS};
580 static const
581 struct processor_costs geode_cost = {
582 COSTS_N_INSNS (1), /* cost of an add instruction */
583 COSTS_N_INSNS (1), /* cost of a lea instruction */
584 COSTS_N_INSNS (2), /* variable shift costs */
585 COSTS_N_INSNS (1), /* constant shift costs */
586 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
587 COSTS_N_INSNS (4), /* HI */
588 COSTS_N_INSNS (7), /* SI */
589 COSTS_N_INSNS (7), /* DI */
590 COSTS_N_INSNS (7)}, /* other */
591 0, /* cost of multiply per each bit set */
592 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
593 COSTS_N_INSNS (23), /* HI */
594 COSTS_N_INSNS (39), /* SI */
595 COSTS_N_INSNS (39), /* DI */
596 COSTS_N_INSNS (39)}, /* other */
597 COSTS_N_INSNS (1), /* cost of movsx */
598 COSTS_N_INSNS (1), /* cost of movzx */
599 8, /* "large" insn */
600 4, /* MOVE_RATIO */
601 1, /* cost for loading QImode using movzbl */
602 {1, 1, 1}, /* cost of loading integer registers
603 in QImode, HImode and SImode.
604 Relative to reg-reg move (2). */
605 {1, 1, 1}, /* cost of storing integer registers */
606 1, /* cost of reg,reg fld/fst */
607 {1, 1, 1}, /* cost of loading fp registers
608 in SFmode, DFmode and XFmode */
609 {4, 6, 6}, /* cost of storing fp registers
610 in SFmode, DFmode and XFmode */
612 2, /* cost of moving MMX register */
613 {2, 2}, /* cost of loading MMX registers
614 in SImode and DImode */
615 {2, 2}, /* cost of storing MMX registers
616 in SImode and DImode */
617 2, /* cost of moving SSE register */
618 {2, 2, 8}, /* cost of loading SSE registers
619 in SImode, DImode and TImode */
620 {2, 2, 8}, /* cost of storing SSE registers
621 in SImode, DImode and TImode */
622 3, /* MMX or SSE register to integer */
623 64, /* size of l1 cache. */
624 128, /* size of l2 cache. */
625 32, /* size of prefetch block */
626 1, /* number of parallel prefetches */
627 1, /* Branch cost */
628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
629 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
630 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
631 COSTS_N_INSNS (1), /* cost of FABS instruction. */
632 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
633 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
634 geode_memcpy,
635 geode_memset,
636 1, /* scalar_stmt_cost. */
637 1, /* scalar load_cost. */
638 1, /* scalar_store_cost. */
639 1, /* vec_stmt_cost. */
640 1, /* vec_to_scalar_cost. */
641 1, /* scalar_to_vec_cost. */
642 1, /* vec_align_load_cost. */
643 2, /* vec_unalign_load_cost. */
644 1, /* vec_store_cost. */
645 3, /* cond_taken_branch_cost. */
646 1, /* cond_not_taken_branch_cost. */
649 static stringop_algs k6_memcpy[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static stringop_algs k6_memset[2] = {
653 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
654 DUMMY_STRINGOP_ALGS};
655 static const
656 struct processor_costs k6_cost = {
657 COSTS_N_INSNS (1), /* cost of an add instruction */
658 COSTS_N_INSNS (2), /* cost of a lea instruction */
659 COSTS_N_INSNS (1), /* variable shift costs */
660 COSTS_N_INSNS (1), /* constant shift costs */
661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
662 COSTS_N_INSNS (3), /* HI */
663 COSTS_N_INSNS (3), /* SI */
664 COSTS_N_INSNS (3), /* DI */
665 COSTS_N_INSNS (3)}, /* other */
666 0, /* cost of multiply per each bit set */
667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
668 COSTS_N_INSNS (18), /* HI */
669 COSTS_N_INSNS (18), /* SI */
670 COSTS_N_INSNS (18), /* DI */
671 COSTS_N_INSNS (18)}, /* other */
672 COSTS_N_INSNS (2), /* cost of movsx */
673 COSTS_N_INSNS (2), /* cost of movzx */
674 8, /* "large" insn */
675 4, /* MOVE_RATIO */
676 3, /* cost for loading QImode using movzbl */
677 {4, 5, 4}, /* cost of loading integer registers
678 in QImode, HImode and SImode.
679 Relative to reg-reg move (2). */
680 {2, 3, 2}, /* cost of storing integer registers */
681 4, /* cost of reg,reg fld/fst */
682 {6, 6, 6}, /* cost of loading fp registers
683 in SFmode, DFmode and XFmode */
684 {4, 4, 4}, /* cost of storing fp registers
685 in SFmode, DFmode and XFmode */
686 2, /* cost of moving MMX register */
687 {2, 2}, /* cost of loading MMX registers
688 in SImode and DImode */
689 {2, 2}, /* cost of storing MMX registers
690 in SImode and DImode */
691 2, /* cost of moving SSE register */
692 {2, 2, 8}, /* cost of loading SSE registers
693 in SImode, DImode and TImode */
694 {2, 2, 8}, /* cost of storing SSE registers
695 in SImode, DImode and TImode */
696 6, /* MMX or SSE register to integer */
697 32, /* size of l1 cache. */
698 32, /* size of l2 cache. Some models
699 have integrated l2 cache, but
700 optimizing for k6 is not important
701 enough to worry about that. */
702 32, /* size of prefetch block */
703 1, /* number of parallel prefetches */
704 1, /* Branch cost */
705 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
706 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
707 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
708 COSTS_N_INSNS (2), /* cost of FABS instruction. */
709 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
710 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
711 k6_memcpy,
712 k6_memset,
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
726 /* For some reason, Athlon deals better with REP prefix (relative to loops)
727 compared to K8. Alignment becomes important after 8 bytes for memcpy and
728 128 bytes for memset. */
729 static stringop_algs athlon_memcpy[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static stringop_algs athlon_memset[2] = {
733 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 DUMMY_STRINGOP_ALGS};
735 static const
736 struct processor_costs athlon_cost = {
737 COSTS_N_INSNS (1), /* cost of an add instruction */
738 COSTS_N_INSNS (2), /* cost of a lea instruction */
739 COSTS_N_INSNS (1), /* variable shift costs */
740 COSTS_N_INSNS (1), /* constant shift costs */
741 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
742 COSTS_N_INSNS (5), /* HI */
743 COSTS_N_INSNS (5), /* SI */
744 COSTS_N_INSNS (5), /* DI */
745 COSTS_N_INSNS (5)}, /* other */
746 0, /* cost of multiply per each bit set */
747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
748 COSTS_N_INSNS (26), /* HI */
749 COSTS_N_INSNS (42), /* SI */
750 COSTS_N_INSNS (74), /* DI */
751 COSTS_N_INSNS (74)}, /* other */
752 COSTS_N_INSNS (1), /* cost of movsx */
753 COSTS_N_INSNS (1), /* cost of movzx */
754 8, /* "large" insn */
755 9, /* MOVE_RATIO */
756 4, /* cost for loading QImode using movzbl */
757 {3, 4, 3}, /* cost of loading integer registers
758 in QImode, HImode and SImode.
759 Relative to reg-reg move (2). */
760 {3, 4, 3}, /* cost of storing integer registers */
761 4, /* cost of reg,reg fld/fst */
762 {4, 4, 12}, /* cost of loading fp registers
763 in SFmode, DFmode and XFmode */
764 {6, 6, 8}, /* cost of storing fp registers
765 in SFmode, DFmode and XFmode */
766 2, /* cost of moving MMX register */
767 {4, 4}, /* cost of loading MMX registers
768 in SImode and DImode */
769 {4, 4}, /* cost of storing MMX registers
770 in SImode and DImode */
771 2, /* cost of moving SSE register */
772 {4, 4, 6}, /* cost of loading SSE registers
773 in SImode, DImode and TImode */
774 {4, 4, 5}, /* cost of storing SSE registers
775 in SImode, DImode and TImode */
776 5, /* MMX or SSE register to integer */
777 64, /* size of l1 cache. */
778 256, /* size of l2 cache. */
779 64, /* size of prefetch block */
780 6, /* number of parallel prefetches */
781 5, /* Branch cost */
782 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
783 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
784 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
785 COSTS_N_INSNS (2), /* cost of FABS instruction. */
786 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
787 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
788 athlon_memcpy,
789 athlon_memset,
790 1, /* scalar_stmt_cost. */
791 1, /* scalar load_cost. */
792 1, /* scalar_store_cost. */
793 1, /* vec_stmt_cost. */
794 1, /* vec_to_scalar_cost. */
795 1, /* scalar_to_vec_cost. */
796 1, /* vec_align_load_cost. */
797 2, /* vec_unalign_load_cost. */
798 1, /* vec_store_cost. */
799 3, /* cond_taken_branch_cost. */
800 1, /* cond_not_taken_branch_cost. */
803 /* K8 has optimized REP instruction for medium sized blocks, but for very
804 small blocks it is better to use loop. For large blocks, libcall can
805 do nontemporary accesses and beat inline considerably. */
806 static stringop_algs k8_memcpy[2] = {
807 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
808 {-1, rep_prefix_4_byte, false}}},
809 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
810 {-1, libcall, false}}}};
811 static stringop_algs k8_memset[2] = {
812 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
813 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
814 {libcall, {{48, unrolled_loop, false},
815 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
816 static const
817 struct processor_costs k8_cost = {
818 COSTS_N_INSNS (1), /* cost of an add instruction */
819 COSTS_N_INSNS (2), /* cost of a lea instruction */
820 COSTS_N_INSNS (1), /* variable shift costs */
821 COSTS_N_INSNS (1), /* constant shift costs */
822 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
823 COSTS_N_INSNS (4), /* HI */
824 COSTS_N_INSNS (3), /* SI */
825 COSTS_N_INSNS (4), /* DI */
826 COSTS_N_INSNS (5)}, /* other */
827 0, /* cost of multiply per each bit set */
828 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
829 COSTS_N_INSNS (26), /* HI */
830 COSTS_N_INSNS (42), /* SI */
831 COSTS_N_INSNS (74), /* DI */
832 COSTS_N_INSNS (74)}, /* other */
833 COSTS_N_INSNS (1), /* cost of movsx */
834 COSTS_N_INSNS (1), /* cost of movzx */
835 8, /* "large" insn */
836 9, /* MOVE_RATIO */
837 4, /* cost for loading QImode using movzbl */
838 {3, 4, 3}, /* cost of loading integer registers
839 in QImode, HImode and SImode.
840 Relative to reg-reg move (2). */
841 {3, 4, 3}, /* cost of storing integer registers */
842 4, /* cost of reg,reg fld/fst */
843 {4, 4, 12}, /* cost of loading fp registers
844 in SFmode, DFmode and XFmode */
845 {6, 6, 8}, /* cost of storing fp registers
846 in SFmode, DFmode and XFmode */
847 2, /* cost of moving MMX register */
848 {3, 3}, /* cost of loading MMX registers
849 in SImode and DImode */
850 {4, 4}, /* cost of storing MMX registers
851 in SImode and DImode */
852 2, /* cost of moving SSE register */
853 {4, 3, 6}, /* cost of loading SSE registers
854 in SImode, DImode and TImode */
855 {4, 4, 5}, /* cost of storing SSE registers
856 in SImode, DImode and TImode */
857 5, /* MMX or SSE register to integer */
858 64, /* size of l1 cache. */
859 512, /* size of l2 cache. */
860 64, /* size of prefetch block */
861 /* New AMD processors never drop prefetches; if they cannot be performed
862 immediately, they are queued. We set number of simultaneous prefetches
863 to a large constant to reflect this (it probably is not a good idea not
864 to limit number of prefetches at all, as their execution also takes some
865 time). */
866 100, /* number of parallel prefetches */
867 3, /* Branch cost */
868 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
869 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
870 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
871 COSTS_N_INSNS (2), /* cost of FABS instruction. */
872 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
873 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
875 k8_memcpy,
876 k8_memset,
877 4, /* scalar_stmt_cost. */
878 2, /* scalar load_cost. */
879 2, /* scalar_store_cost. */
880 5, /* vec_stmt_cost. */
881 0, /* vec_to_scalar_cost. */
882 2, /* scalar_to_vec_cost. */
883 2, /* vec_align_load_cost. */
884 3, /* vec_unalign_load_cost. */
885 3, /* vec_store_cost. */
886 3, /* cond_taken_branch_cost. */
887 2, /* cond_not_taken_branch_cost. */
890 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
891 very small blocks it is better to use loop. For large blocks, libcall can
892 do nontemporary accesses and beat inline considerably. */
893 static stringop_algs amdfam10_memcpy[2] = {
894 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
895 {-1, rep_prefix_4_byte, false}}},
896 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
898 static stringop_algs amdfam10_memset[2] = {
899 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
900 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
901 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
902 {-1, libcall, false}}}};
903 struct processor_costs amdfam10_cost = {
904 COSTS_N_INSNS (1), /* cost of an add instruction */
905 COSTS_N_INSNS (2), /* cost of a lea instruction */
906 COSTS_N_INSNS (1), /* variable shift costs */
907 COSTS_N_INSNS (1), /* constant shift costs */
908 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
909 COSTS_N_INSNS (4), /* HI */
910 COSTS_N_INSNS (3), /* SI */
911 COSTS_N_INSNS (4), /* DI */
912 COSTS_N_INSNS (5)}, /* other */
913 0, /* cost of multiply per each bit set */
914 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
915 COSTS_N_INSNS (35), /* HI */
916 COSTS_N_INSNS (51), /* SI */
917 COSTS_N_INSNS (83), /* DI */
918 COSTS_N_INSNS (83)}, /* other */
919 COSTS_N_INSNS (1), /* cost of movsx */
920 COSTS_N_INSNS (1), /* cost of movzx */
921 8, /* "large" insn */
922 9, /* MOVE_RATIO */
923 4, /* cost for loading QImode using movzbl */
924 {3, 4, 3}, /* cost of loading integer registers
925 in QImode, HImode and SImode.
926 Relative to reg-reg move (2). */
927 {3, 4, 3}, /* cost of storing integer registers */
928 4, /* cost of reg,reg fld/fst */
929 {4, 4, 12}, /* cost of loading fp registers
930 in SFmode, DFmode and XFmode */
931 {6, 6, 8}, /* cost of storing fp registers
932 in SFmode, DFmode and XFmode */
933 2, /* cost of moving MMX register */
934 {3, 3}, /* cost of loading MMX registers
935 in SImode and DImode */
936 {4, 4}, /* cost of storing MMX registers
937 in SImode and DImode */
938 2, /* cost of moving SSE register */
939 {4, 4, 3}, /* cost of loading SSE registers
940 in SImode, DImode and TImode */
941 {4, 4, 5}, /* cost of storing SSE registers
942 in SImode, DImode and TImode */
943 3, /* MMX or SSE register to integer */
944 /* On K8:
945 MOVD reg64, xmmreg Double FSTORE 4
946 MOVD reg32, xmmreg Double FSTORE 4
947 On AMDFAM10:
948 MOVD reg64, xmmreg Double FADD 3
949 1/1 1/1
950 MOVD reg32, xmmreg Double FADD 3
951 1/1 1/1 */
952 64, /* size of l1 cache. */
953 512, /* size of l2 cache. */
954 64, /* size of prefetch block */
955 /* New AMD processors never drop prefetches; if they cannot be performed
956 immediately, they are queued. We set number of simultaneous prefetches
957 to a large constant to reflect this (it probably is not a good idea not
958 to limit number of prefetches at all, as their execution also takes some
959 time). */
960 100, /* number of parallel prefetches */
961 2, /* Branch cost */
962 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
963 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
964 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
965 COSTS_N_INSNS (2), /* cost of FABS instruction. */
966 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
967 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
969 amdfam10_memcpy,
970 amdfam10_memset,
971 4, /* scalar_stmt_cost. */
972 2, /* scalar load_cost. */
973 2, /* scalar_store_cost. */
974 6, /* vec_stmt_cost. */
975 0, /* vec_to_scalar_cost. */
976 2, /* scalar_to_vec_cost. */
977 2, /* vec_align_load_cost. */
978 2, /* vec_unalign_load_cost. */
979 2, /* vec_store_cost. */
980 2, /* cond_taken_branch_cost. */
981 1, /* cond_not_taken_branch_cost. */
984 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 static stringop_algs bdver1_memcpy[2] = {
988 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
989 {-1, rep_prefix_4_byte, false}}},
990 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
991 {-1, libcall, false}}}};
992 static stringop_algs bdver1_memset[2] = {
993 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
994 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
995 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
996 {-1, libcall, false}}}};
998 const struct processor_costs bdver1_cost = {
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (1), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (4), /* SI */
1006 COSTS_N_INSNS (6), /* DI */
1007 COSTS_N_INSNS (6)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (35), /* HI */
1011 COSTS_N_INSNS (51), /* SI */
1012 COSTS_N_INSNS (83), /* DI */
1013 COSTS_N_INSNS (83)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1017 9, /* MOVE_RATIO */
1018 4, /* cost for loading QImode using movzbl */
1019 {5, 5, 4}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {4, 4, 4}, /* cost of storing integer registers */
1023 2, /* cost of reg,reg fld/fst */
1024 {5, 5, 12}, /* cost of loading fp registers
1025 in SFmode, DFmode and XFmode */
1026 {4, 4, 8}, /* cost of storing fp registers
1027 in SFmode, DFmode and XFmode */
1028 2, /* cost of moving MMX register */
1029 {4, 4}, /* cost of loading MMX registers
1030 in SImode and DImode */
1031 {4, 4}, /* cost of storing MMX registers
1032 in SImode and DImode */
1033 2, /* cost of moving SSE register */
1034 {4, 4, 4}, /* cost of loading SSE registers
1035 in SImode, DImode and TImode */
1036 {4, 4, 4}, /* cost of storing SSE registers
1037 in SImode, DImode and TImode */
1038 2, /* MMX or SSE register to integer */
1039 /* On K8:
1040 MOVD reg64, xmmreg Double FSTORE 4
1041 MOVD reg32, xmmreg Double FSTORE 4
1042 On AMDFAM10:
1043 MOVD reg64, xmmreg Double FADD 3
1044 1/1 1/1
1045 MOVD reg32, xmmreg Double FADD 3
1046 1/1 1/1 */
1047 16, /* size of l1 cache. */
1048 2048, /* size of l2 cache. */
1049 64, /* size of prefetch block */
1050 /* New AMD processors never drop prefetches; if they cannot be performed
1051 immediately, they are queued. We set number of simultaneous prefetches
1052 to a large constant to reflect this (it probably is not a good idea not
1053 to limit number of prefetches at all, as their execution also takes some
1054 time). */
1055 100, /* number of parallel prefetches */
1056 2, /* Branch cost */
1057 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1058 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1059 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1060 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1061 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1062 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1064 bdver1_memcpy,
1065 bdver1_memset,
1066 6, /* scalar_stmt_cost. */
1067 4, /* scalar load_cost. */
1068 4, /* scalar_store_cost. */
1069 6, /* vec_stmt_cost. */
1070 0, /* vec_to_scalar_cost. */
1071 2, /* scalar_to_vec_cost. */
1072 4, /* vec_align_load_cost. */
1073 4, /* vec_unalign_load_cost. */
1074 4, /* vec_store_cost. */
1075 4, /* cond_taken_branch_cost. */
1076 2, /* cond_not_taken_branch_cost. */
1079 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1080 very small blocks it is better to use loop. For large blocks, libcall
1081 can do nontemporary accesses and beat inline considerably. */
1083 static stringop_algs bdver2_memcpy[2] = {
1084 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1085 {-1, rep_prefix_4_byte, false}}},
1086 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}};
1088 static stringop_algs bdver2_memset[2] = {
1089 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1090 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1091 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1092 {-1, libcall, false}}}};
1094 const struct processor_costs bdver2_cost = {
1095 COSTS_N_INSNS (1), /* cost of an add instruction */
1096 COSTS_N_INSNS (1), /* cost of a lea instruction */
1097 COSTS_N_INSNS (1), /* variable shift costs */
1098 COSTS_N_INSNS (1), /* constant shift costs */
1099 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1100 COSTS_N_INSNS (4), /* HI */
1101 COSTS_N_INSNS (4), /* SI */
1102 COSTS_N_INSNS (6), /* DI */
1103 COSTS_N_INSNS (6)}, /* other */
1104 0, /* cost of multiply per each bit set */
1105 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1106 COSTS_N_INSNS (35), /* HI */
1107 COSTS_N_INSNS (51), /* SI */
1108 COSTS_N_INSNS (83), /* DI */
1109 COSTS_N_INSNS (83)}, /* other */
1110 COSTS_N_INSNS (1), /* cost of movsx */
1111 COSTS_N_INSNS (1), /* cost of movzx */
1112 8, /* "large" insn */
1113 9, /* MOVE_RATIO */
1114 4, /* cost for loading QImode using movzbl */
1115 {5, 5, 4}, /* cost of loading integer registers
1116 in QImode, HImode and SImode.
1117 Relative to reg-reg move (2). */
1118 {4, 4, 4}, /* cost of storing integer registers */
1119 2, /* cost of reg,reg fld/fst */
1120 {5, 5, 12}, /* cost of loading fp registers
1121 in SFmode, DFmode and XFmode */
1122 {4, 4, 8}, /* cost of storing fp registers
1123 in SFmode, DFmode and XFmode */
1124 2, /* cost of moving MMX register */
1125 {4, 4}, /* cost of loading MMX registers
1126 in SImode and DImode */
1127 {4, 4}, /* cost of storing MMX registers
1128 in SImode and DImode */
1129 2, /* cost of moving SSE register */
1130 {4, 4, 4}, /* cost of loading SSE registers
1131 in SImode, DImode and TImode */
1132 {4, 4, 4}, /* cost of storing SSE registers
1133 in SImode, DImode and TImode */
1134 2, /* MMX or SSE register to integer */
1135 /* On K8:
1136 MOVD reg64, xmmreg Double FSTORE 4
1137 MOVD reg32, xmmreg Double FSTORE 4
1138 On AMDFAM10:
1139 MOVD reg64, xmmreg Double FADD 3
1140 1/1 1/1
1141 MOVD reg32, xmmreg Double FADD 3
1142 1/1 1/1 */
1143 16, /* size of l1 cache. */
1144 2048, /* size of l2 cache. */
1145 64, /* size of prefetch block */
1146 /* New AMD processors never drop prefetches; if they cannot be performed
1147 immediately, they are queued. We set number of simultaneous prefetches
1148 to a large constant to reflect this (it probably is not a good idea not
1149 to limit number of prefetches at all, as their execution also takes some
1150 time). */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1160 bdver2_memcpy,
1161 bdver2_memset,
1162 6, /* scalar_stmt_cost. */
1163 4, /* scalar load_cost. */
1164 4, /* scalar_store_cost. */
1165 6, /* vec_stmt_cost. */
1166 0, /* vec_to_scalar_cost. */
1167 2, /* scalar_to_vec_cost. */
1168 4, /* vec_align_load_cost. */
1169 4, /* vec_unalign_load_cost. */
1170 4, /* vec_store_cost. */
1171 4, /* cond_taken_branch_cost. */
1172 2, /* cond_not_taken_branch_cost. */
1176 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1177 very small blocks it is better to use loop. For large blocks, libcall
1178 can do nontemporary accesses and beat inline considerably. */
1179 static stringop_algs bdver3_memcpy[2] = {
1180 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1181 {-1, rep_prefix_4_byte, false}}},
1182 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1183 {-1, libcall, false}}}};
1184 static stringop_algs bdver3_memset[2] = {
1185 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1186 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1187 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1188 {-1, libcall, false}}}};
1189 struct processor_costs bdver3_cost = {
1190 COSTS_N_INSNS (1), /* cost of an add instruction */
1191 COSTS_N_INSNS (1), /* cost of a lea instruction */
1192 COSTS_N_INSNS (1), /* variable shift costs */
1193 COSTS_N_INSNS (1), /* constant shift costs */
1194 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1195 COSTS_N_INSNS (4), /* HI */
1196 COSTS_N_INSNS (4), /* SI */
1197 COSTS_N_INSNS (6), /* DI */
1198 COSTS_N_INSNS (6)}, /* other */
1199 0, /* cost of multiply per each bit set */
1200 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1201 COSTS_N_INSNS (35), /* HI */
1202 COSTS_N_INSNS (51), /* SI */
1203 COSTS_N_INSNS (83), /* DI */
1204 COSTS_N_INSNS (83)}, /* other */
1205 COSTS_N_INSNS (1), /* cost of movsx */
1206 COSTS_N_INSNS (1), /* cost of movzx */
1207 8, /* "large" insn */
1208 9, /* MOVE_RATIO */
1209 4, /* cost for loading QImode using movzbl */
1210 {5, 5, 4}, /* cost of loading integer registers
1211 in QImode, HImode and SImode.
1212 Relative to reg-reg move (2). */
1213 {4, 4, 4}, /* cost of storing integer registers */
1214 2, /* cost of reg,reg fld/fst */
1215 {5, 5, 12}, /* cost of loading fp registers
1216 in SFmode, DFmode and XFmode */
1217 {4, 4, 8}, /* cost of storing fp registers
1218 in SFmode, DFmode and XFmode */
1219 2, /* cost of moving MMX register */
1220 {4, 4}, /* cost of loading MMX registers
1221 in SImode and DImode */
1222 {4, 4}, /* cost of storing MMX registers
1223 in SImode and DImode */
1224 2, /* cost of moving SSE register */
1225 {4, 4, 4}, /* cost of loading SSE registers
1226 in SImode, DImode and TImode */
1227 {4, 4, 4}, /* cost of storing SSE registers
1228 in SImode, DImode and TImode */
1229 2, /* MMX or SSE register to integer */
1230 16, /* size of l1 cache. */
1231 2048, /* size of l2 cache. */
1232 64, /* size of prefetch block */
1233 /* New AMD processors never drop prefetches; if they cannot be performed
1234 immediately, they are queued. We set number of simultaneous prefetches
1235 to a large constant to reflect this (it probably is not a good idea not
1236 to limit number of prefetches at all, as their execution also takes some
1237 time). */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1247 bdver3_memcpy,
1248 bdver3_memset,
1249 6, /* scalar_stmt_cost. */
1250 4, /* scalar load_cost. */
1251 4, /* scalar_store_cost. */
1252 6, /* vec_stmt_cost. */
1253 0, /* vec_to_scalar_cost. */
1254 2, /* scalar_to_vec_cost. */
1255 4, /* vec_align_load_cost. */
1256 4, /* vec_unalign_load_cost. */
1257 4, /* vec_store_cost. */
1258 4, /* cond_taken_branch_cost. */
1259 2, /* cond_not_taken_branch_cost. */
1262 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1263 very small blocks it is better to use loop. For large blocks, libcall
1264 can do nontemporary accesses and beat inline considerably. */
1265 static stringop_algs bdver4_memcpy[2] = {
1266 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1267 {-1, rep_prefix_4_byte, false}}},
1268 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1269 {-1, libcall, false}}}};
1270 static stringop_algs bdver4_memset[2] = {
1271 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1272 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1273 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1274 {-1, libcall, false}}}};
1275 struct processor_costs bdver4_cost = {
1276 COSTS_N_INSNS (1), /* cost of an add instruction */
1277 COSTS_N_INSNS (1), /* cost of a lea instruction */
1278 COSTS_N_INSNS (1), /* variable shift costs */
1279 COSTS_N_INSNS (1), /* constant shift costs */
1280 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1281 COSTS_N_INSNS (4), /* HI */
1282 COSTS_N_INSNS (4), /* SI */
1283 COSTS_N_INSNS (6), /* DI */
1284 COSTS_N_INSNS (6)}, /* other */
1285 0, /* cost of multiply per each bit set */
1286 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1287 COSTS_N_INSNS (35), /* HI */
1288 COSTS_N_INSNS (51), /* SI */
1289 COSTS_N_INSNS (83), /* DI */
1290 COSTS_N_INSNS (83)}, /* other */
1291 COSTS_N_INSNS (1), /* cost of movsx */
1292 COSTS_N_INSNS (1), /* cost of movzx */
1293 8, /* "large" insn */
1294 9, /* MOVE_RATIO */
1295 4, /* cost for loading QImode using movzbl */
1296 {5, 5, 4}, /* cost of loading integer registers
1297 in QImode, HImode and SImode.
1298 Relative to reg-reg move (2). */
1299 {4, 4, 4}, /* cost of storing integer registers */
1300 2, /* cost of reg,reg fld/fst */
1301 {5, 5, 12}, /* cost of loading fp registers
1302 in SFmode, DFmode and XFmode */
1303 {4, 4, 8}, /* cost of storing fp registers
1304 in SFmode, DFmode and XFmode */
1305 2, /* cost of moving MMX register */
1306 {4, 4}, /* cost of loading MMX registers
1307 in SImode and DImode */
1308 {4, 4}, /* cost of storing MMX registers
1309 in SImode and DImode */
1310 2, /* cost of moving SSE register */
1311 {4, 4, 4}, /* cost of loading SSE registers
1312 in SImode, DImode and TImode */
1313 {4, 4, 4}, /* cost of storing SSE registers
1314 in SImode, DImode and TImode */
1315 2, /* MMX or SSE register to integer */
1316 16, /* size of l1 cache. */
1317 2048, /* size of l2 cache. */
1318 64, /* size of prefetch block */
1319 /* New AMD processors never drop prefetches; if they cannot be performed
1320 immediately, they are queued. We set number of simultaneous prefetches
1321 to a large constant to reflect this (it probably is not a good idea not
1322 to limit number of prefetches at all, as their execution also takes some
1323 time). */
1324 100, /* number of parallel prefetches */
1325 2, /* Branch cost */
1326 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1327 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1328 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1329 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1330 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1331 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1333 bdver4_memcpy,
1334 bdver4_memset,
1335 6, /* scalar_stmt_cost. */
1336 4, /* scalar load_cost. */
1337 4, /* scalar_store_cost. */
1338 6, /* vec_stmt_cost. */
1339 0, /* vec_to_scalar_cost. */
1340 2, /* scalar_to_vec_cost. */
1341 4, /* vec_align_load_cost. */
1342 4, /* vec_unalign_load_cost. */
1343 4, /* vec_store_cost. */
1344 4, /* cond_taken_branch_cost. */
1345 2, /* cond_not_taken_branch_cost. */
1349 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1350 very small blocks it is better to use loop. For large blocks, libcall
1351 can do nontemporary accesses and beat inline considerably. */
1352 static stringop_algs znver1_memcpy[2] = {
1353 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1354 {-1, rep_prefix_4_byte, false}}},
1355 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1356 {-1, libcall, false}}}};
1357 static stringop_algs znver1_memset[2] = {
1358 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1359 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1360 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1361 {-1, libcall, false}}}};
1362 struct processor_costs znver1_cost = {
1363 COSTS_N_INSNS (1), /* cost of an add instruction. */
1364 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1365 COSTS_N_INSNS (1), /* variable shift costs. */
1366 COSTS_N_INSNS (1), /* constant shift costs. */
1367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1368 COSTS_N_INSNS (3), /* HI. */
1369 COSTS_N_INSNS (3), /* SI. */
1370 COSTS_N_INSNS (4), /* DI. */
1371 COSTS_N_INSNS (4)}, /* other. */
1372 0, /* cost of multiply per each bit
1373 set. */
1374 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1375 COSTS_N_INSNS (35), /* HI. */
1376 COSTS_N_INSNS (51), /* SI. */
1377 COSTS_N_INSNS (83), /* DI. */
1378 COSTS_N_INSNS (83)}, /* other. */
1379 COSTS_N_INSNS (1), /* cost of movsx. */
1380 COSTS_N_INSNS (1), /* cost of movzx. */
1381 8, /* "large" insn. */
1382 9, /* MOVE_RATIO. */
1383 4, /* cost for loading QImode using
1384 movzbl. */
1385 {5, 5, 4}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {4, 4, 4}, /* cost of storing integer
1389 registers. */
1390 2, /* cost of reg,reg fld/fst. */
1391 {5, 5, 12}, /* cost of loading fp registers
1392 in SFmode, DFmode and XFmode. */
1393 {4, 4, 8}, /* cost of storing fp registers
1394 in SFmode, DFmode and XFmode. */
1395 2, /* cost of moving MMX register. */
1396 {4, 4}, /* cost of loading MMX registers
1397 in SImode and DImode. */
1398 {4, 4}, /* cost of storing MMX registers
1399 in SImode and DImode. */
1400 2, /* cost of moving SSE register. */
1401 {4, 4, 4}, /* cost of loading SSE registers
1402 in SImode, DImode and TImode. */
1403 {4, 4, 4}, /* cost of storing SSE registers
1404 in SImode, DImode and TImode. */
1405 2, /* MMX or SSE register to integer. */
1406 32, /* size of l1 cache. */
1407 512, /* size of l2 cache. */
1408 64, /* size of prefetch block. */
1409 /* New AMD processors never drop prefetches; if they cannot be performed
1410 immediately, they are queued. We set number of simultaneous prefetches
1411 to a large constant to reflect this (it probably is not a good idea not
1412 to limit number of prefetches at all, as their execution also takes some
1413 time). */
1414 100, /* number of parallel prefetches. */
1415 2, /* Branch cost. */
1416 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1417 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1418 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1421 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1423 znver1_memcpy,
1424 znver1_memset,
1425 6, /* scalar_stmt_cost. */
1426 4, /* scalar load_cost. */
1427 4, /* scalar_store_cost. */
1428 6, /* vec_stmt_cost. */
1429 0, /* vec_to_scalar_cost. */
1430 2, /* scalar_to_vec_cost. */
1431 4, /* vec_align_load_cost. */
1432 4, /* vec_unalign_load_cost. */
1433 4, /* vec_store_cost. */
1434 4, /* cond_taken_branch_cost. */
1435 2, /* cond_not_taken_branch_cost. */
1438 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1439 very small blocks it is better to use loop. For large blocks, libcall can
1440 do nontemporary accesses and beat inline considerably. */
1441 static stringop_algs btver1_memcpy[2] = {
1442 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1443 {-1, rep_prefix_4_byte, false}}},
1444 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1445 {-1, libcall, false}}}};
1446 static stringop_algs btver1_memset[2] = {
1447 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1448 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1449 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1450 {-1, libcall, false}}}};
1451 const struct processor_costs btver1_cost = {
1452 COSTS_N_INSNS (1), /* cost of an add instruction */
1453 COSTS_N_INSNS (2), /* cost of a lea instruction */
1454 COSTS_N_INSNS (1), /* variable shift costs */
1455 COSTS_N_INSNS (1), /* constant shift costs */
1456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1457 COSTS_N_INSNS (4), /* HI */
1458 COSTS_N_INSNS (3), /* SI */
1459 COSTS_N_INSNS (4), /* DI */
1460 COSTS_N_INSNS (5)}, /* other */
1461 0, /* cost of multiply per each bit set */
1462 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1463 COSTS_N_INSNS (35), /* HI */
1464 COSTS_N_INSNS (51), /* SI */
1465 COSTS_N_INSNS (83), /* DI */
1466 COSTS_N_INSNS (83)}, /* other */
1467 COSTS_N_INSNS (1), /* cost of movsx */
1468 COSTS_N_INSNS (1), /* cost of movzx */
1469 8, /* "large" insn */
1470 9, /* MOVE_RATIO */
1471 4, /* cost for loading QImode using movzbl */
1472 {3, 4, 3}, /* cost of loading integer registers
1473 in QImode, HImode and SImode.
1474 Relative to reg-reg move (2). */
1475 {3, 4, 3}, /* cost of storing integer registers */
1476 4, /* cost of reg,reg fld/fst */
1477 {4, 4, 12}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode */
1479 {6, 6, 8}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode */
1481 2, /* cost of moving MMX register */
1482 {3, 3}, /* cost of loading MMX registers
1483 in SImode and DImode */
1484 {4, 4}, /* cost of storing MMX registers
1485 in SImode and DImode */
1486 2, /* cost of moving SSE register */
1487 {4, 4, 3}, /* cost of loading SSE registers
1488 in SImode, DImode and TImode */
1489 {4, 4, 5}, /* cost of storing SSE registers
1490 in SImode, DImode and TImode */
1491 3, /* MMX or SSE register to integer */
1492 /* On K8:
1493 MOVD reg64, xmmreg Double FSTORE 4
1494 MOVD reg32, xmmreg Double FSTORE 4
1495 On AMDFAM10:
1496 MOVD reg64, xmmreg Double FADD 3
1497 1/1 1/1
1498 MOVD reg32, xmmreg Double FADD 3
1499 1/1 1/1 */
1500 32, /* size of l1 cache. */
1501 512, /* size of l2 cache. */
1502 64, /* size of prefetch block */
1503 100, /* number of parallel prefetches */
1504 2, /* Branch cost */
1505 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1506 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1507 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1508 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1509 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1510 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1512 btver1_memcpy,
1513 btver1_memset,
1514 4, /* scalar_stmt_cost. */
1515 2, /* scalar load_cost. */
1516 2, /* scalar_store_cost. */
1517 6, /* vec_stmt_cost. */
1518 0, /* vec_to_scalar_cost. */
1519 2, /* scalar_to_vec_cost. */
1520 2, /* vec_align_load_cost. */
1521 2, /* vec_unalign_load_cost. */
1522 2, /* vec_store_cost. */
1523 2, /* cond_taken_branch_cost. */
1524 1, /* cond_not_taken_branch_cost. */
1527 static stringop_algs btver2_memcpy[2] = {
1528 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1529 {-1, rep_prefix_4_byte, false}}},
1530 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1531 {-1, libcall, false}}}};
1532 static stringop_algs btver2_memset[2] = {
1533 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1534 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1535 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1536 {-1, libcall, false}}}};
1537 const struct processor_costs btver2_cost = {
1538 COSTS_N_INSNS (1), /* cost of an add instruction */
1539 COSTS_N_INSNS (2), /* cost of a lea instruction */
1540 COSTS_N_INSNS (1), /* variable shift costs */
1541 COSTS_N_INSNS (1), /* constant shift costs */
1542 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1543 COSTS_N_INSNS (4), /* HI */
1544 COSTS_N_INSNS (3), /* SI */
1545 COSTS_N_INSNS (4), /* DI */
1546 COSTS_N_INSNS (5)}, /* other */
1547 0, /* cost of multiply per each bit set */
1548 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1549 COSTS_N_INSNS (35), /* HI */
1550 COSTS_N_INSNS (51), /* SI */
1551 COSTS_N_INSNS (83), /* DI */
1552 COSTS_N_INSNS (83)}, /* other */
1553 COSTS_N_INSNS (1), /* cost of movsx */
1554 COSTS_N_INSNS (1), /* cost of movzx */
1555 8, /* "large" insn */
1556 9, /* MOVE_RATIO */
1557 4, /* cost for loading QImode using movzbl */
1558 {3, 4, 3}, /* cost of loading integer registers
1559 in QImode, HImode and SImode.
1560 Relative to reg-reg move (2). */
1561 {3, 4, 3}, /* cost of storing integer registers */
1562 4, /* cost of reg,reg fld/fst */
1563 {4, 4, 12}, /* cost of loading fp registers
1564 in SFmode, DFmode and XFmode */
1565 {6, 6, 8}, /* cost of storing fp registers
1566 in SFmode, DFmode and XFmode */
1567 2, /* cost of moving MMX register */
1568 {3, 3}, /* cost of loading MMX registers
1569 in SImode and DImode */
1570 {4, 4}, /* cost of storing MMX registers
1571 in SImode and DImode */
1572 2, /* cost of moving SSE register */
1573 {4, 4, 3}, /* cost of loading SSE registers
1574 in SImode, DImode and TImode */
1575 {4, 4, 5}, /* cost of storing SSE registers
1576 in SImode, DImode and TImode */
1577 3, /* MMX or SSE register to integer */
1578 /* On K8:
1579 MOVD reg64, xmmreg Double FSTORE 4
1580 MOVD reg32, xmmreg Double FSTORE 4
1581 On AMDFAM10:
1582 MOVD reg64, xmmreg Double FADD 3
1583 1/1 1/1
1584 MOVD reg32, xmmreg Double FADD 3
1585 1/1 1/1 */
1586 32, /* size of l1 cache. */
1587 2048, /* size of l2 cache. */
1588 64, /* size of prefetch block */
1589 100, /* number of parallel prefetches */
1590 2, /* Branch cost */
1591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1597 btver2_memcpy,
1598 btver2_memset,
1599 4, /* scalar_stmt_cost. */
1600 2, /* scalar load_cost. */
1601 2, /* scalar_store_cost. */
1602 6, /* vec_stmt_cost. */
1603 0, /* vec_to_scalar_cost. */
1604 2, /* scalar_to_vec_cost. */
1605 2, /* vec_align_load_cost. */
1606 2, /* vec_unalign_load_cost. */
1607 2, /* vec_store_cost. */
1608 2, /* cond_taken_branch_cost. */
1609 1, /* cond_not_taken_branch_cost. */
1612 static stringop_algs pentium4_memcpy[2] = {
1613 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1614 DUMMY_STRINGOP_ALGS};
1615 static stringop_algs pentium4_memset[2] = {
1616 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1617 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1618 DUMMY_STRINGOP_ALGS};
1620 static const
1621 struct processor_costs pentium4_cost = {
1622 COSTS_N_INSNS (1), /* cost of an add instruction */
1623 COSTS_N_INSNS (3), /* cost of a lea instruction */
1624 COSTS_N_INSNS (4), /* variable shift costs */
1625 COSTS_N_INSNS (4), /* constant shift costs */
1626 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1627 COSTS_N_INSNS (15), /* HI */
1628 COSTS_N_INSNS (15), /* SI */
1629 COSTS_N_INSNS (15), /* DI */
1630 COSTS_N_INSNS (15)}, /* other */
1631 0, /* cost of multiply per each bit set */
1632 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1633 COSTS_N_INSNS (56), /* HI */
1634 COSTS_N_INSNS (56), /* SI */
1635 COSTS_N_INSNS (56), /* DI */
1636 COSTS_N_INSNS (56)}, /* other */
1637 COSTS_N_INSNS (1), /* cost of movsx */
1638 COSTS_N_INSNS (1), /* cost of movzx */
1639 16, /* "large" insn */
1640 6, /* MOVE_RATIO */
1641 2, /* cost for loading QImode using movzbl */
1642 {4, 5, 4}, /* cost of loading integer registers
1643 in QImode, HImode and SImode.
1644 Relative to reg-reg move (2). */
1645 {2, 3, 2}, /* cost of storing integer registers */
1646 2, /* cost of reg,reg fld/fst */
1647 {2, 2, 6}, /* cost of loading fp registers
1648 in SFmode, DFmode and XFmode */
1649 {4, 4, 6}, /* cost of storing fp registers
1650 in SFmode, DFmode and XFmode */
1651 2, /* cost of moving MMX register */
1652 {2, 2}, /* cost of loading MMX registers
1653 in SImode and DImode */
1654 {2, 2}, /* cost of storing MMX registers
1655 in SImode and DImode */
1656 12, /* cost of moving SSE register */
1657 {12, 12, 12}, /* cost of loading SSE registers
1658 in SImode, DImode and TImode */
1659 {2, 2, 8}, /* cost of storing SSE registers
1660 in SImode, DImode and TImode */
1661 10, /* MMX or SSE register to integer */
1662 8, /* size of l1 cache. */
1663 256, /* size of l2 cache. */
1664 64, /* size of prefetch block */
1665 6, /* number of parallel prefetches */
1666 2, /* Branch cost */
1667 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1668 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1669 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1670 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1671 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1672 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1673 pentium4_memcpy,
1674 pentium4_memset,
1675 1, /* scalar_stmt_cost. */
1676 1, /* scalar load_cost. */
1677 1, /* scalar_store_cost. */
1678 1, /* vec_stmt_cost. */
1679 1, /* vec_to_scalar_cost. */
1680 1, /* scalar_to_vec_cost. */
1681 1, /* vec_align_load_cost. */
1682 2, /* vec_unalign_load_cost. */
1683 1, /* vec_store_cost. */
1684 3, /* cond_taken_branch_cost. */
1685 1, /* cond_not_taken_branch_cost. */
1688 static stringop_algs nocona_memcpy[2] = {
1689 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1690 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1691 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1693 static stringop_algs nocona_memset[2] = {
1694 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1695 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1696 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1697 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1699 static const
1700 struct processor_costs nocona_cost = {
1701 COSTS_N_INSNS (1), /* cost of an add instruction */
1702 COSTS_N_INSNS (1), /* cost of a lea instruction */
1703 COSTS_N_INSNS (1), /* variable shift costs */
1704 COSTS_N_INSNS (1), /* constant shift costs */
1705 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1706 COSTS_N_INSNS (10), /* HI */
1707 COSTS_N_INSNS (10), /* SI */
1708 COSTS_N_INSNS (10), /* DI */
1709 COSTS_N_INSNS (10)}, /* other */
1710 0, /* cost of multiply per each bit set */
1711 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1712 COSTS_N_INSNS (66), /* HI */
1713 COSTS_N_INSNS (66), /* SI */
1714 COSTS_N_INSNS (66), /* DI */
1715 COSTS_N_INSNS (66)}, /* other */
1716 COSTS_N_INSNS (1), /* cost of movsx */
1717 COSTS_N_INSNS (1), /* cost of movzx */
1718 16, /* "large" insn */
1719 17, /* MOVE_RATIO */
1720 4, /* cost for loading QImode using movzbl */
1721 {4, 4, 4}, /* cost of loading integer registers
1722 in QImode, HImode and SImode.
1723 Relative to reg-reg move (2). */
1724 {4, 4, 4}, /* cost of storing integer registers */
1725 3, /* cost of reg,reg fld/fst */
1726 {12, 12, 12}, /* cost of loading fp registers
1727 in SFmode, DFmode and XFmode */
1728 {4, 4, 4}, /* cost of storing fp registers
1729 in SFmode, DFmode and XFmode */
1730 6, /* cost of moving MMX register */
1731 {12, 12}, /* cost of loading MMX registers
1732 in SImode and DImode */
1733 {12, 12}, /* cost of storing MMX registers
1734 in SImode and DImode */
1735 6, /* cost of moving SSE register */
1736 {12, 12, 12}, /* cost of loading SSE registers
1737 in SImode, DImode and TImode */
1738 {12, 12, 12}, /* cost of storing SSE registers
1739 in SImode, DImode and TImode */
1740 8, /* MMX or SSE register to integer */
1741 8, /* size of l1 cache. */
1742 1024, /* size of l2 cache. */
1743 64, /* size of prefetch block */
1744 8, /* number of parallel prefetches */
1745 1, /* Branch cost */
1746 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1747 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1748 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1749 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1750 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1751 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1752 nocona_memcpy,
1753 nocona_memset,
1754 1, /* scalar_stmt_cost. */
1755 1, /* scalar load_cost. */
1756 1, /* scalar_store_cost. */
1757 1, /* vec_stmt_cost. */
1758 1, /* vec_to_scalar_cost. */
1759 1, /* scalar_to_vec_cost. */
1760 1, /* vec_align_load_cost. */
1761 2, /* vec_unalign_load_cost. */
1762 1, /* vec_store_cost. */
1763 3, /* cond_taken_branch_cost. */
1764 1, /* cond_not_taken_branch_cost. */
1767 static stringop_algs atom_memcpy[2] = {
1768 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1769 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1770 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1771 static stringop_algs atom_memset[2] = {
1772 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1773 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1774 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1775 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1776 static const
1777 struct processor_costs atom_cost = {
1778 COSTS_N_INSNS (1), /* cost of an add instruction */
1779 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1780 COSTS_N_INSNS (1), /* variable shift costs */
1781 COSTS_N_INSNS (1), /* constant shift costs */
1782 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1783 COSTS_N_INSNS (4), /* HI */
1784 COSTS_N_INSNS (3), /* SI */
1785 COSTS_N_INSNS (4), /* DI */
1786 COSTS_N_INSNS (2)}, /* other */
1787 0, /* cost of multiply per each bit set */
1788 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1789 COSTS_N_INSNS (26), /* HI */
1790 COSTS_N_INSNS (42), /* SI */
1791 COSTS_N_INSNS (74), /* DI */
1792 COSTS_N_INSNS (74)}, /* other */
1793 COSTS_N_INSNS (1), /* cost of movsx */
1794 COSTS_N_INSNS (1), /* cost of movzx */
1795 8, /* "large" insn */
1796 17, /* MOVE_RATIO */
1797 4, /* cost for loading QImode using movzbl */
1798 {4, 4, 4}, /* cost of loading integer registers
1799 in QImode, HImode and SImode.
1800 Relative to reg-reg move (2). */
1801 {4, 4, 4}, /* cost of storing integer registers */
1802 4, /* cost of reg,reg fld/fst */
1803 {12, 12, 12}, /* cost of loading fp registers
1804 in SFmode, DFmode and XFmode */
1805 {6, 6, 8}, /* cost of storing fp registers
1806 in SFmode, DFmode and XFmode */
1807 2, /* cost of moving MMX register */
1808 {8, 8}, /* cost of loading MMX registers
1809 in SImode and DImode */
1810 {8, 8}, /* cost of storing MMX registers
1811 in SImode and DImode */
1812 2, /* cost of moving SSE register */
1813 {8, 8, 8}, /* cost of loading SSE registers
1814 in SImode, DImode and TImode */
1815 {8, 8, 8}, /* cost of storing SSE registers
1816 in SImode, DImode and TImode */
1817 5, /* MMX or SSE register to integer */
1818 32, /* size of l1 cache. */
1819 256, /* size of l2 cache. */
1820 64, /* size of prefetch block */
1821 6, /* number of parallel prefetches */
1822 3, /* Branch cost */
1823 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1824 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1825 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1826 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1827 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1828 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1829 atom_memcpy,
1830 atom_memset,
1831 1, /* scalar_stmt_cost. */
1832 1, /* scalar load_cost. */
1833 1, /* scalar_store_cost. */
1834 1, /* vec_stmt_cost. */
1835 1, /* vec_to_scalar_cost. */
1836 1, /* scalar_to_vec_cost. */
1837 1, /* vec_align_load_cost. */
1838 2, /* vec_unalign_load_cost. */
1839 1, /* vec_store_cost. */
1840 3, /* cond_taken_branch_cost. */
1841 1, /* cond_not_taken_branch_cost. */
1844 static stringop_algs slm_memcpy[2] = {
1845 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1846 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1847 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1848 static stringop_algs slm_memset[2] = {
1849 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1850 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1851 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1852 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1853 static const
1854 struct processor_costs slm_cost = {
1855 COSTS_N_INSNS (1), /* cost of an add instruction */
1856 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1857 COSTS_N_INSNS (1), /* variable shift costs */
1858 COSTS_N_INSNS (1), /* constant shift costs */
1859 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1860 COSTS_N_INSNS (3), /* HI */
1861 COSTS_N_INSNS (3), /* SI */
1862 COSTS_N_INSNS (4), /* DI */
1863 COSTS_N_INSNS (2)}, /* other */
1864 0, /* cost of multiply per each bit set */
1865 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1866 COSTS_N_INSNS (26), /* HI */
1867 COSTS_N_INSNS (42), /* SI */
1868 COSTS_N_INSNS (74), /* DI */
1869 COSTS_N_INSNS (74)}, /* other */
1870 COSTS_N_INSNS (1), /* cost of movsx */
1871 COSTS_N_INSNS (1), /* cost of movzx */
1872 8, /* "large" insn */
1873 17, /* MOVE_RATIO */
1874 4, /* cost for loading QImode using movzbl */
1875 {4, 4, 4}, /* cost of loading integer registers
1876 in QImode, HImode and SImode.
1877 Relative to reg-reg move (2). */
1878 {4, 4, 4}, /* cost of storing integer registers */
1879 4, /* cost of reg,reg fld/fst */
1880 {12, 12, 12}, /* cost of loading fp registers
1881 in SFmode, DFmode and XFmode */
1882 {6, 6, 8}, /* cost of storing fp registers
1883 in SFmode, DFmode and XFmode */
1884 2, /* cost of moving MMX register */
1885 {8, 8}, /* cost of loading MMX registers
1886 in SImode and DImode */
1887 {8, 8}, /* cost of storing MMX registers
1888 in SImode and DImode */
1889 2, /* cost of moving SSE register */
1890 {8, 8, 8}, /* cost of loading SSE registers
1891 in SImode, DImode and TImode */
1892 {8, 8, 8}, /* cost of storing SSE registers
1893 in SImode, DImode and TImode */
1894 5, /* MMX or SSE register to integer */
1895 32, /* size of l1 cache. */
1896 256, /* size of l2 cache. */
1897 64, /* size of prefetch block */
1898 6, /* number of parallel prefetches */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 slm_memcpy,
1907 slm_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 4, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1921 static stringop_algs intel_memcpy[2] = {
1922 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1923 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1924 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1925 static stringop_algs intel_memset[2] = {
1926 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1927 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1929 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1930 static const
1931 struct processor_costs intel_cost = {
1932 COSTS_N_INSNS (1), /* cost of an add instruction */
1933 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1934 COSTS_N_INSNS (1), /* variable shift costs */
1935 COSTS_N_INSNS (1), /* constant shift costs */
1936 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1937 COSTS_N_INSNS (3), /* HI */
1938 COSTS_N_INSNS (3), /* SI */
1939 COSTS_N_INSNS (4), /* DI */
1940 COSTS_N_INSNS (2)}, /* other */
1941 0, /* cost of multiply per each bit set */
1942 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1943 COSTS_N_INSNS (26), /* HI */
1944 COSTS_N_INSNS (42), /* SI */
1945 COSTS_N_INSNS (74), /* DI */
1946 COSTS_N_INSNS (74)}, /* other */
1947 COSTS_N_INSNS (1), /* cost of movsx */
1948 COSTS_N_INSNS (1), /* cost of movzx */
1949 8, /* "large" insn */
1950 17, /* MOVE_RATIO */
1951 4, /* cost for loading QImode using movzbl */
1952 {4, 4, 4}, /* cost of loading integer registers
1953 in QImode, HImode and SImode.
1954 Relative to reg-reg move (2). */
1955 {4, 4, 4}, /* cost of storing integer registers */
1956 4, /* cost of reg,reg fld/fst */
1957 {12, 12, 12}, /* cost of loading fp registers
1958 in SFmode, DFmode and XFmode */
1959 {6, 6, 8}, /* cost of storing fp registers
1960 in SFmode, DFmode and XFmode */
1961 2, /* cost of moving MMX register */
1962 {8, 8}, /* cost of loading MMX registers
1963 in SImode and DImode */
1964 {8, 8}, /* cost of storing MMX registers
1965 in SImode and DImode */
1966 2, /* cost of moving SSE register */
1967 {8, 8, 8}, /* cost of loading SSE registers
1968 in SImode, DImode and TImode */
1969 {8, 8, 8}, /* cost of storing SSE registers
1970 in SImode, DImode and TImode */
1971 5, /* MMX or SSE register to integer */
1972 32, /* size of l1 cache. */
1973 256, /* size of l2 cache. */
1974 64, /* size of prefetch block */
1975 6, /* number of parallel prefetches */
1976 3, /* Branch cost */
1977 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1978 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1979 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1980 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1981 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1982 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1983 intel_memcpy,
1984 intel_memset,
1985 1, /* scalar_stmt_cost. */
1986 1, /* scalar load_cost. */
1987 1, /* scalar_store_cost. */
1988 1, /* vec_stmt_cost. */
1989 4, /* vec_to_scalar_cost. */
1990 1, /* scalar_to_vec_cost. */
1991 1, /* vec_align_load_cost. */
1992 2, /* vec_unalign_load_cost. */
1993 1, /* vec_store_cost. */
1994 3, /* cond_taken_branch_cost. */
1995 1, /* cond_not_taken_branch_cost. */
1998 /* Generic should produce code tuned for Core-i7 (and newer chips)
1999 and btver1 (and newer chips). */
2001 static stringop_algs generic_memcpy[2] = {
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2003 {-1, libcall, false}}},
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2005 {-1, libcall, false}}}};
2006 static stringop_algs generic_memset[2] = {
2007 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2008 {-1, libcall, false}}},
2009 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2010 {-1, libcall, false}}}};
2011 static const
2012 struct processor_costs generic_cost = {
2013 COSTS_N_INSNS (1), /* cost of an add instruction */
2014 /* On all chips taken into consideration lea is 2 cycles and more. With
2015 this cost however our current implementation of synth_mult results in
2016 use of unnecessary temporary registers causing regression on several
2017 SPECfp benchmarks. */
2018 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2019 COSTS_N_INSNS (1), /* variable shift costs */
2020 COSTS_N_INSNS (1), /* constant shift costs */
2021 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2022 COSTS_N_INSNS (4), /* HI */
2023 COSTS_N_INSNS (3), /* SI */
2024 COSTS_N_INSNS (4), /* DI */
2025 COSTS_N_INSNS (2)}, /* other */
2026 0, /* cost of multiply per each bit set */
2027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2028 COSTS_N_INSNS (26), /* HI */
2029 COSTS_N_INSNS (42), /* SI */
2030 COSTS_N_INSNS (74), /* DI */
2031 COSTS_N_INSNS (74)}, /* other */
2032 COSTS_N_INSNS (1), /* cost of movsx */
2033 COSTS_N_INSNS (1), /* cost of movzx */
2034 8, /* "large" insn */
2035 17, /* MOVE_RATIO */
2036 4, /* cost for loading QImode using movzbl */
2037 {4, 4, 4}, /* cost of loading integer registers
2038 in QImode, HImode and SImode.
2039 Relative to reg-reg move (2). */
2040 {4, 4, 4}, /* cost of storing integer registers */
2041 4, /* cost of reg,reg fld/fst */
2042 {12, 12, 12}, /* cost of loading fp registers
2043 in SFmode, DFmode and XFmode */
2044 {6, 6, 8}, /* cost of storing fp registers
2045 in SFmode, DFmode and XFmode */
2046 2, /* cost of moving MMX register */
2047 {8, 8}, /* cost of loading MMX registers
2048 in SImode and DImode */
2049 {8, 8}, /* cost of storing MMX registers
2050 in SImode and DImode */
2051 2, /* cost of moving SSE register */
2052 {8, 8, 8}, /* cost of loading SSE registers
2053 in SImode, DImode and TImode */
2054 {8, 8, 8}, /* cost of storing SSE registers
2055 in SImode, DImode and TImode */
2056 5, /* MMX or SSE register to integer */
2057 32, /* size of l1 cache. */
2058 512, /* size of l2 cache. */
2059 64, /* size of prefetch block */
2060 6, /* number of parallel prefetches */
2061 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2062 value is increased to perhaps more appropriate value of 5. */
2063 3, /* Branch cost */
2064 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2065 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2066 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2067 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2068 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2069 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2070 generic_memcpy,
2071 generic_memset,
2072 1, /* scalar_stmt_cost. */
2073 1, /* scalar load_cost. */
2074 1, /* scalar_store_cost. */
2075 1, /* vec_stmt_cost. */
2076 1, /* vec_to_scalar_cost. */
2077 1, /* scalar_to_vec_cost. */
2078 1, /* vec_align_load_cost. */
2079 2, /* vec_unalign_load_cost. */
2080 1, /* vec_store_cost. */
2081 3, /* cond_taken_branch_cost. */
2082 1, /* cond_not_taken_branch_cost. */
2085 /* core_cost should produce code tuned for Core familly of CPUs. */
2086 static stringop_algs core_memcpy[2] = {
2087 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2090 static stringop_algs core_memset[2] = {
2091 {libcall, {{6, loop_1_byte, true},
2092 {24, loop, true},
2093 {8192, rep_prefix_4_byte, true},
2094 {-1, libcall, false}}},
2095 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2096 {-1, libcall, false}}}};
2098 static const
2099 struct processor_costs core_cost = {
2100 COSTS_N_INSNS (1), /* cost of an add instruction */
2101 /* On all chips taken into consideration lea is 2 cycles and more. With
2102 this cost however our current implementation of synth_mult results in
2103 use of unnecessary temporary registers causing regression on several
2104 SPECfp benchmarks. */
2105 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2106 COSTS_N_INSNS (1), /* variable shift costs */
2107 COSTS_N_INSNS (1), /* constant shift costs */
2108 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2109 COSTS_N_INSNS (4), /* HI */
2110 COSTS_N_INSNS (3), /* SI */
2111 COSTS_N_INSNS (4), /* DI */
2112 COSTS_N_INSNS (2)}, /* other */
2113 0, /* cost of multiply per each bit set */
2114 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2115 COSTS_N_INSNS (26), /* HI */
2116 COSTS_N_INSNS (42), /* SI */
2117 COSTS_N_INSNS (74), /* DI */
2118 COSTS_N_INSNS (74)}, /* other */
2119 COSTS_N_INSNS (1), /* cost of movsx */
2120 COSTS_N_INSNS (1), /* cost of movzx */
2121 8, /* "large" insn */
2122 17, /* MOVE_RATIO */
2123 4, /* cost for loading QImode using movzbl */
2124 {4, 4, 4}, /* cost of loading integer registers
2125 in QImode, HImode and SImode.
2126 Relative to reg-reg move (2). */
2127 {4, 4, 4}, /* cost of storing integer registers */
2128 4, /* cost of reg,reg fld/fst */
2129 {12, 12, 12}, /* cost of loading fp registers
2130 in SFmode, DFmode and XFmode */
2131 {6, 6, 8}, /* cost of storing fp registers
2132 in SFmode, DFmode and XFmode */
2133 2, /* cost of moving MMX register */
2134 {8, 8}, /* cost of loading MMX registers
2135 in SImode and DImode */
2136 {8, 8}, /* cost of storing MMX registers
2137 in SImode and DImode */
2138 2, /* cost of moving SSE register */
2139 {8, 8, 8}, /* cost of loading SSE registers
2140 in SImode, DImode and TImode */
2141 {8, 8, 8}, /* cost of storing SSE registers
2142 in SImode, DImode and TImode */
2143 5, /* MMX or SSE register to integer */
2144 64, /* size of l1 cache. */
2145 512, /* size of l2 cache. */
2146 64, /* size of prefetch block */
2147 6, /* number of parallel prefetches */
2148 /* FIXME perhaps more appropriate value is 5. */
2149 3, /* Branch cost */
2150 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2151 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2152 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2153 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2154 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2155 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2156 core_memcpy,
2157 core_memset,
2158 1, /* scalar_stmt_cost. */
2159 1, /* scalar load_cost. */
2160 1, /* scalar_store_cost. */
2161 1, /* vec_stmt_cost. */
2162 1, /* vec_to_scalar_cost. */
2163 1, /* scalar_to_vec_cost. */
2164 1, /* vec_align_load_cost. */
2165 2, /* vec_unalign_load_cost. */
2166 1, /* vec_store_cost. */
2167 3, /* cond_taken_branch_cost. */
2168 1, /* cond_not_taken_branch_cost. */
2172 /* Set by -mtune. */
2173 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2175 /* Set by -mtune or -Os. */
2176 const struct processor_costs *ix86_cost = &pentium_cost;
2178 /* Processor feature/optimization bitmasks. */
2179 #define m_386 (1U<<PROCESSOR_I386)
2180 #define m_486 (1U<<PROCESSOR_I486)
2181 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2182 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2183 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2184 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2185 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2186 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2187 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2188 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2189 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2190 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2191 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2192 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2193 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2194 #define m_KNL (1U<<PROCESSOR_KNL)
2195 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2196 #define m_INTEL (1U<<PROCESSOR_INTEL)
2198 #define m_GEODE (1U<<PROCESSOR_GEODE)
2199 #define m_K6 (1U<<PROCESSOR_K6)
2200 #define m_K6_GEODE (m_K6 | m_GEODE)
2201 #define m_K8 (1U<<PROCESSOR_K8)
2202 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2203 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2204 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2205 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2206 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2207 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2208 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2209 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2210 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2211 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2212 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2213 #define m_BTVER (m_BTVER1 | m_BTVER2)
2214 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2215 | m_ZNVER1)
2217 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2219 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2220 #undef DEF_TUNE
2221 #define DEF_TUNE(tune, name, selector) name,
2222 #include "x86-tune.def"
2223 #undef DEF_TUNE
2226 /* Feature tests against the various tunings. */
2227 unsigned char ix86_tune_features[X86_TUNE_LAST];
2229 /* Feature tests against the various tunings used to create ix86_tune_features
2230 based on the processor mask. */
2231 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2232 #undef DEF_TUNE
2233 #define DEF_TUNE(tune, name, selector) selector,
2234 #include "x86-tune.def"
2235 #undef DEF_TUNE
2238 /* Feature tests against the various architecture variations. */
2239 unsigned char ix86_arch_features[X86_ARCH_LAST];
2241 /* Feature tests against the various architecture variations, used to create
2242 ix86_arch_features based on the processor mask. */
2243 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2244 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2245 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2247 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2248 ~m_386,
2250 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2251 ~(m_386 | m_486),
2253 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2254 ~m_386,
2256 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2257 ~m_386,
2260 /* In case the average insn count for single function invocation is
2261 lower than this constant, emit fast (but longer) prologue and
2262 epilogue code. */
2263 #define FAST_PROLOGUE_INSN_COUNT 20
2265 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2266 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2267 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2268 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2270 /* Array of the smallest class containing reg number REGNO, indexed by
2271 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2273 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2275 /* ax, dx, cx, bx */
2276 AREG, DREG, CREG, BREG,
2277 /* si, di, bp, sp */
2278 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2279 /* FP registers */
2280 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2281 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2282 /* arg pointer */
2283 NON_Q_REGS,
2284 /* flags, fpsr, fpcr, frame */
2285 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2286 /* SSE registers */
2287 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2288 SSE_REGS, SSE_REGS,
2289 /* MMX registers */
2290 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2291 MMX_REGS, MMX_REGS,
2292 /* REX registers */
2293 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2294 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2295 /* SSE REX registers */
2296 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2297 SSE_REGS, SSE_REGS,
2298 /* AVX-512 SSE registers */
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2301 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2302 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2303 /* Mask registers. */
2304 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2305 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2306 /* MPX bound registers */
2307 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2310 /* The "default" register map used in 32bit mode. */
2312 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2314 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2315 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2316 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2317 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2318 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2320 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2321 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2322 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2323 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2324 101, 102, 103, 104, /* bound registers */
2327 /* The "default" register map used in 64bit mode. */
2329 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2331 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2332 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2333 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2334 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2335 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2336 8,9,10,11,12,13,14,15, /* extended integer registers */
2337 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2338 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2339 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2340 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2341 126, 127, 128, 129, /* bound registers */
2344 /* Define the register numbers to be used in Dwarf debugging information.
2345 The SVR4 reference port C compiler uses the following register numbers
2346 in its Dwarf output code:
2347 0 for %eax (gcc regno = 0)
2348 1 for %ecx (gcc regno = 2)
2349 2 for %edx (gcc regno = 1)
2350 3 for %ebx (gcc regno = 3)
2351 4 for %esp (gcc regno = 7)
2352 5 for %ebp (gcc regno = 6)
2353 6 for %esi (gcc regno = 4)
2354 7 for %edi (gcc regno = 5)
2355 The following three DWARF register numbers are never generated by
2356 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2357 believes these numbers have these meanings.
2358 8 for %eip (no gcc equivalent)
2359 9 for %eflags (gcc regno = 17)
2360 10 for %trapno (no gcc equivalent)
2361 It is not at all clear how we should number the FP stack registers
2362 for the x86 architecture. If the version of SDB on x86/svr4 were
2363 a bit less brain dead with respect to floating-point then we would
2364 have a precedent to follow with respect to DWARF register numbers
2365 for x86 FP registers, but the SDB on x86/svr4 is so completely
2366 broken with respect to FP registers that it is hardly worth thinking
2367 of it as something to strive for compatibility with.
2368 The version of x86/svr4 SDB I have at the moment does (partially)
2369 seem to believe that DWARF register number 11 is associated with
2370 the x86 register %st(0), but that's about all. Higher DWARF
2371 register numbers don't seem to be associated with anything in
2372 particular, and even for DWARF regno 11, SDB only seems to under-
2373 stand that it should say that a variable lives in %st(0) (when
2374 asked via an `=' command) if we said it was in DWARF regno 11,
2375 but SDB still prints garbage when asked for the value of the
2376 variable in question (via a `/' command).
2377 (Also note that the labels SDB prints for various FP stack regs
2378 when doing an `x' command are all wrong.)
2379 Note that these problems generally don't affect the native SVR4
2380 C compiler because it doesn't allow the use of -O with -g and
2381 because when it is *not* optimizing, it allocates a memory
2382 location for each floating-point variable, and the memory
2383 location is what gets described in the DWARF AT_location
2384 attribute for the variable in question.
2385 Regardless of the severe mental illness of the x86/svr4 SDB, we
2386 do something sensible here and we use the following DWARF
2387 register numbers. Note that these are all stack-top-relative
2388 numbers.
2389 11 for %st(0) (gcc regno = 8)
2390 12 for %st(1) (gcc regno = 9)
2391 13 for %st(2) (gcc regno = 10)
2392 14 for %st(3) (gcc regno = 11)
2393 15 for %st(4) (gcc regno = 12)
2394 16 for %st(5) (gcc regno = 13)
2395 17 for %st(6) (gcc regno = 14)
2396 18 for %st(7) (gcc regno = 15)
2398 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2400 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2401 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2402 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2403 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2404 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2406 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2407 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2408 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2409 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2410 101, 102, 103, 104, /* bound registers */
2413 /* Define parameter passing and return registers. */
2415 static int const x86_64_int_parameter_registers[6] =
2417 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2420 static int const x86_64_ms_abi_int_parameter_registers[4] =
2422 CX_REG, DX_REG, R8_REG, R9_REG
2425 static int const x86_64_int_return_registers[4] =
2427 AX_REG, DX_REG, DI_REG, SI_REG
2430 /* Additional registers that are clobbered by SYSV calls. */
2432 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2433 static int const x86_64_ms_sysv_extra_clobbered_registers
2434 [NUM_X86_64_MS_CLOBBERED_REGS] =
2436 SI_REG, DI_REG,
2437 XMM6_REG, XMM7_REG,
2438 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2439 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2442 enum xlogue_stub {
2443 XLOGUE_STUB_SAVE,
2444 XLOGUE_STUB_RESTORE,
2445 XLOGUE_STUB_RESTORE_TAIL,
2446 XLOGUE_STUB_SAVE_HFP,
2447 XLOGUE_STUB_RESTORE_HFP,
2448 XLOGUE_STUB_RESTORE_HFP_TAIL,
2450 XLOGUE_STUB_COUNT
2453 enum xlogue_stub_sets {
2454 XLOGUE_SET_ALIGNED,
2455 XLOGUE_SET_ALIGNED_PLUS_8,
2456 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2457 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2459 XLOGUE_SET_COUNT
2462 /* Register save/restore layout used by out-of-line stubs. */
2463 class xlogue_layout {
2464 public:
2465 struct reginfo
2467 unsigned regno;
2468 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2469 rsi) to where each register is stored. */
2472 unsigned get_nregs () const {return m_nregs;}
2473 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2475 const reginfo &get_reginfo (unsigned reg) const
2477 gcc_assert (reg < m_nregs);
2478 return m_regs[reg];
2481 static const char *get_stub_name (enum xlogue_stub stub,
2482 unsigned n_extra_args);
2484 /* Returns an rtx for the stub's symbol based upon
2485 1.) the specified stub (save, restore or restore_ret) and
2486 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2487 3.) rather or not stack alignment is being performed. */
2488 static rtx get_stub_rtx (enum xlogue_stub stub);
2490 /* Returns the amount of stack space (including padding) that the stub
2491 needs to store registers based upon data in the machine_function. */
2492 HOST_WIDE_INT get_stack_space_used () const
2494 const struct machine_function *m = cfun->machine;
2495 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2497 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2498 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
2501 /* Returns the offset for the base pointer used by the stub. */
2502 HOST_WIDE_INT get_stub_ptr_offset () const
2504 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2507 static const struct xlogue_layout &get_instance ();
2508 static unsigned count_stub_managed_regs ();
2509 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2511 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2512 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2513 static const unsigned MAX_REGS = 18;
2514 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2515 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2516 static const unsigned STUB_NAME_MAX_LEN = 20;
2517 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2518 static const unsigned REG_ORDER[MAX_REGS];
2519 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2521 private:
2522 xlogue_layout ();
2523 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2524 xlogue_layout (const xlogue_layout &);
2526 /* True if hard frame pointer is used. */
2527 bool m_hfp;
2529 /* Max number of register this layout manages. */
2530 unsigned m_nregs;
2532 /* Incoming offset from 16-byte alignment. */
2533 HOST_WIDE_INT m_stack_align_off_in;
2535 /* Register order and offsets. */
2536 struct reginfo m_regs[MAX_REGS];
2538 /* Lazy-inited cache of symbol names for stubs. */
2539 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
2540 [STUB_NAME_MAX_LEN];
2542 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2545 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2546 "savms64",
2547 "resms64",
2548 "resms64x",
2549 "savms64f",
2550 "resms64f",
2551 "resms64fx"
2554 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2555 /* The below offset values are where each register is stored for the layout
2556 relative to incoming stack pointer. The value of each m_regs[].offset will
2557 be relative to the incoming base pointer (rax or rsi) used by the stub.
2559 s_instances: 0 1 2 3
2560 Offset: realigned or aligned + 8
2561 Register aligned aligned + 8 aligned w/HFP w/HFP */
2562 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2563 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2564 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2565 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2566 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2567 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2568 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2569 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2570 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2571 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2572 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2573 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2574 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2575 BP_REG, /* 0xc0 0xc8 N/A N/A */
2576 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2577 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2578 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2579 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2582 /* Instantiate static const values. */
2583 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2584 const unsigned xlogue_layout::MIN_REGS;
2585 const unsigned xlogue_layout::MAX_REGS;
2586 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2587 const unsigned xlogue_layout::VARIANT_COUNT;
2588 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2590 /* Initialize xlogue_layout::s_stub_names to zero. */
2591 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
2592 [STUB_NAME_MAX_LEN];
2594 /* Instantiates all xlogue_layout instances. */
2595 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2596 xlogue_layout (0, false),
2597 xlogue_layout (8, false),
2598 xlogue_layout (0, true),
2599 xlogue_layout (8, true)
2602 /* Return an appropriate const instance of xlogue_layout based upon values
2603 in cfun->machine and crtl. */
2604 const struct xlogue_layout &
2605 xlogue_layout::get_instance ()
2607 enum xlogue_stub_sets stub_set;
2608 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2610 if (stack_realign_fp)
2611 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2612 else if (frame_pointer_needed)
2613 stub_set = aligned_plus_8
2614 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2615 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2616 else
2617 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2619 return s_instances[stub_set];
2622 /* Determine how many clobbered registers can be saved by the stub.
2623 Returns the count of registers the stub will save and restore. */
2624 unsigned
2625 xlogue_layout::count_stub_managed_regs ()
2627 bool hfp = frame_pointer_needed || stack_realign_fp;
2628 unsigned i, count;
2629 unsigned regno;
2631 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2633 regno = REG_ORDER[i];
2634 if (regno == BP_REG && hfp)
2635 continue;
2636 if (!ix86_save_reg (regno, false, false))
2637 break;
2638 ++count;
2640 return count;
2643 /* Determine if register REGNO is a stub managed register given the
2644 total COUNT of stub managed registers. */
2645 bool
2646 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2648 bool hfp = frame_pointer_needed || stack_realign_fp;
2649 unsigned i;
2651 for (i = 0; i < count; ++i)
2653 gcc_assert (i < MAX_REGS);
2654 if (REG_ORDER[i] == BP_REG && hfp)
2655 ++count;
2656 else if (REG_ORDER[i] == regno)
2657 return true;
2659 return false;
2662 /* Constructor for xlogue_layout. */
2663 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2664 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2665 m_stack_align_off_in (stack_align_off_in)
2667 HOST_WIDE_INT offset = stack_align_off_in;
2668 unsigned i, j;
2670 for (i = j = 0; i < MAX_REGS; ++i)
2672 unsigned regno = REG_ORDER[i];
2674 if (regno == BP_REG && hfp)
2675 continue;
2676 if (SSE_REGNO_P (regno))
2678 offset += 16;
2679 /* Verify that SSE regs are always aligned. */
2680 gcc_assert (!((stack_align_off_in + offset) & 15));
2682 else
2683 offset += 8;
2685 m_regs[j].regno = regno;
2686 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2688 gcc_assert (j == m_nregs);
2691 const char *
2692 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2693 unsigned n_extra_regs)
2695 const int have_avx = TARGET_AVX;
2696 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
2698 /* Lazy init */
2699 if (!*name)
2701 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
2702 (have_avx ? "avx" : "sse"),
2703 STUB_BASE_NAMES[stub],
2704 MIN_REGS + n_extra_regs);
2705 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2708 return name;
2711 /* Return rtx of a symbol ref for the entry point (based upon
2712 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2714 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2716 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2717 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2718 gcc_assert (stub < XLOGUE_STUB_COUNT);
2719 gcc_assert (crtl->stack_realign_finalized);
2721 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2724 /* Define the structure for the machine field in struct function. */
2726 struct GTY(()) stack_local_entry {
2727 unsigned short mode;
2728 unsigned short n;
2729 rtx rtl;
2730 struct stack_local_entry *next;
2733 /* Which cpu are we scheduling for. */
2734 enum attr_cpu ix86_schedule;
2736 /* Which cpu are we optimizing for. */
2737 enum processor_type ix86_tune;
2739 /* Which instruction set architecture to use. */
2740 enum processor_type ix86_arch;
2742 /* True if processor has SSE prefetch instruction. */
2743 unsigned char x86_prefetch_sse;
2745 /* -mstackrealign option */
2746 static const char ix86_force_align_arg_pointer_string[]
2747 = "force_align_arg_pointer";
2749 static rtx (*ix86_gen_leave) (void);
2750 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2751 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2753 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2754 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_clzero) (rtx);
2757 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2758 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2759 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2760 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2761 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2762 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2764 /* Preferred alignment for stack boundary in bits. */
2765 unsigned int ix86_preferred_stack_boundary;
2767 /* Alignment for incoming stack boundary in bits specified at
2768 command line. */
2769 static unsigned int ix86_user_incoming_stack_boundary;
2771 /* Default alignment for incoming stack boundary in bits. */
2772 static unsigned int ix86_default_incoming_stack_boundary;
2774 /* Alignment for incoming stack boundary in bits. */
2775 unsigned int ix86_incoming_stack_boundary;
2777 /* Calling abi specific va_list type nodes. */
2778 static GTY(()) tree sysv_va_list_type_node;
2779 static GTY(()) tree ms_va_list_type_node;
2781 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2782 char internal_label_prefix[16];
2783 int internal_label_prefix_len;
2785 /* Fence to use after loop using movnt. */
2786 tree x86_mfence;
2788 /* Register class used for passing given 64bit part of the argument.
2789 These represent classes as documented by the PS ABI, with the exception
2790 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2791 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2793 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2794 whenever possible (upper half does contain padding). */
2795 enum x86_64_reg_class
2797 X86_64_NO_CLASS,
2798 X86_64_INTEGER_CLASS,
2799 X86_64_INTEGERSI_CLASS,
2800 X86_64_SSE_CLASS,
2801 X86_64_SSESF_CLASS,
2802 X86_64_SSEDF_CLASS,
2803 X86_64_SSEUP_CLASS,
2804 X86_64_X87_CLASS,
2805 X86_64_X87UP_CLASS,
2806 X86_64_COMPLEX_X87_CLASS,
2807 X86_64_MEMORY_CLASS
2810 #define MAX_CLASSES 8
2812 /* Table of constants used by fldpi, fldln2, etc.... */
2813 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2814 static bool ext_80387_constants_init;
2817 static struct machine_function * ix86_init_machine_status (void);
2818 static rtx ix86_function_value (const_tree, const_tree, bool);
2819 static bool ix86_function_value_regno_p (const unsigned int);
2820 static unsigned int ix86_function_arg_boundary (machine_mode,
2821 const_tree);
2822 static rtx ix86_static_chain (const_tree, bool);
2823 static int ix86_function_regparm (const_tree, const_tree);
2824 static void ix86_compute_frame_layout (void);
2825 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2826 rtx, rtx, int);
2827 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2828 static tree ix86_canonical_va_list_type (tree);
2829 static void predict_jump (int);
2830 static unsigned int split_stack_prologue_scratch_regno (void);
2831 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2833 enum ix86_function_specific_strings
2835 IX86_FUNCTION_SPECIFIC_ARCH,
2836 IX86_FUNCTION_SPECIFIC_TUNE,
2837 IX86_FUNCTION_SPECIFIC_MAX
2840 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2841 const char *, const char *, enum fpmath_unit,
2842 bool);
2843 static void ix86_function_specific_save (struct cl_target_option *,
2844 struct gcc_options *opts);
2845 static void ix86_function_specific_restore (struct gcc_options *opts,
2846 struct cl_target_option *);
2847 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2848 static void ix86_function_specific_print (FILE *, int,
2849 struct cl_target_option *);
2850 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2851 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2852 struct gcc_options *,
2853 struct gcc_options *,
2854 struct gcc_options *);
2855 static bool ix86_can_inline_p (tree, tree);
2856 static void ix86_set_current_function (tree);
2857 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2859 static enum calling_abi ix86_function_abi (const_tree);
2862 #ifndef SUBTARGET32_DEFAULT_CPU
2863 #define SUBTARGET32_DEFAULT_CPU "i386"
2864 #endif
2866 /* Whether -mtune= or -march= were specified */
2867 static int ix86_tune_defaulted;
2868 static int ix86_arch_specified;
2870 /* Vectorization library interface and handlers. */
2871 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2873 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2874 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2876 /* Processor target table, indexed by processor number */
2877 struct ptt
2879 const char *const name; /* processor name */
2880 const struct processor_costs *cost; /* Processor costs */
2881 const int align_loop; /* Default alignments. */
2882 const int align_loop_max_skip;
2883 const int align_jump;
2884 const int align_jump_max_skip;
2885 const int align_func;
2888 /* This table must be in sync with enum processor_type in i386.h. */
2889 static const struct ptt processor_target_table[PROCESSOR_max] =
2891 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2892 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2893 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2894 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2895 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2896 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2897 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2898 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2899 {"core2", &core_cost, 16, 10, 16, 10, 16},
2900 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2901 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2902 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2903 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2904 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2905 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2906 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2907 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2908 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2909 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2910 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2911 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2912 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2913 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2914 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2915 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2916 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2917 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2918 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2919 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2922 static unsigned int
2923 rest_of_handle_insert_vzeroupper (void)
2925 int i;
2927 /* vzeroupper instructions are inserted immediately after reload to
2928 account for possible spills from 256bit registers. The pass
2929 reuses mode switching infrastructure by re-running mode insertion
2930 pass, so disable entities that have already been processed. */
2931 for (i = 0; i < MAX_386_ENTITIES; i++)
2932 ix86_optimize_mode_switching[i] = 0;
2934 ix86_optimize_mode_switching[AVX_U128] = 1;
2936 /* Call optimize_mode_switching. */
2937 g->get_passes ()->execute_pass_mode_switching ();
2938 return 0;
2941 /* Return 1 if INSN uses or defines a hard register.
2942 Hard register uses in a memory address are ignored.
2943 Clobbers and flags definitions are ignored. */
2945 static bool
2946 has_non_address_hard_reg (rtx_insn *insn)
2948 df_ref ref;
2949 FOR_EACH_INSN_DEF (ref, insn)
2950 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2951 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2952 && DF_REF_REGNO (ref) != FLAGS_REG)
2953 return true;
2955 FOR_EACH_INSN_USE (ref, insn)
2956 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2957 return true;
2959 return false;
2962 /* Check if comparison INSN may be transformed
2963 into vector comparison. Currently we transform
2964 zero checks only which look like:
2966 (set (reg:CCZ 17 flags)
2967 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2968 (subreg:SI (reg:DI x) 0))
2969 (const_int 0 [0]))) */
2971 static bool
2972 convertible_comparison_p (rtx_insn *insn)
2974 if (!TARGET_SSE4_1)
2975 return false;
2977 rtx def_set = single_set (insn);
2979 gcc_assert (def_set);
2981 rtx src = SET_SRC (def_set);
2982 rtx dst = SET_DEST (def_set);
2984 gcc_assert (GET_CODE (src) == COMPARE);
2986 if (GET_CODE (dst) != REG
2987 || REGNO (dst) != FLAGS_REG
2988 || GET_MODE (dst) != CCZmode)
2989 return false;
2991 rtx op1 = XEXP (src, 0);
2992 rtx op2 = XEXP (src, 1);
2994 if (op2 != CONST0_RTX (GET_MODE (op2)))
2995 return false;
2997 if (GET_CODE (op1) != IOR)
2998 return false;
3000 op2 = XEXP (op1, 1);
3001 op1 = XEXP (op1, 0);
3003 if (!SUBREG_P (op1)
3004 || !SUBREG_P (op2)
3005 || GET_MODE (op1) != SImode
3006 || GET_MODE (op2) != SImode
3007 || ((SUBREG_BYTE (op1) != 0
3008 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3009 && (SUBREG_BYTE (op2) != 0
3010 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3011 return false;
3013 op1 = SUBREG_REG (op1);
3014 op2 = SUBREG_REG (op2);
3016 if (op1 != op2
3017 || !REG_P (op1)
3018 || GET_MODE (op1) != DImode)
3019 return false;
3021 return true;
3024 /* The DImode version of scalar_to_vector_candidate_p. */
3026 static bool
3027 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3029 rtx def_set = single_set (insn);
3031 if (!def_set)
3032 return false;
3034 if (has_non_address_hard_reg (insn))
3035 return false;
3037 rtx src = SET_SRC (def_set);
3038 rtx dst = SET_DEST (def_set);
3040 if (GET_CODE (src) == COMPARE)
3041 return convertible_comparison_p (insn);
3043 /* We are interested in DImode promotion only. */
3044 if ((GET_MODE (src) != DImode
3045 && !CONST_INT_P (src))
3046 || GET_MODE (dst) != DImode)
3047 return false;
3049 if (!REG_P (dst) && !MEM_P (dst))
3050 return false;
3052 switch (GET_CODE (src))
3054 case ASHIFTRT:
3055 if (!TARGET_AVX512VL)
3056 return false;
3057 /* FALLTHRU */
3059 case ASHIFT:
3060 case LSHIFTRT:
3061 if (!REG_P (XEXP (src, 1))
3062 && (!SUBREG_P (XEXP (src, 1))
3063 || SUBREG_BYTE (XEXP (src, 1)) != 0
3064 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3065 && (!CONST_INT_P (XEXP (src, 1))
3066 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3067 return false;
3069 if (GET_MODE (XEXP (src, 1)) != QImode
3070 && !CONST_INT_P (XEXP (src, 1)))
3071 return false;
3072 break;
3074 case PLUS:
3075 case MINUS:
3076 case IOR:
3077 case XOR:
3078 case AND:
3079 if (!REG_P (XEXP (src, 1))
3080 && !MEM_P (XEXP (src, 1))
3081 && !CONST_INT_P (XEXP (src, 1)))
3082 return false;
3084 if (GET_MODE (XEXP (src, 1)) != DImode
3085 && !CONST_INT_P (XEXP (src, 1)))
3086 return false;
3087 break;
3089 case NEG:
3090 case NOT:
3091 break;
3093 case REG:
3094 return true;
3096 case MEM:
3097 case CONST_INT:
3098 return REG_P (dst);
3100 default:
3101 return false;
3104 if (!REG_P (XEXP (src, 0))
3105 && !MEM_P (XEXP (src, 0))
3106 && !CONST_INT_P (XEXP (src, 0))
3107 /* Check for andnot case. */
3108 && (GET_CODE (src) != AND
3109 || GET_CODE (XEXP (src, 0)) != NOT
3110 || !REG_P (XEXP (XEXP (src, 0), 0))))
3111 return false;
3113 if (GET_MODE (XEXP (src, 0)) != DImode
3114 && !CONST_INT_P (XEXP (src, 0)))
3115 return false;
3117 return true;
3120 /* The TImode version of scalar_to_vector_candidate_p. */
3122 static bool
3123 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3125 rtx def_set = single_set (insn);
3127 if (!def_set)
3128 return false;
3130 if (has_non_address_hard_reg (insn))
3131 return false;
3133 rtx src = SET_SRC (def_set);
3134 rtx dst = SET_DEST (def_set);
3136 /* Only TImode load and store are allowed. */
3137 if (GET_MODE (dst) != TImode)
3138 return false;
3140 if (MEM_P (dst))
3142 /* Check for store. Memory must be aligned or unaligned store
3143 is optimal. Only support store from register, standard SSE
3144 constant or CONST_WIDE_INT generated from piecewise store.
3146 ??? Verify performance impact before enabling CONST_INT for
3147 __int128 store. */
3148 if (misaligned_operand (dst, TImode)
3149 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3150 return false;
3152 switch (GET_CODE (src))
3154 default:
3155 return false;
3157 case REG:
3158 case CONST_WIDE_INT:
3159 return true;
3161 case CONST_INT:
3162 return standard_sse_constant_p (src, TImode);
3165 else if (MEM_P (src))
3167 /* Check for load. Memory must be aligned or unaligned load is
3168 optimal. */
3169 return (REG_P (dst)
3170 && (!misaligned_operand (src, TImode)
3171 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3174 return false;
3177 /* Return 1 if INSN may be converted into vector
3178 instruction. */
3180 static bool
3181 scalar_to_vector_candidate_p (rtx_insn *insn)
3183 if (TARGET_64BIT)
3184 return timode_scalar_to_vector_candidate_p (insn);
3185 else
3186 return dimode_scalar_to_vector_candidate_p (insn);
3189 /* The DImode version of remove_non_convertible_regs. */
3191 static void
3192 dimode_remove_non_convertible_regs (bitmap candidates)
3194 bitmap_iterator bi;
3195 unsigned id;
3196 bitmap regs = BITMAP_ALLOC (NULL);
3198 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3200 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3201 rtx reg = SET_DEST (def_set);
3203 if (!REG_P (reg)
3204 || bitmap_bit_p (regs, REGNO (reg))
3205 || HARD_REGISTER_P (reg))
3206 continue;
3208 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3209 def;
3210 def = DF_REF_NEXT_REG (def))
3212 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3214 if (dump_file)
3215 fprintf (dump_file,
3216 "r%d has non convertible definition in insn %d\n",
3217 REGNO (reg), DF_REF_INSN_UID (def));
3219 bitmap_set_bit (regs, REGNO (reg));
3220 break;
3225 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3227 for (df_ref def = DF_REG_DEF_CHAIN (id);
3228 def;
3229 def = DF_REF_NEXT_REG (def))
3230 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3232 if (dump_file)
3233 fprintf (dump_file, "Removing insn %d from candidates list\n",
3234 DF_REF_INSN_UID (def));
3236 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3240 BITMAP_FREE (regs);
3243 /* For a register REGNO, scan instructions for its defs and uses.
3244 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3246 static void
3247 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3248 unsigned int regno)
3250 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3251 def;
3252 def = DF_REF_NEXT_REG (def))
3254 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3256 if (dump_file)
3257 fprintf (dump_file,
3258 "r%d has non convertible def in insn %d\n",
3259 regno, DF_REF_INSN_UID (def));
3261 bitmap_set_bit (regs, regno);
3262 break;
3266 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3267 ref;
3268 ref = DF_REF_NEXT_REG (ref))
3270 /* Debug instructions are skipped. */
3271 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3272 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3274 if (dump_file)
3275 fprintf (dump_file,
3276 "r%d has non convertible use in insn %d\n",
3277 regno, DF_REF_INSN_UID (ref));
3279 bitmap_set_bit (regs, regno);
3280 break;
3285 /* The TImode version of remove_non_convertible_regs. */
3287 static void
3288 timode_remove_non_convertible_regs (bitmap candidates)
3290 bitmap_iterator bi;
3291 unsigned id;
3292 bitmap regs = BITMAP_ALLOC (NULL);
3294 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3296 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3297 rtx dest = SET_DEST (def_set);
3298 rtx src = SET_SRC (def_set);
3300 if ((!REG_P (dest)
3301 || bitmap_bit_p (regs, REGNO (dest))
3302 || HARD_REGISTER_P (dest))
3303 && (!REG_P (src)
3304 || bitmap_bit_p (regs, REGNO (src))
3305 || HARD_REGISTER_P (src)))
3306 continue;
3308 if (REG_P (dest))
3309 timode_check_non_convertible_regs (candidates, regs,
3310 REGNO (dest));
3312 if (REG_P (src))
3313 timode_check_non_convertible_regs (candidates, regs,
3314 REGNO (src));
3317 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3319 for (df_ref def = DF_REG_DEF_CHAIN (id);
3320 def;
3321 def = DF_REF_NEXT_REG (def))
3322 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3324 if (dump_file)
3325 fprintf (dump_file, "Removing insn %d from candidates list\n",
3326 DF_REF_INSN_UID (def));
3328 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3331 for (df_ref ref = DF_REG_USE_CHAIN (id);
3332 ref;
3333 ref = DF_REF_NEXT_REG (ref))
3334 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3336 if (dump_file)
3337 fprintf (dump_file, "Removing insn %d from candidates list\n",
3338 DF_REF_INSN_UID (ref));
3340 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3344 BITMAP_FREE (regs);
3347 /* For a given bitmap of insn UIDs scans all instruction and
3348 remove insn from CANDIDATES in case it has both convertible
3349 and not convertible definitions.
3351 All insns in a bitmap are conversion candidates according to
3352 scalar_to_vector_candidate_p. Currently it implies all insns
3353 are single_set. */
3355 static void
3356 remove_non_convertible_regs (bitmap candidates)
3358 if (TARGET_64BIT)
3359 timode_remove_non_convertible_regs (candidates);
3360 else
3361 dimode_remove_non_convertible_regs (candidates);
3364 class scalar_chain
3366 public:
3367 scalar_chain ();
3368 virtual ~scalar_chain ();
3370 static unsigned max_id;
3372 /* ID of a chain. */
3373 unsigned int chain_id;
3374 /* A queue of instructions to be included into a chain. */
3375 bitmap queue;
3376 /* Instructions included into a chain. */
3377 bitmap insns;
3378 /* All registers defined by a chain. */
3379 bitmap defs;
3380 /* Registers used in both vector and sclar modes. */
3381 bitmap defs_conv;
3383 void build (bitmap candidates, unsigned insn_uid);
3384 virtual int compute_convert_gain () = 0;
3385 int convert ();
3387 protected:
3388 void add_to_queue (unsigned insn_uid);
3389 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3391 private:
3392 void add_insn (bitmap candidates, unsigned insn_uid);
3393 void analyze_register_chain (bitmap candidates, df_ref ref);
3394 virtual void mark_dual_mode_def (df_ref def) = 0;
3395 virtual void convert_insn (rtx_insn *insn) = 0;
3396 virtual void convert_registers () = 0;
3399 class dimode_scalar_chain : public scalar_chain
3401 public:
3402 int compute_convert_gain ();
3403 private:
3404 void mark_dual_mode_def (df_ref def);
3405 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3406 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3407 void convert_insn (rtx_insn *insn);
3408 void convert_op (rtx *op, rtx_insn *insn);
3409 void convert_reg (unsigned regno);
3410 void make_vector_copies (unsigned regno);
3411 void convert_registers ();
3412 int vector_const_cost (rtx exp);
3415 class timode_scalar_chain : public scalar_chain
3417 public:
3418 /* Convert from TImode to V1TImode is always faster. */
3419 int compute_convert_gain () { return 1; }
3421 private:
3422 void mark_dual_mode_def (df_ref def);
3423 void fix_debug_reg_uses (rtx reg);
3424 void convert_insn (rtx_insn *insn);
3425 /* We don't convert registers to difference size. */
3426 void convert_registers () {}
3429 unsigned scalar_chain::max_id = 0;
3431 /* Initialize new chain. */
3433 scalar_chain::scalar_chain ()
3435 chain_id = ++max_id;
3437 if (dump_file)
3438 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3440 bitmap_obstack_initialize (NULL);
3441 insns = BITMAP_ALLOC (NULL);
3442 defs = BITMAP_ALLOC (NULL);
3443 defs_conv = BITMAP_ALLOC (NULL);
3444 queue = NULL;
3447 /* Free chain's data. */
3449 scalar_chain::~scalar_chain ()
3451 BITMAP_FREE (insns);
3452 BITMAP_FREE (defs);
3453 BITMAP_FREE (defs_conv);
3454 bitmap_obstack_release (NULL);
3457 /* Add instruction into chains' queue. */
3459 void
3460 scalar_chain::add_to_queue (unsigned insn_uid)
3462 if (bitmap_bit_p (insns, insn_uid)
3463 || bitmap_bit_p (queue, insn_uid))
3464 return;
3466 if (dump_file)
3467 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3468 insn_uid, chain_id);
3469 bitmap_set_bit (queue, insn_uid);
3472 /* For DImode conversion, mark register defined by DEF as requiring
3473 conversion. */
3475 void
3476 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3478 gcc_assert (DF_REF_REG_DEF_P (def));
3480 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3481 return;
3483 if (dump_file)
3484 fprintf (dump_file,
3485 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3486 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3488 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3491 /* For TImode conversion, it is unused. */
3493 void
3494 timode_scalar_chain::mark_dual_mode_def (df_ref)
3496 gcc_unreachable ();
3499 /* Check REF's chain to add new insns into a queue
3500 and find registers requiring conversion. */
3502 void
3503 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3505 df_link *chain;
3507 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3508 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3509 add_to_queue (DF_REF_INSN_UID (ref));
3511 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3513 unsigned uid = DF_REF_INSN_UID (chain->ref);
3515 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3516 continue;
3518 if (!DF_REF_REG_MEM_P (chain->ref))
3520 if (bitmap_bit_p (insns, uid))
3521 continue;
3523 if (bitmap_bit_p (candidates, uid))
3525 add_to_queue (uid);
3526 continue;
3530 if (DF_REF_REG_DEF_P (chain->ref))
3532 if (dump_file)
3533 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3534 DF_REF_REGNO (chain->ref), uid);
3535 mark_dual_mode_def (chain->ref);
3537 else
3539 if (dump_file)
3540 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3541 DF_REF_REGNO (chain->ref), uid);
3542 mark_dual_mode_def (ref);
3547 /* Add instruction into a chain. */
3549 void
3550 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3552 if (bitmap_bit_p (insns, insn_uid))
3553 return;
3555 if (dump_file)
3556 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3558 bitmap_set_bit (insns, insn_uid);
3560 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3561 rtx def_set = single_set (insn);
3562 if (def_set && REG_P (SET_DEST (def_set))
3563 && !HARD_REGISTER_P (SET_DEST (def_set)))
3564 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3566 df_ref ref;
3567 df_ref def;
3568 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3569 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3570 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3571 def;
3572 def = DF_REF_NEXT_REG (def))
3573 analyze_register_chain (candidates, def);
3574 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3575 if (!DF_REF_REG_MEM_P (ref))
3576 analyze_register_chain (candidates, ref);
3579 /* Build new chain starting from insn INSN_UID recursively
3580 adding all dependent uses and definitions. */
3582 void
3583 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3585 queue = BITMAP_ALLOC (NULL);
3586 bitmap_set_bit (queue, insn_uid);
3588 if (dump_file)
3589 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3591 while (!bitmap_empty_p (queue))
3593 insn_uid = bitmap_first_set_bit (queue);
3594 bitmap_clear_bit (queue, insn_uid);
3595 bitmap_clear_bit (candidates, insn_uid);
3596 add_insn (candidates, insn_uid);
3599 if (dump_file)
3601 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3602 fprintf (dump_file, " insns: ");
3603 dump_bitmap (dump_file, insns);
3604 if (!bitmap_empty_p (defs_conv))
3606 bitmap_iterator bi;
3607 unsigned id;
3608 const char *comma = "";
3609 fprintf (dump_file, " defs to convert: ");
3610 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3612 fprintf (dump_file, "%sr%d", comma, id);
3613 comma = ", ";
3615 fprintf (dump_file, "\n");
3619 BITMAP_FREE (queue);
3622 /* Return a cost of building a vector costant
3623 instead of using a scalar one. */
3626 dimode_scalar_chain::vector_const_cost (rtx exp)
3628 gcc_assert (CONST_INT_P (exp));
3630 if (standard_sse_constant_p (exp, V2DImode))
3631 return COSTS_N_INSNS (1);
3632 return ix86_cost->sse_load[1];
3635 /* Compute a gain for chain conversion. */
3638 dimode_scalar_chain::compute_convert_gain ()
3640 bitmap_iterator bi;
3641 unsigned insn_uid;
3642 int gain = 0;
3643 int cost = 0;
3645 if (dump_file)
3646 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3648 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3650 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3651 rtx def_set = single_set (insn);
3652 rtx src = SET_SRC (def_set);
3653 rtx dst = SET_DEST (def_set);
3655 if (REG_P (src) && REG_P (dst))
3656 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3657 else if (REG_P (src) && MEM_P (dst))
3658 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3659 else if (MEM_P (src) && REG_P (dst))
3660 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3661 else if (GET_CODE (src) == ASHIFT
3662 || GET_CODE (src) == ASHIFTRT
3663 || GET_CODE (src) == LSHIFTRT)
3665 if (CONST_INT_P (XEXP (src, 0)))
3666 gain -= vector_const_cost (XEXP (src, 0));
3667 if (CONST_INT_P (XEXP (src, 1)))
3669 gain += ix86_cost->shift_const;
3670 if (INTVAL (XEXP (src, 1)) >= 32)
3671 gain -= COSTS_N_INSNS (1);
3673 else
3674 /* Additional gain for omitting two CMOVs. */
3675 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3677 else if (GET_CODE (src) == PLUS
3678 || GET_CODE (src) == MINUS
3679 || GET_CODE (src) == IOR
3680 || GET_CODE (src) == XOR
3681 || GET_CODE (src) == AND)
3683 gain += ix86_cost->add;
3684 /* Additional gain for andnot for targets without BMI. */
3685 if (GET_CODE (XEXP (src, 0)) == NOT
3686 && !TARGET_BMI)
3687 gain += 2 * ix86_cost->add;
3689 if (CONST_INT_P (XEXP (src, 0)))
3690 gain -= vector_const_cost (XEXP (src, 0));
3691 if (CONST_INT_P (XEXP (src, 1)))
3692 gain -= vector_const_cost (XEXP (src, 1));
3694 else if (GET_CODE (src) == NEG
3695 || GET_CODE (src) == NOT)
3696 gain += ix86_cost->add - COSTS_N_INSNS (1);
3697 else if (GET_CODE (src) == COMPARE)
3699 /* Assume comparison cost is the same. */
3701 else if (CONST_INT_P (src))
3703 if (REG_P (dst))
3704 gain += COSTS_N_INSNS (2);
3705 else if (MEM_P (dst))
3706 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3707 gain -= vector_const_cost (src);
3709 else
3710 gcc_unreachable ();
3713 if (dump_file)
3714 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3716 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3717 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3719 if (dump_file)
3720 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3722 gain -= cost;
3724 if (dump_file)
3725 fprintf (dump_file, " Total gain: %d\n", gain);
3727 return gain;
3730 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3733 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3735 if (x == reg)
3736 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3738 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3739 int i, j;
3740 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3742 if (fmt[i] == 'e')
3743 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3744 else if (fmt[i] == 'E')
3745 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3746 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3747 reg, new_reg);
3750 return x;
3753 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3755 void
3756 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3757 rtx reg, rtx new_reg)
3759 replace_with_subreg (single_set (insn), reg, new_reg);
3762 /* Insert generated conversion instruction sequence INSNS
3763 after instruction AFTER. New BB may be required in case
3764 instruction has EH region attached. */
3766 void
3767 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3769 if (!control_flow_insn_p (after))
3771 emit_insn_after (insns, after);
3772 return;
3775 basic_block bb = BLOCK_FOR_INSN (after);
3776 edge e = find_fallthru_edge (bb->succs);
3777 gcc_assert (e);
3779 basic_block new_bb = split_edge (e);
3780 emit_insn_after (insns, BB_HEAD (new_bb));
3783 /* Make vector copies for all register REGNO definitions
3784 and replace its uses in a chain. */
3786 void
3787 dimode_scalar_chain::make_vector_copies (unsigned regno)
3789 rtx reg = regno_reg_rtx[regno];
3790 rtx vreg = gen_reg_rtx (DImode);
3791 bool count_reg = false;
3792 df_ref ref;
3794 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3795 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3797 df_ref use;
3799 /* Detect the count register of a shift instruction. */
3800 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3801 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3803 rtx_insn *insn = DF_REF_INSN (use);
3804 rtx def_set = single_set (insn);
3806 gcc_assert (def_set);
3808 rtx src = SET_SRC (def_set);
3810 if ((GET_CODE (src) == ASHIFT
3811 || GET_CODE (src) == ASHIFTRT
3812 || GET_CODE (src) == LSHIFTRT)
3813 && !CONST_INT_P (XEXP (src, 1))
3814 && reg_or_subregno (XEXP (src, 1)) == regno)
3815 count_reg = true;
3818 start_sequence ();
3819 if (count_reg)
3821 rtx qreg = gen_lowpart (QImode, reg);
3822 rtx tmp = gen_reg_rtx (SImode);
3824 if (TARGET_ZERO_EXTEND_WITH_AND
3825 && optimize_function_for_speed_p (cfun))
3827 emit_move_insn (tmp, const0_rtx);
3828 emit_insn (gen_movstrictqi
3829 (gen_lowpart (QImode, tmp), qreg));
3831 else
3832 emit_insn (gen_rtx_SET
3833 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3835 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3837 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3838 emit_move_insn (slot, tmp);
3839 tmp = copy_rtx (slot);
3842 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3844 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3846 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3847 emit_move_insn (adjust_address (tmp, SImode, 0),
3848 gen_rtx_SUBREG (SImode, reg, 0));
3849 emit_move_insn (adjust_address (tmp, SImode, 4),
3850 gen_rtx_SUBREG (SImode, reg, 4));
3851 emit_move_insn (vreg, tmp);
3853 else if (TARGET_SSE4_1)
3855 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3856 CONST0_RTX (V4SImode),
3857 gen_rtx_SUBREG (SImode, reg, 0)));
3858 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3859 gen_rtx_SUBREG (V4SImode, vreg, 0),
3860 gen_rtx_SUBREG (SImode, reg, 4),
3861 GEN_INT (2)));
3863 else
3865 rtx tmp = gen_reg_rtx (DImode);
3866 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3867 CONST0_RTX (V4SImode),
3868 gen_rtx_SUBREG (SImode, reg, 0)));
3869 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3870 CONST0_RTX (V4SImode),
3871 gen_rtx_SUBREG (SImode, reg, 4)));
3872 emit_insn (gen_vec_interleave_lowv4si
3873 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3874 gen_rtx_SUBREG (V4SImode, vreg, 0),
3875 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3877 rtx_insn *seq = get_insns ();
3878 end_sequence ();
3879 rtx_insn *insn = DF_REF_INSN (ref);
3880 emit_conversion_insns (seq, insn);
3882 if (dump_file)
3883 fprintf (dump_file,
3884 " Copied r%d to a vector register r%d for insn %d\n",
3885 regno, REGNO (vreg), INSN_UID (insn));
3888 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3889 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3891 rtx_insn *insn = DF_REF_INSN (ref);
3892 if (count_reg)
3894 rtx def_set = single_set (insn);
3895 gcc_assert (def_set);
3897 rtx src = SET_SRC (def_set);
3899 if ((GET_CODE (src) == ASHIFT
3900 || GET_CODE (src) == ASHIFTRT
3901 || GET_CODE (src) == LSHIFTRT)
3902 && !CONST_INT_P (XEXP (src, 1))
3903 && reg_or_subregno (XEXP (src, 1)) == regno)
3904 XEXP (src, 1) = vreg;
3906 else
3907 replace_with_subreg_in_insn (insn, reg, vreg);
3909 if (dump_file)
3910 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3911 regno, REGNO (vreg), INSN_UID (insn));
3915 /* Convert all definitions of register REGNO
3916 and fix its uses. Scalar copies may be created
3917 in case register is used in not convertible insn. */
3919 void
3920 dimode_scalar_chain::convert_reg (unsigned regno)
3922 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3923 rtx reg = regno_reg_rtx[regno];
3924 rtx scopy = NULL_RTX;
3925 df_ref ref;
3926 bitmap conv;
3928 conv = BITMAP_ALLOC (NULL);
3929 bitmap_copy (conv, insns);
3931 if (scalar_copy)
3932 scopy = gen_reg_rtx (DImode);
3934 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3936 rtx_insn *insn = DF_REF_INSN (ref);
3937 rtx def_set = single_set (insn);
3938 rtx src = SET_SRC (def_set);
3939 rtx reg = DF_REF_REG (ref);
3941 if (!MEM_P (src))
3943 replace_with_subreg_in_insn (insn, reg, reg);
3944 bitmap_clear_bit (conv, INSN_UID (insn));
3947 if (scalar_copy)
3949 start_sequence ();
3950 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3952 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3953 emit_move_insn (tmp, reg);
3954 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3955 adjust_address (tmp, SImode, 0));
3956 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3957 adjust_address (tmp, SImode, 4));
3959 else if (TARGET_SSE4_1)
3961 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3962 emit_insn
3963 (gen_rtx_SET
3964 (gen_rtx_SUBREG (SImode, scopy, 0),
3965 gen_rtx_VEC_SELECT (SImode,
3966 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3968 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3969 emit_insn
3970 (gen_rtx_SET
3971 (gen_rtx_SUBREG (SImode, scopy, 4),
3972 gen_rtx_VEC_SELECT (SImode,
3973 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3975 else
3977 rtx vcopy = gen_reg_rtx (V2DImode);
3978 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3979 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3980 gen_rtx_SUBREG (SImode, vcopy, 0));
3981 emit_move_insn (vcopy,
3982 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3983 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3984 gen_rtx_SUBREG (SImode, vcopy, 0));
3986 rtx_insn *seq = get_insns ();
3987 end_sequence ();
3988 emit_conversion_insns (seq, insn);
3990 if (dump_file)
3991 fprintf (dump_file,
3992 " Copied r%d to a scalar register r%d for insn %d\n",
3993 regno, REGNO (scopy), INSN_UID (insn));
3997 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3998 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
4000 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
4002 rtx_insn *insn = DF_REF_INSN (ref);
4004 rtx def_set = single_set (insn);
4005 gcc_assert (def_set);
4007 rtx src = SET_SRC (def_set);
4008 rtx dst = SET_DEST (def_set);
4010 if ((GET_CODE (src) == ASHIFT
4011 || GET_CODE (src) == ASHIFTRT
4012 || GET_CODE (src) == LSHIFTRT)
4013 && !CONST_INT_P (XEXP (src, 1))
4014 && reg_or_subregno (XEXP (src, 1)) == regno)
4016 rtx tmp2 = gen_reg_rtx (V2DImode);
4018 start_sequence ();
4020 if (TARGET_SSE4_1)
4021 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4022 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4023 else
4025 rtx vec_cst
4026 = gen_rtx_CONST_VECTOR (V2DImode,
4027 gen_rtvec (2, GEN_INT (0xff),
4028 const0_rtx));
4029 vec_cst
4030 = validize_mem (force_const_mem (V2DImode, vec_cst));
4032 emit_insn (gen_rtx_SET
4033 (tmp2,
4034 gen_rtx_AND (V2DImode,
4035 gen_rtx_SUBREG (V2DImode, reg, 0),
4036 vec_cst)));
4038 rtx_insn *seq = get_insns ();
4039 end_sequence ();
4041 emit_insn_before (seq, insn);
4043 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4045 else if (!MEM_P (dst) || !REG_P (src))
4046 replace_with_subreg_in_insn (insn, reg, reg);
4048 bitmap_clear_bit (conv, INSN_UID (insn));
4051 /* Skip debug insns and uninitialized uses. */
4052 else if (DF_REF_CHAIN (ref)
4053 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4055 gcc_assert (scopy);
4056 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4057 df_insn_rescan (DF_REF_INSN (ref));
4060 BITMAP_FREE (conv);
4063 /* Convert operand OP in INSN. We should handle
4064 memory operands and uninitialized registers.
4065 All other register uses are converted during
4066 registers conversion. */
4068 void
4069 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4071 *op = copy_rtx_if_shared (*op);
4073 if (GET_CODE (*op) == NOT)
4075 convert_op (&XEXP (*op, 0), insn);
4076 PUT_MODE (*op, V2DImode);
4078 else if (MEM_P (*op))
4080 rtx tmp = gen_reg_rtx (DImode);
4082 emit_insn_before (gen_move_insn (tmp, *op), insn);
4083 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4085 if (dump_file)
4086 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4087 INSN_UID (insn), REGNO (tmp));
4089 else if (REG_P (*op))
4091 /* We may have not converted register usage in case
4092 this register has no definition. Otherwise it
4093 should be converted in convert_reg. */
4094 df_ref ref;
4095 FOR_EACH_INSN_USE (ref, insn)
4096 if (DF_REF_REGNO (ref) == REGNO (*op))
4098 gcc_assert (!DF_REF_CHAIN (ref));
4099 break;
4101 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4103 else if (CONST_INT_P (*op))
4105 rtx vec_cst;
4106 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4108 /* Prefer all ones vector in case of -1. */
4109 if (constm1_operand (*op, GET_MODE (*op)))
4110 vec_cst = CONSTM1_RTX (V2DImode);
4111 else
4112 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4113 gen_rtvec (2, *op, const0_rtx));
4115 if (!standard_sse_constant_p (vec_cst, V2DImode))
4117 start_sequence ();
4118 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4119 rtx_insn *seq = get_insns ();
4120 end_sequence ();
4121 emit_insn_before (seq, insn);
4124 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4125 *op = tmp;
4127 else
4129 gcc_assert (SUBREG_P (*op));
4130 gcc_assert (GET_MODE (*op) == V2DImode);
4134 /* Convert INSN to vector mode. */
4136 void
4137 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4139 rtx def_set = single_set (insn);
4140 rtx src = SET_SRC (def_set);
4141 rtx dst = SET_DEST (def_set);
4142 rtx subreg;
4144 if (MEM_P (dst) && !REG_P (src))
4146 /* There are no scalar integer instructions and therefore
4147 temporary register usage is required. */
4148 rtx tmp = gen_reg_rtx (DImode);
4149 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4150 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4153 switch (GET_CODE (src))
4155 case ASHIFT:
4156 case ASHIFTRT:
4157 case LSHIFTRT:
4158 convert_op (&XEXP (src, 0), insn);
4159 PUT_MODE (src, V2DImode);
4160 break;
4162 case PLUS:
4163 case MINUS:
4164 case IOR:
4165 case XOR:
4166 case AND:
4167 convert_op (&XEXP (src, 0), insn);
4168 convert_op (&XEXP (src, 1), insn);
4169 PUT_MODE (src, V2DImode);
4170 break;
4172 case NEG:
4173 src = XEXP (src, 0);
4174 convert_op (&src, insn);
4175 subreg = gen_reg_rtx (V2DImode);
4176 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4177 src = gen_rtx_MINUS (V2DImode, subreg, src);
4178 break;
4180 case NOT:
4181 src = XEXP (src, 0);
4182 convert_op (&src, insn);
4183 subreg = gen_reg_rtx (V2DImode);
4184 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4185 src = gen_rtx_XOR (V2DImode, src, subreg);
4186 break;
4188 case MEM:
4189 if (!REG_P (dst))
4190 convert_op (&src, insn);
4191 break;
4193 case REG:
4194 if (!MEM_P (dst))
4195 convert_op (&src, insn);
4196 break;
4198 case SUBREG:
4199 gcc_assert (GET_MODE (src) == V2DImode);
4200 break;
4202 case COMPARE:
4203 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4205 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4206 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4208 if (REG_P (src))
4209 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4210 else
4211 subreg = copy_rtx_if_shared (src);
4212 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4213 copy_rtx_if_shared (subreg),
4214 copy_rtx_if_shared (subreg)),
4215 insn);
4216 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4217 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4218 copy_rtx_if_shared (src)),
4219 UNSPEC_PTEST);
4220 break;
4222 case CONST_INT:
4223 convert_op (&src, insn);
4224 break;
4226 default:
4227 gcc_unreachable ();
4230 SET_SRC (def_set) = src;
4231 SET_DEST (def_set) = dst;
4233 /* Drop possible dead definitions. */
4234 PATTERN (insn) = def_set;
4236 INSN_CODE (insn) = -1;
4237 recog_memoized (insn);
4238 df_insn_rescan (insn);
4241 /* Fix uses of converted REG in debug insns. */
4243 void
4244 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4246 if (!flag_var_tracking)
4247 return;
4249 df_ref ref, next;
4250 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4252 rtx_insn *insn = DF_REF_INSN (ref);
4253 /* Make sure the next ref is for a different instruction,
4254 so that we're not affected by the rescan. */
4255 next = DF_REF_NEXT_REG (ref);
4256 while (next && DF_REF_INSN (next) == insn)
4257 next = DF_REF_NEXT_REG (next);
4259 if (DEBUG_INSN_P (insn))
4261 /* It may be a debug insn with a TImode variable in
4262 register. */
4263 bool changed = false;
4264 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4266 rtx *loc = DF_REF_LOC (ref);
4267 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4269 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4270 changed = true;
4273 if (changed)
4274 df_insn_rescan (insn);
4279 /* Convert INSN from TImode to V1T1mode. */
4281 void
4282 timode_scalar_chain::convert_insn (rtx_insn *insn)
4284 rtx def_set = single_set (insn);
4285 rtx src = SET_SRC (def_set);
4286 rtx dst = SET_DEST (def_set);
4288 switch (GET_CODE (dst))
4290 case REG:
4292 rtx tmp = find_reg_equal_equiv_note (insn);
4293 if (tmp)
4294 PUT_MODE (XEXP (tmp, 0), V1TImode);
4295 PUT_MODE (dst, V1TImode);
4296 fix_debug_reg_uses (dst);
4298 break;
4299 case MEM:
4300 PUT_MODE (dst, V1TImode);
4301 break;
4303 default:
4304 gcc_unreachable ();
4307 switch (GET_CODE (src))
4309 case REG:
4310 PUT_MODE (src, V1TImode);
4311 /* Call fix_debug_reg_uses only if SRC is never defined. */
4312 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4313 fix_debug_reg_uses (src);
4314 break;
4316 case MEM:
4317 PUT_MODE (src, V1TImode);
4318 break;
4320 case CONST_WIDE_INT:
4321 if (NONDEBUG_INSN_P (insn))
4323 /* Since there are no instructions to store 128-bit constant,
4324 temporary register usage is required. */
4325 rtx tmp = gen_reg_rtx (V1TImode);
4326 start_sequence ();
4327 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4328 src = validize_mem (force_const_mem (V1TImode, src));
4329 rtx_insn *seq = get_insns ();
4330 end_sequence ();
4331 if (seq)
4332 emit_insn_before (seq, insn);
4333 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4334 dst = tmp;
4336 break;
4338 case CONST_INT:
4339 switch (standard_sse_constant_p (src, TImode))
4341 case 1:
4342 src = CONST0_RTX (GET_MODE (dst));
4343 break;
4344 case 2:
4345 src = CONSTM1_RTX (GET_MODE (dst));
4346 break;
4347 default:
4348 gcc_unreachable ();
4350 if (NONDEBUG_INSN_P (insn))
4352 rtx tmp = gen_reg_rtx (V1TImode);
4353 /* Since there are no instructions to store standard SSE
4354 constant, temporary register usage is required. */
4355 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4356 dst = tmp;
4358 break;
4360 default:
4361 gcc_unreachable ();
4364 SET_SRC (def_set) = src;
4365 SET_DEST (def_set) = dst;
4367 /* Drop possible dead definitions. */
4368 PATTERN (insn) = def_set;
4370 INSN_CODE (insn) = -1;
4371 recog_memoized (insn);
4372 df_insn_rescan (insn);
4375 void
4376 dimode_scalar_chain::convert_registers ()
4378 bitmap_iterator bi;
4379 unsigned id;
4381 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4382 convert_reg (id);
4384 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4385 make_vector_copies (id);
4388 /* Convert whole chain creating required register
4389 conversions and copies. */
4392 scalar_chain::convert ()
4394 bitmap_iterator bi;
4395 unsigned id;
4396 int converted_insns = 0;
4398 if (!dbg_cnt (stv_conversion))
4399 return 0;
4401 if (dump_file)
4402 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4404 convert_registers ();
4406 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4408 convert_insn (DF_INSN_UID_GET (id)->insn);
4409 converted_insns++;
4412 return converted_insns;
4415 /* Main STV pass function. Find and convert scalar
4416 instructions into vector mode when profitable. */
4418 static unsigned int
4419 convert_scalars_to_vector ()
4421 basic_block bb;
4422 bitmap candidates;
4423 int converted_insns = 0;
4425 bitmap_obstack_initialize (NULL);
4426 candidates = BITMAP_ALLOC (NULL);
4428 calculate_dominance_info (CDI_DOMINATORS);
4429 df_set_flags (DF_DEFER_INSN_RESCAN);
4430 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4431 df_md_add_problem ();
4432 df_analyze ();
4434 /* Find all instructions we want to convert into vector mode. */
4435 if (dump_file)
4436 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4438 FOR_EACH_BB_FN (bb, cfun)
4440 rtx_insn *insn;
4441 FOR_BB_INSNS (bb, insn)
4442 if (scalar_to_vector_candidate_p (insn))
4444 if (dump_file)
4445 fprintf (dump_file, " insn %d is marked as a candidate\n",
4446 INSN_UID (insn));
4448 bitmap_set_bit (candidates, INSN_UID (insn));
4452 remove_non_convertible_regs (candidates);
4454 if (bitmap_empty_p (candidates))
4455 if (dump_file)
4456 fprintf (dump_file, "There are no candidates for optimization.\n");
4458 while (!bitmap_empty_p (candidates))
4460 unsigned uid = bitmap_first_set_bit (candidates);
4461 scalar_chain *chain;
4463 if (TARGET_64BIT)
4464 chain = new timode_scalar_chain;
4465 else
4466 chain = new dimode_scalar_chain;
4468 /* Find instructions chain we want to convert to vector mode.
4469 Check all uses and definitions to estimate all required
4470 conversions. */
4471 chain->build (candidates, uid);
4473 if (chain->compute_convert_gain () > 0)
4474 converted_insns += chain->convert ();
4475 else
4476 if (dump_file)
4477 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4478 chain->chain_id);
4480 delete chain;
4483 if (dump_file)
4484 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4486 BITMAP_FREE (candidates);
4487 bitmap_obstack_release (NULL);
4488 df_process_deferred_rescans ();
4490 /* Conversion means we may have 128bit register spills/fills
4491 which require aligned stack. */
4492 if (converted_insns)
4494 if (crtl->stack_alignment_needed < 128)
4495 crtl->stack_alignment_needed = 128;
4496 if (crtl->stack_alignment_estimated < 128)
4497 crtl->stack_alignment_estimated = 128;
4498 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4499 if (TARGET_64BIT)
4500 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4501 parm; parm = DECL_CHAIN (parm))
4503 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4504 continue;
4505 if (DECL_RTL_SET_P (parm)
4506 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4508 rtx r = DECL_RTL (parm);
4509 if (REG_P (r))
4510 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4512 if (DECL_INCOMING_RTL (parm)
4513 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4515 rtx r = DECL_INCOMING_RTL (parm);
4516 if (REG_P (r))
4517 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4522 return 0;
4525 namespace {
4527 const pass_data pass_data_insert_vzeroupper =
4529 RTL_PASS, /* type */
4530 "vzeroupper", /* name */
4531 OPTGROUP_NONE, /* optinfo_flags */
4532 TV_MACH_DEP, /* tv_id */
4533 0, /* properties_required */
4534 0, /* properties_provided */
4535 0, /* properties_destroyed */
4536 0, /* todo_flags_start */
4537 TODO_df_finish, /* todo_flags_finish */
4540 class pass_insert_vzeroupper : public rtl_opt_pass
4542 public:
4543 pass_insert_vzeroupper(gcc::context *ctxt)
4544 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4547 /* opt_pass methods: */
4548 virtual bool gate (function *)
4550 return TARGET_AVX && !TARGET_AVX512F
4551 && TARGET_VZEROUPPER && flag_expensive_optimizations
4552 && !optimize_size;
4555 virtual unsigned int execute (function *)
4557 return rest_of_handle_insert_vzeroupper ();
4560 }; // class pass_insert_vzeroupper
4562 const pass_data pass_data_stv =
4564 RTL_PASS, /* type */
4565 "stv", /* name */
4566 OPTGROUP_NONE, /* optinfo_flags */
4567 TV_MACH_DEP, /* tv_id */
4568 0, /* properties_required */
4569 0, /* properties_provided */
4570 0, /* properties_destroyed */
4571 0, /* todo_flags_start */
4572 TODO_df_finish, /* todo_flags_finish */
4575 class pass_stv : public rtl_opt_pass
4577 public:
4578 pass_stv (gcc::context *ctxt)
4579 : rtl_opt_pass (pass_data_stv, ctxt),
4580 timode_p (false)
4583 /* opt_pass methods: */
4584 virtual bool gate (function *)
4586 return (timode_p == !!TARGET_64BIT
4587 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4590 virtual unsigned int execute (function *)
4592 return convert_scalars_to_vector ();
4595 opt_pass *clone ()
4597 return new pass_stv (m_ctxt);
4600 void set_pass_param (unsigned int n, bool param)
4602 gcc_assert (n == 0);
4603 timode_p = param;
4606 private:
4607 bool timode_p;
4608 }; // class pass_stv
4610 } // anon namespace
4612 rtl_opt_pass *
4613 make_pass_insert_vzeroupper (gcc::context *ctxt)
4615 return new pass_insert_vzeroupper (ctxt);
4618 rtl_opt_pass *
4619 make_pass_stv (gcc::context *ctxt)
4621 return new pass_stv (ctxt);
4624 /* Return true if a red-zone is in use. */
4626 bool
4627 ix86_using_red_zone (void)
4629 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4632 /* Return a string that documents the current -m options. The caller is
4633 responsible for freeing the string. */
4635 static char *
4636 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4637 int flags, int flags2,
4638 const char *arch, const char *tune,
4639 enum fpmath_unit fpmath, bool add_nl_p)
4641 struct ix86_target_opts
4643 const char *option; /* option string */
4644 HOST_WIDE_INT mask; /* isa mask options */
4647 /* This table is ordered so that options like -msse4.2 that imply other
4648 ISAs come first. Target string will be displayed in the same order. */
4649 static struct ix86_target_opts isa2_opts[] =
4651 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4652 { "-msgx", OPTION_MASK_ISA_SGX },
4653 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4654 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4655 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4657 static struct ix86_target_opts isa_opts[] =
4659 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4660 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4661 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4662 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4663 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4664 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4665 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4666 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4667 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4668 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4669 { "-mfma", OPTION_MASK_ISA_FMA },
4670 { "-mxop", OPTION_MASK_ISA_XOP },
4671 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4672 { "-mf16c", OPTION_MASK_ISA_F16C },
4673 { "-mavx", OPTION_MASK_ISA_AVX },
4674 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4675 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4676 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4677 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4678 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4679 { "-msse3", OPTION_MASK_ISA_SSE3 },
4680 { "-maes", OPTION_MASK_ISA_AES },
4681 { "-msha", OPTION_MASK_ISA_SHA },
4682 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4683 { "-msse2", OPTION_MASK_ISA_SSE2 },
4684 { "-msse", OPTION_MASK_ISA_SSE },
4685 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4686 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4687 { "-mmmx", OPTION_MASK_ISA_MMX },
4688 { "-mrtm", OPTION_MASK_ISA_RTM },
4689 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4690 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4691 { "-madx", OPTION_MASK_ISA_ADX },
4692 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4693 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4694 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4695 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4696 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4697 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4698 { "-mabm", OPTION_MASK_ISA_ABM },
4699 { "-mbmi", OPTION_MASK_ISA_BMI },
4700 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4701 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4702 { "-mtbm", OPTION_MASK_ISA_TBM },
4703 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4704 { "-mcx16", OPTION_MASK_ISA_CX16 },
4705 { "-msahf", OPTION_MASK_ISA_SAHF },
4706 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4707 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4708 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4709 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4710 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4711 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4712 { "-mpku", OPTION_MASK_ISA_PKU },
4713 { "-mlwp", OPTION_MASK_ISA_LWP },
4714 { "-mhle", OPTION_MASK_ISA_HLE },
4715 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4716 { "-mmpx", OPTION_MASK_ISA_MPX },
4717 { "-mclwb", OPTION_MASK_ISA_CLWB }
4720 /* Flag options. */
4721 static struct ix86_target_opts flag_opts[] =
4723 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4724 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4725 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4726 { "-m80387", MASK_80387 },
4727 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4728 { "-malign-double", MASK_ALIGN_DOUBLE },
4729 { "-mcld", MASK_CLD },
4730 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4731 { "-mieee-fp", MASK_IEEE_FP },
4732 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4733 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4734 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4735 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4736 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4737 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4738 { "-mno-red-zone", MASK_NO_RED_ZONE },
4739 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4740 { "-mrecip", MASK_RECIP },
4741 { "-mrtd", MASK_RTD },
4742 { "-msseregparm", MASK_SSEREGPARM },
4743 { "-mstack-arg-probe", MASK_STACK_PROBE },
4744 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4745 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4746 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4747 { "-mvzeroupper", MASK_VZEROUPPER },
4748 { "-mstv", MASK_STV },
4749 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4750 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4751 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4752 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4755 /* Additional flag options. */
4756 static struct ix86_target_opts flag2_opts[] =
4758 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4761 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4762 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4764 char isa_other[40];
4765 char isa2_other[40];
4766 char flags_other[40];
4767 char flags2_other[40];
4768 unsigned num = 0;
4769 unsigned i, j;
4770 char *ret;
4771 char *ptr;
4772 size_t len;
4773 size_t line_len;
4774 size_t sep_len;
4775 const char *abi;
4777 memset (opts, '\0', sizeof (opts));
4779 /* Add -march= option. */
4780 if (arch)
4782 opts[num][0] = "-march=";
4783 opts[num++][1] = arch;
4786 /* Add -mtune= option. */
4787 if (tune)
4789 opts[num][0] = "-mtune=";
4790 opts[num++][1] = tune;
4793 /* Add -m32/-m64/-mx32. */
4794 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4796 if ((isa & OPTION_MASK_ABI_64) != 0)
4797 abi = "-m64";
4798 else
4799 abi = "-mx32";
4800 isa &= ~ (OPTION_MASK_ISA_64BIT
4801 | OPTION_MASK_ABI_64
4802 | OPTION_MASK_ABI_X32);
4804 else
4805 abi = "-m32";
4806 opts[num++][0] = abi;
4808 /* Pick out the options in isa2 options. */
4809 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4811 if ((isa2 & isa2_opts[i].mask) != 0)
4813 opts[num++][0] = isa2_opts[i].option;
4814 isa2 &= ~ isa2_opts[i].mask;
4818 if (isa2 && add_nl_p)
4820 opts[num++][0] = isa2_other;
4821 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4824 /* Pick out the options in isa options. */
4825 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4827 if ((isa & isa_opts[i].mask) != 0)
4829 opts[num++][0] = isa_opts[i].option;
4830 isa &= ~ isa_opts[i].mask;
4834 if (isa && add_nl_p)
4836 opts[num++][0] = isa_other;
4837 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4840 /* Add flag options. */
4841 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4843 if ((flags & flag_opts[i].mask) != 0)
4845 opts[num++][0] = flag_opts[i].option;
4846 flags &= ~ flag_opts[i].mask;
4850 if (flags && add_nl_p)
4852 opts[num++][0] = flags_other;
4853 sprintf (flags_other, "(other flags: %#x)", flags);
4856 /* Add additional flag options. */
4857 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4859 if ((flags2 & flag2_opts[i].mask) != 0)
4861 opts[num++][0] = flag2_opts[i].option;
4862 flags2 &= ~ flag2_opts[i].mask;
4866 if (flags2 && add_nl_p)
4868 opts[num++][0] = flags2_other;
4869 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4872 /* Add -fpmath= option. */
4873 if (fpmath)
4875 opts[num][0] = "-mfpmath=";
4876 switch ((int) fpmath)
4878 case FPMATH_387:
4879 opts[num++][1] = "387";
4880 break;
4882 case FPMATH_SSE:
4883 opts[num++][1] = "sse";
4884 break;
4886 case FPMATH_387 | FPMATH_SSE:
4887 opts[num++][1] = "sse+387";
4888 break;
4890 default:
4891 gcc_unreachable ();
4895 /* Any options? */
4896 if (num == 0)
4897 return NULL;
4899 gcc_assert (num < ARRAY_SIZE (opts));
4901 /* Size the string. */
4902 len = 0;
4903 sep_len = (add_nl_p) ? 3 : 1;
4904 for (i = 0; i < num; i++)
4906 len += sep_len;
4907 for (j = 0; j < 2; j++)
4908 if (opts[i][j])
4909 len += strlen (opts[i][j]);
4912 /* Build the string. */
4913 ret = ptr = (char *) xmalloc (len);
4914 line_len = 0;
4916 for (i = 0; i < num; i++)
4918 size_t len2[2];
4920 for (j = 0; j < 2; j++)
4921 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4923 if (i != 0)
4925 *ptr++ = ' ';
4926 line_len++;
4928 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4930 *ptr++ = '\\';
4931 *ptr++ = '\n';
4932 line_len = 0;
4936 for (j = 0; j < 2; j++)
4937 if (opts[i][j])
4939 memcpy (ptr, opts[i][j], len2[j]);
4940 ptr += len2[j];
4941 line_len += len2[j];
4945 *ptr = '\0';
4946 gcc_assert (ret + len >= ptr);
4948 return ret;
4951 /* Return true, if profiling code should be emitted before
4952 prologue. Otherwise it returns false.
4953 Note: For x86 with "hotfix" it is sorried. */
4954 static bool
4955 ix86_profile_before_prologue (void)
4957 return flag_fentry != 0;
4960 /* Function that is callable from the debugger to print the current
4961 options. */
4962 void ATTRIBUTE_UNUSED
4963 ix86_debug_options (void)
4965 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4966 target_flags, ix86_target_flags,
4967 ix86_arch_string,ix86_tune_string,
4968 ix86_fpmath, true);
4970 if (opts)
4972 fprintf (stderr, "%s\n\n", opts);
4973 free (opts);
4975 else
4976 fputs ("<no options>\n\n", stderr);
4978 return;
4981 /* Return true if T is one of the bytes we should avoid with
4982 -fmitigate-rop. */
4984 static bool
4985 ix86_rop_should_change_byte_p (int t)
4987 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4990 static const char *stringop_alg_names[] = {
4991 #define DEF_ENUM
4992 #define DEF_ALG(alg, name) #name,
4993 #include "stringop.def"
4994 #undef DEF_ENUM
4995 #undef DEF_ALG
4998 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4999 The string is of the following form (or comma separated list of it):
5001 strategy_alg:max_size:[align|noalign]
5003 where the full size range for the strategy is either [0, max_size] or
5004 [min_size, max_size], in which min_size is the max_size + 1 of the
5005 preceding range. The last size range must have max_size == -1.
5007 Examples:
5010 -mmemcpy-strategy=libcall:-1:noalign
5012 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5016 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5018 This is to tell the compiler to use the following strategy for memset
5019 1) when the expected size is between [1, 16], use rep_8byte strategy;
5020 2) when the size is between [17, 2048], use vector_loop;
5021 3) when the size is > 2048, use libcall. */
5023 struct stringop_size_range
5025 int max;
5026 stringop_alg alg;
5027 bool noalign;
5030 static void
5031 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5033 const struct stringop_algs *default_algs;
5034 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5035 char *curr_range_str, *next_range_str;
5036 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5037 int i = 0, n = 0;
5039 if (is_memset)
5040 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5041 else
5042 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5044 curr_range_str = strategy_str;
5048 int maxs;
5049 char alg_name[128];
5050 char align[16];
5051 next_range_str = strchr (curr_range_str, ',');
5052 if (next_range_str)
5053 *next_range_str++ = '\0';
5055 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5056 alg_name, &maxs, align))
5058 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5059 return;
5062 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5064 error ("size ranges of option %qs should be increasing", opt);
5065 return;
5068 for (i = 0; i < last_alg; i++)
5069 if (!strcmp (alg_name, stringop_alg_names[i]))
5070 break;
5072 if (i == last_alg)
5074 error ("wrong strategy name %qs specified for option %qs",
5075 alg_name, opt);
5077 auto_vec <const char *> candidates;
5078 for (i = 0; i < last_alg; i++)
5079 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5080 candidates.safe_push (stringop_alg_names[i]);
5082 char *s;
5083 const char *hint
5084 = candidates_list_and_hint (alg_name, s, candidates);
5085 if (hint)
5086 inform (input_location,
5087 "valid arguments to %qs are: %s; did you mean %qs?",
5088 opt, s, hint);
5089 else
5090 inform (input_location, "valid arguments to %qs are: %s",
5091 opt, s);
5092 XDELETEVEC (s);
5093 return;
5096 if ((stringop_alg) i == rep_prefix_8_byte
5097 && !TARGET_64BIT)
5099 /* rep; movq isn't available in 32-bit code. */
5100 error ("strategy name %qs specified for option %qs "
5101 "not supported for 32-bit code", alg_name, opt);
5102 return;
5105 input_ranges[n].max = maxs;
5106 input_ranges[n].alg = (stringop_alg) i;
5107 if (!strcmp (align, "align"))
5108 input_ranges[n].noalign = false;
5109 else if (!strcmp (align, "noalign"))
5110 input_ranges[n].noalign = true;
5111 else
5113 error ("unknown alignment %qs specified for option %qs", align, opt);
5114 return;
5116 n++;
5117 curr_range_str = next_range_str;
5119 while (curr_range_str);
5121 if (input_ranges[n - 1].max != -1)
5123 error ("the max value for the last size range should be -1"
5124 " for option %qs", opt);
5125 return;
5128 if (n > MAX_STRINGOP_ALGS)
5130 error ("too many size ranges specified in option %qs", opt);
5131 return;
5134 /* Now override the default algs array. */
5135 for (i = 0; i < n; i++)
5137 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5138 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5139 = input_ranges[i].alg;
5140 *const_cast<int *>(&default_algs->size[i].noalign)
5141 = input_ranges[i].noalign;
5146 /* parse -mtune-ctrl= option. When DUMP is true,
5147 print the features that are explicitly set. */
5149 static void
5150 parse_mtune_ctrl_str (bool dump)
5152 if (!ix86_tune_ctrl_string)
5153 return;
5155 char *next_feature_string = NULL;
5156 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5157 char *orig = curr_feature_string;
5158 int i;
5161 bool clear = false;
5163 next_feature_string = strchr (curr_feature_string, ',');
5164 if (next_feature_string)
5165 *next_feature_string++ = '\0';
5166 if (*curr_feature_string == '^')
5168 curr_feature_string++;
5169 clear = true;
5171 for (i = 0; i < X86_TUNE_LAST; i++)
5173 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5175 ix86_tune_features[i] = !clear;
5176 if (dump)
5177 fprintf (stderr, "Explicitly %s feature %s\n",
5178 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5179 break;
5182 if (i == X86_TUNE_LAST)
5183 error ("Unknown parameter to option -mtune-ctrl: %s",
5184 clear ? curr_feature_string - 1 : curr_feature_string);
5185 curr_feature_string = next_feature_string;
5187 while (curr_feature_string);
5188 free (orig);
5191 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5192 processor type. */
5194 static void
5195 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5197 unsigned int ix86_tune_mask = 1u << ix86_tune;
5198 int i;
5200 for (i = 0; i < X86_TUNE_LAST; ++i)
5202 if (ix86_tune_no_default)
5203 ix86_tune_features[i] = 0;
5204 else
5205 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5208 if (dump)
5210 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5211 for (i = 0; i < X86_TUNE_LAST; i++)
5212 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5213 ix86_tune_features[i] ? "on" : "off");
5216 parse_mtune_ctrl_str (dump);
5220 /* Default align_* from the processor table. */
5222 static void
5223 ix86_default_align (struct gcc_options *opts)
5225 if (opts->x_align_loops == 0)
5227 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5228 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5230 if (opts->x_align_jumps == 0)
5232 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5233 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5235 if (opts->x_align_functions == 0)
5237 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5241 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5243 static void
5244 ix86_override_options_after_change (void)
5246 ix86_default_align (&global_options);
5249 /* Override various settings based on options. If MAIN_ARGS_P, the
5250 options are from the command line, otherwise they are from
5251 attributes. Return true if there's an error related to march
5252 option. */
5254 static bool
5255 ix86_option_override_internal (bool main_args_p,
5256 struct gcc_options *opts,
5257 struct gcc_options *opts_set)
5259 int i;
5260 unsigned int ix86_arch_mask;
5261 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5263 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5264 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5265 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5266 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5267 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5268 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5269 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5270 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5271 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5272 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5273 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5274 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5275 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5276 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5277 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5278 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5279 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5280 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5281 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5282 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5283 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5284 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5285 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5286 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5287 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5288 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5289 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5290 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5291 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5292 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5293 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5294 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5295 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5296 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5297 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5298 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5299 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5300 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5301 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5302 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5303 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5304 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5305 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5306 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5307 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5308 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5309 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5310 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5311 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5312 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5313 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5314 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5315 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5316 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5317 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5318 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5319 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5320 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5321 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5322 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5323 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5324 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5325 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5326 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5328 #define PTA_CORE2 \
5329 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5330 | PTA_CX16 | PTA_FXSR)
5331 #define PTA_NEHALEM \
5332 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5333 #define PTA_WESTMERE \
5334 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5335 #define PTA_SANDYBRIDGE \
5336 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5337 #define PTA_IVYBRIDGE \
5338 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5339 #define PTA_HASWELL \
5340 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5341 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5342 #define PTA_BROADWELL \
5343 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5344 #define PTA_SKYLAKE \
5345 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5346 #define PTA_SKYLAKE_AVX512 \
5347 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5348 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5349 #define PTA_KNL \
5350 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5351 #define PTA_BONNELL \
5352 (PTA_CORE2 | PTA_MOVBE)
5353 #define PTA_SILVERMONT \
5354 (PTA_WESTMERE | PTA_MOVBE)
5356 /* if this reaches 64, need to widen struct pta flags below */
5358 static struct pta
5360 const char *const name; /* processor name or nickname. */
5361 const enum processor_type processor;
5362 const enum attr_cpu schedule;
5363 const unsigned HOST_WIDE_INT flags;
5365 const processor_alias_table[] =
5367 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5368 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5369 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5370 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5371 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5372 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5373 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5374 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5375 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5376 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5377 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5378 PTA_MMX | PTA_SSE | PTA_FXSR},
5379 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5380 PTA_MMX | PTA_SSE | PTA_FXSR},
5381 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5382 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5383 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5384 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5385 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5386 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5387 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5388 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5389 PTA_MMX | PTA_SSE | PTA_FXSR},
5390 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5391 PTA_MMX | PTA_SSE | PTA_FXSR},
5392 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5393 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5394 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5395 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5396 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5397 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5398 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5399 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5400 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5401 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5402 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5403 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5404 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5405 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5406 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5407 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5408 PTA_SANDYBRIDGE},
5409 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5410 PTA_SANDYBRIDGE},
5411 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5412 PTA_IVYBRIDGE},
5413 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5414 PTA_IVYBRIDGE},
5415 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5416 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5417 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5418 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5419 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5420 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5421 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5422 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5423 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5424 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5425 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5426 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5427 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5428 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5429 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5430 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5431 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5432 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5433 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5434 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5435 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5436 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5437 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5438 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5439 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5440 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5441 {"x86-64", PROCESSOR_K8, CPU_K8,
5442 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5443 {"eden-x2", PROCESSOR_K8, CPU_K8,
5444 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5445 {"nano", PROCESSOR_K8, CPU_K8,
5446 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5447 | PTA_SSSE3 | PTA_FXSR},
5448 {"nano-1000", PROCESSOR_K8, CPU_K8,
5449 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5450 | PTA_SSSE3 | PTA_FXSR},
5451 {"nano-2000", PROCESSOR_K8, CPU_K8,
5452 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5453 | PTA_SSSE3 | PTA_FXSR},
5454 {"nano-3000", PROCESSOR_K8, CPU_K8,
5455 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5456 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5457 {"nano-x2", PROCESSOR_K8, CPU_K8,
5458 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5459 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5460 {"eden-x4", PROCESSOR_K8, CPU_K8,
5461 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5462 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5463 {"nano-x4", PROCESSOR_K8, CPU_K8,
5464 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5465 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5466 {"k8", PROCESSOR_K8, CPU_K8,
5467 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5468 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5469 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5470 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5471 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5472 {"opteron", PROCESSOR_K8, CPU_K8,
5473 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5474 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5475 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5476 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5477 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5478 {"athlon64", PROCESSOR_K8, CPU_K8,
5479 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5480 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5481 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5482 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5483 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5484 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5485 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5486 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5487 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5488 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5489 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5490 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5491 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5492 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5493 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5494 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5495 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5496 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5497 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5498 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5499 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5500 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5501 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5502 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5503 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5504 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5505 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5506 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5507 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5508 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5509 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5510 | PTA_XSAVEOPT | PTA_FSGSBASE},
5511 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5512 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5513 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5514 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5515 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5516 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5517 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5518 | PTA_MOVBE | PTA_MWAITX},
5519 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5520 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5521 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5522 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5523 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5524 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5525 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5526 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5527 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5528 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5529 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5530 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5531 | PTA_FXSR | PTA_XSAVE},
5532 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5533 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5534 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5535 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5536 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5537 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5539 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5540 PTA_64BIT
5541 | PTA_HLE /* flags are only used for -march switch. */ },
5544 /* -mrecip options. */
5545 static struct
5547 const char *string; /* option name */
5548 unsigned int mask; /* mask bits to set */
5550 const recip_options[] =
5552 { "all", RECIP_MASK_ALL },
5553 { "none", RECIP_MASK_NONE },
5554 { "div", RECIP_MASK_DIV },
5555 { "sqrt", RECIP_MASK_SQRT },
5556 { "vec-div", RECIP_MASK_VEC_DIV },
5557 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5560 int const pta_size = ARRAY_SIZE (processor_alias_table);
5562 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5563 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5564 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5565 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5566 #ifdef TARGET_BI_ARCH
5567 else
5569 #if TARGET_BI_ARCH == 1
5570 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5571 is on and OPTION_MASK_ABI_X32 is off. We turn off
5572 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5573 -mx32. */
5574 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5575 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5576 #else
5577 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5578 on and OPTION_MASK_ABI_64 is off. We turn off
5579 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5580 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5581 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5582 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5583 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5584 #endif
5585 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5586 && TARGET_IAMCU_P (opts->x_target_flags))
5587 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5588 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5590 #endif
5592 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5594 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5595 OPTION_MASK_ABI_64 for TARGET_X32. */
5596 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5597 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5599 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5600 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5601 | OPTION_MASK_ABI_X32
5602 | OPTION_MASK_ABI_64);
5603 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5605 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5606 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5608 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5611 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5612 SUBTARGET_OVERRIDE_OPTIONS;
5613 #endif
5615 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5616 SUBSUBTARGET_OVERRIDE_OPTIONS;
5617 #endif
5619 /* -fPIC is the default for x86_64. */
5620 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5621 opts->x_flag_pic = 2;
5623 /* Need to check -mtune=generic first. */
5624 if (opts->x_ix86_tune_string)
5626 /* As special support for cross compilers we read -mtune=native
5627 as -mtune=generic. With native compilers we won't see the
5628 -mtune=native, as it was changed by the driver. */
5629 if (!strcmp (opts->x_ix86_tune_string, "native"))
5631 opts->x_ix86_tune_string = "generic";
5633 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5634 warning (OPT_Wdeprecated,
5635 main_args_p
5636 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5637 "or %<-mtune=generic%> instead as appropriate")
5638 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5639 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5640 " instead as appropriate"));
5642 else
5644 if (opts->x_ix86_arch_string)
5645 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5646 if (!opts->x_ix86_tune_string)
5648 opts->x_ix86_tune_string
5649 = processor_target_table[TARGET_CPU_DEFAULT].name;
5650 ix86_tune_defaulted = 1;
5653 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5654 or defaulted. We need to use a sensible tune option. */
5655 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5657 opts->x_ix86_tune_string = "generic";
5661 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5662 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5664 /* rep; movq isn't available in 32-bit code. */
5665 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5666 opts->x_ix86_stringop_alg = no_stringop;
5669 if (!opts->x_ix86_arch_string)
5670 opts->x_ix86_arch_string
5671 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5672 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5673 else
5674 ix86_arch_specified = 1;
5676 if (opts_set->x_ix86_pmode)
5678 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5679 && opts->x_ix86_pmode == PMODE_SI)
5680 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5681 && opts->x_ix86_pmode == PMODE_DI))
5682 error ("address mode %qs not supported in the %s bit mode",
5683 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5684 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5686 else
5687 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5688 ? PMODE_DI : PMODE_SI;
5690 if (!opts_set->x_ix86_abi)
5691 opts->x_ix86_abi = DEFAULT_ABI;
5693 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
5694 error ("-mabi=ms not supported with X32 ABI");
5695 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
5697 /* For targets using ms ABI enable ms-extensions, if not
5698 explicit turned off. For non-ms ABI we turn off this
5699 option. */
5700 if (!opts_set->x_flag_ms_extensions)
5701 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5703 if (opts_set->x_ix86_cmodel)
5705 switch (opts->x_ix86_cmodel)
5707 case CM_SMALL:
5708 case CM_SMALL_PIC:
5709 if (opts->x_flag_pic)
5710 opts->x_ix86_cmodel = CM_SMALL_PIC;
5711 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5712 error ("code model %qs not supported in the %s bit mode",
5713 "small", "32");
5714 break;
5716 case CM_MEDIUM:
5717 case CM_MEDIUM_PIC:
5718 if (opts->x_flag_pic)
5719 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5720 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5721 error ("code model %qs not supported in the %s bit mode",
5722 "medium", "32");
5723 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5724 error ("code model %qs not supported in x32 mode",
5725 "medium");
5726 break;
5728 case CM_LARGE:
5729 case CM_LARGE_PIC:
5730 if (opts->x_flag_pic)
5731 opts->x_ix86_cmodel = CM_LARGE_PIC;
5732 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5733 error ("code model %qs not supported in the %s bit mode",
5734 "large", "32");
5735 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5736 error ("code model %qs not supported in x32 mode",
5737 "large");
5738 break;
5740 case CM_32:
5741 if (opts->x_flag_pic)
5742 error ("code model %s does not support PIC mode", "32");
5743 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5744 error ("code model %qs not supported in the %s bit mode",
5745 "32", "64");
5746 break;
5748 case CM_KERNEL:
5749 if (opts->x_flag_pic)
5751 error ("code model %s does not support PIC mode", "kernel");
5752 opts->x_ix86_cmodel = CM_32;
5754 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5755 error ("code model %qs not supported in the %s bit mode",
5756 "kernel", "32");
5757 break;
5759 default:
5760 gcc_unreachable ();
5763 else
5765 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5766 use of rip-relative addressing. This eliminates fixups that
5767 would otherwise be needed if this object is to be placed in a
5768 DLL, and is essentially just as efficient as direct addressing. */
5769 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5770 && (TARGET_RDOS || TARGET_PECOFF))
5771 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5772 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5773 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5774 else
5775 opts->x_ix86_cmodel = CM_32;
5777 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5779 error ("-masm=intel not supported in this configuration");
5780 opts->x_ix86_asm_dialect = ASM_ATT;
5782 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5783 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5784 sorry ("%i-bit mode not compiled in",
5785 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5787 for (i = 0; i < pta_size; i++)
5788 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5790 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5792 error (main_args_p
5793 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5794 "switch")
5795 : G_("%<generic%> CPU can be used only for "
5796 "%<target(\"tune=\")%> attribute"));
5797 return false;
5799 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5801 error (main_args_p
5802 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5803 "switch")
5804 : G_("%<intel%> CPU can be used only for "
5805 "%<target(\"tune=\")%> attribute"));
5806 return false;
5809 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5810 && !(processor_alias_table[i].flags & PTA_64BIT))
5812 error ("CPU you selected does not support x86-64 "
5813 "instruction set");
5814 return false;
5817 ix86_schedule = processor_alias_table[i].schedule;
5818 ix86_arch = processor_alias_table[i].processor;
5819 /* Default cpu tuning to the architecture. */
5820 ix86_tune = ix86_arch;
5822 if (processor_alias_table[i].flags & PTA_MMX
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5825 if (processor_alias_table[i].flags & PTA_3DNOW
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5828 if (processor_alias_table[i].flags & PTA_3DNOW_A
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5831 if (processor_alias_table[i].flags & PTA_SSE
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5834 if (processor_alias_table[i].flags & PTA_SSE2
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5837 if (processor_alias_table[i].flags & PTA_SSE3
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5840 if (processor_alias_table[i].flags & PTA_SSSE3
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5843 if (processor_alias_table[i].flags & PTA_SSE4_1
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5846 if (processor_alias_table[i].flags & PTA_SSE4_2
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5849 if (processor_alias_table[i].flags & PTA_AVX
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5852 if (processor_alias_table[i].flags & PTA_AVX2
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5855 if (processor_alias_table[i].flags & PTA_FMA
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5858 if (processor_alias_table[i].flags & PTA_SSE4A
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5861 if (processor_alias_table[i].flags & PTA_FMA4
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5864 if (processor_alias_table[i].flags & PTA_XOP
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5867 if (processor_alias_table[i].flags & PTA_LWP
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5870 if (processor_alias_table[i].flags & PTA_ABM
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5873 if (processor_alias_table[i].flags & PTA_BMI
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5876 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5879 if (processor_alias_table[i].flags & PTA_TBM
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5882 if (processor_alias_table[i].flags & PTA_BMI2
5883 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5885 if (processor_alias_table[i].flags & PTA_CX16
5886 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5887 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5888 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5889 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5890 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5891 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5892 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5893 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5894 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5895 if (processor_alias_table[i].flags & PTA_MOVBE
5896 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5897 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5898 if (processor_alias_table[i].flags & PTA_AES
5899 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5900 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5901 if (processor_alias_table[i].flags & PTA_SHA
5902 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5903 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5904 if (processor_alias_table[i].flags & PTA_PCLMUL
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5907 if (processor_alias_table[i].flags & PTA_FSGSBASE
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5910 if (processor_alias_table[i].flags & PTA_RDRND
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5913 if (processor_alias_table[i].flags & PTA_F16C
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5916 if (processor_alias_table[i].flags & PTA_RTM
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5919 if (processor_alias_table[i].flags & PTA_HLE
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5922 if (processor_alias_table[i].flags & PTA_PRFCHW
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5925 if (processor_alias_table[i].flags & PTA_RDSEED
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5928 if (processor_alias_table[i].flags & PTA_ADX
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5931 if (processor_alias_table[i].flags & PTA_FXSR
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5934 if (processor_alias_table[i].flags & PTA_XSAVE
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5937 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5940 if (processor_alias_table[i].flags & PTA_AVX512F
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5943 if (processor_alias_table[i].flags & PTA_AVX512ER
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5946 if (processor_alias_table[i].flags & PTA_AVX512PF
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5949 if (processor_alias_table[i].flags & PTA_AVX512CD
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5952 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5955 if (processor_alias_table[i].flags & PTA_CLWB
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5958 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5961 if (processor_alias_table[i].flags & PTA_CLZERO
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5964 if (processor_alias_table[i].flags & PTA_XSAVEC
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5967 if (processor_alias_table[i].flags & PTA_XSAVES
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5970 if (processor_alias_table[i].flags & PTA_AVX512DQ
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5973 if (processor_alias_table[i].flags & PTA_AVX512BW
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5976 if (processor_alias_table[i].flags & PTA_AVX512VL
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5979 if (processor_alias_table[i].flags & PTA_MPX
5980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5982 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5985 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5989 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5992 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5993 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5994 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5995 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5996 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5997 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5998 if (processor_alias_table[i].flags & PTA_SGX
5999 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
6000 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
6002 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
6003 x86_prefetch_sse = true;
6004 if (processor_alias_table[i].flags & PTA_MWAITX
6005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
6006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
6007 if (processor_alias_table[i].flags & PTA_PKU
6008 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6009 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6011 /* Don't enable x87 instructions if only
6012 general registers are allowed. */
6013 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6014 && !(opts_set->x_target_flags & MASK_80387))
6016 if (processor_alias_table[i].flags & PTA_NO_80387)
6017 opts->x_target_flags &= ~MASK_80387;
6018 else
6019 opts->x_target_flags |= MASK_80387;
6021 break;
6024 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6025 error ("Intel MPX does not support x32");
6027 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6028 error ("Intel MPX does not support x32");
6030 if (i == pta_size)
6032 error (main_args_p
6033 ? G_("bad value (%qs) for %<-march=%> switch")
6034 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6035 opts->x_ix86_arch_string);
6037 auto_vec <const char *> candidates;
6038 for (i = 0; i < pta_size; i++)
6039 if (strcmp (processor_alias_table[i].name, "generic")
6040 && strcmp (processor_alias_table[i].name, "intel")
6041 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6042 || (processor_alias_table[i].flags & PTA_64BIT)))
6043 candidates.safe_push (processor_alias_table[i].name);
6045 char *s;
6046 const char *hint
6047 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6048 if (hint)
6049 inform (input_location,
6050 main_args_p
6051 ? G_("valid arguments to %<-march=%> switch are: "
6052 "%s; did you mean %qs?")
6053 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6054 "%s; did you mean %qs?"), s, hint);
6055 else
6056 inform (input_location,
6057 main_args_p
6058 ? G_("valid arguments to %<-march=%> switch are: %s")
6059 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6060 "are: %s"), s);
6061 XDELETEVEC (s);
6064 ix86_arch_mask = 1u << ix86_arch;
6065 for (i = 0; i < X86_ARCH_LAST; ++i)
6066 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6068 for (i = 0; i < pta_size; i++)
6069 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6071 ix86_schedule = processor_alias_table[i].schedule;
6072 ix86_tune = processor_alias_table[i].processor;
6073 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6075 if (!(processor_alias_table[i].flags & PTA_64BIT))
6077 if (ix86_tune_defaulted)
6079 opts->x_ix86_tune_string = "x86-64";
6080 for (i = 0; i < pta_size; i++)
6081 if (! strcmp (opts->x_ix86_tune_string,
6082 processor_alias_table[i].name))
6083 break;
6084 ix86_schedule = processor_alias_table[i].schedule;
6085 ix86_tune = processor_alias_table[i].processor;
6087 else
6088 error ("CPU you selected does not support x86-64 "
6089 "instruction set");
6092 /* Intel CPUs have always interpreted SSE prefetch instructions as
6093 NOPs; so, we can enable SSE prefetch instructions even when
6094 -mtune (rather than -march) points us to a processor that has them.
6095 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6096 higher processors. */
6097 if (TARGET_CMOV
6098 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6099 x86_prefetch_sse = true;
6100 break;
6103 if (ix86_tune_specified && i == pta_size)
6105 error (main_args_p
6106 ? G_("bad value (%qs) for %<-mtune=%> switch")
6107 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6108 opts->x_ix86_tune_string);
6110 auto_vec <const char *> candidates;
6111 for (i = 0; i < pta_size; i++)
6112 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6113 || (processor_alias_table[i].flags & PTA_64BIT))
6114 candidates.safe_push (processor_alias_table[i].name);
6116 char *s;
6117 const char *hint
6118 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6119 if (hint)
6120 inform (input_location,
6121 main_args_p
6122 ? G_("valid arguments to %<-mtune=%> switch are: "
6123 "%s; did you mean %qs?")
6124 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6125 "%s; did you mean %qs?"), s, hint);
6126 else
6127 inform (input_location,
6128 main_args_p
6129 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6130 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6131 "are: %s"), s);
6132 XDELETEVEC (s);
6135 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6137 #ifndef USE_IX86_FRAME_POINTER
6138 #define USE_IX86_FRAME_POINTER 0
6139 #endif
6141 #ifndef USE_X86_64_FRAME_POINTER
6142 #define USE_X86_64_FRAME_POINTER 0
6143 #endif
6145 /* Set the default values for switches whose default depends on TARGET_64BIT
6146 in case they weren't overwritten by command line options. */
6147 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6149 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6150 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6151 if (opts->x_flag_asynchronous_unwind_tables
6152 && !opts_set->x_flag_unwind_tables
6153 && TARGET_64BIT_MS_ABI)
6154 opts->x_flag_unwind_tables = 1;
6155 if (opts->x_flag_asynchronous_unwind_tables == 2)
6156 opts->x_flag_unwind_tables
6157 = opts->x_flag_asynchronous_unwind_tables = 1;
6158 if (opts->x_flag_pcc_struct_return == 2)
6159 opts->x_flag_pcc_struct_return = 0;
6161 else
6163 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6164 opts->x_flag_omit_frame_pointer
6165 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6166 if (opts->x_flag_asynchronous_unwind_tables == 2)
6167 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6168 if (opts->x_flag_pcc_struct_return == 2)
6170 /* Intel MCU psABI specifies that -freg-struct-return should
6171 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6172 we check -miamcu so that -freg-struct-return is always
6173 turned on if -miamcu is used. */
6174 if (TARGET_IAMCU_P (opts->x_target_flags))
6175 opts->x_flag_pcc_struct_return = 0;
6176 else
6177 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6181 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6182 /* TODO: ix86_cost should be chosen at instruction or function granuality
6183 so for cold code we use size_cost even in !optimize_size compilation. */
6184 if (opts->x_optimize_size)
6185 ix86_cost = &ix86_size_cost;
6186 else
6187 ix86_cost = ix86_tune_cost;
6189 /* Arrange to set up i386_stack_locals for all functions. */
6190 init_machine_status = ix86_init_machine_status;
6192 /* Validate -mregparm= value. */
6193 if (opts_set->x_ix86_regparm)
6195 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6196 warning (0, "-mregparm is ignored in 64-bit mode");
6197 else if (TARGET_IAMCU_P (opts->x_target_flags))
6198 warning (0, "-mregparm is ignored for Intel MCU psABI");
6199 if (opts->x_ix86_regparm > REGPARM_MAX)
6201 error ("-mregparm=%d is not between 0 and %d",
6202 opts->x_ix86_regparm, REGPARM_MAX);
6203 opts->x_ix86_regparm = 0;
6206 if (TARGET_IAMCU_P (opts->x_target_flags)
6207 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6208 opts->x_ix86_regparm = REGPARM_MAX;
6210 /* Default align_* from the processor table. */
6211 ix86_default_align (opts);
6213 /* Provide default for -mbranch-cost= value. */
6214 if (!opts_set->x_ix86_branch_cost)
6215 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6217 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6219 opts->x_target_flags
6220 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6222 /* Enable by default the SSE and MMX builtins. Do allow the user to
6223 explicitly disable any of these. In particular, disabling SSE and
6224 MMX for kernel code is extremely useful. */
6225 if (!ix86_arch_specified)
6226 opts->x_ix86_isa_flags
6227 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6228 | TARGET_SUBTARGET64_ISA_DEFAULT)
6229 & ~opts->x_ix86_isa_flags_explicit);
6231 if (TARGET_RTD_P (opts->x_target_flags))
6232 warning (0,
6233 main_args_p
6234 ? G_("%<-mrtd%> is ignored in 64bit mode")
6235 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6237 else
6239 opts->x_target_flags
6240 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6242 if (!ix86_arch_specified)
6243 opts->x_ix86_isa_flags
6244 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6246 /* i386 ABI does not specify red zone. It still makes sense to use it
6247 when programmer takes care to stack from being destroyed. */
6248 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6249 opts->x_target_flags |= MASK_NO_RED_ZONE;
6252 /* Keep nonleaf frame pointers. */
6253 if (opts->x_flag_omit_frame_pointer)
6254 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6255 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6256 opts->x_flag_omit_frame_pointer = 1;
6258 /* If we're doing fast math, we don't care about comparison order
6259 wrt NaNs. This lets us use a shorter comparison sequence. */
6260 if (opts->x_flag_finite_math_only)
6261 opts->x_target_flags &= ~MASK_IEEE_FP;
6263 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6264 since the insns won't need emulation. */
6265 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6266 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6268 /* Likewise, if the target doesn't have a 387, or we've specified
6269 software floating point, don't use 387 inline intrinsics. */
6270 if (!TARGET_80387_P (opts->x_target_flags))
6271 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6273 /* Turn on MMX builtins for -msse. */
6274 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6275 opts->x_ix86_isa_flags
6276 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6278 /* Enable SSE prefetch. */
6279 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6280 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6281 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6282 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6283 x86_prefetch_sse = true;
6285 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6286 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6287 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6288 opts->x_ix86_isa_flags
6289 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6291 /* Enable lzcnt instruction for -mabm. */
6292 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6293 opts->x_ix86_isa_flags
6294 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6296 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6297 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6298 opts->x_ix86_isa_flags
6299 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6300 & ~opts->x_ix86_isa_flags_explicit);
6302 /* Validate -mpreferred-stack-boundary= value or default it to
6303 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6304 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6305 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6307 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6308 int max = TARGET_SEH ? 4 : 12;
6310 if (opts->x_ix86_preferred_stack_boundary_arg < min
6311 || opts->x_ix86_preferred_stack_boundary_arg > max)
6313 if (min == max)
6314 error ("-mpreferred-stack-boundary is not supported "
6315 "for this target");
6316 else
6317 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6318 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6320 else
6321 ix86_preferred_stack_boundary
6322 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6325 /* Set the default value for -mstackrealign. */
6326 if (!opts_set->x_ix86_force_align_arg_pointer)
6327 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6329 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6331 /* Validate -mincoming-stack-boundary= value or default it to
6332 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6333 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6334 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6336 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6338 if (opts->x_ix86_incoming_stack_boundary_arg < min
6339 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6340 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6341 opts->x_ix86_incoming_stack_boundary_arg, min);
6342 else
6344 ix86_user_incoming_stack_boundary
6345 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6346 ix86_incoming_stack_boundary
6347 = ix86_user_incoming_stack_boundary;
6351 #ifndef NO_PROFILE_COUNTERS
6352 if (flag_nop_mcount)
6353 error ("-mnop-mcount is not compatible with this target");
6354 #endif
6355 if (flag_nop_mcount && flag_pic)
6356 error ("-mnop-mcount is not implemented for -fPIC");
6358 /* Accept -msseregparm only if at least SSE support is enabled. */
6359 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6360 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6361 error (main_args_p
6362 ? G_("%<-msseregparm%> used without SSE enabled")
6363 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6365 if (opts_set->x_ix86_fpmath)
6367 if (opts->x_ix86_fpmath & FPMATH_SSE)
6369 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6371 if (TARGET_80387_P (opts->x_target_flags))
6373 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6374 opts->x_ix86_fpmath = FPMATH_387;
6377 else if ((opts->x_ix86_fpmath & FPMATH_387)
6378 && !TARGET_80387_P (opts->x_target_flags))
6380 warning (0, "387 instruction set disabled, using SSE arithmetics");
6381 opts->x_ix86_fpmath = FPMATH_SSE;
6385 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6386 fpmath=387. The second is however default at many targets since the
6387 extra 80bit precision of temporaries is considered to be part of ABI.
6388 Overwrite the default at least for -ffast-math.
6389 TODO: -mfpmath=both seems to produce same performing code with bit
6390 smaller binaries. It is however not clear if register allocation is
6391 ready for this setting.
6392 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6393 codegen. We may switch to 387 with -ffast-math for size optimized
6394 functions. */
6395 else if (fast_math_flags_set_p (&global_options)
6396 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6397 opts->x_ix86_fpmath = FPMATH_SSE;
6398 else
6399 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6401 /* Use external vectorized library in vectorizing intrinsics. */
6402 if (opts_set->x_ix86_veclibabi_type)
6403 switch (opts->x_ix86_veclibabi_type)
6405 case ix86_veclibabi_type_svml:
6406 ix86_veclib_handler = ix86_veclibabi_svml;
6407 break;
6409 case ix86_veclibabi_type_acml:
6410 ix86_veclib_handler = ix86_veclibabi_acml;
6411 break;
6413 default:
6414 gcc_unreachable ();
6417 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6418 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6419 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6421 /* If stack probes are required, the space used for large function
6422 arguments on the stack must also be probed, so enable
6423 -maccumulate-outgoing-args so this happens in the prologue. */
6424 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6425 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6427 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6428 warning (0,
6429 main_args_p
6430 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6431 "for correctness")
6432 : G_("stack probing requires "
6433 "%<target(\"accumulate-outgoing-args\")%> for "
6434 "correctness"));
6435 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6438 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6439 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6440 if (fixed_regs[BP_REG]
6441 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6443 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6444 warning (0,
6445 main_args_p
6446 ? G_("fixed ebp register requires "
6447 "%<-maccumulate-outgoing-args%>")
6448 : G_("fixed ebp register requires "
6449 "%<target(\"accumulate-outgoing-args\")%>"));
6450 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6453 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6455 char *p;
6456 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6457 p = strchr (internal_label_prefix, 'X');
6458 internal_label_prefix_len = p - internal_label_prefix;
6459 *p = '\0';
6462 /* When scheduling description is not available, disable scheduler pass
6463 so it won't slow down the compilation and make x87 code slower. */
6464 if (!TARGET_SCHEDULE)
6465 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6467 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6468 ix86_tune_cost->simultaneous_prefetches,
6469 opts->x_param_values,
6470 opts_set->x_param_values);
6471 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6472 ix86_tune_cost->prefetch_block,
6473 opts->x_param_values,
6474 opts_set->x_param_values);
6475 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6476 ix86_tune_cost->l1_cache_size,
6477 opts->x_param_values,
6478 opts_set->x_param_values);
6479 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6480 ix86_tune_cost->l2_cache_size,
6481 opts->x_param_values,
6482 opts_set->x_param_values);
6484 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6485 if (opts->x_flag_prefetch_loop_arrays < 0
6486 && HAVE_prefetch
6487 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6488 && !opts->x_optimize_size
6489 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6490 opts->x_flag_prefetch_loop_arrays = 1;
6492 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6493 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6494 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6495 targetm.expand_builtin_va_start = NULL;
6497 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6499 ix86_gen_leave = gen_leave_rex64;
6500 if (Pmode == DImode)
6502 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6503 ix86_gen_tls_local_dynamic_base_64
6504 = gen_tls_local_dynamic_base_64_di;
6506 else
6508 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6509 ix86_gen_tls_local_dynamic_base_64
6510 = gen_tls_local_dynamic_base_64_si;
6513 else
6514 ix86_gen_leave = gen_leave;
6516 if (Pmode == DImode)
6518 ix86_gen_add3 = gen_adddi3;
6519 ix86_gen_sub3 = gen_subdi3;
6520 ix86_gen_sub3_carry = gen_subdi3_carry;
6521 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6522 ix86_gen_andsp = gen_anddi3;
6523 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6524 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6525 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6526 ix86_gen_monitor = gen_sse3_monitor_di;
6527 ix86_gen_monitorx = gen_monitorx_di;
6528 ix86_gen_clzero = gen_clzero_di;
6530 else
6532 ix86_gen_add3 = gen_addsi3;
6533 ix86_gen_sub3 = gen_subsi3;
6534 ix86_gen_sub3_carry = gen_subsi3_carry;
6535 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6536 ix86_gen_andsp = gen_andsi3;
6537 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6538 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6539 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6540 ix86_gen_monitor = gen_sse3_monitor_si;
6541 ix86_gen_monitorx = gen_monitorx_si;
6542 ix86_gen_clzero = gen_clzero_si;
6545 #ifdef USE_IX86_CLD
6546 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6547 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6548 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6549 #endif
6551 /* Set the default value for -mfentry. */
6552 if (!opts_set->x_flag_fentry)
6553 opts->x_flag_fentry = TARGET_SEH;
6554 else
6556 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
6557 && opts->x_flag_fentry)
6558 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6559 "with -fpic");
6560 else if (TARGET_SEH && !opts->x_flag_fentry)
6561 sorry ("-mno-fentry isn%'t compatible with SEH");
6564 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6565 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6567 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6568 opts->x_target_flags |= MASK_VZEROUPPER;
6569 if (!(opts_set->x_target_flags & MASK_STV))
6570 opts->x_target_flags |= MASK_STV;
6571 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6572 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6573 stack realignment will be extra cost the pass doesn't take into
6574 account and the pass can't realign the stack. */
6575 if (ix86_preferred_stack_boundary < 128
6576 || ix86_incoming_stack_boundary < 128
6577 || opts->x_ix86_force_align_arg_pointer)
6578 opts->x_target_flags &= ~MASK_STV;
6579 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6580 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6581 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6582 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6583 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6584 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6585 /* Enable 128-bit AVX instruction generation
6586 for the auto-vectorizer. */
6587 if (TARGET_AVX128_OPTIMAL
6588 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6589 opts->x_target_flags |= MASK_PREFER_AVX128;
6591 if (opts->x_ix86_recip_name)
6593 char *p = ASTRDUP (opts->x_ix86_recip_name);
6594 char *q;
6595 unsigned int mask, i;
6596 bool invert;
6598 while ((q = strtok (p, ",")) != NULL)
6600 p = NULL;
6601 if (*q == '!')
6603 invert = true;
6604 q++;
6606 else
6607 invert = false;
6609 if (!strcmp (q, "default"))
6610 mask = RECIP_MASK_ALL;
6611 else
6613 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6614 if (!strcmp (q, recip_options[i].string))
6616 mask = recip_options[i].mask;
6617 break;
6620 if (i == ARRAY_SIZE (recip_options))
6622 error ("unknown option for -mrecip=%s", q);
6623 invert = false;
6624 mask = RECIP_MASK_NONE;
6628 opts->x_recip_mask_explicit |= mask;
6629 if (invert)
6630 opts->x_recip_mask &= ~mask;
6631 else
6632 opts->x_recip_mask |= mask;
6636 if (TARGET_RECIP_P (opts->x_target_flags))
6637 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6638 else if (opts_set->x_target_flags & MASK_RECIP)
6639 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6641 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6642 for 64-bit Bionic. Also default long double to 64-bit for Intel
6643 MCU psABI. */
6644 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6645 && !(opts_set->x_target_flags
6646 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6647 opts->x_target_flags |= (TARGET_64BIT
6648 ? MASK_LONG_DOUBLE_128
6649 : MASK_LONG_DOUBLE_64);
6651 /* Only one of them can be active. */
6652 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6653 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6655 /* Handle stack protector */
6656 if (!opts_set->x_ix86_stack_protector_guard)
6657 opts->x_ix86_stack_protector_guard
6658 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6660 #ifdef TARGET_THREAD_SSP_OFFSET
6661 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
6662 #endif
6664 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
6666 char *endp;
6667 const char *str = ix86_stack_protector_guard_offset_str;
6669 errno = 0;
6670 int64_t offset;
6672 #if defined(INT64_T_IS_LONG)
6673 offset = strtol (str, &endp, 0);
6674 #else
6675 offset = strtoll (str, &endp, 0);
6676 #endif
6678 if (!*str || *endp || errno)
6679 error ("%qs is not a valid number "
6680 "in -mstack-protector-guard-offset=", str);
6682 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
6683 HOST_WIDE_INT_C (0x7fffffff)))
6684 error ("%qs is not a valid offset "
6685 "in -mstack-protector-guard-offset=", str);
6687 ix86_stack_protector_guard_offset = offset;
6690 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
6692 /* The kernel uses a different segment register for performance
6693 reasons; a system call would not have to trash the userspace
6694 segment register, which would be expensive. */
6695 if (ix86_cmodel == CM_KERNEL)
6696 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
6698 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
6700 const char *str = ix86_stack_protector_guard_reg_str;
6701 addr_space_t seg = ADDR_SPACE_GENERIC;
6703 /* Discard optional register prefix. */
6704 if (str[0] == '%')
6705 str++;
6707 if (strlen (str) == 2 && str[1] == 's')
6709 if (str[0] == 'f')
6710 seg = ADDR_SPACE_SEG_FS;
6711 else if (str[0] == 'g')
6712 seg = ADDR_SPACE_SEG_GS;
6715 if (seg == ADDR_SPACE_GENERIC)
6716 error ("%qs is not a valid base register "
6717 "in -mstack-protector-guard-reg=",
6718 ix86_stack_protector_guard_reg_str);
6720 ix86_stack_protector_guard_reg = seg;
6723 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6724 if (opts->x_ix86_tune_memcpy_strategy)
6726 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6727 ix86_parse_stringop_strategy_string (str, false);
6728 free (str);
6731 if (opts->x_ix86_tune_memset_strategy)
6733 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6734 ix86_parse_stringop_strategy_string (str, true);
6735 free (str);
6738 /* Save the initial options in case the user does function specific
6739 options. */
6740 if (main_args_p)
6741 target_option_default_node = target_option_current_node
6742 = build_target_option_node (opts);
6744 return true;
6747 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6749 static void
6750 ix86_option_override (void)
6752 ix86_option_override_internal (true, &global_options, &global_options_set);
6755 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6756 static char *
6757 ix86_offload_options (void)
6759 if (TARGET_LP64)
6760 return xstrdup ("-foffload-abi=lp64");
6761 return xstrdup ("-foffload-abi=ilp32");
6764 /* Update register usage after having seen the compiler flags. */
6766 static void
6767 ix86_conditional_register_usage (void)
6769 int i, c_mask;
6771 /* If there are no caller-saved registers, preserve all registers.
6772 except fixed_regs and registers used for function return value
6773 since aggregate_value_p checks call_used_regs[regno] on return
6774 value. */
6775 if (cfun && cfun->machine->no_caller_saved_registers)
6776 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6777 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6778 call_used_regs[i] = 0;
6780 /* For 32-bit targets, squash the REX registers. */
6781 if (! TARGET_64BIT)
6783 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6784 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6785 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6786 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6787 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6788 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6791 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6792 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6794 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6796 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6798 /* Set/reset conditionally defined registers from
6799 CALL_USED_REGISTERS initializer. */
6800 if (call_used_regs[i] > 1)
6801 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6803 /* Calculate registers of CLOBBERED_REGS register set
6804 as call used registers from GENERAL_REGS register set. */
6805 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6806 && call_used_regs[i])
6807 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6810 /* If MMX is disabled, squash the registers. */
6811 if (! TARGET_MMX)
6812 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6813 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6814 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6816 /* If SSE is disabled, squash the registers. */
6817 if (! TARGET_SSE)
6818 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6819 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6820 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6822 /* If the FPU is disabled, squash the registers. */
6823 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6824 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6825 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6826 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6828 /* If AVX512F is disabled, squash the registers. */
6829 if (! TARGET_AVX512F)
6831 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6832 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6834 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6835 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6838 /* If MPX is disabled, squash the registers. */
6839 if (! TARGET_MPX)
6840 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6841 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6845 /* Save the current options */
6847 static void
6848 ix86_function_specific_save (struct cl_target_option *ptr,
6849 struct gcc_options *opts)
6851 ptr->arch = ix86_arch;
6852 ptr->schedule = ix86_schedule;
6853 ptr->prefetch_sse = x86_prefetch_sse;
6854 ptr->tune = ix86_tune;
6855 ptr->branch_cost = ix86_branch_cost;
6856 ptr->tune_defaulted = ix86_tune_defaulted;
6857 ptr->arch_specified = ix86_arch_specified;
6858 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6859 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6860 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6861 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6862 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6863 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6864 ptr->x_ix86_abi = opts->x_ix86_abi;
6865 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6866 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6867 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6868 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6869 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6870 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6871 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6872 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6873 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6874 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6875 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6876 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6877 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6878 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6879 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6880 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6881 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6882 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6883 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6884 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6886 /* The fields are char but the variables are not; make sure the
6887 values fit in the fields. */
6888 gcc_assert (ptr->arch == ix86_arch);
6889 gcc_assert (ptr->schedule == ix86_schedule);
6890 gcc_assert (ptr->tune == ix86_tune);
6891 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6894 /* Restore the current options */
6896 static void
6897 ix86_function_specific_restore (struct gcc_options *opts,
6898 struct cl_target_option *ptr)
6900 enum processor_type old_tune = ix86_tune;
6901 enum processor_type old_arch = ix86_arch;
6902 unsigned int ix86_arch_mask;
6903 int i;
6905 /* We don't change -fPIC. */
6906 opts->x_flag_pic = flag_pic;
6908 ix86_arch = (enum processor_type) ptr->arch;
6909 ix86_schedule = (enum attr_cpu) ptr->schedule;
6910 ix86_tune = (enum processor_type) ptr->tune;
6911 x86_prefetch_sse = ptr->prefetch_sse;
6912 opts->x_ix86_branch_cost = ptr->branch_cost;
6913 ix86_tune_defaulted = ptr->tune_defaulted;
6914 ix86_arch_specified = ptr->arch_specified;
6915 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6916 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6917 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6918 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6919 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6920 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6921 opts->x_ix86_abi = ptr->x_ix86_abi;
6922 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6923 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6924 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6925 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6926 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6927 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6928 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6929 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6930 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6931 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6932 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6933 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6934 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6935 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6936 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6937 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6938 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6939 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6940 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6941 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6942 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6943 /* TODO: ix86_cost should be chosen at instruction or function granuality
6944 so for cold code we use size_cost even in !optimize_size compilation. */
6945 if (opts->x_optimize_size)
6946 ix86_cost = &ix86_size_cost;
6947 else
6948 ix86_cost = ix86_tune_cost;
6950 /* Recreate the arch feature tests if the arch changed */
6951 if (old_arch != ix86_arch)
6953 ix86_arch_mask = 1u << ix86_arch;
6954 for (i = 0; i < X86_ARCH_LAST; ++i)
6955 ix86_arch_features[i]
6956 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6959 /* Recreate the tune optimization tests */
6960 if (old_tune != ix86_tune)
6961 set_ix86_tune_features (ix86_tune, false);
6964 /* Adjust target options after streaming them in. This is mainly about
6965 reconciling them with global options. */
6967 static void
6968 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6970 /* flag_pic is a global option, but ix86_cmodel is target saved option
6971 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6972 for PIC, or error out. */
6973 if (flag_pic)
6974 switch (ptr->x_ix86_cmodel)
6976 case CM_SMALL:
6977 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6978 break;
6980 case CM_MEDIUM:
6981 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6982 break;
6984 case CM_LARGE:
6985 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6986 break;
6988 case CM_KERNEL:
6989 error ("code model %s does not support PIC mode", "kernel");
6990 break;
6992 default:
6993 break;
6995 else
6996 switch (ptr->x_ix86_cmodel)
6998 case CM_SMALL_PIC:
6999 ptr->x_ix86_cmodel = CM_SMALL;
7000 break;
7002 case CM_MEDIUM_PIC:
7003 ptr->x_ix86_cmodel = CM_MEDIUM;
7004 break;
7006 case CM_LARGE_PIC:
7007 ptr->x_ix86_cmodel = CM_LARGE;
7008 break;
7010 default:
7011 break;
7015 /* Print the current options */
7017 static void
7018 ix86_function_specific_print (FILE *file, int indent,
7019 struct cl_target_option *ptr)
7021 char *target_string
7022 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
7023 ptr->x_target_flags, ptr->x_ix86_target_flags,
7024 NULL, NULL, ptr->x_ix86_fpmath, false);
7026 gcc_assert (ptr->arch < PROCESSOR_max);
7027 fprintf (file, "%*sarch = %d (%s)\n",
7028 indent, "",
7029 ptr->arch, processor_target_table[ptr->arch].name);
7031 gcc_assert (ptr->tune < PROCESSOR_max);
7032 fprintf (file, "%*stune = %d (%s)\n",
7033 indent, "",
7034 ptr->tune, processor_target_table[ptr->tune].name);
7036 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
7038 if (target_string)
7040 fprintf (file, "%*s%s\n", indent, "", target_string);
7041 free (target_string);
7046 /* Inner function to process the attribute((target(...))), take an argument and
7047 set the current options from the argument. If we have a list, recursively go
7048 over the list. */
7050 static bool
7051 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
7052 struct gcc_options *opts,
7053 struct gcc_options *opts_set,
7054 struct gcc_options *enum_opts_set)
7056 char *next_optstr;
7057 bool ret = true;
7059 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
7060 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
7061 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
7062 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7063 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7065 enum ix86_opt_type
7067 ix86_opt_unknown,
7068 ix86_opt_yes,
7069 ix86_opt_no,
7070 ix86_opt_str,
7071 ix86_opt_enum,
7072 ix86_opt_isa
7075 static const struct
7077 const char *string;
7078 size_t len;
7079 enum ix86_opt_type type;
7080 int opt;
7081 int mask;
7082 } attrs[] = {
7083 /* isa options */
7084 IX86_ATTR_ISA ("sgx", OPT_msgx),
7085 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7086 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7087 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7089 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7090 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7091 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7092 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7093 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7094 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7095 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7096 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7097 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7098 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7099 IX86_ATTR_ISA ("fma", OPT_mfma),
7100 IX86_ATTR_ISA ("xop", OPT_mxop),
7101 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7102 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7103 IX86_ATTR_ISA ("avx", OPT_mavx),
7104 IX86_ATTR_ISA ("sse4", OPT_msse4),
7105 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7106 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7107 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7108 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7109 IX86_ATTR_ISA ("sse3", OPT_msse3),
7110 IX86_ATTR_ISA ("aes", OPT_maes),
7111 IX86_ATTR_ISA ("sha", OPT_msha),
7112 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7113 IX86_ATTR_ISA ("sse2", OPT_msse2),
7114 IX86_ATTR_ISA ("sse", OPT_msse),
7115 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7116 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7117 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7118 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7119 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7120 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7121 IX86_ATTR_ISA ("adx", OPT_madx),
7122 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7123 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7124 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7125 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7126 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7127 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7128 IX86_ATTR_ISA ("abm", OPT_mabm),
7129 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7130 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7131 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7132 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7133 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7134 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7135 IX86_ATTR_ISA ("sahf", OPT_msahf),
7136 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7137 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7138 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7139 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7140 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7141 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7142 IX86_ATTR_ISA ("pku", OPT_mpku),
7143 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7144 IX86_ATTR_ISA ("hle", OPT_mhle),
7145 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7146 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7147 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7148 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7150 /* enum options */
7151 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7153 /* string options */
7154 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7155 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7157 /* flag options */
7158 IX86_ATTR_YES ("cld",
7159 OPT_mcld,
7160 MASK_CLD),
7162 IX86_ATTR_NO ("fancy-math-387",
7163 OPT_mfancy_math_387,
7164 MASK_NO_FANCY_MATH_387),
7166 IX86_ATTR_YES ("ieee-fp",
7167 OPT_mieee_fp,
7168 MASK_IEEE_FP),
7170 IX86_ATTR_YES ("inline-all-stringops",
7171 OPT_minline_all_stringops,
7172 MASK_INLINE_ALL_STRINGOPS),
7174 IX86_ATTR_YES ("inline-stringops-dynamically",
7175 OPT_minline_stringops_dynamically,
7176 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7178 IX86_ATTR_NO ("align-stringops",
7179 OPT_mno_align_stringops,
7180 MASK_NO_ALIGN_STRINGOPS),
7182 IX86_ATTR_YES ("recip",
7183 OPT_mrecip,
7184 MASK_RECIP),
7188 /* If this is a list, recurse to get the options. */
7189 if (TREE_CODE (args) == TREE_LIST)
7191 bool ret = true;
7193 for (; args; args = TREE_CHAIN (args))
7194 if (TREE_VALUE (args)
7195 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7196 p_strings, opts, opts_set,
7197 enum_opts_set))
7198 ret = false;
7200 return ret;
7203 else if (TREE_CODE (args) != STRING_CST)
7205 error ("attribute %<target%> argument not a string");
7206 return false;
7209 /* Handle multiple arguments separated by commas. */
7210 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7212 while (next_optstr && *next_optstr != '\0')
7214 char *p = next_optstr;
7215 char *orig_p = p;
7216 char *comma = strchr (next_optstr, ',');
7217 const char *opt_string;
7218 size_t len, opt_len;
7219 int opt;
7220 bool opt_set_p;
7221 char ch;
7222 unsigned i;
7223 enum ix86_opt_type type = ix86_opt_unknown;
7224 int mask = 0;
7226 if (comma)
7228 *comma = '\0';
7229 len = comma - next_optstr;
7230 next_optstr = comma + 1;
7232 else
7234 len = strlen (p);
7235 next_optstr = NULL;
7238 /* Recognize no-xxx. */
7239 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7241 opt_set_p = false;
7242 p += 3;
7243 len -= 3;
7245 else
7246 opt_set_p = true;
7248 /* Find the option. */
7249 ch = *p;
7250 opt = N_OPTS;
7251 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7253 type = attrs[i].type;
7254 opt_len = attrs[i].len;
7255 if (ch == attrs[i].string[0]
7256 && ((type != ix86_opt_str && type != ix86_opt_enum)
7257 ? len == opt_len
7258 : len > opt_len)
7259 && memcmp (p, attrs[i].string, opt_len) == 0)
7261 opt = attrs[i].opt;
7262 mask = attrs[i].mask;
7263 opt_string = attrs[i].string;
7264 break;
7268 /* Process the option. */
7269 if (opt == N_OPTS)
7271 error ("attribute(target(\"%s\")) is unknown", orig_p);
7272 ret = false;
7275 else if (type == ix86_opt_isa)
7277 struct cl_decoded_option decoded;
7279 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7280 ix86_handle_option (opts, opts_set,
7281 &decoded, input_location);
7284 else if (type == ix86_opt_yes || type == ix86_opt_no)
7286 if (type == ix86_opt_no)
7287 opt_set_p = !opt_set_p;
7289 if (opt_set_p)
7290 opts->x_target_flags |= mask;
7291 else
7292 opts->x_target_flags &= ~mask;
7295 else if (type == ix86_opt_str)
7297 if (p_strings[opt])
7299 error ("option(\"%s\") was already specified", opt_string);
7300 ret = false;
7302 else
7303 p_strings[opt] = xstrdup (p + opt_len);
7306 else if (type == ix86_opt_enum)
7308 bool arg_ok;
7309 int value;
7311 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7312 if (arg_ok)
7313 set_option (opts, enum_opts_set, opt, value,
7314 p + opt_len, DK_UNSPECIFIED, input_location,
7315 global_dc);
7316 else
7318 error ("attribute(target(\"%s\")) is unknown", orig_p);
7319 ret = false;
7323 else
7324 gcc_unreachable ();
7327 return ret;
7330 /* Release allocated strings. */
7331 static void
7332 release_options_strings (char **option_strings)
7334 /* Free up memory allocated to hold the strings */
7335 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7336 free (option_strings[i]);
7339 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7341 tree
7342 ix86_valid_target_attribute_tree (tree args,
7343 struct gcc_options *opts,
7344 struct gcc_options *opts_set)
7346 const char *orig_arch_string = opts->x_ix86_arch_string;
7347 const char *orig_tune_string = opts->x_ix86_tune_string;
7348 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7349 int orig_tune_defaulted = ix86_tune_defaulted;
7350 int orig_arch_specified = ix86_arch_specified;
7351 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7352 tree t = NULL_TREE;
7353 struct cl_target_option *def
7354 = TREE_TARGET_OPTION (target_option_default_node);
7355 struct gcc_options enum_opts_set;
7357 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7359 /* Process each of the options on the chain. */
7360 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7361 opts_set, &enum_opts_set))
7362 return error_mark_node;
7364 /* If the changed options are different from the default, rerun
7365 ix86_option_override_internal, and then save the options away.
7366 The string options are attribute options, and will be undone
7367 when we copy the save structure. */
7368 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7369 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7370 || opts->x_target_flags != def->x_target_flags
7371 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7372 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7373 || enum_opts_set.x_ix86_fpmath)
7375 /* If we are using the default tune= or arch=, undo the string assigned,
7376 and use the default. */
7377 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7379 opts->x_ix86_arch_string
7380 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7382 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7383 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7384 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7385 | OPTION_MASK_ABI_64
7386 | OPTION_MASK_ABI_X32
7387 | OPTION_MASK_CODE16);
7388 opts->x_ix86_isa_flags2 = 0;
7390 else if (!orig_arch_specified)
7391 opts->x_ix86_arch_string = NULL;
7393 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7394 opts->x_ix86_tune_string
7395 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7396 else if (orig_tune_defaulted)
7397 opts->x_ix86_tune_string = NULL;
7399 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7400 if (enum_opts_set.x_ix86_fpmath)
7401 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7403 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7404 bool r = ix86_option_override_internal (false, opts, opts_set);
7405 if (!r)
7407 release_options_strings (option_strings);
7408 return error_mark_node;
7411 /* Add any builtin functions with the new isa if any. */
7412 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7414 /* Save the current options unless we are validating options for
7415 #pragma. */
7416 t = build_target_option_node (opts);
7418 opts->x_ix86_arch_string = orig_arch_string;
7419 opts->x_ix86_tune_string = orig_tune_string;
7420 opts_set->x_ix86_fpmath = orig_fpmath_set;
7422 release_options_strings (option_strings);
7425 return t;
7428 /* Hook to validate attribute((target("string"))). */
7430 static bool
7431 ix86_valid_target_attribute_p (tree fndecl,
7432 tree ARG_UNUSED (name),
7433 tree args,
7434 int ARG_UNUSED (flags))
7436 struct gcc_options func_options;
7437 tree new_target, new_optimize;
7438 bool ret = true;
7440 /* attribute((target("default"))) does nothing, beyond
7441 affecting multi-versioning. */
7442 if (TREE_VALUE (args)
7443 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7444 && TREE_CHAIN (args) == NULL_TREE
7445 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7446 return true;
7448 tree old_optimize = build_optimization_node (&global_options);
7450 /* Get the optimization options of the current function. */
7451 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7453 if (!func_optimize)
7454 func_optimize = old_optimize;
7456 /* Init func_options. */
7457 memset (&func_options, 0, sizeof (func_options));
7458 init_options_struct (&func_options, NULL);
7459 lang_hooks.init_options_struct (&func_options);
7461 cl_optimization_restore (&func_options,
7462 TREE_OPTIMIZATION (func_optimize));
7464 /* Initialize func_options to the default before its target options can
7465 be set. */
7466 cl_target_option_restore (&func_options,
7467 TREE_TARGET_OPTION (target_option_default_node));
7469 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7470 &global_options_set);
7472 new_optimize = build_optimization_node (&func_options);
7474 if (new_target == error_mark_node)
7475 ret = false;
7477 else if (fndecl && new_target)
7479 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7481 if (old_optimize != new_optimize)
7482 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7485 finalize_options_struct (&func_options);
7487 return ret;
7491 /* Hook to determine if one function can safely inline another. */
7493 static bool
7494 ix86_can_inline_p (tree caller, tree callee)
7496 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7497 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7498 if (!callee_tree)
7499 callee_tree = target_option_default_node;
7500 if (!caller_tree)
7501 caller_tree = target_option_default_node;
7502 if (callee_tree == caller_tree)
7503 return true;
7505 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7506 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7507 bool ret = false;
7509 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7510 function can inline a SSE2 function but a SSE2 function can't inline
7511 a SSE4 function. */
7512 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7513 != callee_opts->x_ix86_isa_flags)
7514 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7515 != callee_opts->x_ix86_isa_flags2))
7516 ret = false;
7518 /* See if we have the same non-isa options. */
7519 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7520 ret = false;
7522 /* See if arch, tune, etc. are the same. */
7523 else if (caller_opts->arch != callee_opts->arch)
7524 ret = false;
7526 else if (caller_opts->tune != callee_opts->tune)
7527 ret = false;
7529 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
7530 /* If the calle doesn't use FP expressions differences in
7531 ix86_fpmath can be ignored. We are called from FEs
7532 for multi-versioning call optimization, so beware of
7533 ipa_fn_summaries not available. */
7534 && (! ipa_fn_summaries
7535 || ipa_fn_summaries->get
7536 (cgraph_node::get (callee))->fp_expressions))
7537 ret = false;
7539 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7540 ret = false;
7542 else
7543 ret = true;
7545 return ret;
7549 /* Remember the last target of ix86_set_current_function. */
7550 static GTY(()) tree ix86_previous_fndecl;
7552 /* Set targets globals to the default (or current #pragma GCC target
7553 if active). Invalidate ix86_previous_fndecl cache. */
7555 void
7556 ix86_reset_previous_fndecl (void)
7558 tree new_tree = target_option_current_node;
7559 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7560 if (TREE_TARGET_GLOBALS (new_tree))
7561 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7562 else if (new_tree == target_option_default_node)
7563 restore_target_globals (&default_target_globals);
7564 else
7565 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7566 ix86_previous_fndecl = NULL_TREE;
7569 /* Set the func_type field from the function FNDECL. */
7571 static void
7572 ix86_set_func_type (tree fndecl)
7574 if (cfun->machine->func_type == TYPE_UNKNOWN)
7576 if (lookup_attribute ("interrupt",
7577 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7579 if (ix86_function_naked (fndecl))
7580 error_at (DECL_SOURCE_LOCATION (fndecl),
7581 "interrupt and naked attributes are not compatible");
7583 int nargs = 0;
7584 for (tree arg = DECL_ARGUMENTS (fndecl);
7585 arg;
7586 arg = TREE_CHAIN (arg))
7587 nargs++;
7588 cfun->machine->no_caller_saved_registers = true;
7589 cfun->machine->func_type
7590 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7592 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7594 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7595 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7596 sorry ("Only DWARF debug format is supported for interrupt "
7597 "service routine.");
7599 else
7601 cfun->machine->func_type = TYPE_NORMAL;
7602 if (lookup_attribute ("no_caller_saved_registers",
7603 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7604 cfun->machine->no_caller_saved_registers = true;
7609 /* Establish appropriate back-end context for processing the function
7610 FNDECL. The argument might be NULL to indicate processing at top
7611 level, outside of any function scope. */
7612 static void
7613 ix86_set_current_function (tree fndecl)
7615 /* Only change the context if the function changes. This hook is called
7616 several times in the course of compiling a function, and we don't want to
7617 slow things down too much or call target_reinit when it isn't safe. */
7618 if (fndecl == ix86_previous_fndecl)
7620 /* There may be 2 function bodies for the same function FNDECL,
7621 one is extern inline and one isn't. Call ix86_set_func_type
7622 to set the func_type field. */
7623 if (fndecl != NULL_TREE)
7624 ix86_set_func_type (fndecl);
7625 return;
7628 tree old_tree;
7629 if (ix86_previous_fndecl == NULL_TREE)
7630 old_tree = target_option_current_node;
7631 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7632 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7633 else
7634 old_tree = target_option_default_node;
7636 if (fndecl == NULL_TREE)
7638 if (old_tree != target_option_current_node)
7639 ix86_reset_previous_fndecl ();
7640 return;
7643 ix86_set_func_type (fndecl);
7645 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7646 if (new_tree == NULL_TREE)
7647 new_tree = target_option_default_node;
7649 if (old_tree != new_tree)
7651 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7652 if (TREE_TARGET_GLOBALS (new_tree))
7653 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7654 else if (new_tree == target_option_default_node)
7655 restore_target_globals (&default_target_globals);
7656 else
7657 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7659 ix86_previous_fndecl = fndecl;
7661 static bool prev_no_caller_saved_registers;
7663 /* 64-bit MS and SYSV ABI have different set of call used registers.
7664 Avoid expensive re-initialization of init_regs each time we switch
7665 function context. */
7666 if (TARGET_64BIT
7667 && (call_used_regs[SI_REG]
7668 == (cfun->machine->call_abi == MS_ABI)))
7669 reinit_regs ();
7670 /* Need to re-initialize init_regs if caller-saved registers are
7671 changed. */
7672 else if (prev_no_caller_saved_registers
7673 != cfun->machine->no_caller_saved_registers)
7674 reinit_regs ();
7676 if (cfun->machine->func_type != TYPE_NORMAL
7677 || cfun->machine->no_caller_saved_registers)
7679 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7680 may change processor state. */
7681 const char *isa;
7682 if (TARGET_MPX)
7683 isa = "MPX";
7684 else if (TARGET_SSE)
7685 isa = "SSE";
7686 else if (TARGET_MMX)
7687 isa = "MMX/3Dnow";
7688 else if (TARGET_80387)
7689 isa = "80387";
7690 else
7691 isa = NULL;
7692 if (isa != NULL)
7694 if (cfun->machine->func_type != TYPE_NORMAL)
7695 sorry ("%s instructions aren't allowed in %s service routine",
7696 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7697 ? "exception" : "interrupt"));
7698 else
7699 sorry ("%s instructions aren't allowed in function with "
7700 "no_caller_saved_registers attribute", isa);
7701 /* Don't issue the same error twice. */
7702 cfun->machine->func_type = TYPE_NORMAL;
7703 cfun->machine->no_caller_saved_registers = false;
7707 prev_no_caller_saved_registers
7708 = cfun->machine->no_caller_saved_registers;
7712 /* Return true if this goes in large data/bss. */
7714 static bool
7715 ix86_in_large_data_p (tree exp)
7717 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7718 return false;
7720 if (exp == NULL_TREE)
7721 return false;
7723 /* Functions are never large data. */
7724 if (TREE_CODE (exp) == FUNCTION_DECL)
7725 return false;
7727 /* Automatic variables are never large data. */
7728 if (VAR_P (exp) && !is_global_var (exp))
7729 return false;
7731 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7733 const char *section = DECL_SECTION_NAME (exp);
7734 if (strcmp (section, ".ldata") == 0
7735 || strcmp (section, ".lbss") == 0)
7736 return true;
7737 return false;
7739 else
7741 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7743 /* If this is an incomplete type with size 0, then we can't put it
7744 in data because it might be too big when completed. Also,
7745 int_size_in_bytes returns -1 if size can vary or is larger than
7746 an integer in which case also it is safer to assume that it goes in
7747 large data. */
7748 if (size <= 0 || size > ix86_section_threshold)
7749 return true;
7752 return false;
7755 /* i386-specific section flag to mark large sections. */
7756 #define SECTION_LARGE SECTION_MACH_DEP
7758 /* Switch to the appropriate section for output of DECL.
7759 DECL is either a `VAR_DECL' node or a constant of some sort.
7760 RELOC indicates whether forming the initial value of DECL requires
7761 link-time relocations. */
7763 ATTRIBUTE_UNUSED static section *
7764 x86_64_elf_select_section (tree decl, int reloc,
7765 unsigned HOST_WIDE_INT align)
7767 if (ix86_in_large_data_p (decl))
7769 const char *sname = NULL;
7770 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7771 switch (categorize_decl_for_section (decl, reloc))
7773 case SECCAT_DATA:
7774 sname = ".ldata";
7775 break;
7776 case SECCAT_DATA_REL:
7777 sname = ".ldata.rel";
7778 break;
7779 case SECCAT_DATA_REL_LOCAL:
7780 sname = ".ldata.rel.local";
7781 break;
7782 case SECCAT_DATA_REL_RO:
7783 sname = ".ldata.rel.ro";
7784 break;
7785 case SECCAT_DATA_REL_RO_LOCAL:
7786 sname = ".ldata.rel.ro.local";
7787 break;
7788 case SECCAT_BSS:
7789 sname = ".lbss";
7790 flags |= SECTION_BSS;
7791 break;
7792 case SECCAT_RODATA:
7793 case SECCAT_RODATA_MERGE_STR:
7794 case SECCAT_RODATA_MERGE_STR_INIT:
7795 case SECCAT_RODATA_MERGE_CONST:
7796 sname = ".lrodata";
7797 flags &= ~SECTION_WRITE;
7798 break;
7799 case SECCAT_SRODATA:
7800 case SECCAT_SDATA:
7801 case SECCAT_SBSS:
7802 gcc_unreachable ();
7803 case SECCAT_TEXT:
7804 case SECCAT_TDATA:
7805 case SECCAT_TBSS:
7806 /* We don't split these for medium model. Place them into
7807 default sections and hope for best. */
7808 break;
7810 if (sname)
7812 /* We might get called with string constants, but get_named_section
7813 doesn't like them as they are not DECLs. Also, we need to set
7814 flags in that case. */
7815 if (!DECL_P (decl))
7816 return get_section (sname, flags, NULL);
7817 return get_named_section (decl, sname, reloc);
7820 return default_elf_select_section (decl, reloc, align);
7823 /* Select a set of attributes for section NAME based on the properties
7824 of DECL and whether or not RELOC indicates that DECL's initializer
7825 might contain runtime relocations. */
7827 static unsigned int ATTRIBUTE_UNUSED
7828 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7830 unsigned int flags = default_section_type_flags (decl, name, reloc);
7832 if (ix86_in_large_data_p (decl))
7833 flags |= SECTION_LARGE;
7835 if (decl == NULL_TREE
7836 && (strcmp (name, ".ldata.rel.ro") == 0
7837 || strcmp (name, ".ldata.rel.ro.local") == 0))
7838 flags |= SECTION_RELRO;
7840 if (strcmp (name, ".lbss") == 0
7841 || strncmp (name, ".lbss.", 5) == 0
7842 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7843 flags |= SECTION_BSS;
7845 return flags;
7848 /* Build up a unique section name, expressed as a
7849 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7850 RELOC indicates whether the initial value of EXP requires
7851 link-time relocations. */
7853 static void ATTRIBUTE_UNUSED
7854 x86_64_elf_unique_section (tree decl, int reloc)
7856 if (ix86_in_large_data_p (decl))
7858 const char *prefix = NULL;
7859 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7860 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7862 switch (categorize_decl_for_section (decl, reloc))
7864 case SECCAT_DATA:
7865 case SECCAT_DATA_REL:
7866 case SECCAT_DATA_REL_LOCAL:
7867 case SECCAT_DATA_REL_RO:
7868 case SECCAT_DATA_REL_RO_LOCAL:
7869 prefix = one_only ? ".ld" : ".ldata";
7870 break;
7871 case SECCAT_BSS:
7872 prefix = one_only ? ".lb" : ".lbss";
7873 break;
7874 case SECCAT_RODATA:
7875 case SECCAT_RODATA_MERGE_STR:
7876 case SECCAT_RODATA_MERGE_STR_INIT:
7877 case SECCAT_RODATA_MERGE_CONST:
7878 prefix = one_only ? ".lr" : ".lrodata";
7879 break;
7880 case SECCAT_SRODATA:
7881 case SECCAT_SDATA:
7882 case SECCAT_SBSS:
7883 gcc_unreachable ();
7884 case SECCAT_TEXT:
7885 case SECCAT_TDATA:
7886 case SECCAT_TBSS:
7887 /* We don't split these for medium model. Place them into
7888 default sections and hope for best. */
7889 break;
7891 if (prefix)
7893 const char *name, *linkonce;
7894 char *string;
7896 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7897 name = targetm.strip_name_encoding (name);
7899 /* If we're using one_only, then there needs to be a .gnu.linkonce
7900 prefix to the section name. */
7901 linkonce = one_only ? ".gnu.linkonce" : "";
7903 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7905 set_decl_section_name (decl, string);
7906 return;
7909 default_unique_section (decl, reloc);
7912 #ifdef COMMON_ASM_OP
7914 #ifndef LARGECOMM_SECTION_ASM_OP
7915 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7916 #endif
7918 /* This says how to output assembler code to declare an
7919 uninitialized external linkage data object.
7921 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7922 large objects. */
7923 void
7924 x86_elf_aligned_decl_common (FILE *file, tree decl,
7925 const char *name, unsigned HOST_WIDE_INT size,
7926 int align)
7928 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7929 && size > (unsigned int)ix86_section_threshold)
7931 switch_to_section (get_named_section (decl, ".lbss", 0));
7932 fputs (LARGECOMM_SECTION_ASM_OP, file);
7934 else
7935 fputs (COMMON_ASM_OP, file);
7936 assemble_name (file, name);
7937 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7938 size, align / BITS_PER_UNIT);
7940 #endif
7942 /* Utility function for targets to use in implementing
7943 ASM_OUTPUT_ALIGNED_BSS. */
7945 void
7946 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7947 unsigned HOST_WIDE_INT size, int align)
7949 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7950 && size > (unsigned int)ix86_section_threshold)
7951 switch_to_section (get_named_section (decl, ".lbss", 0));
7952 else
7953 switch_to_section (bss_section);
7954 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7955 #ifdef ASM_DECLARE_OBJECT_NAME
7956 last_assemble_variable_decl = decl;
7957 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7958 #else
7959 /* Standard thing is just output label for the object. */
7960 ASM_OUTPUT_LABEL (file, name);
7961 #endif /* ASM_DECLARE_OBJECT_NAME */
7962 ASM_OUTPUT_SKIP (file, size ? size : 1);
7965 /* Decide whether we must probe the stack before any space allocation
7966 on this target. It's essentially TARGET_STACK_PROBE except when
7967 -fstack-check causes the stack to be already probed differently. */
7969 bool
7970 ix86_target_stack_probe (void)
7972 /* Do not probe the stack twice if static stack checking is enabled. */
7973 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7974 return false;
7976 return TARGET_STACK_PROBE;
7979 /* Decide whether we can make a sibling call to a function. DECL is the
7980 declaration of the function being targeted by the call and EXP is the
7981 CALL_EXPR representing the call. */
7983 static bool
7984 ix86_function_ok_for_sibcall (tree decl, tree exp)
7986 tree type, decl_or_type;
7987 rtx a, b;
7988 bool bind_global = decl && !targetm.binds_local_p (decl);
7990 if (ix86_function_naked (current_function_decl))
7991 return false;
7993 /* Sibling call isn't OK if there are no caller-saved registers
7994 since all registers must be preserved before return. */
7995 if (cfun->machine->no_caller_saved_registers)
7996 return false;
7998 /* If we are generating position-independent code, we cannot sibcall
7999 optimize direct calls to global functions, as the PLT requires
8000 %ebx be live. (Darwin does not have a PLT.) */
8001 if (!TARGET_MACHO
8002 && !TARGET_64BIT
8003 && flag_pic
8004 && flag_plt
8005 && bind_global)
8006 return false;
8008 /* If we need to align the outgoing stack, then sibcalling would
8009 unalign the stack, which may break the called function. */
8010 if (ix86_minimum_incoming_stack_boundary (true)
8011 < PREFERRED_STACK_BOUNDARY)
8012 return false;
8014 if (decl)
8016 decl_or_type = decl;
8017 type = TREE_TYPE (decl);
8019 else
8021 /* We're looking at the CALL_EXPR, we need the type of the function. */
8022 type = CALL_EXPR_FN (exp); /* pointer expression */
8023 type = TREE_TYPE (type); /* pointer type */
8024 type = TREE_TYPE (type); /* function type */
8025 decl_or_type = type;
8028 /* Check that the return value locations are the same. Like
8029 if we are returning floats on the 80387 register stack, we cannot
8030 make a sibcall from a function that doesn't return a float to a
8031 function that does or, conversely, from a function that does return
8032 a float to a function that doesn't; the necessary stack adjustment
8033 would not be executed. This is also the place we notice
8034 differences in the return value ABI. Note that it is ok for one
8035 of the functions to have void return type as long as the return
8036 value of the other is passed in a register. */
8037 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
8038 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
8039 cfun->decl, false);
8040 if (STACK_REG_P (a) || STACK_REG_P (b))
8042 if (!rtx_equal_p (a, b))
8043 return false;
8045 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
8047 else if (!rtx_equal_p (a, b))
8048 return false;
8050 if (TARGET_64BIT)
8052 /* The SYSV ABI has more call-clobbered registers;
8053 disallow sibcalls from MS to SYSV. */
8054 if (cfun->machine->call_abi == MS_ABI
8055 && ix86_function_type_abi (type) == SYSV_ABI)
8056 return false;
8058 else
8060 /* If this call is indirect, we'll need to be able to use a
8061 call-clobbered register for the address of the target function.
8062 Make sure that all such registers are not used for passing
8063 parameters. Note that DLLIMPORT functions and call to global
8064 function via GOT slot are indirect. */
8065 if (!decl
8066 || (bind_global && flag_pic && !flag_plt)
8067 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8069 /* Check if regparm >= 3 since arg_reg_available is set to
8070 false if regparm == 0. If regparm is 1 or 2, there is
8071 always a call-clobbered register available.
8073 ??? The symbol indirect call doesn't need a call-clobbered
8074 register. But we don't know if this is a symbol indirect
8075 call or not here. */
8076 if (ix86_function_regparm (type, NULL) >= 3
8077 && !cfun->machine->arg_reg_available)
8078 return false;
8082 /* Otherwise okay. That also includes certain types of indirect calls. */
8083 return true;
8086 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8087 and "sseregparm" calling convention attributes;
8088 arguments as in struct attribute_spec.handler. */
8090 static tree
8091 ix86_handle_cconv_attribute (tree *node, tree name,
8092 tree args,
8093 int,
8094 bool *no_add_attrs)
8096 if (TREE_CODE (*node) != FUNCTION_TYPE
8097 && TREE_CODE (*node) != METHOD_TYPE
8098 && TREE_CODE (*node) != FIELD_DECL
8099 && TREE_CODE (*node) != TYPE_DECL)
8101 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8102 name);
8103 *no_add_attrs = true;
8104 return NULL_TREE;
8107 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8108 if (is_attribute_p ("regparm", name))
8110 tree cst;
8112 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8114 error ("fastcall and regparm attributes are not compatible");
8117 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8119 error ("regparam and thiscall attributes are not compatible");
8122 cst = TREE_VALUE (args);
8123 if (TREE_CODE (cst) != INTEGER_CST)
8125 warning (OPT_Wattributes,
8126 "%qE attribute requires an integer constant argument",
8127 name);
8128 *no_add_attrs = true;
8130 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8132 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8133 name, REGPARM_MAX);
8134 *no_add_attrs = true;
8137 return NULL_TREE;
8140 if (TARGET_64BIT)
8142 /* Do not warn when emulating the MS ABI. */
8143 if ((TREE_CODE (*node) != FUNCTION_TYPE
8144 && TREE_CODE (*node) != METHOD_TYPE)
8145 || ix86_function_type_abi (*node) != MS_ABI)
8146 warning (OPT_Wattributes, "%qE attribute ignored",
8147 name);
8148 *no_add_attrs = true;
8149 return NULL_TREE;
8152 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8153 if (is_attribute_p ("fastcall", name))
8155 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8157 error ("fastcall and cdecl attributes are not compatible");
8159 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8161 error ("fastcall and stdcall attributes are not compatible");
8163 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8165 error ("fastcall and regparm attributes are not compatible");
8167 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8169 error ("fastcall and thiscall attributes are not compatible");
8173 /* Can combine stdcall with fastcall (redundant), regparm and
8174 sseregparm. */
8175 else if (is_attribute_p ("stdcall", name))
8177 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8179 error ("stdcall and cdecl attributes are not compatible");
8181 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8183 error ("stdcall and fastcall attributes are not compatible");
8185 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8187 error ("stdcall and thiscall attributes are not compatible");
8191 /* Can combine cdecl with regparm and sseregparm. */
8192 else if (is_attribute_p ("cdecl", name))
8194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8196 error ("stdcall and cdecl attributes are not compatible");
8198 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8200 error ("fastcall and cdecl attributes are not compatible");
8202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8204 error ("cdecl and thiscall attributes are not compatible");
8207 else if (is_attribute_p ("thiscall", name))
8209 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8210 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8211 name);
8212 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8214 error ("stdcall and thiscall attributes are not compatible");
8216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8218 error ("fastcall and thiscall attributes are not compatible");
8220 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8222 error ("cdecl and thiscall attributes are not compatible");
8226 /* Can combine sseregparm with all attributes. */
8228 return NULL_TREE;
8231 /* The transactional memory builtins are implicitly regparm or fastcall
8232 depending on the ABI. Override the generic do-nothing attribute that
8233 these builtins were declared with, and replace it with one of the two
8234 attributes that we expect elsewhere. */
8236 static tree
8237 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8238 int flags, bool *no_add_attrs)
8240 tree alt;
8242 /* In no case do we want to add the placeholder attribute. */
8243 *no_add_attrs = true;
8245 /* The 64-bit ABI is unchanged for transactional memory. */
8246 if (TARGET_64BIT)
8247 return NULL_TREE;
8249 /* ??? Is there a better way to validate 32-bit windows? We have
8250 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8251 if (CHECK_STACK_LIMIT > 0)
8252 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8253 else
8255 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8256 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8258 decl_attributes (node, alt, flags);
8260 return NULL_TREE;
8263 /* This function determines from TYPE the calling-convention. */
8265 unsigned int
8266 ix86_get_callcvt (const_tree type)
8268 unsigned int ret = 0;
8269 bool is_stdarg;
8270 tree attrs;
8272 if (TARGET_64BIT)
8273 return IX86_CALLCVT_CDECL;
8275 attrs = TYPE_ATTRIBUTES (type);
8276 if (attrs != NULL_TREE)
8278 if (lookup_attribute ("cdecl", attrs))
8279 ret |= IX86_CALLCVT_CDECL;
8280 else if (lookup_attribute ("stdcall", attrs))
8281 ret |= IX86_CALLCVT_STDCALL;
8282 else if (lookup_attribute ("fastcall", attrs))
8283 ret |= IX86_CALLCVT_FASTCALL;
8284 else if (lookup_attribute ("thiscall", attrs))
8285 ret |= IX86_CALLCVT_THISCALL;
8287 /* Regparam isn't allowed for thiscall and fastcall. */
8288 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8290 if (lookup_attribute ("regparm", attrs))
8291 ret |= IX86_CALLCVT_REGPARM;
8292 if (lookup_attribute ("sseregparm", attrs))
8293 ret |= IX86_CALLCVT_SSEREGPARM;
8296 if (IX86_BASE_CALLCVT(ret) != 0)
8297 return ret;
8300 is_stdarg = stdarg_p (type);
8301 if (TARGET_RTD && !is_stdarg)
8302 return IX86_CALLCVT_STDCALL | ret;
8304 if (ret != 0
8305 || is_stdarg
8306 || TREE_CODE (type) != METHOD_TYPE
8307 || ix86_function_type_abi (type) != MS_ABI)
8308 return IX86_CALLCVT_CDECL | ret;
8310 return IX86_CALLCVT_THISCALL;
8313 /* Return 0 if the attributes for two types are incompatible, 1 if they
8314 are compatible, and 2 if they are nearly compatible (which causes a
8315 warning to be generated). */
8317 static int
8318 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8320 unsigned int ccvt1, ccvt2;
8322 if (TREE_CODE (type1) != FUNCTION_TYPE
8323 && TREE_CODE (type1) != METHOD_TYPE)
8324 return 1;
8326 ccvt1 = ix86_get_callcvt (type1);
8327 ccvt2 = ix86_get_callcvt (type2);
8328 if (ccvt1 != ccvt2)
8329 return 0;
8330 if (ix86_function_regparm (type1, NULL)
8331 != ix86_function_regparm (type2, NULL))
8332 return 0;
8334 return 1;
8337 /* Return the regparm value for a function with the indicated TYPE and DECL.
8338 DECL may be NULL when calling function indirectly
8339 or considering a libcall. */
8341 static int
8342 ix86_function_regparm (const_tree type, const_tree decl)
8344 tree attr;
8345 int regparm;
8346 unsigned int ccvt;
8348 if (TARGET_64BIT)
8349 return (ix86_function_type_abi (type) == SYSV_ABI
8350 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8351 ccvt = ix86_get_callcvt (type);
8352 regparm = ix86_regparm;
8354 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8356 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8357 if (attr)
8359 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8360 return regparm;
8363 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8364 return 2;
8365 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8366 return 1;
8368 /* Use register calling convention for local functions when possible. */
8369 if (decl
8370 && TREE_CODE (decl) == FUNCTION_DECL)
8372 cgraph_node *target = cgraph_node::get (decl);
8373 if (target)
8374 target = target->function_symbol ();
8376 /* Caller and callee must agree on the calling convention, so
8377 checking here just optimize means that with
8378 __attribute__((optimize (...))) caller could use regparm convention
8379 and callee not, or vice versa. Instead look at whether the callee
8380 is optimized or not. */
8381 if (target && opt_for_fn (target->decl, optimize)
8382 && !(profile_flag && !flag_fentry))
8384 cgraph_local_info *i = &target->local;
8385 if (i && i->local && i->can_change_signature)
8387 int local_regparm, globals = 0, regno;
8389 /* Make sure no regparm register is taken by a
8390 fixed register variable. */
8391 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8392 local_regparm++)
8393 if (fixed_regs[local_regparm])
8394 break;
8396 /* We don't want to use regparm(3) for nested functions as
8397 these use a static chain pointer in the third argument. */
8398 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8399 local_regparm = 2;
8401 /* Save a register for the split stack. */
8402 if (flag_split_stack)
8404 if (local_regparm == 3)
8405 local_regparm = 2;
8406 else if (local_regparm == 2
8407 && DECL_STATIC_CHAIN (target->decl))
8408 local_regparm = 1;
8411 /* Each fixed register usage increases register pressure,
8412 so less registers should be used for argument passing.
8413 This functionality can be overriden by an explicit
8414 regparm value. */
8415 for (regno = AX_REG; regno <= DI_REG; regno++)
8416 if (fixed_regs[regno])
8417 globals++;
8419 local_regparm
8420 = globals < local_regparm ? local_regparm - globals : 0;
8422 if (local_regparm > regparm)
8423 regparm = local_regparm;
8428 return regparm;
8431 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8432 DFmode (2) arguments in SSE registers for a function with the
8433 indicated TYPE and DECL. DECL may be NULL when calling function
8434 indirectly or considering a libcall. Return -1 if any FP parameter
8435 should be rejected by error. This is used in siutation we imply SSE
8436 calling convetion but the function is called from another function with
8437 SSE disabled. Otherwise return 0. */
8439 static int
8440 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8442 gcc_assert (!TARGET_64BIT);
8444 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8445 by the sseregparm attribute. */
8446 if (TARGET_SSEREGPARM
8447 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8449 if (!TARGET_SSE)
8451 if (warn)
8453 if (decl)
8454 error ("calling %qD with attribute sseregparm without "
8455 "SSE/SSE2 enabled", decl);
8456 else
8457 error ("calling %qT with attribute sseregparm without "
8458 "SSE/SSE2 enabled", type);
8460 return 0;
8463 return 2;
8466 if (!decl)
8467 return 0;
8469 cgraph_node *target = cgraph_node::get (decl);
8470 if (target)
8471 target = target->function_symbol ();
8473 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8474 (and DFmode for SSE2) arguments in SSE registers. */
8475 if (target
8476 /* TARGET_SSE_MATH */
8477 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8478 && opt_for_fn (target->decl, optimize)
8479 && !(profile_flag && !flag_fentry))
8481 cgraph_local_info *i = &target->local;
8482 if (i && i->local && i->can_change_signature)
8484 /* Refuse to produce wrong code when local function with SSE enabled
8485 is called from SSE disabled function.
8486 FIXME: We need a way to detect these cases cross-ltrans partition
8487 and avoid using SSE calling conventions on local functions called
8488 from function with SSE disabled. For now at least delay the
8489 warning until we know we are going to produce wrong code.
8490 See PR66047 */
8491 if (!TARGET_SSE && warn)
8492 return -1;
8493 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8494 ->x_ix86_isa_flags) ? 2 : 1;
8498 return 0;
8501 /* Return true if EAX is live at the start of the function. Used by
8502 ix86_expand_prologue to determine if we need special help before
8503 calling allocate_stack_worker. */
8505 static bool
8506 ix86_eax_live_at_start_p (void)
8508 /* Cheat. Don't bother working forward from ix86_function_regparm
8509 to the function type to whether an actual argument is located in
8510 eax. Instead just look at cfg info, which is still close enough
8511 to correct at this point. This gives false positives for broken
8512 functions that might use uninitialized data that happens to be
8513 allocated in eax, but who cares? */
8514 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8517 static bool
8518 ix86_keep_aggregate_return_pointer (tree fntype)
8520 tree attr;
8522 if (!TARGET_64BIT)
8524 attr = lookup_attribute ("callee_pop_aggregate_return",
8525 TYPE_ATTRIBUTES (fntype));
8526 if (attr)
8527 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8529 /* For 32-bit MS-ABI the default is to keep aggregate
8530 return pointer. */
8531 if (ix86_function_type_abi (fntype) == MS_ABI)
8532 return true;
8534 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8537 /* Value is the number of bytes of arguments automatically
8538 popped when returning from a subroutine call.
8539 FUNDECL is the declaration node of the function (as a tree),
8540 FUNTYPE is the data type of the function (as a tree),
8541 or for a library call it is an identifier node for the subroutine name.
8542 SIZE is the number of bytes of arguments passed on the stack.
8544 On the 80386, the RTD insn may be used to pop them if the number
8545 of args is fixed, but if the number is variable then the caller
8546 must pop them all. RTD can't be used for library calls now
8547 because the library is compiled with the Unix compiler.
8548 Use of RTD is a selectable option, since it is incompatible with
8549 standard Unix calling sequences. If the option is not selected,
8550 the caller must always pop the args.
8552 The attribute stdcall is equivalent to RTD on a per module basis. */
8554 static int
8555 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8557 unsigned int ccvt;
8559 /* None of the 64-bit ABIs pop arguments. */
8560 if (TARGET_64BIT)
8561 return 0;
8563 ccvt = ix86_get_callcvt (funtype);
8565 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8566 | IX86_CALLCVT_THISCALL)) != 0
8567 && ! stdarg_p (funtype))
8568 return size;
8570 /* Lose any fake structure return argument if it is passed on the stack. */
8571 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8572 && !ix86_keep_aggregate_return_pointer (funtype))
8574 int nregs = ix86_function_regparm (funtype, fundecl);
8575 if (nregs == 0)
8576 return GET_MODE_SIZE (Pmode);
8579 return 0;
8582 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8584 static bool
8585 ix86_legitimate_combined_insn (rtx_insn *insn)
8587 int i;
8589 /* Check operand constraints in case hard registers were propagated
8590 into insn pattern. This check prevents combine pass from
8591 generating insn patterns with invalid hard register operands.
8592 These invalid insns can eventually confuse reload to error out
8593 with a spill failure. See also PRs 46829 and 46843. */
8595 gcc_assert (INSN_CODE (insn) >= 0);
8597 extract_insn (insn);
8598 preprocess_constraints (insn);
8600 int n_operands = recog_data.n_operands;
8601 int n_alternatives = recog_data.n_alternatives;
8602 for (i = 0; i < n_operands; i++)
8604 rtx op = recog_data.operand[i];
8605 machine_mode mode = GET_MODE (op);
8606 const operand_alternative *op_alt;
8607 int offset = 0;
8608 bool win;
8609 int j;
8611 /* A unary operator may be accepted by the predicate, but it
8612 is irrelevant for matching constraints. */
8613 if (UNARY_P (op))
8614 op = XEXP (op, 0);
8616 if (SUBREG_P (op))
8618 if (REG_P (SUBREG_REG (op))
8619 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8620 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8621 GET_MODE (SUBREG_REG (op)),
8622 SUBREG_BYTE (op),
8623 GET_MODE (op));
8624 op = SUBREG_REG (op);
8627 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8628 continue;
8630 op_alt = recog_op_alt;
8632 /* Operand has no constraints, anything is OK. */
8633 win = !n_alternatives;
8635 alternative_mask preferred = get_preferred_alternatives (insn);
8636 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8638 if (!TEST_BIT (preferred, j))
8639 continue;
8640 if (op_alt[i].anything_ok
8641 || (op_alt[i].matches != -1
8642 && operands_match_p
8643 (recog_data.operand[i],
8644 recog_data.operand[op_alt[i].matches]))
8645 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8647 win = true;
8648 break;
8652 if (!win)
8653 return false;
8656 return true;
8659 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8661 static unsigned HOST_WIDE_INT
8662 ix86_asan_shadow_offset (void)
8664 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8665 : HOST_WIDE_INT_C (0x7fff8000))
8666 : (HOST_WIDE_INT_1 << 29);
8669 /* Argument support functions. */
8671 /* Return true when register may be used to pass function parameters. */
8672 bool
8673 ix86_function_arg_regno_p (int regno)
8675 int i;
8676 enum calling_abi call_abi;
8677 const int *parm_regs;
8679 if (TARGET_MPX && BND_REGNO_P (regno))
8680 return true;
8682 if (!TARGET_64BIT)
8684 if (TARGET_MACHO)
8685 return (regno < REGPARM_MAX
8686 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8687 else
8688 return (regno < REGPARM_MAX
8689 || (TARGET_MMX && MMX_REGNO_P (regno)
8690 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8691 || (TARGET_SSE && SSE_REGNO_P (regno)
8692 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8695 if (TARGET_SSE && SSE_REGNO_P (regno)
8696 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8697 return true;
8699 /* TODO: The function should depend on current function ABI but
8700 builtins.c would need updating then. Therefore we use the
8701 default ABI. */
8702 call_abi = ix86_cfun_abi ();
8704 /* RAX is used as hidden argument to va_arg functions. */
8705 if (call_abi == SYSV_ABI && regno == AX_REG)
8706 return true;
8708 if (call_abi == MS_ABI)
8709 parm_regs = x86_64_ms_abi_int_parameter_registers;
8710 else
8711 parm_regs = x86_64_int_parameter_registers;
8713 for (i = 0; i < (call_abi == MS_ABI
8714 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8715 if (regno == parm_regs[i])
8716 return true;
8717 return false;
8720 /* Return if we do not know how to pass TYPE solely in registers. */
8722 static bool
8723 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8725 if (must_pass_in_stack_var_size_or_pad (mode, type))
8726 return true;
8728 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8729 The layout_type routine is crafty and tries to trick us into passing
8730 currently unsupported vector types on the stack by using TImode. */
8731 return (!TARGET_64BIT && mode == TImode
8732 && type && TREE_CODE (type) != VECTOR_TYPE);
8735 /* It returns the size, in bytes, of the area reserved for arguments passed
8736 in registers for the function represented by fndecl dependent to the used
8737 abi format. */
8739 ix86_reg_parm_stack_space (const_tree fndecl)
8741 enum calling_abi call_abi = SYSV_ABI;
8742 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8743 call_abi = ix86_function_abi (fndecl);
8744 else
8745 call_abi = ix86_function_type_abi (fndecl);
8746 if (TARGET_64BIT && call_abi == MS_ABI)
8747 return 32;
8748 return 0;
8751 /* We add this as a workaround in order to use libc_has_function
8752 hook in i386.md. */
8753 bool
8754 ix86_libc_has_function (enum function_class fn_class)
8756 return targetm.libc_has_function (fn_class);
8759 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8760 specifying the call abi used. */
8761 enum calling_abi
8762 ix86_function_type_abi (const_tree fntype)
8764 enum calling_abi abi = ix86_abi;
8766 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8767 return abi;
8769 if (abi == SYSV_ABI
8770 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8772 static int warned;
8773 if (TARGET_X32 && !warned)
8775 error ("X32 does not support ms_abi attribute");
8776 warned = 1;
8779 abi = MS_ABI;
8781 else if (abi == MS_ABI
8782 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8783 abi = SYSV_ABI;
8785 return abi;
8788 static enum calling_abi
8789 ix86_function_abi (const_tree fndecl)
8791 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8794 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8795 specifying the call abi used. */
8796 enum calling_abi
8797 ix86_cfun_abi (void)
8799 return cfun ? cfun->machine->call_abi : ix86_abi;
8802 static bool
8803 ix86_function_ms_hook_prologue (const_tree fn)
8805 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8807 if (decl_function_context (fn) != NULL_TREE)
8808 error_at (DECL_SOURCE_LOCATION (fn),
8809 "ms_hook_prologue is not compatible with nested function");
8810 else
8811 return true;
8813 return false;
8816 static bool
8817 ix86_function_naked (const_tree fn)
8819 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8820 return true;
8822 return false;
8825 /* Write the extra assembler code needed to declare a function properly. */
8827 void
8828 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8829 tree decl)
8831 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8833 if (is_ms_hook)
8835 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8836 unsigned int filler_cc = 0xcccccccc;
8838 for (i = 0; i < filler_count; i += 4)
8839 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8842 #ifdef SUBTARGET_ASM_UNWIND_INIT
8843 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8844 #endif
8846 ASM_OUTPUT_LABEL (asm_out_file, fname);
8848 /* Output magic byte marker, if hot-patch attribute is set. */
8849 if (is_ms_hook)
8851 if (TARGET_64BIT)
8853 /* leaq [%rsp + 0], %rsp */
8854 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8855 asm_out_file);
8857 else
8859 /* movl.s %edi, %edi
8860 push %ebp
8861 movl.s %esp, %ebp */
8862 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8867 /* Implementation of call abi switching target hook. Specific to FNDECL
8868 the specific call register sets are set. See also
8869 ix86_conditional_register_usage for more details. */
8870 void
8871 ix86_call_abi_override (const_tree fndecl)
8873 cfun->machine->call_abi = ix86_function_abi (fndecl);
8876 /* Return 1 if pseudo register should be created and used to hold
8877 GOT address for PIC code. */
8878 bool
8879 ix86_use_pseudo_pic_reg (void)
8881 if ((TARGET_64BIT
8882 && (ix86_cmodel == CM_SMALL_PIC
8883 || TARGET_PECOFF))
8884 || !flag_pic)
8885 return false;
8886 return true;
8889 /* Initialize large model PIC register. */
8891 static void
8892 ix86_init_large_pic_reg (unsigned int tmp_regno)
8894 rtx_code_label *label;
8895 rtx tmp_reg;
8897 gcc_assert (Pmode == DImode);
8898 label = gen_label_rtx ();
8899 emit_label (label);
8900 LABEL_PRESERVE_P (label) = 1;
8901 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8902 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8903 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8904 label));
8905 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8906 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8907 pic_offset_table_rtx, tmp_reg));
8908 const char *name = LABEL_NAME (label);
8909 PUT_CODE (label, NOTE);
8910 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
8911 NOTE_DELETED_LABEL_NAME (label) = name;
8914 /* Create and initialize PIC register if required. */
8915 static void
8916 ix86_init_pic_reg (void)
8918 edge entry_edge;
8919 rtx_insn *seq;
8921 if (!ix86_use_pseudo_pic_reg ())
8922 return;
8924 start_sequence ();
8926 if (TARGET_64BIT)
8928 if (ix86_cmodel == CM_LARGE_PIC)
8929 ix86_init_large_pic_reg (R11_REG);
8930 else
8931 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8933 else
8935 /* If there is future mcount call in the function it is more profitable
8936 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8937 rtx reg = crtl->profile
8938 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8939 : pic_offset_table_rtx;
8940 rtx_insn *insn = emit_insn (gen_set_got (reg));
8941 RTX_FRAME_RELATED_P (insn) = 1;
8942 if (crtl->profile)
8943 emit_move_insn (pic_offset_table_rtx, reg);
8944 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8947 seq = get_insns ();
8948 end_sequence ();
8950 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8951 insert_insn_on_edge (seq, entry_edge);
8952 commit_one_edge_insertion (entry_edge);
8955 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8956 for a call to a function whose data type is FNTYPE.
8957 For a library call, FNTYPE is 0. */
8959 void
8960 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8961 tree fntype, /* tree ptr for function decl */
8962 rtx libname, /* SYMBOL_REF of library name or 0 */
8963 tree fndecl,
8964 int caller)
8966 struct cgraph_local_info *i = NULL;
8967 struct cgraph_node *target = NULL;
8969 memset (cum, 0, sizeof (*cum));
8971 if (fndecl)
8973 target = cgraph_node::get (fndecl);
8974 if (target)
8976 target = target->function_symbol ();
8977 i = cgraph_node::local_info (target->decl);
8978 cum->call_abi = ix86_function_abi (target->decl);
8980 else
8981 cum->call_abi = ix86_function_abi (fndecl);
8983 else
8984 cum->call_abi = ix86_function_type_abi (fntype);
8986 cum->caller = caller;
8988 /* Set up the number of registers to use for passing arguments. */
8989 cum->nregs = ix86_regparm;
8990 if (TARGET_64BIT)
8992 cum->nregs = (cum->call_abi == SYSV_ABI
8993 ? X86_64_REGPARM_MAX
8994 : X86_64_MS_REGPARM_MAX);
8996 if (TARGET_SSE)
8998 cum->sse_nregs = SSE_REGPARM_MAX;
8999 if (TARGET_64BIT)
9001 cum->sse_nregs = (cum->call_abi == SYSV_ABI
9002 ? X86_64_SSE_REGPARM_MAX
9003 : X86_64_MS_SSE_REGPARM_MAX);
9006 if (TARGET_MMX)
9007 cum->mmx_nregs = MMX_REGPARM_MAX;
9008 cum->warn_avx512f = true;
9009 cum->warn_avx = true;
9010 cum->warn_sse = true;
9011 cum->warn_mmx = true;
9013 /* Because type might mismatch in between caller and callee, we need to
9014 use actual type of function for local calls.
9015 FIXME: cgraph_analyze can be told to actually record if function uses
9016 va_start so for local functions maybe_vaarg can be made aggressive
9017 helping K&R code.
9018 FIXME: once typesytem is fixed, we won't need this code anymore. */
9019 if (i && i->local && i->can_change_signature)
9020 fntype = TREE_TYPE (target->decl);
9021 cum->stdarg = stdarg_p (fntype);
9022 cum->maybe_vaarg = (fntype
9023 ? (!prototype_p (fntype) || stdarg_p (fntype))
9024 : !libname);
9026 cum->bnd_regno = FIRST_BND_REG;
9027 cum->bnds_in_bt = 0;
9028 cum->force_bnd_pass = 0;
9029 cum->decl = fndecl;
9031 if (!TARGET_64BIT)
9033 /* If there are variable arguments, then we won't pass anything
9034 in registers in 32-bit mode. */
9035 if (stdarg_p (fntype))
9037 cum->nregs = 0;
9038 /* Since in 32-bit, variable arguments are always passed on
9039 stack, there is scratch register available for indirect
9040 sibcall. */
9041 cfun->machine->arg_reg_available = true;
9042 cum->sse_nregs = 0;
9043 cum->mmx_nregs = 0;
9044 cum->warn_avx512f = false;
9045 cum->warn_avx = false;
9046 cum->warn_sse = false;
9047 cum->warn_mmx = false;
9048 return;
9051 /* Use ecx and edx registers if function has fastcall attribute,
9052 else look for regparm information. */
9053 if (fntype)
9055 unsigned int ccvt = ix86_get_callcvt (fntype);
9056 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
9058 cum->nregs = 1;
9059 cum->fastcall = 1; /* Same first register as in fastcall. */
9061 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9063 cum->nregs = 2;
9064 cum->fastcall = 1;
9066 else
9067 cum->nregs = ix86_function_regparm (fntype, fndecl);
9070 /* Set up the number of SSE registers used for passing SFmode
9071 and DFmode arguments. Warn for mismatching ABI. */
9072 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9075 cfun->machine->arg_reg_available = (cum->nregs > 0);
9078 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9079 But in the case of vector types, it is some vector mode.
9081 When we have only some of our vector isa extensions enabled, then there
9082 are some modes for which vector_mode_supported_p is false. For these
9083 modes, the generic vector support in gcc will choose some non-vector mode
9084 in order to implement the type. By computing the natural mode, we'll
9085 select the proper ABI location for the operand and not depend on whatever
9086 the middle-end decides to do with these vector types.
9088 The midde-end can't deal with the vector types > 16 bytes. In this
9089 case, we return the original mode and warn ABI change if CUM isn't
9090 NULL.
9092 If INT_RETURN is true, warn ABI change if the vector mode isn't
9093 available for function return value. */
9095 static machine_mode
9096 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9097 bool in_return)
9099 machine_mode mode = TYPE_MODE (type);
9101 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9103 HOST_WIDE_INT size = int_size_in_bytes (type);
9104 if ((size == 8 || size == 16 || size == 32 || size == 64)
9105 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9106 && TYPE_VECTOR_SUBPARTS (type) > 1)
9108 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9110 /* There are no XFmode vector modes. */
9111 if (innermode == XFmode)
9112 return mode;
9114 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9115 mode = MIN_MODE_VECTOR_FLOAT;
9116 else
9117 mode = MIN_MODE_VECTOR_INT;
9119 /* Get the mode which has this inner mode and number of units. */
9120 FOR_EACH_MODE_FROM (mode, mode)
9121 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9122 && GET_MODE_INNER (mode) == innermode)
9124 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9126 static bool warnedavx512f;
9127 static bool warnedavx512f_ret;
9129 if (cum && cum->warn_avx512f && !warnedavx512f)
9131 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9132 "without AVX512F enabled changes the ABI"))
9133 warnedavx512f = true;
9135 else if (in_return && !warnedavx512f_ret)
9137 if (warning (OPT_Wpsabi, "AVX512F vector return "
9138 "without AVX512F enabled changes the ABI"))
9139 warnedavx512f_ret = true;
9142 return TYPE_MODE (type);
9144 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9146 static bool warnedavx;
9147 static bool warnedavx_ret;
9149 if (cum && cum->warn_avx && !warnedavx)
9151 if (warning (OPT_Wpsabi, "AVX vector argument "
9152 "without AVX enabled changes the ABI"))
9153 warnedavx = true;
9155 else if (in_return && !warnedavx_ret)
9157 if (warning (OPT_Wpsabi, "AVX vector return "
9158 "without AVX enabled changes the ABI"))
9159 warnedavx_ret = true;
9162 return TYPE_MODE (type);
9164 else if (((size == 8 && TARGET_64BIT) || size == 16)
9165 && !TARGET_SSE
9166 && !TARGET_IAMCU)
9168 static bool warnedsse;
9169 static bool warnedsse_ret;
9171 if (cum && cum->warn_sse && !warnedsse)
9173 if (warning (OPT_Wpsabi, "SSE vector argument "
9174 "without SSE enabled changes the ABI"))
9175 warnedsse = true;
9177 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9179 if (warning (OPT_Wpsabi, "SSE vector return "
9180 "without SSE enabled changes the ABI"))
9181 warnedsse_ret = true;
9184 else if ((size == 8 && !TARGET_64BIT)
9185 && (!cfun
9186 || cfun->machine->func_type == TYPE_NORMAL)
9187 && !TARGET_MMX
9188 && !TARGET_IAMCU)
9190 static bool warnedmmx;
9191 static bool warnedmmx_ret;
9193 if (cum && cum->warn_mmx && !warnedmmx)
9195 if (warning (OPT_Wpsabi, "MMX vector argument "
9196 "without MMX enabled changes the ABI"))
9197 warnedmmx = true;
9199 else if (in_return && !warnedmmx_ret)
9201 if (warning (OPT_Wpsabi, "MMX vector return "
9202 "without MMX enabled changes the ABI"))
9203 warnedmmx_ret = true;
9206 return mode;
9209 gcc_unreachable ();
9213 return mode;
9216 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9217 this may not agree with the mode that the type system has chosen for the
9218 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9219 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9221 static rtx
9222 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9223 unsigned int regno)
9225 rtx tmp;
9227 if (orig_mode != BLKmode)
9228 tmp = gen_rtx_REG (orig_mode, regno);
9229 else
9231 tmp = gen_rtx_REG (mode, regno);
9232 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9233 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9236 return tmp;
9239 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9240 of this code is to classify each 8bytes of incoming argument by the register
9241 class and assign registers accordingly. */
9243 /* Return the union class of CLASS1 and CLASS2.
9244 See the x86-64 PS ABI for details. */
9246 static enum x86_64_reg_class
9247 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9249 /* Rule #1: If both classes are equal, this is the resulting class. */
9250 if (class1 == class2)
9251 return class1;
9253 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9254 the other class. */
9255 if (class1 == X86_64_NO_CLASS)
9256 return class2;
9257 if (class2 == X86_64_NO_CLASS)
9258 return class1;
9260 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9261 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9262 return X86_64_MEMORY_CLASS;
9264 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9265 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9266 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9267 return X86_64_INTEGERSI_CLASS;
9268 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9269 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9270 return X86_64_INTEGER_CLASS;
9272 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9273 MEMORY is used. */
9274 if (class1 == X86_64_X87_CLASS
9275 || class1 == X86_64_X87UP_CLASS
9276 || class1 == X86_64_COMPLEX_X87_CLASS
9277 || class2 == X86_64_X87_CLASS
9278 || class2 == X86_64_X87UP_CLASS
9279 || class2 == X86_64_COMPLEX_X87_CLASS)
9280 return X86_64_MEMORY_CLASS;
9282 /* Rule #6: Otherwise class SSE is used. */
9283 return X86_64_SSE_CLASS;
9286 /* Classify the argument of type TYPE and mode MODE.
9287 CLASSES will be filled by the register class used to pass each word
9288 of the operand. The number of words is returned. In case the parameter
9289 should be passed in memory, 0 is returned. As a special case for zero
9290 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9292 BIT_OFFSET is used internally for handling records and specifies offset
9293 of the offset in bits modulo 512 to avoid overflow cases.
9295 See the x86-64 PS ABI for details.
9298 static int
9299 classify_argument (machine_mode mode, const_tree type,
9300 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9302 HOST_WIDE_INT bytes =
9303 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9304 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9306 /* Variable sized entities are always passed/returned in memory. */
9307 if (bytes < 0)
9308 return 0;
9310 if (mode != VOIDmode
9311 && targetm.calls.must_pass_in_stack (mode, type))
9312 return 0;
9314 if (type && AGGREGATE_TYPE_P (type))
9316 int i;
9317 tree field;
9318 enum x86_64_reg_class subclasses[MAX_CLASSES];
9320 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9321 if (bytes > 64)
9322 return 0;
9324 for (i = 0; i < words; i++)
9325 classes[i] = X86_64_NO_CLASS;
9327 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9328 signalize memory class, so handle it as special case. */
9329 if (!words)
9331 classes[0] = X86_64_NO_CLASS;
9332 return 1;
9335 /* Classify each field of record and merge classes. */
9336 switch (TREE_CODE (type))
9338 case RECORD_TYPE:
9339 /* And now merge the fields of structure. */
9340 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9342 if (TREE_CODE (field) == FIELD_DECL)
9344 int num;
9346 if (TREE_TYPE (field) == error_mark_node)
9347 continue;
9349 /* Bitfields are always classified as integer. Handle them
9350 early, since later code would consider them to be
9351 misaligned integers. */
9352 if (DECL_BIT_FIELD (field))
9354 for (i = (int_bit_position (field)
9355 + (bit_offset % 64)) / 8 / 8;
9356 i < ((int_bit_position (field) + (bit_offset % 64))
9357 + tree_to_shwi (DECL_SIZE (field))
9358 + 63) / 8 / 8; i++)
9359 classes[i] =
9360 merge_classes (X86_64_INTEGER_CLASS,
9361 classes[i]);
9363 else
9365 int pos;
9367 type = TREE_TYPE (field);
9369 /* Flexible array member is ignored. */
9370 if (TYPE_MODE (type) == BLKmode
9371 && TREE_CODE (type) == ARRAY_TYPE
9372 && TYPE_SIZE (type) == NULL_TREE
9373 && TYPE_DOMAIN (type) != NULL_TREE
9374 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9375 == NULL_TREE))
9377 static bool warned;
9379 if (!warned && warn_psabi)
9381 warned = true;
9382 inform (input_location,
9383 "the ABI of passing struct with"
9384 " a flexible array member has"
9385 " changed in GCC 4.4");
9387 continue;
9389 num = classify_argument (TYPE_MODE (type), type,
9390 subclasses,
9391 (int_bit_position (field)
9392 + bit_offset) % 512);
9393 if (!num)
9394 return 0;
9395 pos = (int_bit_position (field)
9396 + (bit_offset % 64)) / 8 / 8;
9397 for (i = 0; i < num && (i + pos) < words; i++)
9398 classes[i + pos] =
9399 merge_classes (subclasses[i], classes[i + pos]);
9403 break;
9405 case ARRAY_TYPE:
9406 /* Arrays are handled as small records. */
9408 int num;
9409 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9410 TREE_TYPE (type), subclasses, bit_offset);
9411 if (!num)
9412 return 0;
9414 /* The partial classes are now full classes. */
9415 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9416 subclasses[0] = X86_64_SSE_CLASS;
9417 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9418 && !((bit_offset % 64) == 0 && bytes == 4))
9419 subclasses[0] = X86_64_INTEGER_CLASS;
9421 for (i = 0; i < words; i++)
9422 classes[i] = subclasses[i % num];
9424 break;
9426 case UNION_TYPE:
9427 case QUAL_UNION_TYPE:
9428 /* Unions are similar to RECORD_TYPE but offset is always 0.
9430 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9432 if (TREE_CODE (field) == FIELD_DECL)
9434 int num;
9436 if (TREE_TYPE (field) == error_mark_node)
9437 continue;
9439 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9440 TREE_TYPE (field), subclasses,
9441 bit_offset);
9442 if (!num)
9443 return 0;
9444 for (i = 0; i < num && i < words; i++)
9445 classes[i] = merge_classes (subclasses[i], classes[i]);
9448 break;
9450 default:
9451 gcc_unreachable ();
9454 if (words > 2)
9456 /* When size > 16 bytes, if the first one isn't
9457 X86_64_SSE_CLASS or any other ones aren't
9458 X86_64_SSEUP_CLASS, everything should be passed in
9459 memory. */
9460 if (classes[0] != X86_64_SSE_CLASS)
9461 return 0;
9463 for (i = 1; i < words; i++)
9464 if (classes[i] != X86_64_SSEUP_CLASS)
9465 return 0;
9468 /* Final merger cleanup. */
9469 for (i = 0; i < words; i++)
9471 /* If one class is MEMORY, everything should be passed in
9472 memory. */
9473 if (classes[i] == X86_64_MEMORY_CLASS)
9474 return 0;
9476 /* The X86_64_SSEUP_CLASS should be always preceded by
9477 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9478 if (classes[i] == X86_64_SSEUP_CLASS
9479 && classes[i - 1] != X86_64_SSE_CLASS
9480 && classes[i - 1] != X86_64_SSEUP_CLASS)
9482 /* The first one should never be X86_64_SSEUP_CLASS. */
9483 gcc_assert (i != 0);
9484 classes[i] = X86_64_SSE_CLASS;
9487 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9488 everything should be passed in memory. */
9489 if (classes[i] == X86_64_X87UP_CLASS
9490 && (classes[i - 1] != X86_64_X87_CLASS))
9492 static bool warned;
9494 /* The first one should never be X86_64_X87UP_CLASS. */
9495 gcc_assert (i != 0);
9496 if (!warned && warn_psabi)
9498 warned = true;
9499 inform (input_location,
9500 "the ABI of passing union with long double"
9501 " has changed in GCC 4.4");
9503 return 0;
9506 return words;
9509 /* Compute alignment needed. We align all types to natural boundaries with
9510 exception of XFmode that is aligned to 64bits. */
9511 if (mode != VOIDmode && mode != BLKmode)
9513 int mode_alignment = GET_MODE_BITSIZE (mode);
9515 if (mode == XFmode)
9516 mode_alignment = 128;
9517 else if (mode == XCmode)
9518 mode_alignment = 256;
9519 if (COMPLEX_MODE_P (mode))
9520 mode_alignment /= 2;
9521 /* Misaligned fields are always returned in memory. */
9522 if (bit_offset % mode_alignment)
9523 return 0;
9526 /* for V1xx modes, just use the base mode */
9527 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9528 && GET_MODE_UNIT_SIZE (mode) == bytes)
9529 mode = GET_MODE_INNER (mode);
9531 /* Classification of atomic types. */
9532 switch (mode)
9534 case E_SDmode:
9535 case E_DDmode:
9536 classes[0] = X86_64_SSE_CLASS;
9537 return 1;
9538 case E_TDmode:
9539 classes[0] = X86_64_SSE_CLASS;
9540 classes[1] = X86_64_SSEUP_CLASS;
9541 return 2;
9542 case E_DImode:
9543 case E_SImode:
9544 case E_HImode:
9545 case E_QImode:
9546 case E_CSImode:
9547 case E_CHImode:
9548 case E_CQImode:
9550 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9552 /* Analyze last 128 bits only. */
9553 size = (size - 1) & 0x7f;
9555 if (size < 32)
9557 classes[0] = X86_64_INTEGERSI_CLASS;
9558 return 1;
9560 else if (size < 64)
9562 classes[0] = X86_64_INTEGER_CLASS;
9563 return 1;
9565 else if (size < 64+32)
9567 classes[0] = X86_64_INTEGER_CLASS;
9568 classes[1] = X86_64_INTEGERSI_CLASS;
9569 return 2;
9571 else if (size < 64+64)
9573 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9574 return 2;
9576 else
9577 gcc_unreachable ();
9579 case E_CDImode:
9580 case E_TImode:
9581 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9582 return 2;
9583 case E_COImode:
9584 case E_OImode:
9585 /* OImode shouldn't be used directly. */
9586 gcc_unreachable ();
9587 case E_CTImode:
9588 return 0;
9589 case E_SFmode:
9590 if (!(bit_offset % 64))
9591 classes[0] = X86_64_SSESF_CLASS;
9592 else
9593 classes[0] = X86_64_SSE_CLASS;
9594 return 1;
9595 case E_DFmode:
9596 classes[0] = X86_64_SSEDF_CLASS;
9597 return 1;
9598 case E_XFmode:
9599 classes[0] = X86_64_X87_CLASS;
9600 classes[1] = X86_64_X87UP_CLASS;
9601 return 2;
9602 case E_TFmode:
9603 classes[0] = X86_64_SSE_CLASS;
9604 classes[1] = X86_64_SSEUP_CLASS;
9605 return 2;
9606 case E_SCmode:
9607 classes[0] = X86_64_SSE_CLASS;
9608 if (!(bit_offset % 64))
9609 return 1;
9610 else
9612 static bool warned;
9614 if (!warned && warn_psabi)
9616 warned = true;
9617 inform (input_location,
9618 "the ABI of passing structure with complex float"
9619 " member has changed in GCC 4.4");
9621 classes[1] = X86_64_SSESF_CLASS;
9622 return 2;
9624 case E_DCmode:
9625 classes[0] = X86_64_SSEDF_CLASS;
9626 classes[1] = X86_64_SSEDF_CLASS;
9627 return 2;
9628 case E_XCmode:
9629 classes[0] = X86_64_COMPLEX_X87_CLASS;
9630 return 1;
9631 case E_TCmode:
9632 /* This modes is larger than 16 bytes. */
9633 return 0;
9634 case E_V8SFmode:
9635 case E_V8SImode:
9636 case E_V32QImode:
9637 case E_V16HImode:
9638 case E_V4DFmode:
9639 case E_V4DImode:
9640 classes[0] = X86_64_SSE_CLASS;
9641 classes[1] = X86_64_SSEUP_CLASS;
9642 classes[2] = X86_64_SSEUP_CLASS;
9643 classes[3] = X86_64_SSEUP_CLASS;
9644 return 4;
9645 case E_V8DFmode:
9646 case E_V16SFmode:
9647 case E_V8DImode:
9648 case E_V16SImode:
9649 case E_V32HImode:
9650 case E_V64QImode:
9651 classes[0] = X86_64_SSE_CLASS;
9652 classes[1] = X86_64_SSEUP_CLASS;
9653 classes[2] = X86_64_SSEUP_CLASS;
9654 classes[3] = X86_64_SSEUP_CLASS;
9655 classes[4] = X86_64_SSEUP_CLASS;
9656 classes[5] = X86_64_SSEUP_CLASS;
9657 classes[6] = X86_64_SSEUP_CLASS;
9658 classes[7] = X86_64_SSEUP_CLASS;
9659 return 8;
9660 case E_V4SFmode:
9661 case E_V4SImode:
9662 case E_V16QImode:
9663 case E_V8HImode:
9664 case E_V2DFmode:
9665 case E_V2DImode:
9666 classes[0] = X86_64_SSE_CLASS;
9667 classes[1] = X86_64_SSEUP_CLASS;
9668 return 2;
9669 case E_V1TImode:
9670 case E_V1DImode:
9671 case E_V2SFmode:
9672 case E_V2SImode:
9673 case E_V4HImode:
9674 case E_V8QImode:
9675 classes[0] = X86_64_SSE_CLASS;
9676 return 1;
9677 case E_BLKmode:
9678 case E_VOIDmode:
9679 return 0;
9680 default:
9681 gcc_assert (VECTOR_MODE_P (mode));
9683 if (bytes > 16)
9684 return 0;
9686 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9688 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9689 classes[0] = X86_64_INTEGERSI_CLASS;
9690 else
9691 classes[0] = X86_64_INTEGER_CLASS;
9692 classes[1] = X86_64_INTEGER_CLASS;
9693 return 1 + (bytes > 8);
9697 /* Examine the argument and return set number of register required in each
9698 class. Return true iff parameter should be passed in memory. */
9700 static bool
9701 examine_argument (machine_mode mode, const_tree type, int in_return,
9702 int *int_nregs, int *sse_nregs)
9704 enum x86_64_reg_class regclass[MAX_CLASSES];
9705 int n = classify_argument (mode, type, regclass, 0);
9707 *int_nregs = 0;
9708 *sse_nregs = 0;
9710 if (!n)
9711 return true;
9712 for (n--; n >= 0; n--)
9713 switch (regclass[n])
9715 case X86_64_INTEGER_CLASS:
9716 case X86_64_INTEGERSI_CLASS:
9717 (*int_nregs)++;
9718 break;
9719 case X86_64_SSE_CLASS:
9720 case X86_64_SSESF_CLASS:
9721 case X86_64_SSEDF_CLASS:
9722 (*sse_nregs)++;
9723 break;
9724 case X86_64_NO_CLASS:
9725 case X86_64_SSEUP_CLASS:
9726 break;
9727 case X86_64_X87_CLASS:
9728 case X86_64_X87UP_CLASS:
9729 case X86_64_COMPLEX_X87_CLASS:
9730 if (!in_return)
9731 return true;
9732 break;
9733 case X86_64_MEMORY_CLASS:
9734 gcc_unreachable ();
9737 return false;
9740 /* Construct container for the argument used by GCC interface. See
9741 FUNCTION_ARG for the detailed description. */
9743 static rtx
9744 construct_container (machine_mode mode, machine_mode orig_mode,
9745 const_tree type, int in_return, int nintregs, int nsseregs,
9746 const int *intreg, int sse_regno)
9748 /* The following variables hold the static issued_error state. */
9749 static bool issued_sse_arg_error;
9750 static bool issued_sse_ret_error;
9751 static bool issued_x87_ret_error;
9753 machine_mode tmpmode;
9754 int bytes =
9755 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9756 enum x86_64_reg_class regclass[MAX_CLASSES];
9757 int n;
9758 int i;
9759 int nexps = 0;
9760 int needed_sseregs, needed_intregs;
9761 rtx exp[MAX_CLASSES];
9762 rtx ret;
9764 n = classify_argument (mode, type, regclass, 0);
9765 if (!n)
9766 return NULL;
9767 if (examine_argument (mode, type, in_return, &needed_intregs,
9768 &needed_sseregs))
9769 return NULL;
9770 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9771 return NULL;
9773 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9774 some less clueful developer tries to use floating-point anyway. */
9775 if (needed_sseregs && !TARGET_SSE)
9777 if (in_return)
9779 if (!issued_sse_ret_error)
9781 error ("SSE register return with SSE disabled");
9782 issued_sse_ret_error = true;
9785 else if (!issued_sse_arg_error)
9787 error ("SSE register argument with SSE disabled");
9788 issued_sse_arg_error = true;
9790 return NULL;
9793 /* Likewise, error if the ABI requires us to return values in the
9794 x87 registers and the user specified -mno-80387. */
9795 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9796 for (i = 0; i < n; i++)
9797 if (regclass[i] == X86_64_X87_CLASS
9798 || regclass[i] == X86_64_X87UP_CLASS
9799 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9801 if (!issued_x87_ret_error)
9803 error ("x87 register return with x87 disabled");
9804 issued_x87_ret_error = true;
9806 return NULL;
9809 /* First construct simple cases. Avoid SCmode, since we want to use
9810 single register to pass this type. */
9811 if (n == 1 && mode != SCmode)
9812 switch (regclass[0])
9814 case X86_64_INTEGER_CLASS:
9815 case X86_64_INTEGERSI_CLASS:
9816 return gen_rtx_REG (mode, intreg[0]);
9817 case X86_64_SSE_CLASS:
9818 case X86_64_SSESF_CLASS:
9819 case X86_64_SSEDF_CLASS:
9820 if (mode != BLKmode)
9821 return gen_reg_or_parallel (mode, orig_mode,
9822 SSE_REGNO (sse_regno));
9823 break;
9824 case X86_64_X87_CLASS:
9825 case X86_64_COMPLEX_X87_CLASS:
9826 return gen_rtx_REG (mode, FIRST_STACK_REG);
9827 case X86_64_NO_CLASS:
9828 /* Zero sized array, struct or class. */
9829 return NULL;
9830 default:
9831 gcc_unreachable ();
9833 if (n == 2
9834 && regclass[0] == X86_64_SSE_CLASS
9835 && regclass[1] == X86_64_SSEUP_CLASS
9836 && mode != BLKmode)
9837 return gen_reg_or_parallel (mode, orig_mode,
9838 SSE_REGNO (sse_regno));
9839 if (n == 4
9840 && regclass[0] == X86_64_SSE_CLASS
9841 && regclass[1] == X86_64_SSEUP_CLASS
9842 && regclass[2] == X86_64_SSEUP_CLASS
9843 && regclass[3] == X86_64_SSEUP_CLASS
9844 && mode != BLKmode)
9845 return gen_reg_or_parallel (mode, orig_mode,
9846 SSE_REGNO (sse_regno));
9847 if (n == 8
9848 && regclass[0] == X86_64_SSE_CLASS
9849 && regclass[1] == X86_64_SSEUP_CLASS
9850 && regclass[2] == X86_64_SSEUP_CLASS
9851 && regclass[3] == X86_64_SSEUP_CLASS
9852 && regclass[4] == X86_64_SSEUP_CLASS
9853 && regclass[5] == X86_64_SSEUP_CLASS
9854 && regclass[6] == X86_64_SSEUP_CLASS
9855 && regclass[7] == X86_64_SSEUP_CLASS
9856 && mode != BLKmode)
9857 return gen_reg_or_parallel (mode, orig_mode,
9858 SSE_REGNO (sse_regno));
9859 if (n == 2
9860 && regclass[0] == X86_64_X87_CLASS
9861 && regclass[1] == X86_64_X87UP_CLASS)
9862 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9864 if (n == 2
9865 && regclass[0] == X86_64_INTEGER_CLASS
9866 && regclass[1] == X86_64_INTEGER_CLASS
9867 && (mode == CDImode || mode == TImode)
9868 && intreg[0] + 1 == intreg[1])
9869 return gen_rtx_REG (mode, intreg[0]);
9871 /* Otherwise figure out the entries of the PARALLEL. */
9872 for (i = 0; i < n; i++)
9874 int pos;
9876 switch (regclass[i])
9878 case X86_64_NO_CLASS:
9879 break;
9880 case X86_64_INTEGER_CLASS:
9881 case X86_64_INTEGERSI_CLASS:
9882 /* Merge TImodes on aligned occasions here too. */
9883 if (i * 8 + 8 > bytes)
9885 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
9886 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
9887 /* We've requested 24 bytes we
9888 don't have mode for. Use DImode. */
9889 tmpmode = DImode;
9891 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9892 tmpmode = SImode;
9893 else
9894 tmpmode = DImode;
9895 exp [nexps++]
9896 = gen_rtx_EXPR_LIST (VOIDmode,
9897 gen_rtx_REG (tmpmode, *intreg),
9898 GEN_INT (i*8));
9899 intreg++;
9900 break;
9901 case X86_64_SSESF_CLASS:
9902 exp [nexps++]
9903 = gen_rtx_EXPR_LIST (VOIDmode,
9904 gen_rtx_REG (SFmode,
9905 SSE_REGNO (sse_regno)),
9906 GEN_INT (i*8));
9907 sse_regno++;
9908 break;
9909 case X86_64_SSEDF_CLASS:
9910 exp [nexps++]
9911 = gen_rtx_EXPR_LIST (VOIDmode,
9912 gen_rtx_REG (DFmode,
9913 SSE_REGNO (sse_regno)),
9914 GEN_INT (i*8));
9915 sse_regno++;
9916 break;
9917 case X86_64_SSE_CLASS:
9918 pos = i;
9919 switch (n)
9921 case 1:
9922 tmpmode = DImode;
9923 break;
9924 case 2:
9925 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9927 tmpmode = TImode;
9928 i++;
9930 else
9931 tmpmode = DImode;
9932 break;
9933 case 4:
9934 gcc_assert (i == 0
9935 && regclass[1] == X86_64_SSEUP_CLASS
9936 && regclass[2] == X86_64_SSEUP_CLASS
9937 && regclass[3] == X86_64_SSEUP_CLASS);
9938 tmpmode = OImode;
9939 i += 3;
9940 break;
9941 case 8:
9942 gcc_assert (i == 0
9943 && regclass[1] == X86_64_SSEUP_CLASS
9944 && regclass[2] == X86_64_SSEUP_CLASS
9945 && regclass[3] == X86_64_SSEUP_CLASS
9946 && regclass[4] == X86_64_SSEUP_CLASS
9947 && regclass[5] == X86_64_SSEUP_CLASS
9948 && regclass[6] == X86_64_SSEUP_CLASS
9949 && regclass[7] == X86_64_SSEUP_CLASS);
9950 tmpmode = XImode;
9951 i += 7;
9952 break;
9953 default:
9954 gcc_unreachable ();
9956 exp [nexps++]
9957 = gen_rtx_EXPR_LIST (VOIDmode,
9958 gen_rtx_REG (tmpmode,
9959 SSE_REGNO (sse_regno)),
9960 GEN_INT (pos*8));
9961 sse_regno++;
9962 break;
9963 default:
9964 gcc_unreachable ();
9968 /* Empty aligned struct, union or class. */
9969 if (nexps == 0)
9970 return NULL;
9972 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9973 for (i = 0; i < nexps; i++)
9974 XVECEXP (ret, 0, i) = exp [i];
9975 return ret;
9978 /* Update the data in CUM to advance over an argument of mode MODE
9979 and data type TYPE. (TYPE is null for libcalls where that information
9980 may not be available.)
9982 Return a number of integer regsiters advanced over. */
9984 static int
9985 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9986 const_tree type, HOST_WIDE_INT bytes,
9987 HOST_WIDE_INT words)
9989 int res = 0;
9990 bool error_p = false;
9992 if (TARGET_IAMCU)
9994 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9995 bytes in registers. */
9996 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9997 goto pass_in_reg;
9998 return res;
10001 switch (mode)
10003 default:
10004 break;
10006 case E_BLKmode:
10007 if (bytes < 0)
10008 break;
10009 /* FALLTHRU */
10011 case E_DImode:
10012 case E_SImode:
10013 case E_HImode:
10014 case E_QImode:
10015 pass_in_reg:
10016 cum->words += words;
10017 cum->nregs -= words;
10018 cum->regno += words;
10019 if (cum->nregs >= 0)
10020 res = words;
10021 if (cum->nregs <= 0)
10023 cum->nregs = 0;
10024 cfun->machine->arg_reg_available = false;
10025 cum->regno = 0;
10027 break;
10029 case E_OImode:
10030 /* OImode shouldn't be used directly. */
10031 gcc_unreachable ();
10033 case E_DFmode:
10034 if (cum->float_in_sse == -1)
10035 error_p = true;
10036 if (cum->float_in_sse < 2)
10037 break;
10038 /* FALLTHRU */
10039 case E_SFmode:
10040 if (cum->float_in_sse == -1)
10041 error_p = true;
10042 if (cum->float_in_sse < 1)
10043 break;
10044 /* FALLTHRU */
10046 case E_V8SFmode:
10047 case E_V8SImode:
10048 case E_V64QImode:
10049 case E_V32HImode:
10050 case E_V16SImode:
10051 case E_V8DImode:
10052 case E_V16SFmode:
10053 case E_V8DFmode:
10054 case E_V32QImode:
10055 case E_V16HImode:
10056 case E_V4DFmode:
10057 case E_V4DImode:
10058 case E_TImode:
10059 case E_V16QImode:
10060 case E_V8HImode:
10061 case E_V4SImode:
10062 case E_V2DImode:
10063 case E_V4SFmode:
10064 case E_V2DFmode:
10065 if (!type || !AGGREGATE_TYPE_P (type))
10067 cum->sse_words += words;
10068 cum->sse_nregs -= 1;
10069 cum->sse_regno += 1;
10070 if (cum->sse_nregs <= 0)
10072 cum->sse_nregs = 0;
10073 cum->sse_regno = 0;
10076 break;
10078 case E_V8QImode:
10079 case E_V4HImode:
10080 case E_V2SImode:
10081 case E_V2SFmode:
10082 case E_V1TImode:
10083 case E_V1DImode:
10084 if (!type || !AGGREGATE_TYPE_P (type))
10086 cum->mmx_words += words;
10087 cum->mmx_nregs -= 1;
10088 cum->mmx_regno += 1;
10089 if (cum->mmx_nregs <= 0)
10091 cum->mmx_nregs = 0;
10092 cum->mmx_regno = 0;
10095 break;
10097 if (error_p)
10099 cum->float_in_sse = 0;
10100 error ("calling %qD with SSE calling convention without "
10101 "SSE/SSE2 enabled", cum->decl);
10102 sorry ("this is a GCC bug that can be worked around by adding "
10103 "attribute used to function called");
10106 return res;
10109 static int
10110 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10111 const_tree type, HOST_WIDE_INT words, bool named)
10113 int int_nregs, sse_nregs;
10115 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10116 if (!named && (VALID_AVX512F_REG_MODE (mode)
10117 || VALID_AVX256_REG_MODE (mode)))
10118 return 0;
10120 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10121 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10123 cum->nregs -= int_nregs;
10124 cum->sse_nregs -= sse_nregs;
10125 cum->regno += int_nregs;
10126 cum->sse_regno += sse_nregs;
10127 return int_nregs;
10129 else
10131 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10132 cum->words = ROUND_UP (cum->words, align);
10133 cum->words += words;
10134 return 0;
10138 static int
10139 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10140 HOST_WIDE_INT words)
10142 /* Otherwise, this should be passed indirect. */
10143 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10145 cum->words += words;
10146 if (cum->nregs > 0)
10148 cum->nregs -= 1;
10149 cum->regno += 1;
10150 return 1;
10152 return 0;
10155 /* Update the data in CUM to advance over an argument of mode MODE and
10156 data type TYPE. (TYPE is null for libcalls where that information
10157 may not be available.) */
10159 static void
10160 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10161 const_tree type, bool named)
10163 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10164 HOST_WIDE_INT bytes, words;
10165 int nregs;
10167 /* The argument of interrupt handler is a special case and is
10168 handled in ix86_function_arg. */
10169 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10170 return;
10172 if (mode == BLKmode)
10173 bytes = int_size_in_bytes (type);
10174 else
10175 bytes = GET_MODE_SIZE (mode);
10176 words = CEIL (bytes, UNITS_PER_WORD);
10178 if (type)
10179 mode = type_natural_mode (type, NULL, false);
10181 if ((type && POINTER_BOUNDS_TYPE_P (type))
10182 || POINTER_BOUNDS_MODE_P (mode))
10184 /* If we pass bounds in BT then just update remained bounds count. */
10185 if (cum->bnds_in_bt)
10187 cum->bnds_in_bt--;
10188 return;
10191 /* Update remained number of bounds to force. */
10192 if (cum->force_bnd_pass)
10193 cum->force_bnd_pass--;
10195 cum->bnd_regno++;
10197 return;
10200 /* The first arg not going to Bounds Tables resets this counter. */
10201 cum->bnds_in_bt = 0;
10202 /* For unnamed args we always pass bounds to avoid bounds mess when
10203 passed and received types do not match. If bounds do not follow
10204 unnamed arg, still pretend required number of bounds were passed. */
10205 if (cum->force_bnd_pass)
10207 cum->bnd_regno += cum->force_bnd_pass;
10208 cum->force_bnd_pass = 0;
10211 if (TARGET_64BIT)
10213 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10215 if (call_abi == MS_ABI)
10216 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10217 else
10218 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10220 else
10221 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10223 /* For stdarg we expect bounds to be passed for each value passed
10224 in register. */
10225 if (cum->stdarg)
10226 cum->force_bnd_pass = nregs;
10227 /* For pointers passed in memory we expect bounds passed in Bounds
10228 Table. */
10229 if (!nregs)
10231 /* Track if there are outgoing arguments on stack. */
10232 if (cum->caller)
10233 cfun->machine->outgoing_args_on_stack = true;
10235 cum->bnds_in_bt = chkp_type_bounds_count (type);
10239 /* Define where to put the arguments to a function.
10240 Value is zero to push the argument on the stack,
10241 or a hard register in which to store the argument.
10243 MODE is the argument's machine mode.
10244 TYPE is the data type of the argument (as a tree).
10245 This is null for libcalls where that information may
10246 not be available.
10247 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10248 the preceding args and about the function being called.
10249 NAMED is nonzero if this argument is a named parameter
10250 (otherwise it is an extra parameter matching an ellipsis). */
10252 static rtx
10253 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10254 machine_mode orig_mode, const_tree type,
10255 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10257 bool error_p = false;
10259 /* Avoid the AL settings for the Unix64 ABI. */
10260 if (mode == VOIDmode)
10261 return constm1_rtx;
10263 if (TARGET_IAMCU)
10265 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10266 bytes in registers. */
10267 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10268 goto pass_in_reg;
10269 return NULL_RTX;
10272 switch (mode)
10274 default:
10275 break;
10277 case E_BLKmode:
10278 if (bytes < 0)
10279 break;
10280 /* FALLTHRU */
10281 case E_DImode:
10282 case E_SImode:
10283 case E_HImode:
10284 case E_QImode:
10285 pass_in_reg:
10286 if (words <= cum->nregs)
10288 int regno = cum->regno;
10290 /* Fastcall allocates the first two DWORD (SImode) or
10291 smaller arguments to ECX and EDX if it isn't an
10292 aggregate type . */
10293 if (cum->fastcall)
10295 if (mode == BLKmode
10296 || mode == DImode
10297 || (type && AGGREGATE_TYPE_P (type)))
10298 break;
10300 /* ECX not EAX is the first allocated register. */
10301 if (regno == AX_REG)
10302 regno = CX_REG;
10304 return gen_rtx_REG (mode, regno);
10306 break;
10308 case E_DFmode:
10309 if (cum->float_in_sse == -1)
10310 error_p = true;
10311 if (cum->float_in_sse < 2)
10312 break;
10313 /* FALLTHRU */
10314 case E_SFmode:
10315 if (cum->float_in_sse == -1)
10316 error_p = true;
10317 if (cum->float_in_sse < 1)
10318 break;
10319 /* FALLTHRU */
10320 case E_TImode:
10321 /* In 32bit, we pass TImode in xmm registers. */
10322 case E_V16QImode:
10323 case E_V8HImode:
10324 case E_V4SImode:
10325 case E_V2DImode:
10326 case E_V4SFmode:
10327 case E_V2DFmode:
10328 if (!type || !AGGREGATE_TYPE_P (type))
10330 if (cum->sse_nregs)
10331 return gen_reg_or_parallel (mode, orig_mode,
10332 cum->sse_regno + FIRST_SSE_REG);
10334 break;
10336 case E_OImode:
10337 case E_XImode:
10338 /* OImode and XImode shouldn't be used directly. */
10339 gcc_unreachable ();
10341 case E_V64QImode:
10342 case E_V32HImode:
10343 case E_V16SImode:
10344 case E_V8DImode:
10345 case E_V16SFmode:
10346 case E_V8DFmode:
10347 case E_V8SFmode:
10348 case E_V8SImode:
10349 case E_V32QImode:
10350 case E_V16HImode:
10351 case E_V4DFmode:
10352 case E_V4DImode:
10353 if (!type || !AGGREGATE_TYPE_P (type))
10355 if (cum->sse_nregs)
10356 return gen_reg_or_parallel (mode, orig_mode,
10357 cum->sse_regno + FIRST_SSE_REG);
10359 break;
10361 case E_V8QImode:
10362 case E_V4HImode:
10363 case E_V2SImode:
10364 case E_V2SFmode:
10365 case E_V1TImode:
10366 case E_V1DImode:
10367 if (!type || !AGGREGATE_TYPE_P (type))
10369 if (cum->mmx_nregs)
10370 return gen_reg_or_parallel (mode, orig_mode,
10371 cum->mmx_regno + FIRST_MMX_REG);
10373 break;
10375 if (error_p)
10377 cum->float_in_sse = 0;
10378 error ("calling %qD with SSE calling convention without "
10379 "SSE/SSE2 enabled", cum->decl);
10380 sorry ("this is a GCC bug that can be worked around by adding "
10381 "attribute used to function called");
10384 return NULL_RTX;
10387 static rtx
10388 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10389 machine_mode orig_mode, const_tree type, bool named)
10391 /* Handle a hidden AL argument containing number of registers
10392 for varargs x86-64 functions. */
10393 if (mode == VOIDmode)
10394 return GEN_INT (cum->maybe_vaarg
10395 ? (cum->sse_nregs < 0
10396 ? X86_64_SSE_REGPARM_MAX
10397 : cum->sse_regno)
10398 : -1);
10400 switch (mode)
10402 default:
10403 break;
10405 case E_V8SFmode:
10406 case E_V8SImode:
10407 case E_V32QImode:
10408 case E_V16HImode:
10409 case E_V4DFmode:
10410 case E_V4DImode:
10411 case E_V16SFmode:
10412 case E_V16SImode:
10413 case E_V64QImode:
10414 case E_V32HImode:
10415 case E_V8DFmode:
10416 case E_V8DImode:
10417 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10418 if (!named)
10419 return NULL;
10420 break;
10423 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10424 cum->sse_nregs,
10425 &x86_64_int_parameter_registers [cum->regno],
10426 cum->sse_regno);
10429 static rtx
10430 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10431 machine_mode orig_mode, bool named,
10432 HOST_WIDE_INT bytes)
10434 unsigned int regno;
10436 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10437 We use value of -2 to specify that current function call is MSABI. */
10438 if (mode == VOIDmode)
10439 return GEN_INT (-2);
10441 /* If we've run out of registers, it goes on the stack. */
10442 if (cum->nregs == 0)
10443 return NULL_RTX;
10445 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10447 /* Only floating point modes are passed in anything but integer regs. */
10448 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10450 if (named)
10451 regno = cum->regno + FIRST_SSE_REG;
10452 else
10454 rtx t1, t2;
10456 /* Unnamed floating parameters are passed in both the
10457 SSE and integer registers. */
10458 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10459 t2 = gen_rtx_REG (mode, regno);
10460 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10461 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10462 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10465 /* Handle aggregated types passed in register. */
10466 if (orig_mode == BLKmode)
10468 if (bytes > 0 && bytes <= 8)
10469 mode = (bytes > 4 ? DImode : SImode);
10470 if (mode == BLKmode)
10471 mode = DImode;
10474 return gen_reg_or_parallel (mode, orig_mode, regno);
10477 /* Return where to put the arguments to a function.
10478 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10480 MODE is the argument's machine mode. TYPE is the data type of the
10481 argument. It is null for libcalls where that information may not be
10482 available. CUM gives information about the preceding args and about
10483 the function being called. NAMED is nonzero if this argument is a
10484 named parameter (otherwise it is an extra parameter matching an
10485 ellipsis). */
10487 static rtx
10488 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10489 const_tree type, bool named)
10491 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10492 machine_mode mode = omode;
10493 HOST_WIDE_INT bytes, words;
10494 rtx arg;
10496 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10498 gcc_assert (type != NULL_TREE);
10499 if (POINTER_TYPE_P (type))
10501 /* This is the pointer argument. */
10502 gcc_assert (TYPE_MODE (type) == Pmode);
10503 /* It is at -WORD(AP) in the current frame in interrupt and
10504 exception handlers. */
10505 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10507 else
10509 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10510 && TREE_CODE (type) == INTEGER_TYPE
10511 && TYPE_MODE (type) == word_mode);
10512 /* The error code is the word-mode integer argument at
10513 -2 * WORD(AP) in the current frame of the exception
10514 handler. */
10515 arg = gen_rtx_MEM (word_mode,
10516 plus_constant (Pmode,
10517 arg_pointer_rtx,
10518 -2 * UNITS_PER_WORD));
10520 return arg;
10523 /* All pointer bounds arguments are handled separately here. */
10524 if ((type && POINTER_BOUNDS_TYPE_P (type))
10525 || POINTER_BOUNDS_MODE_P (mode))
10527 /* Return NULL if bounds are forced to go in Bounds Table. */
10528 if (cum->bnds_in_bt)
10529 arg = NULL;
10530 /* Return the next available bound reg if any. */
10531 else if (cum->bnd_regno <= LAST_BND_REG)
10532 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10533 /* Return the next special slot number otherwise. */
10534 else
10535 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10537 return arg;
10540 if (mode == BLKmode)
10541 bytes = int_size_in_bytes (type);
10542 else
10543 bytes = GET_MODE_SIZE (mode);
10544 words = CEIL (bytes, UNITS_PER_WORD);
10546 /* To simplify the code below, represent vector types with a vector mode
10547 even if MMX/SSE are not active. */
10548 if (type && TREE_CODE (type) == VECTOR_TYPE)
10549 mode = type_natural_mode (type, cum, false);
10551 if (TARGET_64BIT)
10553 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10555 if (call_abi == MS_ABI)
10556 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10557 else
10558 arg = function_arg_64 (cum, mode, omode, type, named);
10560 else
10561 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10563 /* Track if there are outgoing arguments on stack. */
10564 if (arg == NULL_RTX && cum->caller)
10565 cfun->machine->outgoing_args_on_stack = true;
10567 return arg;
10570 /* A C expression that indicates when an argument must be passed by
10571 reference. If nonzero for an argument, a copy of that argument is
10572 made in memory and a pointer to the argument is passed instead of
10573 the argument itself. The pointer is passed in whatever way is
10574 appropriate for passing a pointer to that type. */
10576 static bool
10577 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10578 const_tree type, bool)
10580 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10582 /* Bounds are never passed by reference. */
10583 if ((type && POINTER_BOUNDS_TYPE_P (type))
10584 || POINTER_BOUNDS_MODE_P (mode))
10585 return false;
10587 if (TARGET_64BIT)
10589 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10591 /* See Windows x64 Software Convention. */
10592 if (call_abi == MS_ABI)
10594 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10596 if (type)
10598 /* Arrays are passed by reference. */
10599 if (TREE_CODE (type) == ARRAY_TYPE)
10600 return true;
10602 if (RECORD_OR_UNION_TYPE_P (type))
10604 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10605 are passed by reference. */
10606 msize = int_size_in_bytes (type);
10610 /* __m128 is passed by reference. */
10611 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10613 else if (type && int_size_in_bytes (type) == -1)
10614 return true;
10617 return false;
10620 /* Return true when TYPE should be 128bit aligned for 32bit argument
10621 passing ABI. XXX: This function is obsolete and is only used for
10622 checking psABI compatibility with previous versions of GCC. */
10624 static bool
10625 ix86_compat_aligned_value_p (const_tree type)
10627 machine_mode mode = TYPE_MODE (type);
10628 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10629 || mode == TDmode
10630 || mode == TFmode
10631 || mode == TCmode)
10632 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10633 return true;
10634 if (TYPE_ALIGN (type) < 128)
10635 return false;
10637 if (AGGREGATE_TYPE_P (type))
10639 /* Walk the aggregates recursively. */
10640 switch (TREE_CODE (type))
10642 case RECORD_TYPE:
10643 case UNION_TYPE:
10644 case QUAL_UNION_TYPE:
10646 tree field;
10648 /* Walk all the structure fields. */
10649 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10651 if (TREE_CODE (field) == FIELD_DECL
10652 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10653 return true;
10655 break;
10658 case ARRAY_TYPE:
10659 /* Just for use if some languages passes arrays by value. */
10660 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10661 return true;
10662 break;
10664 default:
10665 gcc_unreachable ();
10668 return false;
10671 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10672 XXX: This function is obsolete and is only used for checking psABI
10673 compatibility with previous versions of GCC. */
10675 static unsigned int
10676 ix86_compat_function_arg_boundary (machine_mode mode,
10677 const_tree type, unsigned int align)
10679 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10680 natural boundaries. */
10681 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10683 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10684 make an exception for SSE modes since these require 128bit
10685 alignment.
10687 The handling here differs from field_alignment. ICC aligns MMX
10688 arguments to 4 byte boundaries, while structure fields are aligned
10689 to 8 byte boundaries. */
10690 if (!type)
10692 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10693 align = PARM_BOUNDARY;
10695 else
10697 if (!ix86_compat_aligned_value_p (type))
10698 align = PARM_BOUNDARY;
10701 if (align > BIGGEST_ALIGNMENT)
10702 align = BIGGEST_ALIGNMENT;
10703 return align;
10706 /* Return true when TYPE should be 128bit aligned for 32bit argument
10707 passing ABI. */
10709 static bool
10710 ix86_contains_aligned_value_p (const_tree type)
10712 machine_mode mode = TYPE_MODE (type);
10714 if (mode == XFmode || mode == XCmode)
10715 return false;
10717 if (TYPE_ALIGN (type) < 128)
10718 return false;
10720 if (AGGREGATE_TYPE_P (type))
10722 /* Walk the aggregates recursively. */
10723 switch (TREE_CODE (type))
10725 case RECORD_TYPE:
10726 case UNION_TYPE:
10727 case QUAL_UNION_TYPE:
10729 tree field;
10731 /* Walk all the structure fields. */
10732 for (field = TYPE_FIELDS (type);
10733 field;
10734 field = DECL_CHAIN (field))
10736 if (TREE_CODE (field) == FIELD_DECL
10737 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10738 return true;
10740 break;
10743 case ARRAY_TYPE:
10744 /* Just for use if some languages passes arrays by value. */
10745 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10746 return true;
10747 break;
10749 default:
10750 gcc_unreachable ();
10753 else
10754 return TYPE_ALIGN (type) >= 128;
10756 return false;
10759 /* Gives the alignment boundary, in bits, of an argument with the
10760 specified mode and type. */
10762 static unsigned int
10763 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10765 unsigned int align;
10766 if (type)
10768 /* Since the main variant type is used for call, we convert it to
10769 the main variant type. */
10770 type = TYPE_MAIN_VARIANT (type);
10771 align = TYPE_ALIGN (type);
10773 else
10774 align = GET_MODE_ALIGNMENT (mode);
10775 if (align < PARM_BOUNDARY)
10776 align = PARM_BOUNDARY;
10777 else
10779 static bool warned;
10780 unsigned int saved_align = align;
10782 if (!TARGET_64BIT)
10784 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10785 if (!type)
10787 if (mode == XFmode || mode == XCmode)
10788 align = PARM_BOUNDARY;
10790 else if (!ix86_contains_aligned_value_p (type))
10791 align = PARM_BOUNDARY;
10793 if (align < 128)
10794 align = PARM_BOUNDARY;
10797 if (warn_psabi
10798 && !warned
10799 && align != ix86_compat_function_arg_boundary (mode, type,
10800 saved_align))
10802 warned = true;
10803 inform (input_location,
10804 "The ABI for passing parameters with %d-byte"
10805 " alignment has changed in GCC 4.6",
10806 align / BITS_PER_UNIT);
10810 return align;
10813 /* Return true if N is a possible register number of function value. */
10815 static bool
10816 ix86_function_value_regno_p (const unsigned int regno)
10818 switch (regno)
10820 case AX_REG:
10821 return true;
10822 case DX_REG:
10823 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10824 case DI_REG:
10825 case SI_REG:
10826 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10828 case BND0_REG:
10829 case BND1_REG:
10830 return chkp_function_instrumented_p (current_function_decl);
10832 /* Complex values are returned in %st(0)/%st(1) pair. */
10833 case ST0_REG:
10834 case ST1_REG:
10835 /* TODO: The function should depend on current function ABI but
10836 builtins.c would need updating then. Therefore we use the
10837 default ABI. */
10838 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10839 return false;
10840 return TARGET_FLOAT_RETURNS_IN_80387;
10842 /* Complex values are returned in %xmm0/%xmm1 pair. */
10843 case XMM0_REG:
10844 case XMM1_REG:
10845 return TARGET_SSE;
10847 case MM0_REG:
10848 if (TARGET_MACHO || TARGET_64BIT)
10849 return false;
10850 return TARGET_MMX;
10853 return false;
10856 /* Define how to find the value returned by a function.
10857 VALTYPE is the data type of the value (as a tree).
10858 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10859 otherwise, FUNC is 0. */
10861 static rtx
10862 function_value_32 (machine_mode orig_mode, machine_mode mode,
10863 const_tree fntype, const_tree fn)
10865 unsigned int regno;
10867 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10868 we normally prevent this case when mmx is not available. However
10869 some ABIs may require the result to be returned like DImode. */
10870 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10871 regno = FIRST_MMX_REG;
10873 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10874 we prevent this case when sse is not available. However some ABIs
10875 may require the result to be returned like integer TImode. */
10876 else if (mode == TImode
10877 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10878 regno = FIRST_SSE_REG;
10880 /* 32-byte vector modes in %ymm0. */
10881 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10882 regno = FIRST_SSE_REG;
10884 /* 64-byte vector modes in %zmm0. */
10885 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10886 regno = FIRST_SSE_REG;
10888 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10889 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10890 regno = FIRST_FLOAT_REG;
10891 else
10892 /* Most things go in %eax. */
10893 regno = AX_REG;
10895 /* Override FP return register with %xmm0 for local functions when
10896 SSE math is enabled or for functions with sseregparm attribute. */
10897 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10899 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10900 if (sse_level == -1)
10902 error ("calling %qD with SSE calling convention without "
10903 "SSE/SSE2 enabled", fn);
10904 sorry ("this is a GCC bug that can be worked around by adding "
10905 "attribute used to function called");
10907 else if ((sse_level >= 1 && mode == SFmode)
10908 || (sse_level == 2 && mode == DFmode))
10909 regno = FIRST_SSE_REG;
10912 /* OImode shouldn't be used directly. */
10913 gcc_assert (mode != OImode);
10915 return gen_rtx_REG (orig_mode, regno);
10918 static rtx
10919 function_value_64 (machine_mode orig_mode, machine_mode mode,
10920 const_tree valtype)
10922 rtx ret;
10924 /* Handle libcalls, which don't provide a type node. */
10925 if (valtype == NULL)
10927 unsigned int regno;
10929 switch (mode)
10931 case E_SFmode:
10932 case E_SCmode:
10933 case E_DFmode:
10934 case E_DCmode:
10935 case E_TFmode:
10936 case E_SDmode:
10937 case E_DDmode:
10938 case E_TDmode:
10939 regno = FIRST_SSE_REG;
10940 break;
10941 case E_XFmode:
10942 case E_XCmode:
10943 regno = FIRST_FLOAT_REG;
10944 break;
10945 case E_TCmode:
10946 return NULL;
10947 default:
10948 regno = AX_REG;
10951 return gen_rtx_REG (mode, regno);
10953 else if (POINTER_TYPE_P (valtype))
10955 /* Pointers are always returned in word_mode. */
10956 mode = word_mode;
10959 ret = construct_container (mode, orig_mode, valtype, 1,
10960 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10961 x86_64_int_return_registers, 0);
10963 /* For zero sized structures, construct_container returns NULL, but we
10964 need to keep rest of compiler happy by returning meaningful value. */
10965 if (!ret)
10966 ret = gen_rtx_REG (orig_mode, AX_REG);
10968 return ret;
10971 static rtx
10972 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10973 const_tree valtype)
10975 unsigned int regno = AX_REG;
10977 if (TARGET_SSE)
10979 switch (GET_MODE_SIZE (mode))
10981 case 16:
10982 if (valtype != NULL_TREE
10983 && !VECTOR_INTEGER_TYPE_P (valtype)
10984 && !VECTOR_INTEGER_TYPE_P (valtype)
10985 && !INTEGRAL_TYPE_P (valtype)
10986 && !VECTOR_FLOAT_TYPE_P (valtype))
10987 break;
10988 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10989 && !COMPLEX_MODE_P (mode))
10990 regno = FIRST_SSE_REG;
10991 break;
10992 case 8:
10993 case 4:
10994 if (mode == SFmode || mode == DFmode)
10995 regno = FIRST_SSE_REG;
10996 break;
10997 default:
10998 break;
11001 return gen_rtx_REG (orig_mode, regno);
11004 static rtx
11005 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
11006 machine_mode orig_mode, machine_mode mode)
11008 const_tree fn, fntype;
11010 fn = NULL_TREE;
11011 if (fntype_or_decl && DECL_P (fntype_or_decl))
11012 fn = fntype_or_decl;
11013 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
11015 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
11016 || POINTER_BOUNDS_MODE_P (mode))
11017 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
11018 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
11019 return function_value_ms_64 (orig_mode, mode, valtype);
11020 else if (TARGET_64BIT)
11021 return function_value_64 (orig_mode, mode, valtype);
11022 else
11023 return function_value_32 (orig_mode, mode, fntype, fn);
11026 static rtx
11027 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
11029 machine_mode mode, orig_mode;
11031 orig_mode = TYPE_MODE (valtype);
11032 mode = type_natural_mode (valtype, NULL, true);
11033 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
11036 /* Return an RTX representing a place where a function returns
11037 or recieves pointer bounds or NULL if no bounds are returned.
11039 VALTYPE is a data type of a value returned by the function.
11041 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
11042 or FUNCTION_TYPE of the function.
11044 If OUTGOING is false, return a place in which the caller will
11045 see the return value. Otherwise, return a place where a
11046 function returns a value. */
11048 static rtx
11049 ix86_function_value_bounds (const_tree valtype,
11050 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
11051 bool outgoing ATTRIBUTE_UNUSED)
11053 rtx res = NULL_RTX;
11055 if (BOUNDED_TYPE_P (valtype))
11056 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
11057 else if (chkp_type_has_pointer (valtype))
11059 bitmap slots;
11060 rtx bounds[2];
11061 bitmap_iterator bi;
11062 unsigned i, bnd_no = 0;
11064 bitmap_obstack_initialize (NULL);
11065 slots = BITMAP_ALLOC (NULL);
11066 chkp_find_bound_slots (valtype, slots);
11068 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11070 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11071 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11072 gcc_assert (bnd_no < 2);
11073 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11076 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11078 BITMAP_FREE (slots);
11079 bitmap_obstack_release (NULL);
11081 else
11082 res = NULL_RTX;
11084 return res;
11087 /* Pointer function arguments and return values are promoted to
11088 word_mode for normal functions. */
11090 static machine_mode
11091 ix86_promote_function_mode (const_tree type, machine_mode mode,
11092 int *punsignedp, const_tree fntype,
11093 int for_return)
11095 if (cfun->machine->func_type == TYPE_NORMAL
11096 && type != NULL_TREE
11097 && POINTER_TYPE_P (type))
11099 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11100 return word_mode;
11102 return default_promote_function_mode (type, mode, punsignedp, fntype,
11103 for_return);
11106 /* Return true if a structure, union or array with MODE containing FIELD
11107 should be accessed using BLKmode. */
11109 static bool
11110 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11112 /* Union with XFmode must be in BLKmode. */
11113 return (mode == XFmode
11114 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11115 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11119 ix86_libcall_value (machine_mode mode)
11121 return ix86_function_value_1 (NULL, NULL, mode, mode);
11124 /* Return true iff type is returned in memory. */
11126 static bool
11127 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11129 #ifdef SUBTARGET_RETURN_IN_MEMORY
11130 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11131 #else
11132 const machine_mode mode = type_natural_mode (type, NULL, true);
11133 HOST_WIDE_INT size;
11135 if (POINTER_BOUNDS_TYPE_P (type))
11136 return false;
11138 if (TARGET_64BIT)
11140 if (ix86_function_type_abi (fntype) == MS_ABI)
11142 size = int_size_in_bytes (type);
11144 /* __m128 is returned in xmm0. */
11145 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11146 || INTEGRAL_TYPE_P (type)
11147 || VECTOR_FLOAT_TYPE_P (type))
11148 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11149 && !COMPLEX_MODE_P (mode)
11150 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11151 return false;
11153 /* Otherwise, the size must be exactly in [1248]. */
11154 return size != 1 && size != 2 && size != 4 && size != 8;
11156 else
11158 int needed_intregs, needed_sseregs;
11160 return examine_argument (mode, type, 1,
11161 &needed_intregs, &needed_sseregs);
11164 else
11166 size = int_size_in_bytes (type);
11168 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11169 bytes in registers. */
11170 if (TARGET_IAMCU)
11171 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11173 if (mode == BLKmode)
11174 return true;
11176 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11177 return false;
11179 if (VECTOR_MODE_P (mode) || mode == TImode)
11181 /* User-created vectors small enough to fit in EAX. */
11182 if (size < 8)
11183 return false;
11185 /* Unless ABI prescibes otherwise,
11186 MMX/3dNow values are returned in MM0 if available. */
11188 if (size == 8)
11189 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11191 /* SSE values are returned in XMM0 if available. */
11192 if (size == 16)
11193 return !TARGET_SSE;
11195 /* AVX values are returned in YMM0 if available. */
11196 if (size == 32)
11197 return !TARGET_AVX;
11199 /* AVX512F values are returned in ZMM0 if available. */
11200 if (size == 64)
11201 return !TARGET_AVX512F;
11204 if (mode == XFmode)
11205 return false;
11207 if (size > 12)
11208 return true;
11210 /* OImode shouldn't be used directly. */
11211 gcc_assert (mode != OImode);
11213 return false;
11215 #endif
11219 /* Create the va_list data type. */
11221 static tree
11222 ix86_build_builtin_va_list_64 (void)
11224 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11226 record = lang_hooks.types.make_type (RECORD_TYPE);
11227 type_decl = build_decl (BUILTINS_LOCATION,
11228 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11230 f_gpr = build_decl (BUILTINS_LOCATION,
11231 FIELD_DECL, get_identifier ("gp_offset"),
11232 unsigned_type_node);
11233 f_fpr = build_decl (BUILTINS_LOCATION,
11234 FIELD_DECL, get_identifier ("fp_offset"),
11235 unsigned_type_node);
11236 f_ovf = build_decl (BUILTINS_LOCATION,
11237 FIELD_DECL, get_identifier ("overflow_arg_area"),
11238 ptr_type_node);
11239 f_sav = build_decl (BUILTINS_LOCATION,
11240 FIELD_DECL, get_identifier ("reg_save_area"),
11241 ptr_type_node);
11243 va_list_gpr_counter_field = f_gpr;
11244 va_list_fpr_counter_field = f_fpr;
11246 DECL_FIELD_CONTEXT (f_gpr) = record;
11247 DECL_FIELD_CONTEXT (f_fpr) = record;
11248 DECL_FIELD_CONTEXT (f_ovf) = record;
11249 DECL_FIELD_CONTEXT (f_sav) = record;
11251 TYPE_STUB_DECL (record) = type_decl;
11252 TYPE_NAME (record) = type_decl;
11253 TYPE_FIELDS (record) = f_gpr;
11254 DECL_CHAIN (f_gpr) = f_fpr;
11255 DECL_CHAIN (f_fpr) = f_ovf;
11256 DECL_CHAIN (f_ovf) = f_sav;
11258 layout_type (record);
11260 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11261 NULL_TREE, TYPE_ATTRIBUTES (record));
11263 /* The correct type is an array type of one element. */
11264 return build_array_type (record, build_index_type (size_zero_node));
11267 /* Setup the builtin va_list data type and for 64-bit the additional
11268 calling convention specific va_list data types. */
11270 static tree
11271 ix86_build_builtin_va_list (void)
11273 if (TARGET_64BIT)
11275 /* Initialize ABI specific va_list builtin types.
11277 In lto1, we can encounter two va_list types:
11278 - one as a result of the type-merge across TUs, and
11279 - the one constructed here.
11280 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11281 a type identity check in canonical_va_list_type based on
11282 TYPE_MAIN_VARIANT (which we used to have) will not work.
11283 Instead, we tag each va_list_type_node with its unique attribute, and
11284 look for the attribute in the type identity check in
11285 canonical_va_list_type.
11287 Tagging sysv_va_list_type_node directly with the attribute is
11288 problematic since it's a array of one record, which will degrade into a
11289 pointer to record when used as parameter (see build_va_arg comments for
11290 an example), dropping the attribute in the process. So we tag the
11291 record instead. */
11293 /* For SYSV_ABI we use an array of one record. */
11294 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11296 /* For MS_ABI we use plain pointer to argument area. */
11297 tree char_ptr_type = build_pointer_type (char_type_node);
11298 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11299 TYPE_ATTRIBUTES (char_ptr_type));
11300 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11302 return ((ix86_abi == MS_ABI)
11303 ? ms_va_list_type_node
11304 : sysv_va_list_type_node);
11306 else
11308 /* For i386 we use plain pointer to argument area. */
11309 return build_pointer_type (char_type_node);
11313 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11315 static void
11316 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11318 rtx save_area, mem;
11319 alias_set_type set;
11320 int i, max;
11322 /* GPR size of varargs save area. */
11323 if (cfun->va_list_gpr_size)
11324 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11325 else
11326 ix86_varargs_gpr_size = 0;
11328 /* FPR size of varargs save area. We don't need it if we don't pass
11329 anything in SSE registers. */
11330 if (TARGET_SSE && cfun->va_list_fpr_size)
11331 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11332 else
11333 ix86_varargs_fpr_size = 0;
11335 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11336 return;
11338 save_area = frame_pointer_rtx;
11339 set = get_varargs_alias_set ();
11341 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11342 if (max > X86_64_REGPARM_MAX)
11343 max = X86_64_REGPARM_MAX;
11345 for (i = cum->regno; i < max; i++)
11347 mem = gen_rtx_MEM (word_mode,
11348 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11349 MEM_NOTRAP_P (mem) = 1;
11350 set_mem_alias_set (mem, set);
11351 emit_move_insn (mem,
11352 gen_rtx_REG (word_mode,
11353 x86_64_int_parameter_registers[i]));
11356 if (ix86_varargs_fpr_size)
11358 machine_mode smode;
11359 rtx_code_label *label;
11360 rtx test;
11362 /* Now emit code to save SSE registers. The AX parameter contains number
11363 of SSE parameter registers used to call this function, though all we
11364 actually check here is the zero/non-zero status. */
11366 label = gen_label_rtx ();
11367 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11368 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11369 label));
11371 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11372 we used movdqa (i.e. TImode) instead? Perhaps even better would
11373 be if we could determine the real mode of the data, via a hook
11374 into pass_stdarg. Ignore all that for now. */
11375 smode = V4SFmode;
11376 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11377 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11379 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11380 if (max > X86_64_SSE_REGPARM_MAX)
11381 max = X86_64_SSE_REGPARM_MAX;
11383 for (i = cum->sse_regno; i < max; ++i)
11385 mem = plus_constant (Pmode, save_area,
11386 i * 16 + ix86_varargs_gpr_size);
11387 mem = gen_rtx_MEM (smode, mem);
11388 MEM_NOTRAP_P (mem) = 1;
11389 set_mem_alias_set (mem, set);
11390 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11392 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11395 emit_label (label);
11399 static void
11400 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11402 alias_set_type set = get_varargs_alias_set ();
11403 int i;
11405 /* Reset to zero, as there might be a sysv vaarg used
11406 before. */
11407 ix86_varargs_gpr_size = 0;
11408 ix86_varargs_fpr_size = 0;
11410 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11412 rtx reg, mem;
11414 mem = gen_rtx_MEM (Pmode,
11415 plus_constant (Pmode, virtual_incoming_args_rtx,
11416 i * UNITS_PER_WORD));
11417 MEM_NOTRAP_P (mem) = 1;
11418 set_mem_alias_set (mem, set);
11420 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11421 emit_move_insn (mem, reg);
11425 static void
11426 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11427 tree type, int *, int no_rtl)
11429 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11430 CUMULATIVE_ARGS next_cum;
11431 tree fntype;
11433 /* This argument doesn't appear to be used anymore. Which is good,
11434 because the old code here didn't suppress rtl generation. */
11435 gcc_assert (!no_rtl);
11437 if (!TARGET_64BIT)
11438 return;
11440 fntype = TREE_TYPE (current_function_decl);
11442 /* For varargs, we do not want to skip the dummy va_dcl argument.
11443 For stdargs, we do want to skip the last named argument. */
11444 next_cum = *cum;
11445 if (stdarg_p (fntype))
11446 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11447 true);
11449 if (cum->call_abi == MS_ABI)
11450 setup_incoming_varargs_ms_64 (&next_cum);
11451 else
11452 setup_incoming_varargs_64 (&next_cum);
11455 static void
11456 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11457 machine_mode mode,
11458 tree type,
11459 int *pretend_size ATTRIBUTE_UNUSED,
11460 int no_rtl)
11462 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11463 CUMULATIVE_ARGS next_cum;
11464 tree fntype;
11465 rtx save_area;
11466 int bnd_reg, i, max;
11468 gcc_assert (!no_rtl);
11470 /* Do nothing if we use plain pointer to argument area. */
11471 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11472 return;
11474 fntype = TREE_TYPE (current_function_decl);
11476 /* For varargs, we do not want to skip the dummy va_dcl argument.
11477 For stdargs, we do want to skip the last named argument. */
11478 next_cum = *cum;
11479 if (stdarg_p (fntype))
11480 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11481 true);
11482 save_area = frame_pointer_rtx;
11484 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11485 if (max > X86_64_REGPARM_MAX)
11486 max = X86_64_REGPARM_MAX;
11488 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11489 if (chkp_function_instrumented_p (current_function_decl))
11490 for (i = cum->regno; i < max; i++)
11492 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11493 rtx ptr = gen_rtx_REG (Pmode,
11494 x86_64_int_parameter_registers[i]);
11495 rtx bounds;
11497 if (bnd_reg <= LAST_BND_REG)
11498 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11499 else
11501 rtx ldx_addr =
11502 plus_constant (Pmode, arg_pointer_rtx,
11503 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11504 bounds = gen_reg_rtx (BNDmode);
11505 emit_insn (BNDmode == BND64mode
11506 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11507 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11510 emit_insn (BNDmode == BND64mode
11511 ? gen_bnd64_stx (addr, ptr, bounds)
11512 : gen_bnd32_stx (addr, ptr, bounds));
11514 bnd_reg++;
11519 /* Checks if TYPE is of kind va_list char *. */
11521 static bool
11522 is_va_list_char_pointer (tree type)
11524 tree canonic;
11526 /* For 32-bit it is always true. */
11527 if (!TARGET_64BIT)
11528 return true;
11529 canonic = ix86_canonical_va_list_type (type);
11530 return (canonic == ms_va_list_type_node
11531 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11534 /* Implement va_start. */
11536 static void
11537 ix86_va_start (tree valist, rtx nextarg)
11539 HOST_WIDE_INT words, n_gpr, n_fpr;
11540 tree f_gpr, f_fpr, f_ovf, f_sav;
11541 tree gpr, fpr, ovf, sav, t;
11542 tree type;
11543 rtx ovf_rtx;
11545 if (flag_split_stack
11546 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11548 unsigned int scratch_regno;
11550 /* When we are splitting the stack, we can't refer to the stack
11551 arguments using internal_arg_pointer, because they may be on
11552 the old stack. The split stack prologue will arrange to
11553 leave a pointer to the old stack arguments in a scratch
11554 register, which we here copy to a pseudo-register. The split
11555 stack prologue can't set the pseudo-register directly because
11556 it (the prologue) runs before any registers have been saved. */
11558 scratch_regno = split_stack_prologue_scratch_regno ();
11559 if (scratch_regno != INVALID_REGNUM)
11561 rtx reg;
11562 rtx_insn *seq;
11564 reg = gen_reg_rtx (Pmode);
11565 cfun->machine->split_stack_varargs_pointer = reg;
11567 start_sequence ();
11568 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11569 seq = get_insns ();
11570 end_sequence ();
11572 push_topmost_sequence ();
11573 emit_insn_after (seq, entry_of_function ());
11574 pop_topmost_sequence ();
11578 /* Only 64bit target needs something special. */
11579 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11581 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11582 std_expand_builtin_va_start (valist, nextarg);
11583 else
11585 rtx va_r, next;
11587 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11588 next = expand_binop (ptr_mode, add_optab,
11589 cfun->machine->split_stack_varargs_pointer,
11590 crtl->args.arg_offset_rtx,
11591 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11592 convert_move (va_r, next, 0);
11594 /* Store zero bounds for va_list. */
11595 if (chkp_function_instrumented_p (current_function_decl))
11596 chkp_expand_bounds_reset_for_mem (valist,
11597 make_tree (TREE_TYPE (valist),
11598 next));
11601 return;
11604 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11605 f_fpr = DECL_CHAIN (f_gpr);
11606 f_ovf = DECL_CHAIN (f_fpr);
11607 f_sav = DECL_CHAIN (f_ovf);
11609 valist = build_simple_mem_ref (valist);
11610 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11611 /* The following should be folded into the MEM_REF offset. */
11612 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11613 f_gpr, NULL_TREE);
11614 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11615 f_fpr, NULL_TREE);
11616 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11617 f_ovf, NULL_TREE);
11618 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11619 f_sav, NULL_TREE);
11621 /* Count number of gp and fp argument registers used. */
11622 words = crtl->args.info.words;
11623 n_gpr = crtl->args.info.regno;
11624 n_fpr = crtl->args.info.sse_regno;
11626 if (cfun->va_list_gpr_size)
11628 type = TREE_TYPE (gpr);
11629 t = build2 (MODIFY_EXPR, type,
11630 gpr, build_int_cst (type, n_gpr * 8));
11631 TREE_SIDE_EFFECTS (t) = 1;
11632 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11635 if (TARGET_SSE && cfun->va_list_fpr_size)
11637 type = TREE_TYPE (fpr);
11638 t = build2 (MODIFY_EXPR, type, fpr,
11639 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11640 TREE_SIDE_EFFECTS (t) = 1;
11641 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11644 /* Find the overflow area. */
11645 type = TREE_TYPE (ovf);
11646 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11647 ovf_rtx = crtl->args.internal_arg_pointer;
11648 else
11649 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11650 t = make_tree (type, ovf_rtx);
11651 if (words != 0)
11652 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11654 /* Store zero bounds for overflow area pointer. */
11655 if (chkp_function_instrumented_p (current_function_decl))
11656 chkp_expand_bounds_reset_for_mem (ovf, t);
11658 t = build2 (MODIFY_EXPR, type, ovf, t);
11659 TREE_SIDE_EFFECTS (t) = 1;
11660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11662 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11664 /* Find the register save area.
11665 Prologue of the function save it right above stack frame. */
11666 type = TREE_TYPE (sav);
11667 t = make_tree (type, frame_pointer_rtx);
11668 if (!ix86_varargs_gpr_size)
11669 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11671 /* Store zero bounds for save area pointer. */
11672 if (chkp_function_instrumented_p (current_function_decl))
11673 chkp_expand_bounds_reset_for_mem (sav, t);
11675 t = build2 (MODIFY_EXPR, type, sav, t);
11676 TREE_SIDE_EFFECTS (t) = 1;
11677 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11681 /* Implement va_arg. */
11683 static tree
11684 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11685 gimple_seq *post_p)
11687 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11688 tree f_gpr, f_fpr, f_ovf, f_sav;
11689 tree gpr, fpr, ovf, sav, t;
11690 int size, rsize;
11691 tree lab_false, lab_over = NULL_TREE;
11692 tree addr, t2;
11693 rtx container;
11694 int indirect_p = 0;
11695 tree ptrtype;
11696 machine_mode nat_mode;
11697 unsigned int arg_boundary;
11699 /* Only 64bit target needs something special. */
11700 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11701 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11703 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11704 f_fpr = DECL_CHAIN (f_gpr);
11705 f_ovf = DECL_CHAIN (f_fpr);
11706 f_sav = DECL_CHAIN (f_ovf);
11708 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11709 valist, f_gpr, NULL_TREE);
11711 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11712 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11713 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11715 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11716 if (indirect_p)
11717 type = build_pointer_type (type);
11718 size = int_size_in_bytes (type);
11719 rsize = CEIL (size, UNITS_PER_WORD);
11721 nat_mode = type_natural_mode (type, NULL, false);
11722 switch (nat_mode)
11724 case E_V8SFmode:
11725 case E_V8SImode:
11726 case E_V32QImode:
11727 case E_V16HImode:
11728 case E_V4DFmode:
11729 case E_V4DImode:
11730 case E_V16SFmode:
11731 case E_V16SImode:
11732 case E_V64QImode:
11733 case E_V32HImode:
11734 case E_V8DFmode:
11735 case E_V8DImode:
11736 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11737 if (!TARGET_64BIT_MS_ABI)
11739 container = NULL;
11740 break;
11742 /* FALLTHRU */
11744 default:
11745 container = construct_container (nat_mode, TYPE_MODE (type),
11746 type, 0, X86_64_REGPARM_MAX,
11747 X86_64_SSE_REGPARM_MAX, intreg,
11749 break;
11752 /* Pull the value out of the saved registers. */
11754 addr = create_tmp_var (ptr_type_node, "addr");
11756 if (container)
11758 int needed_intregs, needed_sseregs;
11759 bool need_temp;
11760 tree int_addr, sse_addr;
11762 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11763 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11765 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11767 need_temp = (!REG_P (container)
11768 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11769 || TYPE_ALIGN (type) > 128));
11771 /* In case we are passing structure, verify that it is consecutive block
11772 on the register save area. If not we need to do moves. */
11773 if (!need_temp && !REG_P (container))
11775 /* Verify that all registers are strictly consecutive */
11776 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11778 int i;
11780 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11782 rtx slot = XVECEXP (container, 0, i);
11783 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11784 || INTVAL (XEXP (slot, 1)) != i * 16)
11785 need_temp = true;
11788 else
11790 int i;
11792 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11794 rtx slot = XVECEXP (container, 0, i);
11795 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11796 || INTVAL (XEXP (slot, 1)) != i * 8)
11797 need_temp = true;
11801 if (!need_temp)
11803 int_addr = addr;
11804 sse_addr = addr;
11806 else
11808 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11809 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11812 /* First ensure that we fit completely in registers. */
11813 if (needed_intregs)
11815 t = build_int_cst (TREE_TYPE (gpr),
11816 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11817 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11818 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11819 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11820 gimplify_and_add (t, pre_p);
11822 if (needed_sseregs)
11824 t = build_int_cst (TREE_TYPE (fpr),
11825 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11826 + X86_64_REGPARM_MAX * 8);
11827 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11828 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11829 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11830 gimplify_and_add (t, pre_p);
11833 /* Compute index to start of area used for integer regs. */
11834 if (needed_intregs)
11836 /* int_addr = gpr + sav; */
11837 t = fold_build_pointer_plus (sav, gpr);
11838 gimplify_assign (int_addr, t, pre_p);
11840 if (needed_sseregs)
11842 /* sse_addr = fpr + sav; */
11843 t = fold_build_pointer_plus (sav, fpr);
11844 gimplify_assign (sse_addr, t, pre_p);
11846 if (need_temp)
11848 int i, prev_size = 0;
11849 tree temp = create_tmp_var (type, "va_arg_tmp");
11851 /* addr = &temp; */
11852 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11853 gimplify_assign (addr, t, pre_p);
11855 for (i = 0; i < XVECLEN (container, 0); i++)
11857 rtx slot = XVECEXP (container, 0, i);
11858 rtx reg = XEXP (slot, 0);
11859 machine_mode mode = GET_MODE (reg);
11860 tree piece_type;
11861 tree addr_type;
11862 tree daddr_type;
11863 tree src_addr, src;
11864 int src_offset;
11865 tree dest_addr, dest;
11866 int cur_size = GET_MODE_SIZE (mode);
11868 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11869 prev_size = INTVAL (XEXP (slot, 1));
11870 if (prev_size + cur_size > size)
11872 cur_size = size - prev_size;
11873 unsigned int nbits = cur_size * BITS_PER_UNIT;
11874 if (!int_mode_for_size (nbits, 1).exists (&mode))
11875 mode = QImode;
11877 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11878 if (mode == GET_MODE (reg))
11879 addr_type = build_pointer_type (piece_type);
11880 else
11881 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11882 true);
11883 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11884 true);
11886 if (SSE_REGNO_P (REGNO (reg)))
11888 src_addr = sse_addr;
11889 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11891 else
11893 src_addr = int_addr;
11894 src_offset = REGNO (reg) * 8;
11896 src_addr = fold_convert (addr_type, src_addr);
11897 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11899 dest_addr = fold_convert (daddr_type, addr);
11900 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11901 if (cur_size == GET_MODE_SIZE (mode))
11903 src = build_va_arg_indirect_ref (src_addr);
11904 dest = build_va_arg_indirect_ref (dest_addr);
11906 gimplify_assign (dest, src, pre_p);
11908 else
11910 tree copy
11911 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11912 3, dest_addr, src_addr,
11913 size_int (cur_size));
11914 gimplify_and_add (copy, pre_p);
11916 prev_size += cur_size;
11920 if (needed_intregs)
11922 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11923 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11924 gimplify_assign (gpr, t, pre_p);
11927 if (needed_sseregs)
11929 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11930 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11931 gimplify_assign (unshare_expr (fpr), t, pre_p);
11934 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11936 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11939 /* ... otherwise out of the overflow area. */
11941 /* When we align parameter on stack for caller, if the parameter
11942 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11943 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11944 here with caller. */
11945 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11946 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11947 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11949 /* Care for on-stack alignment if needed. */
11950 if (arg_boundary <= 64 || size == 0)
11951 t = ovf;
11952 else
11954 HOST_WIDE_INT align = arg_boundary / 8;
11955 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11956 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11957 build_int_cst (TREE_TYPE (t), -align));
11960 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11961 gimplify_assign (addr, t, pre_p);
11963 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11964 gimplify_assign (unshare_expr (ovf), t, pre_p);
11966 if (container)
11967 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11969 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11970 addr = fold_convert (ptrtype, addr);
11972 if (indirect_p)
11973 addr = build_va_arg_indirect_ref (addr);
11974 return build_va_arg_indirect_ref (addr);
11977 /* Return true if OPNUM's MEM should be matched
11978 in movabs* patterns. */
11980 bool
11981 ix86_check_movabs (rtx insn, int opnum)
11983 rtx set, mem;
11985 set = PATTERN (insn);
11986 if (GET_CODE (set) == PARALLEL)
11987 set = XVECEXP (set, 0, 0);
11988 gcc_assert (GET_CODE (set) == SET);
11989 mem = XEXP (set, opnum);
11990 while (SUBREG_P (mem))
11991 mem = SUBREG_REG (mem);
11992 gcc_assert (MEM_P (mem));
11993 return volatile_ok || !MEM_VOLATILE_P (mem);
11996 /* Return false if INSN contains a MEM with a non-default address space. */
11997 bool
11998 ix86_check_no_addr_space (rtx insn)
12000 subrtx_var_iterator::array_type array;
12001 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
12003 rtx x = *iter;
12004 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
12005 return false;
12007 return true;
12010 /* Initialize the table of extra 80387 mathematical constants. */
12012 static void
12013 init_ext_80387_constants (void)
12015 static const char * cst[5] =
12017 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
12018 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
12019 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
12020 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
12021 "3.1415926535897932385128089594061862044", /* 4: fldpi */
12023 int i;
12025 for (i = 0; i < 5; i++)
12027 real_from_string (&ext_80387_constants_table[i], cst[i]);
12028 /* Ensure each constant is rounded to XFmode precision. */
12029 real_convert (&ext_80387_constants_table[i],
12030 XFmode, &ext_80387_constants_table[i]);
12033 ext_80387_constants_init = 1;
12036 /* Return non-zero if the constant is something that
12037 can be loaded with a special instruction. */
12040 standard_80387_constant_p (rtx x)
12042 machine_mode mode = GET_MODE (x);
12044 const REAL_VALUE_TYPE *r;
12046 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
12047 return -1;
12049 if (x == CONST0_RTX (mode))
12050 return 1;
12051 if (x == CONST1_RTX (mode))
12052 return 2;
12054 r = CONST_DOUBLE_REAL_VALUE (x);
12056 /* For XFmode constants, try to find a special 80387 instruction when
12057 optimizing for size or on those CPUs that benefit from them. */
12058 if (mode == XFmode
12059 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
12061 int i;
12063 if (! ext_80387_constants_init)
12064 init_ext_80387_constants ();
12066 for (i = 0; i < 5; i++)
12067 if (real_identical (r, &ext_80387_constants_table[i]))
12068 return i + 3;
12071 /* Load of the constant -0.0 or -1.0 will be split as
12072 fldz;fchs or fld1;fchs sequence. */
12073 if (real_isnegzero (r))
12074 return 8;
12075 if (real_identical (r, &dconstm1))
12076 return 9;
12078 return 0;
12081 /* Return the opcode of the special instruction to be used to load
12082 the constant X. */
12084 const char *
12085 standard_80387_constant_opcode (rtx x)
12087 switch (standard_80387_constant_p (x))
12089 case 1:
12090 return "fldz";
12091 case 2:
12092 return "fld1";
12093 case 3:
12094 return "fldlg2";
12095 case 4:
12096 return "fldln2";
12097 case 5:
12098 return "fldl2e";
12099 case 6:
12100 return "fldl2t";
12101 case 7:
12102 return "fldpi";
12103 case 8:
12104 case 9:
12105 return "#";
12106 default:
12107 gcc_unreachable ();
12111 /* Return the CONST_DOUBLE representing the 80387 constant that is
12112 loaded by the specified special instruction. The argument IDX
12113 matches the return value from standard_80387_constant_p. */
12116 standard_80387_constant_rtx (int idx)
12118 int i;
12120 if (! ext_80387_constants_init)
12121 init_ext_80387_constants ();
12123 switch (idx)
12125 case 3:
12126 case 4:
12127 case 5:
12128 case 6:
12129 case 7:
12130 i = idx - 3;
12131 break;
12133 default:
12134 gcc_unreachable ();
12137 return const_double_from_real_value (ext_80387_constants_table[i],
12138 XFmode);
12141 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12142 in supported SSE/AVX vector mode. */
12145 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12147 machine_mode mode;
12149 if (!TARGET_SSE)
12150 return 0;
12152 mode = GET_MODE (x);
12154 if (x == const0_rtx || const0_operand (x, mode))
12155 return 1;
12157 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12159 /* VOIDmode integer constant, get mode from the predicate. */
12160 if (mode == VOIDmode)
12161 mode = pred_mode;
12163 switch (GET_MODE_SIZE (mode))
12165 case 64:
12166 if (TARGET_AVX512F)
12167 return 2;
12168 break;
12169 case 32:
12170 if (TARGET_AVX2)
12171 return 2;
12172 break;
12173 case 16:
12174 if (TARGET_SSE2)
12175 return 2;
12176 break;
12177 case 0:
12178 /* VOIDmode */
12179 gcc_unreachable ();
12180 default:
12181 break;
12185 return 0;
12188 /* Return the opcode of the special instruction to be used to load
12189 the constant X. */
12191 const char *
12192 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12194 machine_mode mode;
12196 gcc_assert (TARGET_SSE);
12198 mode = GET_MODE (x);
12200 if (x == const0_rtx || const0_operand (x, mode))
12202 switch (get_attr_mode (insn))
12204 case MODE_XI:
12205 return "vpxord\t%g0, %g0, %g0";
12206 case MODE_OI:
12207 return (TARGET_AVX512VL
12208 ? "vpxord\t%x0, %x0, %x0"
12209 : "vpxor\t%x0, %x0, %x0");
12210 case MODE_TI:
12211 return (TARGET_AVX512VL
12212 ? "vpxord\t%t0, %t0, %t0"
12213 : "%vpxor\t%0, %d0");
12215 case MODE_V8DF:
12216 return (TARGET_AVX512DQ
12217 ? "vxorpd\t%g0, %g0, %g0"
12218 : "vpxorq\t%g0, %g0, %g0");
12219 case MODE_V4DF:
12220 return "vxorpd\t%x0, %x0, %x0";
12221 case MODE_V2DF:
12222 return "%vxorpd\t%0, %d0";
12224 case MODE_V16SF:
12225 return (TARGET_AVX512DQ
12226 ? "vxorps\t%g0, %g0, %g0"
12227 : "vpxord\t%g0, %g0, %g0");
12228 case MODE_V8SF:
12229 return "vxorps\t%x0, %x0, %x0";
12230 case MODE_V4SF:
12231 return "%vxorps\t%0, %d0";
12233 default:
12234 gcc_unreachable ();
12237 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12239 enum attr_mode insn_mode = get_attr_mode (insn);
12241 switch (insn_mode)
12243 case MODE_XI:
12244 case MODE_V8DF:
12245 case MODE_V16SF:
12246 gcc_assert (TARGET_AVX512F);
12247 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12249 case MODE_OI:
12250 case MODE_V4DF:
12251 case MODE_V8SF:
12252 gcc_assert (TARGET_AVX2);
12253 /* FALLTHRU */
12254 case MODE_TI:
12255 case MODE_V2DF:
12256 case MODE_V4SF:
12257 gcc_assert (TARGET_SSE2);
12258 return (TARGET_AVX
12259 ? "vpcmpeqd\t%0, %0, %0"
12260 : "pcmpeqd\t%0, %0");
12262 default:
12263 gcc_unreachable ();
12267 gcc_unreachable ();
12270 /* Returns true if INSN can be transformed from a memory load
12271 to a supported FP constant load. */
12273 bool
12274 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12276 rtx src = find_constant_src (insn);
12278 gcc_assert (REG_P (dst));
12280 if (src == NULL
12281 || (SSE_REGNO_P (REGNO (dst))
12282 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12283 || (STACK_REGNO_P (REGNO (dst))
12284 && standard_80387_constant_p (src) < 1))
12285 return false;
12287 return true;
12290 /* Returns true if OP contains a symbol reference */
12292 bool
12293 symbolic_reference_mentioned_p (rtx op)
12295 const char *fmt;
12296 int i;
12298 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12299 return true;
12301 fmt = GET_RTX_FORMAT (GET_CODE (op));
12302 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12304 if (fmt[i] == 'E')
12306 int j;
12308 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12309 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12310 return true;
12313 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12314 return true;
12317 return false;
12320 /* Return true if it is appropriate to emit `ret' instructions in the
12321 body of a function. Do this only if the epilogue is simple, needing a
12322 couple of insns. Prior to reloading, we can't tell how many registers
12323 must be saved, so return false then. Return false if there is no frame
12324 marker to de-allocate. */
12326 bool
12327 ix86_can_use_return_insn_p (void)
12329 struct ix86_frame frame;
12331 if (ix86_function_naked (current_function_decl))
12332 return false;
12334 /* Don't use `ret' instruction in interrupt handler. */
12335 if (! reload_completed
12336 || frame_pointer_needed
12337 || cfun->machine->func_type != TYPE_NORMAL)
12338 return 0;
12340 /* Don't allow more than 32k pop, since that's all we can do
12341 with one instruction. */
12342 if (crtl->args.pops_args && crtl->args.size >= 32768)
12343 return 0;
12345 frame = cfun->machine->frame;
12346 return (frame.stack_pointer_offset == UNITS_PER_WORD
12347 && (frame.nregs + frame.nsseregs) == 0);
12350 /* Value should be nonzero if functions must have frame pointers.
12351 Zero means the frame pointer need not be set up (and parms may
12352 be accessed via the stack pointer) in functions that seem suitable. */
12354 static bool
12355 ix86_frame_pointer_required (void)
12357 /* If we accessed previous frames, then the generated code expects
12358 to be able to access the saved ebp value in our frame. */
12359 if (cfun->machine->accesses_prev_frame)
12360 return true;
12362 /* Several x86 os'es need a frame pointer for other reasons,
12363 usually pertaining to setjmp. */
12364 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12365 return true;
12367 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12368 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12369 return true;
12371 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12372 allocation is 4GB. */
12373 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12374 return true;
12376 /* SSE saves require frame-pointer when stack is misaligned. */
12377 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12378 return true;
12380 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12381 turns off the frame pointer by default. Turn it back on now if
12382 we've not got a leaf function. */
12383 if (TARGET_OMIT_LEAF_FRAME_POINTER
12384 && (!crtl->is_leaf
12385 || ix86_current_function_calls_tls_descriptor))
12386 return true;
12388 if (crtl->profile && !flag_fentry)
12389 return true;
12391 return false;
12394 /* Record that the current function accesses previous call frames. */
12396 void
12397 ix86_setup_frame_addresses (void)
12399 cfun->machine->accesses_prev_frame = 1;
12402 #ifndef USE_HIDDEN_LINKONCE
12403 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12404 # define USE_HIDDEN_LINKONCE 1
12405 # else
12406 # define USE_HIDDEN_LINKONCE 0
12407 # endif
12408 #endif
12410 static int pic_labels_used;
12412 /* Fills in the label name that should be used for a pc thunk for
12413 the given register. */
12415 static void
12416 get_pc_thunk_name (char name[32], unsigned int regno)
12418 gcc_assert (!TARGET_64BIT);
12420 if (USE_HIDDEN_LINKONCE)
12421 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12422 else
12423 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12427 /* This function generates code for -fpic that loads %ebx with
12428 the return address of the caller and then returns. */
12430 static void
12431 ix86_code_end (void)
12433 rtx xops[2];
12434 int regno;
12436 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12438 char name[32];
12439 tree decl;
12441 if (!(pic_labels_used & (1 << regno)))
12442 continue;
12444 get_pc_thunk_name (name, regno);
12446 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12447 get_identifier (name),
12448 build_function_type_list (void_type_node, NULL_TREE));
12449 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12450 NULL_TREE, void_type_node);
12451 TREE_PUBLIC (decl) = 1;
12452 TREE_STATIC (decl) = 1;
12453 DECL_IGNORED_P (decl) = 1;
12455 #if TARGET_MACHO
12456 if (TARGET_MACHO)
12458 switch_to_section (darwin_sections[picbase_thunk_section]);
12459 fputs ("\t.weak_definition\t", asm_out_file);
12460 assemble_name (asm_out_file, name);
12461 fputs ("\n\t.private_extern\t", asm_out_file);
12462 assemble_name (asm_out_file, name);
12463 putc ('\n', asm_out_file);
12464 ASM_OUTPUT_LABEL (asm_out_file, name);
12465 DECL_WEAK (decl) = 1;
12467 else
12468 #endif
12469 if (USE_HIDDEN_LINKONCE)
12471 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12473 targetm.asm_out.unique_section (decl, 0);
12474 switch_to_section (get_named_section (decl, NULL, 0));
12476 targetm.asm_out.globalize_label (asm_out_file, name);
12477 fputs ("\t.hidden\t", asm_out_file);
12478 assemble_name (asm_out_file, name);
12479 putc ('\n', asm_out_file);
12480 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12482 else
12484 switch_to_section (text_section);
12485 ASM_OUTPUT_LABEL (asm_out_file, name);
12488 DECL_INITIAL (decl) = make_node (BLOCK);
12489 current_function_decl = decl;
12490 allocate_struct_function (decl, false);
12491 init_function_start (decl);
12492 /* We're about to hide the function body from callees of final_* by
12493 emitting it directly; tell them we're a thunk, if they care. */
12494 cfun->is_thunk = true;
12495 first_function_block_is_cold = false;
12496 /* Make sure unwind info is emitted for the thunk if needed. */
12497 final_start_function (emit_barrier (), asm_out_file, 1);
12499 /* Pad stack IP move with 4 instructions (two NOPs count
12500 as one instruction). */
12501 if (TARGET_PAD_SHORT_FUNCTION)
12503 int i = 8;
12505 while (i--)
12506 fputs ("\tnop\n", asm_out_file);
12509 xops[0] = gen_rtx_REG (Pmode, regno);
12510 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12511 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12512 output_asm_insn ("%!ret", NULL);
12513 final_end_function ();
12514 init_insn_lengths ();
12515 free_after_compilation (cfun);
12516 set_cfun (NULL);
12517 current_function_decl = NULL;
12520 if (flag_split_stack)
12521 file_end_indicate_split_stack ();
12524 /* Emit code for the SET_GOT patterns. */
12526 const char *
12527 output_set_got (rtx dest, rtx label)
12529 rtx xops[3];
12531 xops[0] = dest;
12533 if (TARGET_VXWORKS_RTP && flag_pic)
12535 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12536 xops[2] = gen_rtx_MEM (Pmode,
12537 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12538 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12540 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12541 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12542 an unadorned address. */
12543 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12544 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12545 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12546 return "";
12549 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12551 if (flag_pic)
12553 char name[32];
12554 get_pc_thunk_name (name, REGNO (dest));
12555 pic_labels_used |= 1 << REGNO (dest);
12557 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12558 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12559 output_asm_insn ("%!call\t%X2", xops);
12561 #if TARGET_MACHO
12562 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12563 This is what will be referenced by the Mach-O PIC subsystem. */
12564 if (machopic_should_output_picbase_label () || !label)
12565 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12567 /* When we are restoring the pic base at the site of a nonlocal label,
12568 and we decided to emit the pic base above, we will still output a
12569 local label used for calculating the correction offset (even though
12570 the offset will be 0 in that case). */
12571 if (label)
12572 targetm.asm_out.internal_label (asm_out_file, "L",
12573 CODE_LABEL_NUMBER (label));
12574 #endif
12576 else
12578 if (TARGET_MACHO)
12579 /* We don't need a pic base, we're not producing pic. */
12580 gcc_unreachable ();
12582 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12583 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12584 targetm.asm_out.internal_label (asm_out_file, "L",
12585 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12588 if (!TARGET_MACHO)
12589 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12591 return "";
12594 /* Generate an "push" pattern for input ARG. */
12596 static rtx
12597 gen_push (rtx arg)
12599 struct machine_function *m = cfun->machine;
12601 if (m->fs.cfa_reg == stack_pointer_rtx)
12602 m->fs.cfa_offset += UNITS_PER_WORD;
12603 m->fs.sp_offset += UNITS_PER_WORD;
12605 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12606 arg = gen_rtx_REG (word_mode, REGNO (arg));
12608 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12609 gen_rtx_PRE_DEC (Pmode,
12610 stack_pointer_rtx)),
12611 arg);
12614 /* Generate an "pop" pattern for input ARG. */
12616 static rtx
12617 gen_pop (rtx arg)
12619 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12620 arg = gen_rtx_REG (word_mode, REGNO (arg));
12622 return gen_rtx_SET (arg,
12623 gen_rtx_MEM (word_mode,
12624 gen_rtx_POST_INC (Pmode,
12625 stack_pointer_rtx)));
12628 /* Return >= 0 if there is an unused call-clobbered register available
12629 for the entire function. */
12631 static unsigned int
12632 ix86_select_alt_pic_regnum (void)
12634 if (ix86_use_pseudo_pic_reg ())
12635 return INVALID_REGNUM;
12637 if (crtl->is_leaf
12638 && !crtl->profile
12639 && !ix86_current_function_calls_tls_descriptor)
12641 int i, drap;
12642 /* Can't use the same register for both PIC and DRAP. */
12643 if (crtl->drap_reg)
12644 drap = REGNO (crtl->drap_reg);
12645 else
12646 drap = -1;
12647 for (i = 2; i >= 0; --i)
12648 if (i != drap && !df_regs_ever_live_p (i))
12649 return i;
12652 return INVALID_REGNUM;
12655 /* Return true if REGNO is used by the epilogue. */
12657 bool
12658 ix86_epilogue_uses (int regno)
12660 /* If there are no caller-saved registers, we preserve all registers,
12661 except for MMX and x87 registers which aren't supported when saving
12662 and restoring registers. Don't explicitly save SP register since
12663 it is always preserved. */
12664 return (epilogue_completed
12665 && cfun->machine->no_caller_saved_registers
12666 && !fixed_regs[regno]
12667 && !STACK_REGNO_P (regno)
12668 && !MMX_REGNO_P (regno));
12671 /* Return nonzero if register REGNO can be used as a scratch register
12672 in peephole2. */
12674 static bool
12675 ix86_hard_regno_scratch_ok (unsigned int regno)
12677 /* If there are no caller-saved registers, we can't use any register
12678 as a scratch register after epilogue and use REGNO as scratch
12679 register only if it has been used before to avoid saving and
12680 restoring it. */
12681 return (!cfun->machine->no_caller_saved_registers
12682 || (!epilogue_completed
12683 && df_regs_ever_live_p (regno)));
12686 /* Return true if register class CL should be an additional allocno
12687 class. */
12689 static bool
12690 ix86_additional_allocno_class_p (reg_class_t cl)
12692 return cl == MOD4_SSE_REGS;
12695 /* Return TRUE if we need to save REGNO. */
12697 static bool
12698 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12700 /* If there are no caller-saved registers, we preserve all registers,
12701 except for MMX and x87 registers which aren't supported when saving
12702 and restoring registers. Don't explicitly save SP register since
12703 it is always preserved. */
12704 if (cfun->machine->no_caller_saved_registers)
12706 /* Don't preserve registers used for function return value. */
12707 rtx reg = crtl->return_rtx;
12708 if (reg)
12710 unsigned int i = REGNO (reg);
12711 unsigned int nregs = REG_NREGS (reg);
12712 while (nregs-- > 0)
12713 if ((i + nregs) == regno)
12714 return false;
12716 reg = crtl->return_bnd;
12717 if (reg)
12719 i = REGNO (reg);
12720 nregs = REG_NREGS (reg);
12721 while (nregs-- > 0)
12722 if ((i + nregs) == regno)
12723 return false;
12727 return (df_regs_ever_live_p (regno)
12728 && !fixed_regs[regno]
12729 && !STACK_REGNO_P (regno)
12730 && !MMX_REGNO_P (regno)
12731 && (regno != HARD_FRAME_POINTER_REGNUM
12732 || !frame_pointer_needed));
12735 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12736 && pic_offset_table_rtx)
12738 if (ix86_use_pseudo_pic_reg ())
12740 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12741 _mcount in prologue. */
12742 if (!TARGET_64BIT && flag_pic && crtl->profile)
12743 return true;
12745 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12746 || crtl->profile
12747 || crtl->calls_eh_return
12748 || crtl->uses_const_pool
12749 || cfun->has_nonlocal_label)
12750 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12753 if (crtl->calls_eh_return && maybe_eh_return)
12755 unsigned i;
12756 for (i = 0; ; i++)
12758 unsigned test = EH_RETURN_DATA_REGNO (i);
12759 if (test == INVALID_REGNUM)
12760 break;
12761 if (test == regno)
12762 return true;
12766 if (ignore_outlined && cfun->machine->call_ms2sysv)
12768 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12769 + xlogue_layout::MIN_REGS;
12770 if (xlogue_layout::is_stub_managed_reg (regno, count))
12771 return false;
12774 if (crtl->drap_reg
12775 && regno == REGNO (crtl->drap_reg)
12776 && !cfun->machine->no_drap_save_restore)
12777 return true;
12779 return (df_regs_ever_live_p (regno)
12780 && !call_used_regs[regno]
12781 && !fixed_regs[regno]
12782 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12785 /* Return number of saved general prupose registers. */
12787 static int
12788 ix86_nsaved_regs (void)
12790 int nregs = 0;
12791 int regno;
12793 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12794 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12795 nregs ++;
12796 return nregs;
12799 /* Return number of saved SSE registers. */
12801 static int
12802 ix86_nsaved_sseregs (void)
12804 int nregs = 0;
12805 int regno;
12807 if (!TARGET_64BIT_MS_ABI)
12808 return 0;
12809 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12810 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12811 nregs ++;
12812 return nregs;
12815 /* Given FROM and TO register numbers, say whether this elimination is
12816 allowed. If stack alignment is needed, we can only replace argument
12817 pointer with hard frame pointer, or replace frame pointer with stack
12818 pointer. Otherwise, frame pointer elimination is automatically
12819 handled and all other eliminations are valid. */
12821 static bool
12822 ix86_can_eliminate (const int from, const int to)
12824 if (stack_realign_fp)
12825 return ((from == ARG_POINTER_REGNUM
12826 && to == HARD_FRAME_POINTER_REGNUM)
12827 || (from == FRAME_POINTER_REGNUM
12828 && to == STACK_POINTER_REGNUM));
12829 else
12830 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12833 /* Return the offset between two registers, one to be eliminated, and the other
12834 its replacement, at the start of a routine. */
12836 HOST_WIDE_INT
12837 ix86_initial_elimination_offset (int from, int to)
12839 struct ix86_frame frame = cfun->machine->frame;
12841 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12842 return frame.hard_frame_pointer_offset;
12843 else if (from == FRAME_POINTER_REGNUM
12844 && to == HARD_FRAME_POINTER_REGNUM)
12845 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12846 else
12848 gcc_assert (to == STACK_POINTER_REGNUM);
12850 if (from == ARG_POINTER_REGNUM)
12851 return frame.stack_pointer_offset;
12853 gcc_assert (from == FRAME_POINTER_REGNUM);
12854 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12858 /* In a dynamically-aligned function, we can't know the offset from
12859 stack pointer to frame pointer, so we must ensure that setjmp
12860 eliminates fp against the hard fp (%ebp) rather than trying to
12861 index from %esp up to the top of the frame across a gap that is
12862 of unknown (at compile-time) size. */
12863 static rtx
12864 ix86_builtin_setjmp_frame_value (void)
12866 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12869 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12870 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12872 static bool warned_once = false;
12873 if (!warned_once)
12875 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12876 feature);
12877 warned_once = true;
12881 /* When using -fsplit-stack, the allocation routines set a field in
12882 the TCB to the bottom of the stack plus this much space, measured
12883 in bytes. */
12885 #define SPLIT_STACK_AVAILABLE 256
12887 /* Fill structure ix86_frame about frame of currently computed function. */
12889 static void
12890 ix86_compute_frame_layout (void)
12892 struct ix86_frame *frame = &cfun->machine->frame;
12893 struct machine_function *m = cfun->machine;
12894 unsigned HOST_WIDE_INT stack_alignment_needed;
12895 HOST_WIDE_INT offset;
12896 unsigned HOST_WIDE_INT preferred_alignment;
12897 HOST_WIDE_INT size = get_frame_size ();
12898 HOST_WIDE_INT to_allocate;
12900 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12901 * ms_abi functions that call a sysv function. We now need to prune away
12902 * cases where it should be disabled. */
12903 if (TARGET_64BIT && m->call_ms2sysv)
12905 gcc_assert (TARGET_64BIT_MS_ABI);
12906 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12907 gcc_assert (!TARGET_SEH);
12908 gcc_assert (TARGET_SSE);
12909 gcc_assert (!ix86_using_red_zone ());
12911 if (crtl->calls_eh_return)
12913 gcc_assert (!reload_completed);
12914 m->call_ms2sysv = false;
12915 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12918 else if (ix86_static_chain_on_stack)
12920 gcc_assert (!reload_completed);
12921 m->call_ms2sysv = false;
12922 warn_once_call_ms2sysv_xlogues ("static call chains");
12925 /* Finally, compute which registers the stub will manage. */
12926 else
12928 unsigned count = xlogue_layout::count_stub_managed_regs ();
12929 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12930 m->call_ms2sysv_pad_in = 0;
12934 frame->nregs = ix86_nsaved_regs ();
12935 frame->nsseregs = ix86_nsaved_sseregs ();
12937 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12938 except for function prologues, leaf functions and when the defult
12939 incoming stack boundary is overriden at command line or via
12940 force_align_arg_pointer attribute. */
12941 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12942 && (!crtl->is_leaf || cfun->calls_alloca != 0
12943 || ix86_current_function_calls_tls_descriptor
12944 || ix86_incoming_stack_boundary < 128))
12946 crtl->preferred_stack_boundary = 128;
12947 crtl->stack_alignment_needed = 128;
12950 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12951 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12953 gcc_assert (!size || stack_alignment_needed);
12954 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12955 gcc_assert (preferred_alignment <= stack_alignment_needed);
12957 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
12958 gcc_assert (TARGET_64BIT || !frame->nsseregs);
12959 if (TARGET_64BIT && m->call_ms2sysv)
12961 gcc_assert (stack_alignment_needed >= 16);
12962 gcc_assert (!frame->nsseregs);
12965 /* For SEH we have to limit the amount of code movement into the prologue.
12966 At present we do this via a BLOCKAGE, at which point there's very little
12967 scheduling that can be done, which means that there's very little point
12968 in doing anything except PUSHs. */
12969 if (TARGET_SEH)
12970 m->use_fast_prologue_epilogue = false;
12971 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12973 int count = frame->nregs;
12974 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12976 /* The fast prologue uses move instead of push to save registers. This
12977 is significantly longer, but also executes faster as modern hardware
12978 can execute the moves in parallel, but can't do that for push/pop.
12980 Be careful about choosing what prologue to emit: When function takes
12981 many instructions to execute we may use slow version as well as in
12982 case function is known to be outside hot spot (this is known with
12983 feedback only). Weight the size of function by number of registers
12984 to save as it is cheap to use one or two push instructions but very
12985 slow to use many of them. */
12986 if (count)
12987 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12988 if (node->frequency < NODE_FREQUENCY_NORMAL
12989 || (flag_branch_probabilities
12990 && node->frequency < NODE_FREQUENCY_HOT))
12991 m->use_fast_prologue_epilogue = false;
12992 else
12993 m->use_fast_prologue_epilogue
12994 = !expensive_function_p (count);
12997 frame->save_regs_using_mov
12998 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12999 /* If static stack checking is enabled and done with probes,
13000 the registers need to be saved before allocating the frame. */
13001 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
13003 /* Skip return address and error code in exception handler. */
13004 offset = INCOMING_FRAME_SP_OFFSET;
13006 /* Skip pushed static chain. */
13007 if (ix86_static_chain_on_stack)
13008 offset += UNITS_PER_WORD;
13010 /* Skip saved base pointer. */
13011 if (frame_pointer_needed)
13012 offset += UNITS_PER_WORD;
13013 frame->hfp_save_offset = offset;
13015 /* The traditional frame pointer location is at the top of the frame. */
13016 frame->hard_frame_pointer_offset = offset;
13018 /* Register save area */
13019 offset += frame->nregs * UNITS_PER_WORD;
13020 frame->reg_save_offset = offset;
13022 /* On SEH target, registers are pushed just before the frame pointer
13023 location. */
13024 if (TARGET_SEH)
13025 frame->hard_frame_pointer_offset = offset;
13027 /* Calculate the size of the va-arg area (not including padding, if any). */
13028 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
13030 if (stack_realign_fp)
13032 /* We may need a 16-byte aligned stack for the remainder of the
13033 register save area, but the stack frame for the local function
13034 may require a greater alignment if using AVX/2/512. In order
13035 to avoid wasting space, we first calculate the space needed for
13036 the rest of the register saves, add that to the stack pointer,
13037 and then realign the stack to the boundary of the start of the
13038 frame for the local function. */
13039 HOST_WIDE_INT space_needed = 0;
13040 HOST_WIDE_INT sse_reg_space_needed = 0;
13042 if (TARGET_64BIT)
13044 if (m->call_ms2sysv)
13046 m->call_ms2sysv_pad_in = 0;
13047 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
13050 else if (frame->nsseregs)
13051 /* The only ABI that has saved SSE registers (Win64) also has a
13052 16-byte aligned default stack. However, many programs violate
13053 the ABI, and Wine64 forces stack realignment to compensate. */
13054 space_needed = frame->nsseregs * 16;
13056 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
13058 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
13059 rounding to be pedantic. */
13060 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
13062 else
13063 space_needed = frame->va_arg_size;
13065 /* Record the allocation size required prior to the realignment AND. */
13066 frame->stack_realign_allocate = space_needed;
13068 /* The re-aligned stack starts at frame->stack_realign_offset. Values
13069 before this point are not directly comparable with values below
13070 this point. Use sp_valid_at to determine if the stack pointer is
13071 valid for a given offset, fp_valid_at for the frame pointer, or
13072 choose_baseaddr to have a base register chosen for you.
13074 Note that the result of (frame->stack_realign_offset
13075 & (stack_alignment_needed - 1)) may not equal zero. */
13076 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
13077 frame->stack_realign_offset = offset - space_needed;
13078 frame->sse_reg_save_offset = frame->stack_realign_offset
13079 + sse_reg_space_needed;
13081 else
13083 frame->stack_realign_offset = offset;
13085 if (TARGET_64BIT && m->call_ms2sysv)
13087 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
13088 offset += xlogue_layout::get_instance ().get_stack_space_used ();
13091 /* Align and set SSE register save area. */
13092 else if (frame->nsseregs)
13094 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
13095 required and the DRAP re-alignment boundary is at least 16 bytes,
13096 then we want the SSE register save area properly aligned. */
13097 if (ix86_incoming_stack_boundary >= 128
13098 || (stack_realign_drap && stack_alignment_needed >= 16))
13099 offset = ROUND_UP (offset, 16);
13100 offset += frame->nsseregs * 16;
13102 frame->sse_reg_save_offset = offset;
13103 offset += frame->va_arg_size;
13106 /* Align start of frame for local function. */
13107 if (m->call_ms2sysv
13108 || frame->va_arg_size != 0
13109 || size != 0
13110 || !crtl->is_leaf
13111 || cfun->calls_alloca
13112 || ix86_current_function_calls_tls_descriptor)
13113 offset = ROUND_UP (offset, stack_alignment_needed);
13115 /* Frame pointer points here. */
13116 frame->frame_pointer_offset = offset;
13118 offset += size;
13120 /* Add outgoing arguments area. Can be skipped if we eliminated
13121 all the function calls as dead code.
13122 Skipping is however impossible when function calls alloca. Alloca
13123 expander assumes that last crtl->outgoing_args_size
13124 of stack frame are unused. */
13125 if (ACCUMULATE_OUTGOING_ARGS
13126 && (!crtl->is_leaf || cfun->calls_alloca
13127 || ix86_current_function_calls_tls_descriptor))
13129 offset += crtl->outgoing_args_size;
13130 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13132 else
13133 frame->outgoing_arguments_size = 0;
13135 /* Align stack boundary. Only needed if we're calling another function
13136 or using alloca. */
13137 if (!crtl->is_leaf || cfun->calls_alloca
13138 || ix86_current_function_calls_tls_descriptor)
13139 offset = ROUND_UP (offset, preferred_alignment);
13141 /* We've reached end of stack frame. */
13142 frame->stack_pointer_offset = offset;
13144 /* Size prologue needs to allocate. */
13145 to_allocate = offset - frame->sse_reg_save_offset;
13147 if ((!to_allocate && frame->nregs <= 1)
13148 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13149 frame->save_regs_using_mov = false;
13151 if (ix86_using_red_zone ()
13152 && crtl->sp_is_unchanging
13153 && crtl->is_leaf
13154 && !ix86_pc_thunk_call_expanded
13155 && !ix86_current_function_calls_tls_descriptor)
13157 frame->red_zone_size = to_allocate;
13158 if (frame->save_regs_using_mov)
13159 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13160 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13161 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13163 else
13164 frame->red_zone_size = 0;
13165 frame->stack_pointer_offset -= frame->red_zone_size;
13167 /* The SEH frame pointer location is near the bottom of the frame.
13168 This is enforced by the fact that the difference between the
13169 stack pointer and the frame pointer is limited to 240 bytes in
13170 the unwind data structure. */
13171 if (TARGET_SEH)
13173 HOST_WIDE_INT diff;
13175 /* If we can leave the frame pointer where it is, do so. Also, returns
13176 the establisher frame for __builtin_frame_address (0). */
13177 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13178 if (diff <= SEH_MAX_FRAME_SIZE
13179 && (diff > 240 || (diff & 15) != 0)
13180 && !crtl->accesses_prior_frames)
13182 /* Ideally we'd determine what portion of the local stack frame
13183 (within the constraint of the lowest 240) is most heavily used.
13184 But without that complication, simply bias the frame pointer
13185 by 128 bytes so as to maximize the amount of the local stack
13186 frame that is addressable with 8-bit offsets. */
13187 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13192 /* This is semi-inlined memory_address_length, but simplified
13193 since we know that we're always dealing with reg+offset, and
13194 to avoid having to create and discard all that rtl. */
13196 static inline int
13197 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13199 int len = 4;
13201 if (offset == 0)
13203 /* EBP and R13 cannot be encoded without an offset. */
13204 len = (regno == BP_REG || regno == R13_REG);
13206 else if (IN_RANGE (offset, -128, 127))
13207 len = 1;
13209 /* ESP and R12 must be encoded with a SIB byte. */
13210 if (regno == SP_REG || regno == R12_REG)
13211 len++;
13213 return len;
13216 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
13217 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13219 static bool
13220 sp_valid_at (HOST_WIDE_INT cfa_offset)
13222 const struct machine_frame_state &fs = cfun->machine->fs;
13223 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
13225 /* Validate that the cfa_offset isn't in a "no-man's land". */
13226 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
13227 return false;
13229 return fs.sp_valid;
13232 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
13233 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13235 static inline bool
13236 fp_valid_at (HOST_WIDE_INT cfa_offset)
13238 const struct machine_frame_state &fs = cfun->machine->fs;
13239 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
13241 /* Validate that the cfa_offset isn't in a "no-man's land". */
13242 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
13243 return false;
13245 return fs.fp_valid;
13248 /* Choose a base register based upon alignment requested, speed and/or
13249 size. */
13251 static void
13252 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13253 HOST_WIDE_INT &base_offset,
13254 unsigned int align_reqested, unsigned int *align)
13256 const struct machine_function *m = cfun->machine;
13257 unsigned int hfp_align;
13258 unsigned int drap_align;
13259 unsigned int sp_align;
13260 bool hfp_ok = fp_valid_at (cfa_offset);
13261 bool drap_ok = m->fs.drap_valid;
13262 bool sp_ok = sp_valid_at (cfa_offset);
13264 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13266 /* Filter out any registers that don't meet the requested alignment
13267 criteria. */
13268 if (align_reqested)
13270 if (m->fs.realigned)
13271 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13272 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13273 notes (which we would need to use a realigned stack pointer),
13274 so disable on SEH targets. */
13275 else if (m->fs.sp_realigned)
13276 sp_align = crtl->stack_alignment_needed;
13278 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13279 drap_ok = drap_ok && drap_align >= align_reqested;
13280 sp_ok = sp_ok && sp_align >= align_reqested;
13283 if (m->use_fast_prologue_epilogue)
13285 /* Choose the base register most likely to allow the most scheduling
13286 opportunities. Generally FP is valid throughout the function,
13287 while DRAP must be reloaded within the epilogue. But choose either
13288 over the SP due to increased encoding size. */
13290 if (hfp_ok)
13292 base_reg = hard_frame_pointer_rtx;
13293 base_offset = m->fs.fp_offset - cfa_offset;
13295 else if (drap_ok)
13297 base_reg = crtl->drap_reg;
13298 base_offset = 0 - cfa_offset;
13300 else if (sp_ok)
13302 base_reg = stack_pointer_rtx;
13303 base_offset = m->fs.sp_offset - cfa_offset;
13306 else
13308 HOST_WIDE_INT toffset;
13309 int len = 16, tlen;
13311 /* Choose the base register with the smallest address encoding.
13312 With a tie, choose FP > DRAP > SP. */
13313 if (sp_ok)
13315 base_reg = stack_pointer_rtx;
13316 base_offset = m->fs.sp_offset - cfa_offset;
13317 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13319 if (drap_ok)
13321 toffset = 0 - cfa_offset;
13322 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13323 if (tlen <= len)
13325 base_reg = crtl->drap_reg;
13326 base_offset = toffset;
13327 len = tlen;
13330 if (hfp_ok)
13332 toffset = m->fs.fp_offset - cfa_offset;
13333 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13334 if (tlen <= len)
13336 base_reg = hard_frame_pointer_rtx;
13337 base_offset = toffset;
13338 len = tlen;
13343 /* Set the align return value. */
13344 if (align)
13346 if (base_reg == stack_pointer_rtx)
13347 *align = sp_align;
13348 else if (base_reg == crtl->drap_reg)
13349 *align = drap_align;
13350 else if (base_reg == hard_frame_pointer_rtx)
13351 *align = hfp_align;
13355 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13356 the alignment of address. If ALIGN is non-null, it should point to
13357 an alignment value (in bits) that is preferred or zero and will
13358 recieve the alignment of the base register that was selected,
13359 irrespective of rather or not CFA_OFFSET is a multiple of that
13360 alignment value.
13362 The valid base registers are taken from CFUN->MACHINE->FS. */
13364 static rtx
13365 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13367 rtx base_reg = NULL;
13368 HOST_WIDE_INT base_offset = 0;
13370 /* If a specific alignment is requested, try to get a base register
13371 with that alignment first. */
13372 if (align && *align)
13373 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13375 if (!base_reg)
13376 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13378 gcc_assert (base_reg != NULL);
13379 return plus_constant (Pmode, base_reg, base_offset);
13382 /* Emit code to save registers in the prologue. */
13384 static void
13385 ix86_emit_save_regs (void)
13387 unsigned int regno;
13388 rtx_insn *insn;
13390 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13391 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13393 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13394 RTX_FRAME_RELATED_P (insn) = 1;
13398 /* Emit a single register save at CFA - CFA_OFFSET. */
13400 static void
13401 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13402 HOST_WIDE_INT cfa_offset)
13404 struct machine_function *m = cfun->machine;
13405 rtx reg = gen_rtx_REG (mode, regno);
13406 rtx mem, addr, base, insn;
13407 unsigned int align = GET_MODE_ALIGNMENT (mode);
13409 addr = choose_baseaddr (cfa_offset, &align);
13410 mem = gen_frame_mem (mode, addr);
13412 /* The location aligment depends upon the base register. */
13413 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13414 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13415 set_mem_align (mem, align);
13417 insn = emit_insn (gen_rtx_SET (mem, reg));
13418 RTX_FRAME_RELATED_P (insn) = 1;
13420 base = addr;
13421 if (GET_CODE (base) == PLUS)
13422 base = XEXP (base, 0);
13423 gcc_checking_assert (REG_P (base));
13425 /* When saving registers into a re-aligned local stack frame, avoid
13426 any tricky guessing by dwarf2out. */
13427 if (m->fs.realigned)
13429 gcc_checking_assert (stack_realign_drap);
13431 if (regno == REGNO (crtl->drap_reg))
13433 /* A bit of a hack. We force the DRAP register to be saved in
13434 the re-aligned stack frame, which provides us with a copy
13435 of the CFA that will last past the prologue. Install it. */
13436 gcc_checking_assert (cfun->machine->fs.fp_valid);
13437 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13438 cfun->machine->fs.fp_offset - cfa_offset);
13439 mem = gen_rtx_MEM (mode, addr);
13440 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13442 else
13444 /* The frame pointer is a stable reference within the
13445 aligned frame. Use it. */
13446 gcc_checking_assert (cfun->machine->fs.fp_valid);
13447 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13448 cfun->machine->fs.fp_offset - cfa_offset);
13449 mem = gen_rtx_MEM (mode, addr);
13450 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13454 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13455 && cfa_offset >= m->fs.sp_realigned_offset)
13457 gcc_checking_assert (stack_realign_fp);
13458 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13461 /* The memory may not be relative to the current CFA register,
13462 which means that we may need to generate a new pattern for
13463 use by the unwind info. */
13464 else if (base != m->fs.cfa_reg)
13466 addr = plus_constant (Pmode, m->fs.cfa_reg,
13467 m->fs.cfa_offset - cfa_offset);
13468 mem = gen_rtx_MEM (mode, addr);
13469 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13473 /* Emit code to save registers using MOV insns.
13474 First register is stored at CFA - CFA_OFFSET. */
13475 static void
13476 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13478 unsigned int regno;
13480 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13481 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13483 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13484 cfa_offset -= UNITS_PER_WORD;
13488 /* Emit code to save SSE registers using MOV insns.
13489 First register is stored at CFA - CFA_OFFSET. */
13490 static void
13491 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13493 unsigned int regno;
13495 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13496 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13498 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13499 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13503 static GTY(()) rtx queued_cfa_restores;
13505 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13506 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13507 Don't add the note if the previously saved value will be left untouched
13508 within stack red-zone till return, as unwinders can find the same value
13509 in the register and on the stack. */
13511 static void
13512 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13514 if (!crtl->shrink_wrapped
13515 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13516 return;
13518 if (insn)
13520 add_reg_note (insn, REG_CFA_RESTORE, reg);
13521 RTX_FRAME_RELATED_P (insn) = 1;
13523 else
13524 queued_cfa_restores
13525 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13528 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13530 static void
13531 ix86_add_queued_cfa_restore_notes (rtx insn)
13533 rtx last;
13534 if (!queued_cfa_restores)
13535 return;
13536 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13538 XEXP (last, 1) = REG_NOTES (insn);
13539 REG_NOTES (insn) = queued_cfa_restores;
13540 queued_cfa_restores = NULL_RTX;
13541 RTX_FRAME_RELATED_P (insn) = 1;
13544 /* Expand prologue or epilogue stack adjustment.
13545 The pattern exist to put a dependency on all ebp-based memory accesses.
13546 STYLE should be negative if instructions should be marked as frame related,
13547 zero if %r11 register is live and cannot be freely used and positive
13548 otherwise. */
13550 static void
13551 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13552 int style, bool set_cfa)
13554 struct machine_function *m = cfun->machine;
13555 rtx insn;
13556 bool add_frame_related_expr = false;
13558 if (Pmode == SImode)
13559 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13560 else if (x86_64_immediate_operand (offset, DImode))
13561 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13562 else
13564 rtx tmp;
13565 /* r11 is used by indirect sibcall return as well, set before the
13566 epilogue and used after the epilogue. */
13567 if (style)
13568 tmp = gen_rtx_REG (DImode, R11_REG);
13569 else
13571 gcc_assert (src != hard_frame_pointer_rtx
13572 && dest != hard_frame_pointer_rtx);
13573 tmp = hard_frame_pointer_rtx;
13575 insn = emit_insn (gen_rtx_SET (tmp, offset));
13576 if (style < 0)
13577 add_frame_related_expr = true;
13579 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13582 insn = emit_insn (insn);
13583 if (style >= 0)
13584 ix86_add_queued_cfa_restore_notes (insn);
13586 if (set_cfa)
13588 rtx r;
13590 gcc_assert (m->fs.cfa_reg == src);
13591 m->fs.cfa_offset += INTVAL (offset);
13592 m->fs.cfa_reg = dest;
13594 r = gen_rtx_PLUS (Pmode, src, offset);
13595 r = gen_rtx_SET (dest, r);
13596 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13597 RTX_FRAME_RELATED_P (insn) = 1;
13599 else if (style < 0)
13601 RTX_FRAME_RELATED_P (insn) = 1;
13602 if (add_frame_related_expr)
13604 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13605 r = gen_rtx_SET (dest, r);
13606 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13610 if (dest == stack_pointer_rtx)
13612 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13613 bool valid = m->fs.sp_valid;
13614 bool realigned = m->fs.sp_realigned;
13616 if (src == hard_frame_pointer_rtx)
13618 valid = m->fs.fp_valid;
13619 realigned = false;
13620 ooffset = m->fs.fp_offset;
13622 else if (src == crtl->drap_reg)
13624 valid = m->fs.drap_valid;
13625 realigned = false;
13626 ooffset = 0;
13628 else
13630 /* Else there are two possibilities: SP itself, which we set
13631 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13632 taken care of this by hand along the eh_return path. */
13633 gcc_checking_assert (src == stack_pointer_rtx
13634 || offset == const0_rtx);
13637 m->fs.sp_offset = ooffset - INTVAL (offset);
13638 m->fs.sp_valid = valid;
13639 m->fs.sp_realigned = realigned;
13643 /* Find an available register to be used as dynamic realign argument
13644 pointer regsiter. Such a register will be written in prologue and
13645 used in begin of body, so it must not be
13646 1. parameter passing register.
13647 2. GOT pointer.
13648 We reuse static-chain register if it is available. Otherwise, we
13649 use DI for i386 and R13 for x86-64. We chose R13 since it has
13650 shorter encoding.
13652 Return: the regno of chosen register. */
13654 static unsigned int
13655 find_drap_reg (void)
13657 tree decl = cfun->decl;
13659 /* Always use callee-saved register if there are no caller-saved
13660 registers. */
13661 if (TARGET_64BIT)
13663 /* Use R13 for nested function or function need static chain.
13664 Since function with tail call may use any caller-saved
13665 registers in epilogue, DRAP must not use caller-saved
13666 register in such case. */
13667 if (DECL_STATIC_CHAIN (decl)
13668 || cfun->machine->no_caller_saved_registers
13669 || crtl->tail_call_emit)
13670 return R13_REG;
13672 return R10_REG;
13674 else
13676 /* Use DI for nested function or function need static chain.
13677 Since function with tail call may use any caller-saved
13678 registers in epilogue, DRAP must not use caller-saved
13679 register in such case. */
13680 if (DECL_STATIC_CHAIN (decl)
13681 || cfun->machine->no_caller_saved_registers
13682 || crtl->tail_call_emit)
13683 return DI_REG;
13685 /* Reuse static chain register if it isn't used for parameter
13686 passing. */
13687 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13689 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13690 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13691 return CX_REG;
13693 return DI_REG;
13697 /* Handle a "force_align_arg_pointer" attribute. */
13699 static tree
13700 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13701 tree, int, bool *no_add_attrs)
13703 if (TREE_CODE (*node) != FUNCTION_TYPE
13704 && TREE_CODE (*node) != METHOD_TYPE
13705 && TREE_CODE (*node) != FIELD_DECL
13706 && TREE_CODE (*node) != TYPE_DECL)
13708 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13709 name);
13710 *no_add_attrs = true;
13713 return NULL_TREE;
13716 /* Return minimum incoming stack alignment. */
13718 static unsigned int
13719 ix86_minimum_incoming_stack_boundary (bool sibcall)
13721 unsigned int incoming_stack_boundary;
13723 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13724 if (cfun->machine->func_type != TYPE_NORMAL)
13725 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13726 /* Prefer the one specified at command line. */
13727 else if (ix86_user_incoming_stack_boundary)
13728 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13729 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13730 if -mstackrealign is used, it isn't used for sibcall check and
13731 estimated stack alignment is 128bit. */
13732 else if (!sibcall
13733 && ix86_force_align_arg_pointer
13734 && crtl->stack_alignment_estimated == 128)
13735 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13736 else
13737 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13739 /* Incoming stack alignment can be changed on individual functions
13740 via force_align_arg_pointer attribute. We use the smallest
13741 incoming stack boundary. */
13742 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13743 && lookup_attribute (ix86_force_align_arg_pointer_string,
13744 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13745 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13747 /* The incoming stack frame has to be aligned at least at
13748 parm_stack_boundary. */
13749 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13750 incoming_stack_boundary = crtl->parm_stack_boundary;
13752 /* Stack at entrance of main is aligned by runtime. We use the
13753 smallest incoming stack boundary. */
13754 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13755 && DECL_NAME (current_function_decl)
13756 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13757 && DECL_FILE_SCOPE_P (current_function_decl))
13758 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13760 return incoming_stack_boundary;
13763 /* Update incoming stack boundary and estimated stack alignment. */
13765 static void
13766 ix86_update_stack_boundary (void)
13768 ix86_incoming_stack_boundary
13769 = ix86_minimum_incoming_stack_boundary (false);
13771 /* x86_64 vararg needs 16byte stack alignment for register save
13772 area. */
13773 if (TARGET_64BIT
13774 && cfun->stdarg
13775 && crtl->stack_alignment_estimated < 128)
13776 crtl->stack_alignment_estimated = 128;
13778 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13779 if (ix86_tls_descriptor_calls_expanded_in_cfun
13780 && crtl->preferred_stack_boundary < 128)
13781 crtl->preferred_stack_boundary = 128;
13784 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13785 needed or an rtx for DRAP otherwise. */
13787 static rtx
13788 ix86_get_drap_rtx (void)
13790 /* We must use DRAP if there are outgoing arguments on stack and
13791 ACCUMULATE_OUTGOING_ARGS is false. */
13792 if (ix86_force_drap
13793 || (cfun->machine->outgoing_args_on_stack
13794 && !ACCUMULATE_OUTGOING_ARGS))
13795 crtl->need_drap = true;
13797 if (stack_realign_drap)
13799 /* Assign DRAP to vDRAP and returns vDRAP */
13800 unsigned int regno = find_drap_reg ();
13801 rtx drap_vreg;
13802 rtx arg_ptr;
13803 rtx_insn *seq, *insn;
13805 arg_ptr = gen_rtx_REG (Pmode, regno);
13806 crtl->drap_reg = arg_ptr;
13808 start_sequence ();
13809 drap_vreg = copy_to_reg (arg_ptr);
13810 seq = get_insns ();
13811 end_sequence ();
13813 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13814 if (!optimize)
13816 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13817 RTX_FRAME_RELATED_P (insn) = 1;
13819 return drap_vreg;
13821 else
13822 return NULL;
13825 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13827 static rtx
13828 ix86_internal_arg_pointer (void)
13830 return virtual_incoming_args_rtx;
13833 struct scratch_reg {
13834 rtx reg;
13835 bool saved;
13838 /* Return a short-lived scratch register for use on function entry.
13839 In 32-bit mode, it is valid only after the registers are saved
13840 in the prologue. This register must be released by means of
13841 release_scratch_register_on_entry once it is dead. */
13843 static void
13844 get_scratch_register_on_entry (struct scratch_reg *sr)
13846 int regno;
13848 sr->saved = false;
13850 if (TARGET_64BIT)
13852 /* We always use R11 in 64-bit mode. */
13853 regno = R11_REG;
13855 else
13857 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13858 bool fastcall_p
13859 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13860 bool thiscall_p
13861 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13862 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13863 int regparm = ix86_function_regparm (fntype, decl);
13864 int drap_regno
13865 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13867 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13868 for the static chain register. */
13869 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13870 && drap_regno != AX_REG)
13871 regno = AX_REG;
13872 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13873 for the static chain register. */
13874 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13875 regno = AX_REG;
13876 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13877 regno = DX_REG;
13878 /* ecx is the static chain register. */
13879 else if (regparm < 3 && !fastcall_p && !thiscall_p
13880 && !static_chain_p
13881 && drap_regno != CX_REG)
13882 regno = CX_REG;
13883 else if (ix86_save_reg (BX_REG, true, false))
13884 regno = BX_REG;
13885 /* esi is the static chain register. */
13886 else if (!(regparm == 3 && static_chain_p)
13887 && ix86_save_reg (SI_REG, true, false))
13888 regno = SI_REG;
13889 else if (ix86_save_reg (DI_REG, true, false))
13890 regno = DI_REG;
13891 else
13893 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13894 sr->saved = true;
13898 sr->reg = gen_rtx_REG (Pmode, regno);
13899 if (sr->saved)
13901 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13902 RTX_FRAME_RELATED_P (insn) = 1;
13906 /* Release a scratch register obtained from the preceding function. */
13908 static void
13909 release_scratch_register_on_entry (struct scratch_reg *sr)
13911 if (sr->saved)
13913 struct machine_function *m = cfun->machine;
13914 rtx x, insn = emit_insn (gen_pop (sr->reg));
13916 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13917 RTX_FRAME_RELATED_P (insn) = 1;
13918 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13919 x = gen_rtx_SET (stack_pointer_rtx, x);
13920 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13921 m->fs.sp_offset -= UNITS_PER_WORD;
13925 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13927 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13929 static void
13930 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13932 /* We skip the probe for the first interval + a small dope of 4 words and
13933 probe that many bytes past the specified size to maintain a protection
13934 area at the botton of the stack. */
13935 const int dope = 4 * UNITS_PER_WORD;
13936 rtx size_rtx = GEN_INT (size), last;
13938 /* See if we have a constant small number of probes to generate. If so,
13939 that's the easy case. The run-time loop is made up of 9 insns in the
13940 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13941 for n # of intervals. */
13942 if (size <= 4 * PROBE_INTERVAL)
13944 HOST_WIDE_INT i, adjust;
13945 bool first_probe = true;
13947 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13948 values of N from 1 until it exceeds SIZE. If only one probe is
13949 needed, this will not generate any code. Then adjust and probe
13950 to PROBE_INTERVAL + SIZE. */
13951 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13953 if (first_probe)
13955 adjust = 2 * PROBE_INTERVAL + dope;
13956 first_probe = false;
13958 else
13959 adjust = PROBE_INTERVAL;
13961 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13962 plus_constant (Pmode, stack_pointer_rtx,
13963 -adjust)));
13964 emit_stack_probe (stack_pointer_rtx);
13967 if (first_probe)
13968 adjust = size + PROBE_INTERVAL + dope;
13969 else
13970 adjust = size + PROBE_INTERVAL - i;
13972 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13973 plus_constant (Pmode, stack_pointer_rtx,
13974 -adjust)));
13975 emit_stack_probe (stack_pointer_rtx);
13977 /* Adjust back to account for the additional first interval. */
13978 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13979 plus_constant (Pmode, stack_pointer_rtx,
13980 PROBE_INTERVAL + dope)));
13983 /* Otherwise, do the same as above, but in a loop. Note that we must be
13984 extra careful with variables wrapping around because we might be at
13985 the very top (or the very bottom) of the address space and we have
13986 to be able to handle this case properly; in particular, we use an
13987 equality test for the loop condition. */
13988 else
13990 HOST_WIDE_INT rounded_size;
13991 struct scratch_reg sr;
13993 get_scratch_register_on_entry (&sr);
13996 /* Step 1: round SIZE to the previous multiple of the interval. */
13998 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14001 /* Step 2: compute initial and final value of the loop counter. */
14003 /* SP = SP_0 + PROBE_INTERVAL. */
14004 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14005 plus_constant (Pmode, stack_pointer_rtx,
14006 - (PROBE_INTERVAL + dope))));
14008 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
14009 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
14010 emit_insn (gen_rtx_SET (sr.reg,
14011 plus_constant (Pmode, stack_pointer_rtx,
14012 -rounded_size)));
14013 else
14015 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
14016 emit_insn (gen_rtx_SET (sr.reg,
14017 gen_rtx_PLUS (Pmode, sr.reg,
14018 stack_pointer_rtx)));
14022 /* Step 3: the loop
14026 SP = SP + PROBE_INTERVAL
14027 probe at SP
14029 while (SP != LAST_ADDR)
14031 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
14032 values of N from 1 until it is equal to ROUNDED_SIZE. */
14034 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
14037 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
14038 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
14040 if (size != rounded_size)
14042 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14043 plus_constant (Pmode, stack_pointer_rtx,
14044 rounded_size - size)));
14045 emit_stack_probe (stack_pointer_rtx);
14048 /* Adjust back to account for the additional first interval. */
14049 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
14050 plus_constant (Pmode, stack_pointer_rtx,
14051 PROBE_INTERVAL + dope)));
14053 release_scratch_register_on_entry (&sr);
14056 /* Even if the stack pointer isn't the CFA register, we need to correctly
14057 describe the adjustments made to it, in particular differentiate the
14058 frame-related ones from the frame-unrelated ones. */
14059 if (size > 0)
14061 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
14062 XVECEXP (expr, 0, 0)
14063 = gen_rtx_SET (stack_pointer_rtx,
14064 plus_constant (Pmode, stack_pointer_rtx, -size));
14065 XVECEXP (expr, 0, 1)
14066 = gen_rtx_SET (stack_pointer_rtx,
14067 plus_constant (Pmode, stack_pointer_rtx,
14068 PROBE_INTERVAL + dope + size));
14069 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
14070 RTX_FRAME_RELATED_P (last) = 1;
14072 cfun->machine->fs.sp_offset += size;
14075 /* Make sure nothing is scheduled before we are done. */
14076 emit_insn (gen_blockage ());
14079 /* Adjust the stack pointer up to REG while probing it. */
14081 const char *
14082 output_adjust_stack_and_probe (rtx reg)
14084 static int labelno = 0;
14085 char loop_lab[32];
14086 rtx xops[2];
14088 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14090 /* Loop. */
14091 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14093 /* SP = SP + PROBE_INTERVAL. */
14094 xops[0] = stack_pointer_rtx;
14095 xops[1] = GEN_INT (PROBE_INTERVAL);
14096 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14098 /* Probe at SP. */
14099 xops[1] = const0_rtx;
14100 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
14102 /* Test if SP == LAST_ADDR. */
14103 xops[0] = stack_pointer_rtx;
14104 xops[1] = reg;
14105 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14107 /* Branch. */
14108 fputs ("\tjne\t", asm_out_file);
14109 assemble_name_raw (asm_out_file, loop_lab);
14110 fputc ('\n', asm_out_file);
14112 return "";
14115 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
14116 inclusive. These are offsets from the current stack pointer. */
14118 static void
14119 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14121 /* See if we have a constant small number of probes to generate. If so,
14122 that's the easy case. The run-time loop is made up of 6 insns in the
14123 generic case while the compile-time loop is made up of n insns for n #
14124 of intervals. */
14125 if (size <= 6 * PROBE_INTERVAL)
14127 HOST_WIDE_INT i;
14129 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14130 it exceeds SIZE. If only one probe is needed, this will not
14131 generate any code. Then probe at FIRST + SIZE. */
14132 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14133 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14134 -(first + i)));
14136 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14137 -(first + size)));
14140 /* Otherwise, do the same as above, but in a loop. Note that we must be
14141 extra careful with variables wrapping around because we might be at
14142 the very top (or the very bottom) of the address space and we have
14143 to be able to handle this case properly; in particular, we use an
14144 equality test for the loop condition. */
14145 else
14147 HOST_WIDE_INT rounded_size, last;
14148 struct scratch_reg sr;
14150 get_scratch_register_on_entry (&sr);
14153 /* Step 1: round SIZE to the previous multiple of the interval. */
14155 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14158 /* Step 2: compute initial and final value of the loop counter. */
14160 /* TEST_OFFSET = FIRST. */
14161 emit_move_insn (sr.reg, GEN_INT (-first));
14163 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14164 last = first + rounded_size;
14167 /* Step 3: the loop
14171 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14172 probe at TEST_ADDR
14174 while (TEST_ADDR != LAST_ADDR)
14176 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14177 until it is equal to ROUNDED_SIZE. */
14179 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14182 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14183 that SIZE is equal to ROUNDED_SIZE. */
14185 if (size != rounded_size)
14186 emit_stack_probe (plus_constant (Pmode,
14187 gen_rtx_PLUS (Pmode,
14188 stack_pointer_rtx,
14189 sr.reg),
14190 rounded_size - size));
14192 release_scratch_register_on_entry (&sr);
14195 /* Make sure nothing is scheduled before we are done. */
14196 emit_insn (gen_blockage ());
14199 /* Probe a range of stack addresses from REG to END, inclusive. These are
14200 offsets from the current stack pointer. */
14202 const char *
14203 output_probe_stack_range (rtx reg, rtx end)
14205 static int labelno = 0;
14206 char loop_lab[32];
14207 rtx xops[3];
14209 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14211 /* Loop. */
14212 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14214 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14215 xops[0] = reg;
14216 xops[1] = GEN_INT (PROBE_INTERVAL);
14217 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14219 /* Probe at TEST_ADDR. */
14220 xops[0] = stack_pointer_rtx;
14221 xops[1] = reg;
14222 xops[2] = const0_rtx;
14223 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14225 /* Test if TEST_ADDR == LAST_ADDR. */
14226 xops[0] = reg;
14227 xops[1] = end;
14228 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14230 /* Branch. */
14231 fputs ("\tjne\t", asm_out_file);
14232 assemble_name_raw (asm_out_file, loop_lab);
14233 fputc ('\n', asm_out_file);
14235 return "";
14238 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
14239 will guide prologue/epilogue to be generated in correct form. */
14241 static void
14242 ix86_finalize_stack_frame_flags (void)
14244 /* Check if stack realign is really needed after reload, and
14245 stores result in cfun */
14246 unsigned int incoming_stack_boundary
14247 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14248 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14249 unsigned int stack_alignment
14250 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14251 ? crtl->max_used_stack_slot_alignment
14252 : crtl->stack_alignment_needed);
14253 unsigned int stack_realign
14254 = (incoming_stack_boundary < stack_alignment);
14255 bool recompute_frame_layout_p = false;
14257 if (crtl->stack_realign_finalized)
14259 /* After stack_realign_needed is finalized, we can't no longer
14260 change it. */
14261 gcc_assert (crtl->stack_realign_needed == stack_realign);
14262 return;
14265 /* If the only reason for frame_pointer_needed is that we conservatively
14266 assumed stack realignment might be needed or -fno-omit-frame-pointer
14267 is used, but in the end nothing that needed the stack alignment had
14268 been spilled nor stack access, clear frame_pointer_needed and say we
14269 don't need stack realignment. */
14270 if ((stack_realign || !flag_omit_frame_pointer)
14271 && frame_pointer_needed
14272 && crtl->is_leaf
14273 && crtl->sp_is_unchanging
14274 && !ix86_current_function_calls_tls_descriptor
14275 && !crtl->accesses_prior_frames
14276 && !cfun->calls_alloca
14277 && !crtl->calls_eh_return
14278 /* See ira_setup_eliminable_regset for the rationale. */
14279 && !(STACK_CHECK_MOVING_SP
14280 && flag_stack_check
14281 && flag_exceptions
14282 && cfun->can_throw_non_call_exceptions)
14283 && !ix86_frame_pointer_required ()
14284 && get_frame_size () == 0
14285 && ix86_nsaved_sseregs () == 0
14286 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14288 HARD_REG_SET set_up_by_prologue, prologue_used;
14289 basic_block bb;
14291 CLEAR_HARD_REG_SET (prologue_used);
14292 CLEAR_HARD_REG_SET (set_up_by_prologue);
14293 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14294 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14295 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14296 HARD_FRAME_POINTER_REGNUM);
14298 /* The preferred stack alignment is the minimum stack alignment. */
14299 if (stack_alignment > crtl->preferred_stack_boundary)
14300 stack_alignment = crtl->preferred_stack_boundary;
14302 bool require_stack_frame = false;
14304 FOR_EACH_BB_FN (bb, cfun)
14306 rtx_insn *insn;
14307 FOR_BB_INSNS (bb, insn)
14308 if (NONDEBUG_INSN_P (insn)
14309 && requires_stack_frame_p (insn, prologue_used,
14310 set_up_by_prologue))
14312 require_stack_frame = true;
14314 if (stack_realign)
14316 /* Find the maximum stack alignment. */
14317 subrtx_iterator::array_type array;
14318 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
14319 if (MEM_P (*iter)
14320 && (reg_mentioned_p (stack_pointer_rtx,
14321 *iter)
14322 || reg_mentioned_p (frame_pointer_rtx,
14323 *iter)))
14325 unsigned int alignment = MEM_ALIGN (*iter);
14326 if (alignment > stack_alignment)
14327 stack_alignment = alignment;
14333 if (require_stack_frame)
14335 /* Stack frame is required. If stack alignment needed is less
14336 than incoming stack boundary, don't realign stack. */
14337 stack_realign = incoming_stack_boundary < stack_alignment;
14338 if (!stack_realign)
14340 crtl->max_used_stack_slot_alignment
14341 = incoming_stack_boundary;
14342 crtl->stack_alignment_needed
14343 = incoming_stack_boundary;
14344 /* Also update preferred_stack_boundary for leaf
14345 functions. */
14346 crtl->preferred_stack_boundary
14347 = incoming_stack_boundary;
14350 else
14352 /* If drap has been set, but it actually isn't live at the
14353 start of the function, there is no reason to set it up. */
14354 if (crtl->drap_reg)
14356 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14357 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
14358 REGNO (crtl->drap_reg)))
14360 crtl->drap_reg = NULL_RTX;
14361 crtl->need_drap = false;
14364 else
14365 cfun->machine->no_drap_save_restore = true;
14367 frame_pointer_needed = false;
14368 stack_realign = false;
14369 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14370 crtl->stack_alignment_needed = incoming_stack_boundary;
14371 crtl->stack_alignment_estimated = incoming_stack_boundary;
14372 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14373 crtl->preferred_stack_boundary = incoming_stack_boundary;
14374 df_finish_pass (true);
14375 df_scan_alloc (NULL);
14376 df_scan_blocks ();
14377 df_compute_regs_ever_live (true);
14378 df_analyze ();
14380 if (flag_var_tracking)
14382 /* Since frame pointer is no longer available, replace it with
14383 stack pointer - UNITS_PER_WORD in debug insns. */
14384 df_ref ref, next;
14385 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
14386 ref; ref = next)
14388 rtx_insn *insn = DF_REF_INSN (ref);
14389 /* Make sure the next ref is for a different instruction,
14390 so that we're not affected by the rescan. */
14391 next = DF_REF_NEXT_REG (ref);
14392 while (next && DF_REF_INSN (next) == insn)
14393 next = DF_REF_NEXT_REG (next);
14395 if (DEBUG_INSN_P (insn))
14397 bool changed = false;
14398 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
14400 rtx *loc = DF_REF_LOC (ref);
14401 if (*loc == hard_frame_pointer_rtx)
14403 *loc = plus_constant (Pmode,
14404 stack_pointer_rtx,
14405 -UNITS_PER_WORD);
14406 changed = true;
14409 if (changed)
14410 df_insn_rescan (insn);
14415 recompute_frame_layout_p = true;
14419 if (crtl->stack_realign_needed != stack_realign)
14420 recompute_frame_layout_p = true;
14421 crtl->stack_realign_needed = stack_realign;
14422 crtl->stack_realign_finalized = true;
14423 if (recompute_frame_layout_p)
14424 ix86_compute_frame_layout ();
14427 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14429 static void
14430 ix86_elim_entry_set_got (rtx reg)
14432 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14433 rtx_insn *c_insn = BB_HEAD (bb);
14434 if (!NONDEBUG_INSN_P (c_insn))
14435 c_insn = next_nonnote_nondebug_insn (c_insn);
14436 if (c_insn && NONJUMP_INSN_P (c_insn))
14438 rtx pat = PATTERN (c_insn);
14439 if (GET_CODE (pat) == PARALLEL)
14441 rtx vec = XVECEXP (pat, 0, 0);
14442 if (GET_CODE (vec) == SET
14443 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14444 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14445 delete_insn (c_insn);
14450 static rtx
14451 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14453 rtx addr, mem;
14455 if (offset)
14456 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14457 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14458 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14461 static inline rtx
14462 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14464 return gen_frame_set (reg, frame_reg, offset, false);
14467 static inline rtx
14468 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14470 return gen_frame_set (reg, frame_reg, offset, true);
14473 static void
14474 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14476 struct machine_function *m = cfun->machine;
14477 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14478 + m->call_ms2sysv_extra_regs;
14479 rtvec v = rtvec_alloc (ncregs + 1);
14480 unsigned int align, i, vi = 0;
14481 rtx_insn *insn;
14482 rtx sym, addr;
14483 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14484 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14485 HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14487 /* AL should only be live with sysv_abi. */
14488 gcc_assert (!ix86_eax_live_at_start_p ());
14490 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
14491 we've actually realigned the stack or not. */
14492 align = GET_MODE_ALIGNMENT (V4SFmode);
14493 addr = choose_baseaddr (frame.stack_realign_offset
14494 + xlogue.get_stub_ptr_offset (), &align);
14495 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14496 emit_insn (gen_rtx_SET (rax, addr));
14498 /* Allocate stack if not already done. */
14499 if (allocate > 0)
14500 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14501 GEN_INT (-allocate), -1, false);
14503 /* Get the stub symbol. */
14504 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14505 : XLOGUE_STUB_SAVE);
14506 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14508 for (i = 0; i < ncregs; ++i)
14510 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14511 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14512 r.regno);
14513 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
14516 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14518 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14519 RTX_FRAME_RELATED_P (insn) = true;
14522 /* Expand the prologue into a bunch of separate insns. */
14524 void
14525 ix86_expand_prologue (void)
14527 struct machine_function *m = cfun->machine;
14528 rtx insn, t;
14529 struct ix86_frame frame;
14530 HOST_WIDE_INT allocate;
14531 bool int_registers_saved;
14532 bool sse_registers_saved;
14533 rtx static_chain = NULL_RTX;
14535 if (ix86_function_naked (current_function_decl))
14536 return;
14538 ix86_finalize_stack_frame_flags ();
14540 /* DRAP should not coexist with stack_realign_fp */
14541 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14543 memset (&m->fs, 0, sizeof (m->fs));
14545 /* Initialize CFA state for before the prologue. */
14546 m->fs.cfa_reg = stack_pointer_rtx;
14547 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14549 /* Track SP offset to the CFA. We continue tracking this after we've
14550 swapped the CFA register away from SP. In the case of re-alignment
14551 this is fudged; we're interested to offsets within the local frame. */
14552 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14553 m->fs.sp_valid = true;
14554 m->fs.sp_realigned = false;
14556 frame = m->frame;
14558 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14560 /* We should have already generated an error for any use of
14561 ms_hook on a nested function. */
14562 gcc_checking_assert (!ix86_static_chain_on_stack);
14564 /* Check if profiling is active and we shall use profiling before
14565 prologue variant. If so sorry. */
14566 if (crtl->profile && flag_fentry != 0)
14567 sorry ("ms_hook_prologue attribute isn%'t compatible "
14568 "with -mfentry for 32-bit");
14570 /* In ix86_asm_output_function_label we emitted:
14571 8b ff movl.s %edi,%edi
14572 55 push %ebp
14573 8b ec movl.s %esp,%ebp
14575 This matches the hookable function prologue in Win32 API
14576 functions in Microsoft Windows XP Service Pack 2 and newer.
14577 Wine uses this to enable Windows apps to hook the Win32 API
14578 functions provided by Wine.
14580 What that means is that we've already set up the frame pointer. */
14582 if (frame_pointer_needed
14583 && !(crtl->drap_reg && crtl->stack_realign_needed))
14585 rtx push, mov;
14587 /* We've decided to use the frame pointer already set up.
14588 Describe this to the unwinder by pretending that both
14589 push and mov insns happen right here.
14591 Putting the unwind info here at the end of the ms_hook
14592 is done so that we can make absolutely certain we get
14593 the required byte sequence at the start of the function,
14594 rather than relying on an assembler that can produce
14595 the exact encoding required.
14597 However it does mean (in the unpatched case) that we have
14598 a 1 insn window where the asynchronous unwind info is
14599 incorrect. However, if we placed the unwind info at
14600 its correct location we would have incorrect unwind info
14601 in the patched case. Which is probably all moot since
14602 I don't expect Wine generates dwarf2 unwind info for the
14603 system libraries that use this feature. */
14605 insn = emit_insn (gen_blockage ());
14607 push = gen_push (hard_frame_pointer_rtx);
14608 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14609 stack_pointer_rtx);
14610 RTX_FRAME_RELATED_P (push) = 1;
14611 RTX_FRAME_RELATED_P (mov) = 1;
14613 RTX_FRAME_RELATED_P (insn) = 1;
14614 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14615 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14617 /* Note that gen_push incremented m->fs.cfa_offset, even
14618 though we didn't emit the push insn here. */
14619 m->fs.cfa_reg = hard_frame_pointer_rtx;
14620 m->fs.fp_offset = m->fs.cfa_offset;
14621 m->fs.fp_valid = true;
14623 else
14625 /* The frame pointer is not needed so pop %ebp again.
14626 This leaves us with a pristine state. */
14627 emit_insn (gen_pop (hard_frame_pointer_rtx));
14631 /* The first insn of a function that accepts its static chain on the
14632 stack is to push the register that would be filled in by a direct
14633 call. This insn will be skipped by the trampoline. */
14634 else if (ix86_static_chain_on_stack)
14636 static_chain = ix86_static_chain (cfun->decl, false);
14637 insn = emit_insn (gen_push (static_chain));
14638 emit_insn (gen_blockage ());
14640 /* We don't want to interpret this push insn as a register save,
14641 only as a stack adjustment. The real copy of the register as
14642 a save will be done later, if needed. */
14643 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14644 t = gen_rtx_SET (stack_pointer_rtx, t);
14645 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14646 RTX_FRAME_RELATED_P (insn) = 1;
14649 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14650 of DRAP is needed and stack realignment is really needed after reload */
14651 if (stack_realign_drap)
14653 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14655 /* Can't use DRAP in interrupt function. */
14656 if (cfun->machine->func_type != TYPE_NORMAL)
14657 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14658 "in interrupt service routine. This may be worked "
14659 "around by avoiding functions with aggregate return.");
14661 /* Only need to push parameter pointer reg if it is caller saved. */
14662 if (!call_used_regs[REGNO (crtl->drap_reg)])
14664 /* Push arg pointer reg */
14665 insn = emit_insn (gen_push (crtl->drap_reg));
14666 RTX_FRAME_RELATED_P (insn) = 1;
14669 /* Grab the argument pointer. */
14670 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14671 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14672 RTX_FRAME_RELATED_P (insn) = 1;
14673 m->fs.cfa_reg = crtl->drap_reg;
14674 m->fs.cfa_offset = 0;
14676 /* Align the stack. */
14677 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14678 stack_pointer_rtx,
14679 GEN_INT (-align_bytes)));
14680 RTX_FRAME_RELATED_P (insn) = 1;
14682 /* Replicate the return address on the stack so that return
14683 address can be reached via (argp - 1) slot. This is needed
14684 to implement macro RETURN_ADDR_RTX and intrinsic function
14685 expand_builtin_return_addr etc. */
14686 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14687 t = gen_frame_mem (word_mode, t);
14688 insn = emit_insn (gen_push (t));
14689 RTX_FRAME_RELATED_P (insn) = 1;
14691 /* For the purposes of frame and register save area addressing,
14692 we've started over with a new frame. */
14693 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14694 m->fs.realigned = true;
14696 if (static_chain)
14698 /* Replicate static chain on the stack so that static chain
14699 can be reached via (argp - 2) slot. This is needed for
14700 nested function with stack realignment. */
14701 insn = emit_insn (gen_push (static_chain));
14702 RTX_FRAME_RELATED_P (insn) = 1;
14706 int_registers_saved = (frame.nregs == 0);
14707 sse_registers_saved = (frame.nsseregs == 0);
14709 if (frame_pointer_needed && !m->fs.fp_valid)
14711 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14712 slower on all targets. Also sdb doesn't like it. */
14713 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14714 RTX_FRAME_RELATED_P (insn) = 1;
14716 /* Push registers now, before setting the frame pointer
14717 on SEH target. */
14718 if (!int_registers_saved
14719 && TARGET_SEH
14720 && !frame.save_regs_using_mov)
14722 ix86_emit_save_regs ();
14723 int_registers_saved = true;
14724 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14727 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14729 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14730 RTX_FRAME_RELATED_P (insn) = 1;
14732 if (m->fs.cfa_reg == stack_pointer_rtx)
14733 m->fs.cfa_reg = hard_frame_pointer_rtx;
14734 m->fs.fp_offset = m->fs.sp_offset;
14735 m->fs.fp_valid = true;
14739 if (!int_registers_saved)
14741 /* If saving registers via PUSH, do so now. */
14742 if (!frame.save_regs_using_mov)
14744 ix86_emit_save_regs ();
14745 int_registers_saved = true;
14746 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14749 /* When using red zone we may start register saving before allocating
14750 the stack frame saving one cycle of the prologue. However, avoid
14751 doing this if we have to probe the stack; at least on x86_64 the
14752 stack probe can turn into a call that clobbers a red zone location. */
14753 else if (ix86_using_red_zone ()
14754 && (! TARGET_STACK_PROBE
14755 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14757 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14758 int_registers_saved = true;
14762 if (stack_realign_fp)
14764 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14765 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14767 /* Record last valid frame pointer offset. */
14768 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
14770 /* The computation of the size of the re-aligned stack frame means
14771 that we must allocate the size of the register save area before
14772 performing the actual alignment. Otherwise we cannot guarantee
14773 that there's enough storage above the realignment point. */
14774 allocate = frame.reg_save_offset - m->fs.sp_offset
14775 + frame.stack_realign_allocate;
14776 if (allocate)
14777 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14778 GEN_INT (-allocate), -1, false);
14780 /* Align the stack. */
14781 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14782 stack_pointer_rtx,
14783 GEN_INT (-align_bytes)));
14784 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14785 m->fs.sp_realigned_offset = m->fs.sp_offset
14786 - frame.stack_realign_allocate;
14787 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
14788 Beyond this point, stack access should be done via choose_baseaddr or
14789 by using sp_valid_at and fp_valid_at to determine the correct base
14790 register. Henceforth, any CFA offset should be thought of as logical
14791 and not physical. */
14792 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
14793 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14794 m->fs.sp_realigned = true;
14796 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14797 is needed to describe where a register is saved using a realigned
14798 stack pointer, so we need to invalidate the stack pointer for that
14799 target. */
14800 if (TARGET_SEH)
14801 m->fs.sp_valid = false;
14804 if (m->call_ms2sysv)
14805 ix86_emit_outlined_ms2sysv_save (frame);
14807 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14809 if (flag_stack_usage_info)
14811 /* We start to count from ARG_POINTER. */
14812 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14814 /* If it was realigned, take into account the fake frame. */
14815 if (stack_realign_drap)
14817 if (ix86_static_chain_on_stack)
14818 stack_size += UNITS_PER_WORD;
14820 if (!call_used_regs[REGNO (crtl->drap_reg)])
14821 stack_size += UNITS_PER_WORD;
14823 /* This over-estimates by 1 minimal-stack-alignment-unit but
14824 mitigates that by counting in the new return address slot. */
14825 current_function_dynamic_stack_size
14826 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14829 current_function_static_stack_size = stack_size;
14832 /* On SEH target with very large frame size, allocate an area to save
14833 SSE registers (as the very large allocation won't be described). */
14834 if (TARGET_SEH
14835 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14836 && !sse_registers_saved)
14838 HOST_WIDE_INT sse_size =
14839 frame.sse_reg_save_offset - frame.reg_save_offset;
14841 gcc_assert (int_registers_saved);
14843 /* No need to do stack checking as the area will be immediately
14844 written. */
14845 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14846 GEN_INT (-sse_size), -1,
14847 m->fs.cfa_reg == stack_pointer_rtx);
14848 allocate -= sse_size;
14849 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14850 sse_registers_saved = true;
14853 /* The stack has already been decremented by the instruction calling us
14854 so probe if the size is non-negative to preserve the protection area. */
14855 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14857 /* We expect the GP registers to be saved when probes are used. */
14858 gcc_assert (int_registers_saved);
14860 if (STACK_CHECK_MOVING_SP)
14862 if (!(crtl->is_leaf && !cfun->calls_alloca
14863 && allocate <= PROBE_INTERVAL))
14865 ix86_adjust_stack_and_probe (allocate);
14866 allocate = 0;
14869 else
14871 HOST_WIDE_INT size = allocate;
14873 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14874 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14876 if (TARGET_STACK_PROBE)
14878 if (crtl->is_leaf && !cfun->calls_alloca)
14880 if (size > PROBE_INTERVAL)
14881 ix86_emit_probe_stack_range (0, size);
14883 else
14884 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14886 else
14888 if (crtl->is_leaf && !cfun->calls_alloca)
14890 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14891 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14892 size - STACK_CHECK_PROTECT);
14894 else
14895 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14900 if (allocate == 0)
14902 else if (!ix86_target_stack_probe ()
14903 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14905 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14906 GEN_INT (-allocate), -1,
14907 m->fs.cfa_reg == stack_pointer_rtx);
14909 else
14911 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14912 rtx r10 = NULL;
14913 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14914 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14915 bool eax_live = ix86_eax_live_at_start_p ();
14916 bool r10_live = false;
14918 if (TARGET_64BIT)
14919 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14921 if (eax_live)
14923 insn = emit_insn (gen_push (eax));
14924 allocate -= UNITS_PER_WORD;
14925 /* Note that SEH directives need to continue tracking the stack
14926 pointer even after the frame pointer has been set up. */
14927 if (sp_is_cfa_reg || TARGET_SEH)
14929 if (sp_is_cfa_reg)
14930 m->fs.cfa_offset += UNITS_PER_WORD;
14931 RTX_FRAME_RELATED_P (insn) = 1;
14932 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14933 gen_rtx_SET (stack_pointer_rtx,
14934 plus_constant (Pmode, stack_pointer_rtx,
14935 -UNITS_PER_WORD)));
14939 if (r10_live)
14941 r10 = gen_rtx_REG (Pmode, R10_REG);
14942 insn = emit_insn (gen_push (r10));
14943 allocate -= UNITS_PER_WORD;
14944 if (sp_is_cfa_reg || TARGET_SEH)
14946 if (sp_is_cfa_reg)
14947 m->fs.cfa_offset += UNITS_PER_WORD;
14948 RTX_FRAME_RELATED_P (insn) = 1;
14949 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14950 gen_rtx_SET (stack_pointer_rtx,
14951 plus_constant (Pmode, stack_pointer_rtx,
14952 -UNITS_PER_WORD)));
14956 emit_move_insn (eax, GEN_INT (allocate));
14957 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14959 /* Use the fact that AX still contains ALLOCATE. */
14960 adjust_stack_insn = (Pmode == DImode
14961 ? gen_pro_epilogue_adjust_stack_di_sub
14962 : gen_pro_epilogue_adjust_stack_si_sub);
14964 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14965 stack_pointer_rtx, eax));
14967 if (sp_is_cfa_reg || TARGET_SEH)
14969 if (sp_is_cfa_reg)
14970 m->fs.cfa_offset += allocate;
14971 RTX_FRAME_RELATED_P (insn) = 1;
14972 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14973 gen_rtx_SET (stack_pointer_rtx,
14974 plus_constant (Pmode, stack_pointer_rtx,
14975 -allocate)));
14977 m->fs.sp_offset += allocate;
14979 /* Use stack_pointer_rtx for relative addressing so that code
14980 works for realigned stack, too. */
14981 if (r10_live && eax_live)
14983 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14984 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14985 gen_frame_mem (word_mode, t));
14986 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14987 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14988 gen_frame_mem (word_mode, t));
14990 else if (eax_live || r10_live)
14992 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14993 emit_move_insn (gen_rtx_REG (word_mode,
14994 (eax_live ? AX_REG : R10_REG)),
14995 gen_frame_mem (word_mode, t));
14998 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
15000 /* If we havn't already set up the frame pointer, do so now. */
15001 if (frame_pointer_needed && !m->fs.fp_valid)
15003 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
15004 GEN_INT (frame.stack_pointer_offset
15005 - frame.hard_frame_pointer_offset));
15006 insn = emit_insn (insn);
15007 RTX_FRAME_RELATED_P (insn) = 1;
15008 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
15010 if (m->fs.cfa_reg == stack_pointer_rtx)
15011 m->fs.cfa_reg = hard_frame_pointer_rtx;
15012 m->fs.fp_offset = frame.hard_frame_pointer_offset;
15013 m->fs.fp_valid = true;
15016 if (!int_registers_saved)
15017 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
15018 if (!sse_registers_saved)
15019 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
15021 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
15022 in PROLOGUE. */
15023 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
15025 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
15026 insn = emit_insn (gen_set_got (pic));
15027 RTX_FRAME_RELATED_P (insn) = 1;
15028 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
15029 emit_insn (gen_prologue_use (pic));
15030 /* Deleting already emmitted SET_GOT if exist and allocated to
15031 REAL_PIC_OFFSET_TABLE_REGNUM. */
15032 ix86_elim_entry_set_got (pic);
15035 if (crtl->drap_reg && !crtl->stack_realign_needed)
15037 /* vDRAP is setup but after reload it turns out stack realign
15038 isn't necessary, here we will emit prologue to setup DRAP
15039 without stack realign adjustment */
15040 t = choose_baseaddr (0, NULL);
15041 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
15044 /* Prevent instructions from being scheduled into register save push
15045 sequence when access to the redzone area is done through frame pointer.
15046 The offset between the frame pointer and the stack pointer is calculated
15047 relative to the value of the stack pointer at the end of the function
15048 prologue, and moving instructions that access redzone area via frame
15049 pointer inside push sequence violates this assumption. */
15050 if (frame_pointer_needed && frame.red_zone_size)
15051 emit_insn (gen_memory_blockage ());
15053 /* SEH requires that the prologue end within 256 bytes of the start of
15054 the function. Prevent instruction schedules that would extend that.
15055 Further, prevent alloca modifications to the stack pointer from being
15056 combined with prologue modifications. */
15057 if (TARGET_SEH)
15058 emit_insn (gen_prologue_use (stack_pointer_rtx));
15061 /* Emit code to restore REG using a POP insn. */
15063 static void
15064 ix86_emit_restore_reg_using_pop (rtx reg)
15066 struct machine_function *m = cfun->machine;
15067 rtx_insn *insn = emit_insn (gen_pop (reg));
15069 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
15070 m->fs.sp_offset -= UNITS_PER_WORD;
15072 if (m->fs.cfa_reg == crtl->drap_reg
15073 && REGNO (reg) == REGNO (crtl->drap_reg))
15075 /* Previously we'd represented the CFA as an expression
15076 like *(%ebp - 8). We've just popped that value from
15077 the stack, which means we need to reset the CFA to
15078 the drap register. This will remain until we restore
15079 the stack pointer. */
15080 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15081 RTX_FRAME_RELATED_P (insn) = 1;
15083 /* This means that the DRAP register is valid for addressing too. */
15084 m->fs.drap_valid = true;
15085 return;
15088 if (m->fs.cfa_reg == stack_pointer_rtx)
15090 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15091 x = gen_rtx_SET (stack_pointer_rtx, x);
15092 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15093 RTX_FRAME_RELATED_P (insn) = 1;
15095 m->fs.cfa_offset -= UNITS_PER_WORD;
15098 /* When the frame pointer is the CFA, and we pop it, we are
15099 swapping back to the stack pointer as the CFA. This happens
15100 for stack frames that don't allocate other data, so we assume
15101 the stack pointer is now pointing at the return address, i.e.
15102 the function entry state, which makes the offset be 1 word. */
15103 if (reg == hard_frame_pointer_rtx)
15105 m->fs.fp_valid = false;
15106 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15108 m->fs.cfa_reg = stack_pointer_rtx;
15109 m->fs.cfa_offset -= UNITS_PER_WORD;
15111 add_reg_note (insn, REG_CFA_DEF_CFA,
15112 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15113 GEN_INT (m->fs.cfa_offset)));
15114 RTX_FRAME_RELATED_P (insn) = 1;
15119 /* Emit code to restore saved registers using POP insns. */
15121 static void
15122 ix86_emit_restore_regs_using_pop (void)
15124 unsigned int regno;
15126 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15127 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
15128 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
15131 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
15132 omits the emit and only attaches the notes. */
15134 static void
15135 ix86_emit_leave (rtx_insn *insn)
15137 struct machine_function *m = cfun->machine;
15138 if (!insn)
15139 insn = emit_insn (ix86_gen_leave ());
15141 ix86_add_queued_cfa_restore_notes (insn);
15143 gcc_assert (m->fs.fp_valid);
15144 m->fs.sp_valid = true;
15145 m->fs.sp_realigned = false;
15146 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
15147 m->fs.fp_valid = false;
15149 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15151 m->fs.cfa_reg = stack_pointer_rtx;
15152 m->fs.cfa_offset = m->fs.sp_offset;
15154 add_reg_note (insn, REG_CFA_DEF_CFA,
15155 plus_constant (Pmode, stack_pointer_rtx,
15156 m->fs.sp_offset));
15157 RTX_FRAME_RELATED_P (insn) = 1;
15159 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
15160 m->fs.fp_offset);
15163 /* Emit code to restore saved registers using MOV insns.
15164 First register is restored from CFA - CFA_OFFSET. */
15165 static void
15166 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
15167 bool maybe_eh_return)
15169 struct machine_function *m = cfun->machine;
15170 unsigned int regno;
15172 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15173 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15175 rtx reg = gen_rtx_REG (word_mode, regno);
15176 rtx mem;
15177 rtx_insn *insn;
15179 mem = choose_baseaddr (cfa_offset, NULL);
15180 mem = gen_frame_mem (word_mode, mem);
15181 insn = emit_move_insn (reg, mem);
15183 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
15185 /* Previously we'd represented the CFA as an expression
15186 like *(%ebp - 8). We've just popped that value from
15187 the stack, which means we need to reset the CFA to
15188 the drap register. This will remain until we restore
15189 the stack pointer. */
15190 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15191 RTX_FRAME_RELATED_P (insn) = 1;
15193 /* This means that the DRAP register is valid for addressing. */
15194 m->fs.drap_valid = true;
15196 else
15197 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15199 cfa_offset -= UNITS_PER_WORD;
15203 /* Emit code to restore saved registers using MOV insns.
15204 First register is restored from CFA - CFA_OFFSET. */
15205 static void
15206 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15207 bool maybe_eh_return)
15209 unsigned int regno;
15211 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15212 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15214 rtx reg = gen_rtx_REG (V4SFmode, regno);
15215 rtx mem;
15216 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15218 mem = choose_baseaddr (cfa_offset, &align);
15219 mem = gen_rtx_MEM (V4SFmode, mem);
15221 /* The location aligment depends upon the base register. */
15222 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15223 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15224 set_mem_align (mem, align);
15225 emit_insn (gen_rtx_SET (reg, mem));
15227 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15229 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15233 static void
15234 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15235 bool use_call, int style)
15237 struct machine_function *m = cfun->machine;
15238 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15239 + m->call_ms2sysv_extra_regs;
15240 rtvec v;
15241 unsigned int elems_needed, align, i, vi = 0;
15242 rtx_insn *insn;
15243 rtx sym, tmp;
15244 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15245 rtx r10 = NULL_RTX;
15246 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15247 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15248 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15249 rtx rsi_frame_load = NULL_RTX;
15250 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15251 enum xlogue_stub stub;
15253 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15255 /* If using a realigned stack, we should never start with padding. */
15256 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15258 /* Setup RSI as the stub's base pointer. */
15259 align = GET_MODE_ALIGNMENT (V4SFmode);
15260 tmp = choose_baseaddr (rsi_offset, &align);
15261 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15262 emit_insn (gen_rtx_SET (rsi, tmp));
15264 /* Get a symbol for the stub. */
15265 if (frame_pointer_needed)
15266 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15267 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15268 else
15269 stub = use_call ? XLOGUE_STUB_RESTORE
15270 : XLOGUE_STUB_RESTORE_TAIL;
15271 sym = xlogue.get_stub_rtx (stub);
15273 elems_needed = ncregs;
15274 if (use_call)
15275 elems_needed += 1;
15276 else
15277 elems_needed += frame_pointer_needed ? 5 : 3;
15278 v = rtvec_alloc (elems_needed);
15280 /* We call the epilogue stub when we need to pop incoming args or we are
15281 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15282 epilogue stub and it is the tail-call. */
15283 if (use_call)
15284 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15285 else
15287 RTVEC_ELT (v, vi++) = ret_rtx;
15288 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15289 if (frame_pointer_needed)
15291 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15292 gcc_assert (m->fs.fp_valid);
15293 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15295 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15296 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15297 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15298 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15299 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15301 else
15303 /* If no hard frame pointer, we set R10 to the SP restore value. */
15304 gcc_assert (!m->fs.fp_valid);
15305 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15306 gcc_assert (m->fs.sp_valid);
15308 r10 = gen_rtx_REG (DImode, R10_REG);
15309 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15310 emit_insn (gen_rtx_SET (r10, tmp));
15312 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15316 /* Generate frame load insns and restore notes. */
15317 for (i = 0; i < ncregs; ++i)
15319 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15320 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15321 rtx reg, frame_load;
15323 reg = gen_rtx_REG (mode, r.regno);
15324 frame_load = gen_frame_load (reg, rsi, r.offset);
15326 /* Save RSI frame load insn & note to add last. */
15327 if (r.regno == SI_REG)
15329 gcc_assert (!rsi_frame_load);
15330 rsi_frame_load = frame_load;
15331 rsi_restore_offset = r.offset;
15333 else
15335 RTVEC_ELT (v, vi++) = frame_load;
15336 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15340 /* Add RSI frame load & restore note at the end. */
15341 gcc_assert (rsi_frame_load);
15342 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15343 RTVEC_ELT (v, vi++) = rsi_frame_load;
15344 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15345 rsi_restore_offset);
15347 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15348 if (!use_call && !frame_pointer_needed)
15350 gcc_assert (m->fs.sp_valid);
15351 gcc_assert (!m->fs.sp_realigned);
15353 /* At this point, R10 should point to frame.stack_realign_offset. */
15354 if (m->fs.cfa_reg == stack_pointer_rtx)
15355 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15356 m->fs.sp_offset = frame.stack_realign_offset;
15359 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15360 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15361 if (use_call)
15362 insn = emit_insn (tmp);
15363 else
15365 insn = emit_jump_insn (tmp);
15366 JUMP_LABEL (insn) = ret_rtx;
15368 if (frame_pointer_needed)
15369 ix86_emit_leave (insn);
15370 else
15372 /* Need CFA adjust note. */
15373 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15374 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15378 RTX_FRAME_RELATED_P (insn) = true;
15379 ix86_add_queued_cfa_restore_notes (insn);
15381 /* If we're not doing a tail-call, we need to adjust the stack. */
15382 if (use_call && m->fs.sp_valid)
15384 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15385 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15386 GEN_INT (dealloc), style,
15387 m->fs.cfa_reg == stack_pointer_rtx);
15391 /* Restore function stack, frame, and registers. */
15393 void
15394 ix86_expand_epilogue (int style)
15396 struct machine_function *m = cfun->machine;
15397 struct machine_frame_state frame_state_save = m->fs;
15398 struct ix86_frame frame;
15399 bool restore_regs_via_mov;
15400 bool using_drap;
15401 bool restore_stub_is_tail = false;
15403 if (ix86_function_naked (current_function_decl))
15405 /* The program should not reach this point. */
15406 emit_insn (gen_ud2 ());
15407 return;
15410 ix86_finalize_stack_frame_flags ();
15411 frame = m->frame;
15413 m->fs.sp_realigned = stack_realign_fp;
15414 m->fs.sp_valid = stack_realign_fp
15415 || !frame_pointer_needed
15416 || crtl->sp_is_unchanging;
15417 gcc_assert (!m->fs.sp_valid
15418 || m->fs.sp_offset == frame.stack_pointer_offset);
15420 /* The FP must be valid if the frame pointer is present. */
15421 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15422 gcc_assert (!m->fs.fp_valid
15423 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15425 /* We must have *some* valid pointer to the stack frame. */
15426 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15428 /* The DRAP is never valid at this point. */
15429 gcc_assert (!m->fs.drap_valid);
15431 /* See the comment about red zone and frame
15432 pointer usage in ix86_expand_prologue. */
15433 if (frame_pointer_needed && frame.red_zone_size)
15434 emit_insn (gen_memory_blockage ());
15436 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15437 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15439 /* Determine the CFA offset of the end of the red-zone. */
15440 m->fs.red_zone_offset = 0;
15441 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15443 /* The red-zone begins below return address and error code in
15444 exception handler. */
15445 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15447 /* When the register save area is in the aligned portion of
15448 the stack, determine the maximum runtime displacement that
15449 matches up with the aligned frame. */
15450 if (stack_realign_drap)
15451 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15452 + UNITS_PER_WORD);
15455 /* Special care must be taken for the normal return case of a function
15456 using eh_return: the eax and edx registers are marked as saved, but
15457 not restored along this path. Adjust the save location to match. */
15458 if (crtl->calls_eh_return && style != 2)
15459 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15461 /* EH_RETURN requires the use of moves to function properly. */
15462 if (crtl->calls_eh_return)
15463 restore_regs_via_mov = true;
15464 /* SEH requires the use of pops to identify the epilogue. */
15465 else if (TARGET_SEH)
15466 restore_regs_via_mov = false;
15467 /* If we're only restoring one register and sp cannot be used then
15468 using a move instruction to restore the register since it's
15469 less work than reloading sp and popping the register. */
15470 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15471 restore_regs_via_mov = true;
15472 else if (TARGET_EPILOGUE_USING_MOVE
15473 && cfun->machine->use_fast_prologue_epilogue
15474 && (frame.nregs > 1
15475 || m->fs.sp_offset != frame.reg_save_offset))
15476 restore_regs_via_mov = true;
15477 else if (frame_pointer_needed
15478 && !frame.nregs
15479 && m->fs.sp_offset != frame.reg_save_offset)
15480 restore_regs_via_mov = true;
15481 else if (frame_pointer_needed
15482 && TARGET_USE_LEAVE
15483 && cfun->machine->use_fast_prologue_epilogue
15484 && frame.nregs == 1)
15485 restore_regs_via_mov = true;
15486 else
15487 restore_regs_via_mov = false;
15489 if (restore_regs_via_mov || frame.nsseregs)
15491 /* Ensure that the entire register save area is addressable via
15492 the stack pointer, if we will restore SSE regs via sp. */
15493 if (TARGET_64BIT
15494 && m->fs.sp_offset > 0x7fffffff
15495 && sp_valid_at (frame.stack_realign_offset)
15496 && (frame.nsseregs + frame.nregs) != 0)
15498 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15499 GEN_INT (m->fs.sp_offset
15500 - frame.sse_reg_save_offset),
15501 style,
15502 m->fs.cfa_reg == stack_pointer_rtx);
15506 /* If there are any SSE registers to restore, then we have to do it
15507 via moves, since there's obviously no pop for SSE regs. */
15508 if (frame.nsseregs)
15509 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15510 style == 2);
15512 if (m->call_ms2sysv)
15514 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15516 /* We cannot use a tail-call for the stub if:
15517 1. We have to pop incoming args,
15518 2. We have additional int regs to restore, or
15519 3. A sibling call will be the tail-call, or
15520 4. We are emitting an eh_return_internal epilogue.
15522 TODO: Item 4 has not yet tested!
15524 If any of the above are true, we will call the stub rather than
15525 jump to it. */
15526 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15527 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15530 /* If using out-of-line stub that is a tail-call, then...*/
15531 if (m->call_ms2sysv && restore_stub_is_tail)
15533 /* TODO: parinoid tests. (remove eventually) */
15534 gcc_assert (m->fs.sp_valid);
15535 gcc_assert (!m->fs.sp_realigned);
15536 gcc_assert (!m->fs.fp_valid);
15537 gcc_assert (!m->fs.realigned);
15538 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15539 gcc_assert (!crtl->drap_reg);
15540 gcc_assert (!frame.nregs);
15542 else if (restore_regs_via_mov)
15544 rtx t;
15546 if (frame.nregs)
15547 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15549 /* eh_return epilogues need %ecx added to the stack pointer. */
15550 if (style == 2)
15552 rtx sa = EH_RETURN_STACKADJ_RTX;
15553 rtx_insn *insn;
15555 /* %ecx can't be used for both DRAP register and eh_return. */
15556 if (crtl->drap_reg)
15557 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15559 /* regparm nested functions don't work with eh_return. */
15560 gcc_assert (!ix86_static_chain_on_stack);
15562 if (frame_pointer_needed)
15564 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15565 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15566 emit_insn (gen_rtx_SET (sa, t));
15568 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15569 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15571 /* Note that we use SA as a temporary CFA, as the return
15572 address is at the proper place relative to it. We
15573 pretend this happens at the FP restore insn because
15574 prior to this insn the FP would be stored at the wrong
15575 offset relative to SA, and after this insn we have no
15576 other reasonable register to use for the CFA. We don't
15577 bother resetting the CFA to the SP for the duration of
15578 the return insn. */
15579 add_reg_note (insn, REG_CFA_DEF_CFA,
15580 plus_constant (Pmode, sa, UNITS_PER_WORD));
15581 ix86_add_queued_cfa_restore_notes (insn);
15582 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15583 RTX_FRAME_RELATED_P (insn) = 1;
15585 m->fs.cfa_reg = sa;
15586 m->fs.cfa_offset = UNITS_PER_WORD;
15587 m->fs.fp_valid = false;
15589 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15590 const0_rtx, style, false);
15592 else
15594 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15595 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15596 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15597 ix86_add_queued_cfa_restore_notes (insn);
15599 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15600 if (m->fs.cfa_offset != UNITS_PER_WORD)
15602 m->fs.cfa_offset = UNITS_PER_WORD;
15603 add_reg_note (insn, REG_CFA_DEF_CFA,
15604 plus_constant (Pmode, stack_pointer_rtx,
15605 UNITS_PER_WORD));
15606 RTX_FRAME_RELATED_P (insn) = 1;
15609 m->fs.sp_offset = UNITS_PER_WORD;
15610 m->fs.sp_valid = true;
15611 m->fs.sp_realigned = false;
15614 else
15616 /* SEH requires that the function end with (1) a stack adjustment
15617 if necessary, (2) a sequence of pops, and (3) a return or
15618 jump instruction. Prevent insns from the function body from
15619 being scheduled into this sequence. */
15620 if (TARGET_SEH)
15622 /* Prevent a catch region from being adjacent to the standard
15623 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15624 several other flags that would be interesting to test are
15625 not yet set up. */
15626 if (flag_non_call_exceptions)
15627 emit_insn (gen_nops (const1_rtx));
15628 else
15629 emit_insn (gen_blockage ());
15632 /* First step is to deallocate the stack frame so that we can
15633 pop the registers. If the stack pointer was realigned, it needs
15634 to be restored now. Also do it on SEH target for very large
15635 frame as the emitted instructions aren't allowed by the ABI
15636 in epilogues. */
15637 if (!m->fs.sp_valid || m->fs.sp_realigned
15638 || (TARGET_SEH
15639 && (m->fs.sp_offset - frame.reg_save_offset
15640 >= SEH_MAX_FRAME_SIZE)))
15642 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15643 GEN_INT (m->fs.fp_offset
15644 - frame.reg_save_offset),
15645 style, false);
15647 else if (m->fs.sp_offset != frame.reg_save_offset)
15649 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15650 GEN_INT (m->fs.sp_offset
15651 - frame.reg_save_offset),
15652 style,
15653 m->fs.cfa_reg == stack_pointer_rtx);
15656 ix86_emit_restore_regs_using_pop ();
15659 /* If we used a stack pointer and haven't already got rid of it,
15660 then do so now. */
15661 if (m->fs.fp_valid)
15663 /* If the stack pointer is valid and pointing at the frame
15664 pointer store address, then we only need a pop. */
15665 if (sp_valid_at (frame.hfp_save_offset)
15666 && m->fs.sp_offset == frame.hfp_save_offset)
15667 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15668 /* Leave results in shorter dependency chains on CPUs that are
15669 able to grok it fast. */
15670 else if (TARGET_USE_LEAVE
15671 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15672 || !cfun->machine->use_fast_prologue_epilogue)
15673 ix86_emit_leave (NULL);
15674 else
15676 pro_epilogue_adjust_stack (stack_pointer_rtx,
15677 hard_frame_pointer_rtx,
15678 const0_rtx, style, !using_drap);
15679 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15683 if (using_drap)
15685 int param_ptr_offset = UNITS_PER_WORD;
15686 rtx_insn *insn;
15688 gcc_assert (stack_realign_drap);
15690 if (ix86_static_chain_on_stack)
15691 param_ptr_offset += UNITS_PER_WORD;
15692 if (!call_used_regs[REGNO (crtl->drap_reg)])
15693 param_ptr_offset += UNITS_PER_WORD;
15695 insn = emit_insn (gen_rtx_SET
15696 (stack_pointer_rtx,
15697 gen_rtx_PLUS (Pmode,
15698 crtl->drap_reg,
15699 GEN_INT (-param_ptr_offset))));
15700 m->fs.cfa_reg = stack_pointer_rtx;
15701 m->fs.cfa_offset = param_ptr_offset;
15702 m->fs.sp_offset = param_ptr_offset;
15703 m->fs.realigned = false;
15705 add_reg_note (insn, REG_CFA_DEF_CFA,
15706 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15707 GEN_INT (param_ptr_offset)));
15708 RTX_FRAME_RELATED_P (insn) = 1;
15710 if (!call_used_regs[REGNO (crtl->drap_reg)])
15711 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15714 /* At this point the stack pointer must be valid, and we must have
15715 restored all of the registers. We may not have deallocated the
15716 entire stack frame. We've delayed this until now because it may
15717 be possible to merge the local stack deallocation with the
15718 deallocation forced by ix86_static_chain_on_stack. */
15719 gcc_assert (m->fs.sp_valid);
15720 gcc_assert (!m->fs.sp_realigned);
15721 gcc_assert (!m->fs.fp_valid);
15722 gcc_assert (!m->fs.realigned);
15723 if (m->fs.sp_offset != UNITS_PER_WORD)
15725 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15726 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15727 style, true);
15729 else
15730 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15732 /* Sibcall epilogues don't want a return instruction. */
15733 if (style == 0)
15735 m->fs = frame_state_save;
15736 return;
15739 if (cfun->machine->func_type != TYPE_NORMAL)
15740 emit_jump_insn (gen_interrupt_return ());
15741 else if (crtl->args.pops_args && crtl->args.size)
15743 rtx popc = GEN_INT (crtl->args.pops_args);
15745 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15746 address, do explicit add, and jump indirectly to the caller. */
15748 if (crtl->args.pops_args >= 65536)
15750 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15751 rtx_insn *insn;
15753 /* There is no "pascal" calling convention in any 64bit ABI. */
15754 gcc_assert (!TARGET_64BIT);
15756 insn = emit_insn (gen_pop (ecx));
15757 m->fs.cfa_offset -= UNITS_PER_WORD;
15758 m->fs.sp_offset -= UNITS_PER_WORD;
15760 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15761 x = gen_rtx_SET (stack_pointer_rtx, x);
15762 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15763 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15764 RTX_FRAME_RELATED_P (insn) = 1;
15766 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15767 popc, -1, true);
15768 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15770 else
15771 emit_jump_insn (gen_simple_return_pop_internal (popc));
15773 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15774 emit_jump_insn (gen_simple_return_internal ());
15776 /* Restore the state back to the state from the prologue,
15777 so that it's correct for the next epilogue. */
15778 m->fs = frame_state_save;
15781 /* Reset from the function's potential modifications. */
15783 static void
15784 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
15786 if (pic_offset_table_rtx
15787 && !ix86_use_pseudo_pic_reg ())
15788 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15790 if (TARGET_MACHO)
15792 rtx_insn *insn = get_last_insn ();
15793 rtx_insn *deleted_debug_label = NULL;
15795 /* Mach-O doesn't support labels at the end of objects, so if
15796 it looks like we might want one, take special action.
15797 First, collect any sequence of deleted debug labels. */
15798 while (insn
15799 && NOTE_P (insn)
15800 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15802 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15803 notes only, instead set their CODE_LABEL_NUMBER to -1,
15804 otherwise there would be code generation differences
15805 in between -g and -g0. */
15806 if (NOTE_P (insn) && NOTE_KIND (insn)
15807 == NOTE_INSN_DELETED_DEBUG_LABEL)
15808 deleted_debug_label = insn;
15809 insn = PREV_INSN (insn);
15812 /* If we have:
15813 label:
15814 barrier
15815 then this needs to be detected, so skip past the barrier. */
15817 if (insn && BARRIER_P (insn))
15818 insn = PREV_INSN (insn);
15820 /* Up to now we've only seen notes or barriers. */
15821 if (insn)
15823 if (LABEL_P (insn)
15824 || (NOTE_P (insn)
15825 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15826 /* Trailing label. */
15827 fputs ("\tnop\n", file);
15828 else if (cfun && ! cfun->is_thunk)
15830 /* See if we have a completely empty function body, skipping
15831 the special case of the picbase thunk emitted as asm. */
15832 while (insn && ! INSN_P (insn))
15833 insn = PREV_INSN (insn);
15834 /* If we don't find any insns, we've got an empty function body;
15835 I.e. completely empty - without a return or branch. This is
15836 taken as the case where a function body has been removed
15837 because it contains an inline __builtin_unreachable(). GCC
15838 declares that reaching __builtin_unreachable() means UB so
15839 we're not obliged to do anything special; however, we want
15840 non-zero-sized function bodies. To meet this, and help the
15841 user out, let's trap the case. */
15842 if (insn == NULL)
15843 fputs ("\tud2\n", file);
15846 else if (deleted_debug_label)
15847 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15848 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15849 CODE_LABEL_NUMBER (insn) = -1;
15853 /* Return a scratch register to use in the split stack prologue. The
15854 split stack prologue is used for -fsplit-stack. It is the first
15855 instructions in the function, even before the regular prologue.
15856 The scratch register can be any caller-saved register which is not
15857 used for parameters or for the static chain. */
15859 static unsigned int
15860 split_stack_prologue_scratch_regno (void)
15862 if (TARGET_64BIT)
15863 return R11_REG;
15864 else
15866 bool is_fastcall, is_thiscall;
15867 int regparm;
15869 is_fastcall = (lookup_attribute ("fastcall",
15870 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15871 != NULL);
15872 is_thiscall = (lookup_attribute ("thiscall",
15873 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15874 != NULL);
15875 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15877 if (is_fastcall)
15879 if (DECL_STATIC_CHAIN (cfun->decl))
15881 sorry ("-fsplit-stack does not support fastcall with "
15882 "nested function");
15883 return INVALID_REGNUM;
15885 return AX_REG;
15887 else if (is_thiscall)
15889 if (!DECL_STATIC_CHAIN (cfun->decl))
15890 return DX_REG;
15891 return AX_REG;
15893 else if (regparm < 3)
15895 if (!DECL_STATIC_CHAIN (cfun->decl))
15896 return CX_REG;
15897 else
15899 if (regparm >= 2)
15901 sorry ("-fsplit-stack does not support 2 register "
15902 "parameters for a nested function");
15903 return INVALID_REGNUM;
15905 return DX_REG;
15908 else
15910 /* FIXME: We could make this work by pushing a register
15911 around the addition and comparison. */
15912 sorry ("-fsplit-stack does not support 3 register parameters");
15913 return INVALID_REGNUM;
15918 /* A SYMBOL_REF for the function which allocates new stackspace for
15919 -fsplit-stack. */
15921 static GTY(()) rtx split_stack_fn;
15923 /* A SYMBOL_REF for the more stack function when using the large
15924 model. */
15926 static GTY(()) rtx split_stack_fn_large;
15928 /* Return location of the stack guard value in the TLS block. */
15931 ix86_split_stack_guard (void)
15933 int offset;
15934 addr_space_t as = DEFAULT_TLS_SEG_REG;
15935 rtx r;
15937 gcc_assert (flag_split_stack);
15939 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15940 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15941 #else
15942 gcc_unreachable ();
15943 #endif
15945 r = GEN_INT (offset);
15946 r = gen_const_mem (Pmode, r);
15947 set_mem_addr_space (r, as);
15949 return r;
15952 /* Handle -fsplit-stack. These are the first instructions in the
15953 function, even before the regular prologue. */
15955 void
15956 ix86_expand_split_stack_prologue (void)
15958 struct ix86_frame frame;
15959 HOST_WIDE_INT allocate;
15960 unsigned HOST_WIDE_INT args_size;
15961 rtx_code_label *label;
15962 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15963 rtx scratch_reg = NULL_RTX;
15964 rtx_code_label *varargs_label = NULL;
15965 rtx fn;
15967 gcc_assert (flag_split_stack && reload_completed);
15969 ix86_finalize_stack_frame_flags ();
15970 frame = cfun->machine->frame;
15971 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15973 /* This is the label we will branch to if we have enough stack
15974 space. We expect the basic block reordering pass to reverse this
15975 branch if optimizing, so that we branch in the unlikely case. */
15976 label = gen_label_rtx ();
15978 /* We need to compare the stack pointer minus the frame size with
15979 the stack boundary in the TCB. The stack boundary always gives
15980 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15981 can compare directly. Otherwise we need to do an addition. */
15983 limit = ix86_split_stack_guard ();
15985 if (allocate < SPLIT_STACK_AVAILABLE)
15986 current = stack_pointer_rtx;
15987 else
15989 unsigned int scratch_regno;
15990 rtx offset;
15992 /* We need a scratch register to hold the stack pointer minus
15993 the required frame size. Since this is the very start of the
15994 function, the scratch register can be any caller-saved
15995 register which is not used for parameters. */
15996 offset = GEN_INT (- allocate);
15997 scratch_regno = split_stack_prologue_scratch_regno ();
15998 if (scratch_regno == INVALID_REGNUM)
15999 return;
16000 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16001 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
16003 /* We don't use ix86_gen_add3 in this case because it will
16004 want to split to lea, but when not optimizing the insn
16005 will not be split after this point. */
16006 emit_insn (gen_rtx_SET (scratch_reg,
16007 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16008 offset)));
16010 else
16012 emit_move_insn (scratch_reg, offset);
16013 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
16014 stack_pointer_rtx));
16016 current = scratch_reg;
16019 ix86_expand_branch (GEU, current, limit, label);
16020 rtx_insn *jump_insn = get_last_insn ();
16021 JUMP_LABEL (jump_insn) = label;
16023 /* Mark the jump as very likely to be taken. */
16024 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
16026 if (split_stack_fn == NULL_RTX)
16028 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
16029 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
16031 fn = split_stack_fn;
16033 /* Get more stack space. We pass in the desired stack space and the
16034 size of the arguments to copy to the new stack. In 32-bit mode
16035 we push the parameters; __morestack will return on a new stack
16036 anyhow. In 64-bit mode we pass the parameters in r10 and
16037 r11. */
16038 allocate_rtx = GEN_INT (allocate);
16039 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
16040 call_fusage = NULL_RTX;
16041 rtx pop = NULL_RTX;
16042 if (TARGET_64BIT)
16044 rtx reg10, reg11;
16046 reg10 = gen_rtx_REG (Pmode, R10_REG);
16047 reg11 = gen_rtx_REG (Pmode, R11_REG);
16049 /* If this function uses a static chain, it will be in %r10.
16050 Preserve it across the call to __morestack. */
16051 if (DECL_STATIC_CHAIN (cfun->decl))
16053 rtx rax;
16055 rax = gen_rtx_REG (word_mode, AX_REG);
16056 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
16057 use_reg (&call_fusage, rax);
16060 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
16061 && !TARGET_PECOFF)
16063 HOST_WIDE_INT argval;
16065 gcc_assert (Pmode == DImode);
16066 /* When using the large model we need to load the address
16067 into a register, and we've run out of registers. So we
16068 switch to a different calling convention, and we call a
16069 different function: __morestack_large. We pass the
16070 argument size in the upper 32 bits of r10 and pass the
16071 frame size in the lower 32 bits. */
16072 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
16073 gcc_assert ((args_size & 0xffffffff) == args_size);
16075 if (split_stack_fn_large == NULL_RTX)
16077 split_stack_fn_large =
16078 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
16079 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
16081 if (ix86_cmodel == CM_LARGE_PIC)
16083 rtx_code_label *label;
16084 rtx x;
16086 label = gen_label_rtx ();
16087 emit_label (label);
16088 LABEL_PRESERVE_P (label) = 1;
16089 emit_insn (gen_set_rip_rex64 (reg10, label));
16090 emit_insn (gen_set_got_offset_rex64 (reg11, label));
16091 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
16092 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
16093 UNSPEC_GOT);
16094 x = gen_rtx_CONST (Pmode, x);
16095 emit_move_insn (reg11, x);
16096 x = gen_rtx_PLUS (Pmode, reg10, reg11);
16097 x = gen_const_mem (Pmode, x);
16098 emit_move_insn (reg11, x);
16100 else
16101 emit_move_insn (reg11, split_stack_fn_large);
16103 fn = reg11;
16105 argval = ((args_size << 16) << 16) + allocate;
16106 emit_move_insn (reg10, GEN_INT (argval));
16108 else
16110 emit_move_insn (reg10, allocate_rtx);
16111 emit_move_insn (reg11, GEN_INT (args_size));
16112 use_reg (&call_fusage, reg11);
16115 use_reg (&call_fusage, reg10);
16117 else
16119 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
16120 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
16121 insn = emit_insn (gen_push (allocate_rtx));
16122 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
16123 pop = GEN_INT (2 * UNITS_PER_WORD);
16125 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
16126 GEN_INT (UNITS_PER_WORD), constm1_rtx,
16127 pop, false);
16128 add_function_usage_to (call_insn, call_fusage);
16129 if (!TARGET_64BIT)
16130 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
16131 /* Indicate that this function can't jump to non-local gotos. */
16132 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
16134 /* In order to make call/return prediction work right, we now need
16135 to execute a return instruction. See
16136 libgcc/config/i386/morestack.S for the details on how this works.
16138 For flow purposes gcc must not see this as a return
16139 instruction--we need control flow to continue at the subsequent
16140 label. Therefore, we use an unspec. */
16141 gcc_assert (crtl->args.pops_args < 65536);
16142 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
16144 /* If we are in 64-bit mode and this function uses a static chain,
16145 we saved %r10 in %rax before calling _morestack. */
16146 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
16147 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
16148 gen_rtx_REG (word_mode, AX_REG));
16150 /* If this function calls va_start, we need to store a pointer to
16151 the arguments on the old stack, because they may not have been
16152 all copied to the new stack. At this point the old stack can be
16153 found at the frame pointer value used by __morestack, because
16154 __morestack has set that up before calling back to us. Here we
16155 store that pointer in a scratch register, and in
16156 ix86_expand_prologue we store the scratch register in a stack
16157 slot. */
16158 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16160 unsigned int scratch_regno;
16161 rtx frame_reg;
16162 int words;
16164 scratch_regno = split_stack_prologue_scratch_regno ();
16165 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16166 frame_reg = gen_rtx_REG (Pmode, BP_REG);
16168 /* 64-bit:
16169 fp -> old fp value
16170 return address within this function
16171 return address of caller of this function
16172 stack arguments
16173 So we add three words to get to the stack arguments.
16175 32-bit:
16176 fp -> old fp value
16177 return address within this function
16178 first argument to __morestack
16179 second argument to __morestack
16180 return address of caller of this function
16181 stack arguments
16182 So we add five words to get to the stack arguments.
16184 words = TARGET_64BIT ? 3 : 5;
16185 emit_insn (gen_rtx_SET (scratch_reg,
16186 gen_rtx_PLUS (Pmode, frame_reg,
16187 GEN_INT (words * UNITS_PER_WORD))));
16189 varargs_label = gen_label_rtx ();
16190 emit_jump_insn (gen_jump (varargs_label));
16191 JUMP_LABEL (get_last_insn ()) = varargs_label;
16193 emit_barrier ();
16196 emit_label (label);
16197 LABEL_NUSES (label) = 1;
16199 /* If this function calls va_start, we now have to set the scratch
16200 register for the case where we do not call __morestack. In this
16201 case we need to set it based on the stack pointer. */
16202 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16204 emit_insn (gen_rtx_SET (scratch_reg,
16205 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16206 GEN_INT (UNITS_PER_WORD))));
16208 emit_label (varargs_label);
16209 LABEL_NUSES (varargs_label) = 1;
16213 /* We may have to tell the dataflow pass that the split stack prologue
16214 is initializing a scratch register. */
16216 static void
16217 ix86_live_on_entry (bitmap regs)
16219 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16221 gcc_assert (flag_split_stack);
16222 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16226 /* Extract the parts of an RTL expression that is a valid memory address
16227 for an instruction. Return 0 if the structure of the address is
16228 grossly off. Return -1 if the address contains ASHIFT, so it is not
16229 strictly valid, but still used for computing length of lea instruction. */
16232 ix86_decompose_address (rtx addr, struct ix86_address *out)
16234 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16235 rtx base_reg, index_reg;
16236 HOST_WIDE_INT scale = 1;
16237 rtx scale_rtx = NULL_RTX;
16238 rtx tmp;
16239 int retval = 1;
16240 addr_space_t seg = ADDR_SPACE_GENERIC;
16242 /* Allow zero-extended SImode addresses,
16243 they will be emitted with addr32 prefix. */
16244 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16246 if (GET_CODE (addr) == ZERO_EXTEND
16247 && GET_MODE (XEXP (addr, 0)) == SImode)
16249 addr = XEXP (addr, 0);
16250 if (CONST_INT_P (addr))
16251 return 0;
16253 else if (GET_CODE (addr) == AND
16254 && const_32bit_mask (XEXP (addr, 1), DImode))
16256 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16257 if (addr == NULL_RTX)
16258 return 0;
16260 if (CONST_INT_P (addr))
16261 return 0;
16265 /* Allow SImode subregs of DImode addresses,
16266 they will be emitted with addr32 prefix. */
16267 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16269 if (SUBREG_P (addr)
16270 && GET_MODE (SUBREG_REG (addr)) == DImode)
16272 addr = SUBREG_REG (addr);
16273 if (CONST_INT_P (addr))
16274 return 0;
16278 if (REG_P (addr))
16279 base = addr;
16280 else if (SUBREG_P (addr))
16282 if (REG_P (SUBREG_REG (addr)))
16283 base = addr;
16284 else
16285 return 0;
16287 else if (GET_CODE (addr) == PLUS)
16289 rtx addends[4], op;
16290 int n = 0, i;
16292 op = addr;
16295 if (n >= 4)
16296 return 0;
16297 addends[n++] = XEXP (op, 1);
16298 op = XEXP (op, 0);
16300 while (GET_CODE (op) == PLUS);
16301 if (n >= 4)
16302 return 0;
16303 addends[n] = op;
16305 for (i = n; i >= 0; --i)
16307 op = addends[i];
16308 switch (GET_CODE (op))
16310 case MULT:
16311 if (index)
16312 return 0;
16313 index = XEXP (op, 0);
16314 scale_rtx = XEXP (op, 1);
16315 break;
16317 case ASHIFT:
16318 if (index)
16319 return 0;
16320 index = XEXP (op, 0);
16321 tmp = XEXP (op, 1);
16322 if (!CONST_INT_P (tmp))
16323 return 0;
16324 scale = INTVAL (tmp);
16325 if ((unsigned HOST_WIDE_INT) scale > 3)
16326 return 0;
16327 scale = 1 << scale;
16328 break;
16330 case ZERO_EXTEND:
16331 op = XEXP (op, 0);
16332 if (GET_CODE (op) != UNSPEC)
16333 return 0;
16334 /* FALLTHRU */
16336 case UNSPEC:
16337 if (XINT (op, 1) == UNSPEC_TP
16338 && TARGET_TLS_DIRECT_SEG_REFS
16339 && seg == ADDR_SPACE_GENERIC)
16340 seg = DEFAULT_TLS_SEG_REG;
16341 else
16342 return 0;
16343 break;
16345 case SUBREG:
16346 if (!REG_P (SUBREG_REG (op)))
16347 return 0;
16348 /* FALLTHRU */
16350 case REG:
16351 if (!base)
16352 base = op;
16353 else if (!index)
16354 index = op;
16355 else
16356 return 0;
16357 break;
16359 case CONST:
16360 case CONST_INT:
16361 case SYMBOL_REF:
16362 case LABEL_REF:
16363 if (disp)
16364 return 0;
16365 disp = op;
16366 break;
16368 default:
16369 return 0;
16373 else if (GET_CODE (addr) == MULT)
16375 index = XEXP (addr, 0); /* index*scale */
16376 scale_rtx = XEXP (addr, 1);
16378 else if (GET_CODE (addr) == ASHIFT)
16380 /* We're called for lea too, which implements ashift on occasion. */
16381 index = XEXP (addr, 0);
16382 tmp = XEXP (addr, 1);
16383 if (!CONST_INT_P (tmp))
16384 return 0;
16385 scale = INTVAL (tmp);
16386 if ((unsigned HOST_WIDE_INT) scale > 3)
16387 return 0;
16388 scale = 1 << scale;
16389 retval = -1;
16391 else
16392 disp = addr; /* displacement */
16394 if (index)
16396 if (REG_P (index))
16398 else if (SUBREG_P (index)
16399 && REG_P (SUBREG_REG (index)))
16401 else
16402 return 0;
16405 /* Extract the integral value of scale. */
16406 if (scale_rtx)
16408 if (!CONST_INT_P (scale_rtx))
16409 return 0;
16410 scale = INTVAL (scale_rtx);
16413 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16414 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16416 /* Avoid useless 0 displacement. */
16417 if (disp == const0_rtx && (base || index))
16418 disp = NULL_RTX;
16420 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16421 if (base_reg && index_reg && scale == 1
16422 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16423 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16424 || REGNO (index_reg) == SP_REG))
16426 std::swap (base, index);
16427 std::swap (base_reg, index_reg);
16430 /* Special case: %ebp cannot be encoded as a base without a displacement.
16431 Similarly %r13. */
16432 if (!disp && base_reg
16433 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16434 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16435 || REGNO (base_reg) == BP_REG
16436 || REGNO (base_reg) == R13_REG))
16437 disp = const0_rtx;
16439 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16440 Avoid this by transforming to [%esi+0].
16441 Reload calls address legitimization without cfun defined, so we need
16442 to test cfun for being non-NULL. */
16443 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16444 && base_reg && !index_reg && !disp
16445 && REGNO (base_reg) == SI_REG)
16446 disp = const0_rtx;
16448 /* Special case: encode reg+reg instead of reg*2. */
16449 if (!base && index && scale == 2)
16450 base = index, base_reg = index_reg, scale = 1;
16452 /* Special case: scaling cannot be encoded without base or displacement. */
16453 if (!base && !disp && index && scale != 1)
16454 disp = const0_rtx;
16456 out->base = base;
16457 out->index = index;
16458 out->disp = disp;
16459 out->scale = scale;
16460 out->seg = seg;
16462 return retval;
16465 /* Return cost of the memory address x.
16466 For i386, it is better to use a complex address than let gcc copy
16467 the address into a reg and make a new pseudo. But not if the address
16468 requires to two regs - that would mean more pseudos with longer
16469 lifetimes. */
16470 static int
16471 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16473 struct ix86_address parts;
16474 int cost = 1;
16475 int ok = ix86_decompose_address (x, &parts);
16477 gcc_assert (ok);
16479 if (parts.base && SUBREG_P (parts.base))
16480 parts.base = SUBREG_REG (parts.base);
16481 if (parts.index && SUBREG_P (parts.index))
16482 parts.index = SUBREG_REG (parts.index);
16484 /* Attempt to minimize number of registers in the address by increasing
16485 address cost for each used register. We don't increase address cost
16486 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16487 is not invariant itself it most likely means that base or index is not
16488 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16489 which is not profitable for x86. */
16490 if (parts.base
16491 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16492 && (current_pass->type == GIMPLE_PASS
16493 || !pic_offset_table_rtx
16494 || !REG_P (parts.base)
16495 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16496 cost++;
16498 if (parts.index
16499 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16500 && (current_pass->type == GIMPLE_PASS
16501 || !pic_offset_table_rtx
16502 || !REG_P (parts.index)
16503 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16504 cost++;
16506 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16507 since it's predecode logic can't detect the length of instructions
16508 and it degenerates to vector decoded. Increase cost of such
16509 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16510 to split such addresses or even refuse such addresses at all.
16512 Following addressing modes are affected:
16513 [base+scale*index]
16514 [scale*index+disp]
16515 [base+index]
16517 The first and last case may be avoidable by explicitly coding the zero in
16518 memory address, but I don't have AMD-K6 machine handy to check this
16519 theory. */
16521 if (TARGET_K6
16522 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16523 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16524 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16525 cost += 10;
16527 return cost;
16530 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16531 this is used for to form addresses to local data when -fPIC is in
16532 use. */
16534 static bool
16535 darwin_local_data_pic (rtx disp)
16537 return (GET_CODE (disp) == UNSPEC
16538 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16541 /* True if operand X should be loaded from GOT. */
16543 bool
16544 ix86_force_load_from_GOT_p (rtx x)
16546 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16547 && !TARGET_PECOFF && !TARGET_MACHO
16548 && !flag_plt && !flag_pic
16549 && ix86_cmodel != CM_LARGE
16550 && GET_CODE (x) == SYMBOL_REF
16551 && SYMBOL_REF_FUNCTION_P (x)
16552 && !SYMBOL_REF_LOCAL_P (x));
16555 /* Determine if a given RTX is a valid constant. We already know this
16556 satisfies CONSTANT_P. */
16558 static bool
16559 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16561 /* Pointer bounds constants are not valid. */
16562 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16563 return false;
16565 switch (GET_CODE (x))
16567 case CONST:
16568 x = XEXP (x, 0);
16570 if (GET_CODE (x) == PLUS)
16572 if (!CONST_INT_P (XEXP (x, 1)))
16573 return false;
16574 x = XEXP (x, 0);
16577 if (TARGET_MACHO && darwin_local_data_pic (x))
16578 return true;
16580 /* Only some unspecs are valid as "constants". */
16581 if (GET_CODE (x) == UNSPEC)
16582 switch (XINT (x, 1))
16584 case UNSPEC_GOT:
16585 case UNSPEC_GOTOFF:
16586 case UNSPEC_PLTOFF:
16587 return TARGET_64BIT;
16588 case UNSPEC_TPOFF:
16589 case UNSPEC_NTPOFF:
16590 x = XVECEXP (x, 0, 0);
16591 return (GET_CODE (x) == SYMBOL_REF
16592 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16593 case UNSPEC_DTPOFF:
16594 x = XVECEXP (x, 0, 0);
16595 return (GET_CODE (x) == SYMBOL_REF
16596 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16597 default:
16598 return false;
16601 /* We must have drilled down to a symbol. */
16602 if (GET_CODE (x) == LABEL_REF)
16603 return true;
16604 if (GET_CODE (x) != SYMBOL_REF)
16605 return false;
16606 /* FALLTHRU */
16608 case SYMBOL_REF:
16609 /* TLS symbols are never valid. */
16610 if (SYMBOL_REF_TLS_MODEL (x))
16611 return false;
16613 /* DLLIMPORT symbols are never valid. */
16614 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16615 && SYMBOL_REF_DLLIMPORT_P (x))
16616 return false;
16618 #if TARGET_MACHO
16619 /* mdynamic-no-pic */
16620 if (MACHO_DYNAMIC_NO_PIC_P)
16621 return machopic_symbol_defined_p (x);
16622 #endif
16624 /* External function address should be loaded
16625 via the GOT slot to avoid PLT. */
16626 if (ix86_force_load_from_GOT_p (x))
16627 return false;
16629 break;
16631 CASE_CONST_SCALAR_INT:
16632 switch (mode)
16634 case E_TImode:
16635 if (TARGET_64BIT)
16636 return true;
16637 /* FALLTHRU */
16638 case E_OImode:
16639 case E_XImode:
16640 if (!standard_sse_constant_p (x, mode))
16641 return false;
16642 default:
16643 break;
16645 break;
16647 case CONST_VECTOR:
16648 if (!standard_sse_constant_p (x, mode))
16649 return false;
16651 default:
16652 break;
16655 /* Otherwise we handle everything else in the move patterns. */
16656 return true;
16659 /* Determine if it's legal to put X into the constant pool. This
16660 is not possible for the address of thread-local symbols, which
16661 is checked above. */
16663 static bool
16664 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16666 /* We can put any immediate constant in memory. */
16667 switch (GET_CODE (x))
16669 CASE_CONST_ANY:
16670 return false;
16672 default:
16673 break;
16676 return !ix86_legitimate_constant_p (mode, x);
16679 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16680 otherwise zero. */
16682 static bool
16683 is_imported_p (rtx x)
16685 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16686 || GET_CODE (x) != SYMBOL_REF)
16687 return false;
16689 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16693 /* Nonzero if the constant value X is a legitimate general operand
16694 when generating PIC code. It is given that flag_pic is on and
16695 that X satisfies CONSTANT_P. */
16697 bool
16698 legitimate_pic_operand_p (rtx x)
16700 rtx inner;
16702 switch (GET_CODE (x))
16704 case CONST:
16705 inner = XEXP (x, 0);
16706 if (GET_CODE (inner) == PLUS
16707 && CONST_INT_P (XEXP (inner, 1)))
16708 inner = XEXP (inner, 0);
16710 /* Only some unspecs are valid as "constants". */
16711 if (GET_CODE (inner) == UNSPEC)
16712 switch (XINT (inner, 1))
16714 case UNSPEC_GOT:
16715 case UNSPEC_GOTOFF:
16716 case UNSPEC_PLTOFF:
16717 return TARGET_64BIT;
16718 case UNSPEC_TPOFF:
16719 x = XVECEXP (inner, 0, 0);
16720 return (GET_CODE (x) == SYMBOL_REF
16721 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16722 case UNSPEC_MACHOPIC_OFFSET:
16723 return legitimate_pic_address_disp_p (x);
16724 default:
16725 return false;
16727 /* FALLTHRU */
16729 case SYMBOL_REF:
16730 case LABEL_REF:
16731 return legitimate_pic_address_disp_p (x);
16733 default:
16734 return true;
16738 /* Determine if a given CONST RTX is a valid memory displacement
16739 in PIC mode. */
16741 bool
16742 legitimate_pic_address_disp_p (rtx disp)
16744 bool saw_plus;
16746 /* In 64bit mode we can allow direct addresses of symbols and labels
16747 when they are not dynamic symbols. */
16748 if (TARGET_64BIT)
16750 rtx op0 = disp, op1;
16752 switch (GET_CODE (disp))
16754 case LABEL_REF:
16755 return true;
16757 case CONST:
16758 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16759 break;
16760 op0 = XEXP (XEXP (disp, 0), 0);
16761 op1 = XEXP (XEXP (disp, 0), 1);
16762 if (!CONST_INT_P (op1)
16763 || INTVAL (op1) >= 16*1024*1024
16764 || INTVAL (op1) < -16*1024*1024)
16765 break;
16766 if (GET_CODE (op0) == LABEL_REF)
16767 return true;
16768 if (GET_CODE (op0) == CONST
16769 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16770 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16771 return true;
16772 if (GET_CODE (op0) == UNSPEC
16773 && XINT (op0, 1) == UNSPEC_PCREL)
16774 return true;
16775 if (GET_CODE (op0) != SYMBOL_REF)
16776 break;
16777 /* FALLTHRU */
16779 case SYMBOL_REF:
16780 /* TLS references should always be enclosed in UNSPEC.
16781 The dllimported symbol needs always to be resolved. */
16782 if (SYMBOL_REF_TLS_MODEL (op0)
16783 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16784 return false;
16786 if (TARGET_PECOFF)
16788 if (is_imported_p (op0))
16789 return true;
16791 if (SYMBOL_REF_FAR_ADDR_P (op0)
16792 || !SYMBOL_REF_LOCAL_P (op0))
16793 break;
16795 /* Function-symbols need to be resolved only for
16796 large-model.
16797 For the small-model we don't need to resolve anything
16798 here. */
16799 if ((ix86_cmodel != CM_LARGE_PIC
16800 && SYMBOL_REF_FUNCTION_P (op0))
16801 || ix86_cmodel == CM_SMALL_PIC)
16802 return true;
16803 /* Non-external symbols don't need to be resolved for
16804 large, and medium-model. */
16805 if ((ix86_cmodel == CM_LARGE_PIC
16806 || ix86_cmodel == CM_MEDIUM_PIC)
16807 && !SYMBOL_REF_EXTERNAL_P (op0))
16808 return true;
16810 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16811 && (SYMBOL_REF_LOCAL_P (op0)
16812 || (HAVE_LD_PIE_COPYRELOC
16813 && flag_pie
16814 && !SYMBOL_REF_WEAK (op0)
16815 && !SYMBOL_REF_FUNCTION_P (op0)))
16816 && ix86_cmodel != CM_LARGE_PIC)
16817 return true;
16818 break;
16820 default:
16821 break;
16824 if (GET_CODE (disp) != CONST)
16825 return false;
16826 disp = XEXP (disp, 0);
16828 if (TARGET_64BIT)
16830 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16831 of GOT tables. We should not need these anyway. */
16832 if (GET_CODE (disp) != UNSPEC
16833 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16834 && XINT (disp, 1) != UNSPEC_GOTOFF
16835 && XINT (disp, 1) != UNSPEC_PCREL
16836 && XINT (disp, 1) != UNSPEC_PLTOFF))
16837 return false;
16839 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16840 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16841 return false;
16842 return true;
16845 saw_plus = false;
16846 if (GET_CODE (disp) == PLUS)
16848 if (!CONST_INT_P (XEXP (disp, 1)))
16849 return false;
16850 disp = XEXP (disp, 0);
16851 saw_plus = true;
16854 if (TARGET_MACHO && darwin_local_data_pic (disp))
16855 return true;
16857 if (GET_CODE (disp) != UNSPEC)
16858 return false;
16860 switch (XINT (disp, 1))
16862 case UNSPEC_GOT:
16863 if (saw_plus)
16864 return false;
16865 /* We need to check for both symbols and labels because VxWorks loads
16866 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16867 details. */
16868 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16869 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16870 case UNSPEC_GOTOFF:
16871 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16872 While ABI specify also 32bit relocation but we don't produce it in
16873 small PIC model at all. */
16874 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16875 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16876 && !TARGET_64BIT)
16877 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16878 return false;
16879 case UNSPEC_GOTTPOFF:
16880 case UNSPEC_GOTNTPOFF:
16881 case UNSPEC_INDNTPOFF:
16882 if (saw_plus)
16883 return false;
16884 disp = XVECEXP (disp, 0, 0);
16885 return (GET_CODE (disp) == SYMBOL_REF
16886 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16887 case UNSPEC_NTPOFF:
16888 disp = XVECEXP (disp, 0, 0);
16889 return (GET_CODE (disp) == SYMBOL_REF
16890 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16891 case UNSPEC_DTPOFF:
16892 disp = XVECEXP (disp, 0, 0);
16893 return (GET_CODE (disp) == SYMBOL_REF
16894 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16897 return false;
16900 /* Determine if op is suitable RTX for an address register.
16901 Return naked register if a register or a register subreg is
16902 found, otherwise return NULL_RTX. */
16904 static rtx
16905 ix86_validate_address_register (rtx op)
16907 machine_mode mode = GET_MODE (op);
16909 /* Only SImode or DImode registers can form the address. */
16910 if (mode != SImode && mode != DImode)
16911 return NULL_RTX;
16913 if (REG_P (op))
16914 return op;
16915 else if (SUBREG_P (op))
16917 rtx reg = SUBREG_REG (op);
16919 if (!REG_P (reg))
16920 return NULL_RTX;
16922 mode = GET_MODE (reg);
16924 /* Don't allow SUBREGs that span more than a word. It can
16925 lead to spill failures when the register is one word out
16926 of a two word structure. */
16927 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16928 return NULL_RTX;
16930 /* Allow only SUBREGs of non-eliminable hard registers. */
16931 if (register_no_elim_operand (reg, mode))
16932 return reg;
16935 /* Op is not a register. */
16936 return NULL_RTX;
16939 /* Recognizes RTL expressions that are valid memory addresses for an
16940 instruction. The MODE argument is the machine mode for the MEM
16941 expression that wants to use this address.
16943 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16944 convert common non-canonical forms to canonical form so that they will
16945 be recognized. */
16947 static bool
16948 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16950 struct ix86_address parts;
16951 rtx base, index, disp;
16952 HOST_WIDE_INT scale;
16953 addr_space_t seg;
16955 if (ix86_decompose_address (addr, &parts) <= 0)
16956 /* Decomposition failed. */
16957 return false;
16959 base = parts.base;
16960 index = parts.index;
16961 disp = parts.disp;
16962 scale = parts.scale;
16963 seg = parts.seg;
16965 /* Validate base register. */
16966 if (base)
16968 rtx reg = ix86_validate_address_register (base);
16970 if (reg == NULL_RTX)
16971 return false;
16973 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16974 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16975 /* Base is not valid. */
16976 return false;
16979 /* Validate index register. */
16980 if (index)
16982 rtx reg = ix86_validate_address_register (index);
16984 if (reg == NULL_RTX)
16985 return false;
16987 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16988 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16989 /* Index is not valid. */
16990 return false;
16993 /* Index and base should have the same mode. */
16994 if (base && index
16995 && GET_MODE (base) != GET_MODE (index))
16996 return false;
16998 /* Address override works only on the (%reg) part of %fs:(%reg). */
16999 if (seg != ADDR_SPACE_GENERIC
17000 && ((base && GET_MODE (base) != word_mode)
17001 || (index && GET_MODE (index) != word_mode)))
17002 return false;
17004 /* Validate scale factor. */
17005 if (scale != 1)
17007 if (!index)
17008 /* Scale without index. */
17009 return false;
17011 if (scale != 2 && scale != 4 && scale != 8)
17012 /* Scale is not a valid multiplier. */
17013 return false;
17016 /* Validate displacement. */
17017 if (disp)
17019 if (GET_CODE (disp) == CONST
17020 && GET_CODE (XEXP (disp, 0)) == UNSPEC
17021 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
17022 switch (XINT (XEXP (disp, 0), 1))
17024 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
17025 when used. While ABI specify also 32bit relocations, we
17026 don't produce them at all and use IP relative instead.
17027 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
17028 should be loaded via GOT. */
17029 case UNSPEC_GOT:
17030 if (!TARGET_64BIT
17031 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
17032 goto is_legitimate_pic;
17033 /* FALLTHRU */
17034 case UNSPEC_GOTOFF:
17035 gcc_assert (flag_pic);
17036 if (!TARGET_64BIT)
17037 goto is_legitimate_pic;
17039 /* 64bit address unspec. */
17040 return false;
17042 case UNSPEC_GOTPCREL:
17043 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
17044 goto is_legitimate_pic;
17045 /* FALLTHRU */
17046 case UNSPEC_PCREL:
17047 gcc_assert (flag_pic);
17048 goto is_legitimate_pic;
17050 case UNSPEC_GOTTPOFF:
17051 case UNSPEC_GOTNTPOFF:
17052 case UNSPEC_INDNTPOFF:
17053 case UNSPEC_NTPOFF:
17054 case UNSPEC_DTPOFF:
17055 break;
17057 default:
17058 /* Invalid address unspec. */
17059 return false;
17062 else if (SYMBOLIC_CONST (disp)
17063 && (flag_pic
17064 || (TARGET_MACHO
17065 #if TARGET_MACHO
17066 && MACHOPIC_INDIRECT
17067 && !machopic_operand_p (disp)
17068 #endif
17072 is_legitimate_pic:
17073 if (TARGET_64BIT && (index || base))
17075 /* foo@dtpoff(%rX) is ok. */
17076 if (GET_CODE (disp) != CONST
17077 || GET_CODE (XEXP (disp, 0)) != PLUS
17078 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
17079 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
17080 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
17081 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
17082 /* Non-constant pic memory reference. */
17083 return false;
17085 else if ((!TARGET_MACHO || flag_pic)
17086 && ! legitimate_pic_address_disp_p (disp))
17087 /* Displacement is an invalid pic construct. */
17088 return false;
17089 #if TARGET_MACHO
17090 else if (MACHO_DYNAMIC_NO_PIC_P
17091 && !ix86_legitimate_constant_p (Pmode, disp))
17092 /* displacment must be referenced via non_lazy_pointer */
17093 return false;
17094 #endif
17096 /* This code used to verify that a symbolic pic displacement
17097 includes the pic_offset_table_rtx register.
17099 While this is good idea, unfortunately these constructs may
17100 be created by "adds using lea" optimization for incorrect
17101 code like:
17103 int a;
17104 int foo(int i)
17106 return *(&a+i);
17109 This code is nonsensical, but results in addressing
17110 GOT table with pic_offset_table_rtx base. We can't
17111 just refuse it easily, since it gets matched by
17112 "addsi3" pattern, that later gets split to lea in the
17113 case output register differs from input. While this
17114 can be handled by separate addsi pattern for this case
17115 that never results in lea, this seems to be easier and
17116 correct fix for crash to disable this test. */
17118 else if (GET_CODE (disp) != LABEL_REF
17119 && !CONST_INT_P (disp)
17120 && (GET_CODE (disp) != CONST
17121 || !ix86_legitimate_constant_p (Pmode, disp))
17122 && (GET_CODE (disp) != SYMBOL_REF
17123 || !ix86_legitimate_constant_p (Pmode, disp)))
17124 /* Displacement is not constant. */
17125 return false;
17126 else if (TARGET_64BIT
17127 && !x86_64_immediate_operand (disp, VOIDmode))
17128 /* Displacement is out of range. */
17129 return false;
17130 /* In x32 mode, constant addresses are sign extended to 64bit, so
17131 we have to prevent addresses from 0x80000000 to 0xffffffff. */
17132 else if (TARGET_X32 && !(index || base)
17133 && CONST_INT_P (disp)
17134 && val_signbit_known_set_p (SImode, INTVAL (disp)))
17135 return false;
17138 /* Everything looks valid. */
17139 return true;
17142 /* Determine if a given RTX is a valid constant address. */
17144 bool
17145 constant_address_p (rtx x)
17147 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
17150 /* Return a unique alias set for the GOT. */
17152 static alias_set_type
17153 ix86_GOT_alias_set (void)
17155 static alias_set_type set = -1;
17156 if (set == -1)
17157 set = new_alias_set ();
17158 return set;
17161 /* Return a legitimate reference for ORIG (an address) using the
17162 register REG. If REG is 0, a new pseudo is generated.
17164 There are two types of references that must be handled:
17166 1. Global data references must load the address from the GOT, via
17167 the PIC reg. An insn is emitted to do this load, and the reg is
17168 returned.
17170 2. Static data references, constant pool addresses, and code labels
17171 compute the address as an offset from the GOT, whose base is in
17172 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
17173 differentiate them from global data objects. The returned
17174 address is the PIC reg + an unspec constant.
17176 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
17177 reg also appears in the address. */
17179 static rtx
17180 legitimize_pic_address (rtx orig, rtx reg)
17182 rtx addr = orig;
17183 rtx new_rtx = orig;
17185 #if TARGET_MACHO
17186 if (TARGET_MACHO && !TARGET_64BIT)
17188 if (reg == 0)
17189 reg = gen_reg_rtx (Pmode);
17190 /* Use the generic Mach-O PIC machinery. */
17191 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
17193 #endif
17195 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17197 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17198 if (tmp)
17199 return tmp;
17202 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
17203 new_rtx = addr;
17204 else if ((!TARGET_64BIT
17205 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
17206 && !TARGET_PECOFF
17207 && gotoff_operand (addr, Pmode))
17209 /* This symbol may be referenced via a displacement
17210 from the PIC base address (@GOTOFF). */
17211 if (GET_CODE (addr) == CONST)
17212 addr = XEXP (addr, 0);
17214 if (GET_CODE (addr) == PLUS)
17216 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
17217 UNSPEC_GOTOFF);
17218 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
17220 else
17221 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17223 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17225 if (TARGET_64BIT)
17226 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17228 if (reg != 0)
17230 gcc_assert (REG_P (reg));
17231 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17232 new_rtx, reg, 1, OPTAB_DIRECT);
17234 else
17235 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17237 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17238 /* We can't use @GOTOFF for text labels
17239 on VxWorks, see gotoff_operand. */
17240 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17242 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17243 if (tmp)
17244 return tmp;
17246 /* For x64 PE-COFF there is no GOT table,
17247 so we use address directly. */
17248 if (TARGET_64BIT && TARGET_PECOFF)
17250 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17251 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17253 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17255 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17256 UNSPEC_GOTPCREL);
17257 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17258 new_rtx = gen_const_mem (Pmode, new_rtx);
17259 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17261 else
17263 /* This symbol must be referenced via a load
17264 from the Global Offset Table (@GOT). */
17265 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17266 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17267 if (TARGET_64BIT)
17268 new_rtx = force_reg (Pmode, new_rtx);
17269 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17270 new_rtx = gen_const_mem (Pmode, new_rtx);
17271 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17274 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17276 else
17278 if (CONST_INT_P (addr)
17279 && !x86_64_immediate_operand (addr, VOIDmode))
17280 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17281 else if (GET_CODE (addr) == CONST)
17283 addr = XEXP (addr, 0);
17285 /* We must match stuff we generate before. Assume the only
17286 unspecs that can get here are ours. Not that we could do
17287 anything with them anyway.... */
17288 if (GET_CODE (addr) == UNSPEC
17289 || (GET_CODE (addr) == PLUS
17290 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17291 return orig;
17292 gcc_assert (GET_CODE (addr) == PLUS);
17295 if (GET_CODE (addr) == PLUS)
17297 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17299 /* Check first to see if this is a constant
17300 offset from a @GOTOFF symbol reference. */
17301 if (!TARGET_PECOFF
17302 && gotoff_operand (op0, Pmode)
17303 && CONST_INT_P (op1))
17305 if (!TARGET_64BIT)
17307 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17308 UNSPEC_GOTOFF);
17309 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17310 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17312 if (reg != 0)
17314 gcc_assert (REG_P (reg));
17315 new_rtx = expand_simple_binop (Pmode, PLUS,
17316 pic_offset_table_rtx,
17317 new_rtx, reg, 1,
17318 OPTAB_DIRECT);
17320 else
17321 new_rtx
17322 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17324 else
17326 if (INTVAL (op1) < -16*1024*1024
17327 || INTVAL (op1) >= 16*1024*1024)
17329 if (!x86_64_immediate_operand (op1, Pmode))
17330 op1 = force_reg (Pmode, op1);
17332 new_rtx
17333 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17337 else
17339 rtx base = legitimize_pic_address (op0, reg);
17340 machine_mode mode = GET_MODE (base);
17341 new_rtx
17342 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17344 if (CONST_INT_P (new_rtx))
17346 if (INTVAL (new_rtx) < -16*1024*1024
17347 || INTVAL (new_rtx) >= 16*1024*1024)
17349 if (!x86_64_immediate_operand (new_rtx, mode))
17350 new_rtx = force_reg (mode, new_rtx);
17352 new_rtx
17353 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17355 else
17356 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17358 else
17360 /* For %rip addressing, we have to use
17361 just disp32, not base nor index. */
17362 if (TARGET_64BIT
17363 && (GET_CODE (base) == SYMBOL_REF
17364 || GET_CODE (base) == LABEL_REF))
17365 base = force_reg (mode, base);
17366 if (GET_CODE (new_rtx) == PLUS
17367 && CONSTANT_P (XEXP (new_rtx, 1)))
17369 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17370 new_rtx = XEXP (new_rtx, 1);
17372 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17377 return new_rtx;
17380 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17382 static rtx
17383 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17385 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17387 if (GET_MODE (tp) != tp_mode)
17389 gcc_assert (GET_MODE (tp) == SImode);
17390 gcc_assert (tp_mode == DImode);
17392 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17395 if (to_reg)
17396 tp = copy_to_mode_reg (tp_mode, tp);
17398 return tp;
17401 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17403 static GTY(()) rtx ix86_tls_symbol;
17405 static rtx
17406 ix86_tls_get_addr (void)
17408 if (!ix86_tls_symbol)
17410 const char *sym
17411 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17412 ? "___tls_get_addr" : "__tls_get_addr");
17414 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17417 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17419 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17420 UNSPEC_PLTOFF);
17421 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17422 gen_rtx_CONST (Pmode, unspec));
17425 return ix86_tls_symbol;
17428 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17430 static GTY(()) rtx ix86_tls_module_base_symbol;
17433 ix86_tls_module_base (void)
17435 if (!ix86_tls_module_base_symbol)
17437 ix86_tls_module_base_symbol
17438 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17440 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17441 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17444 return ix86_tls_module_base_symbol;
17447 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17448 false if we expect this to be used for a memory address and true if
17449 we expect to load the address into a register. */
17451 static rtx
17452 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17454 rtx dest, base, off;
17455 rtx pic = NULL_RTX, tp = NULL_RTX;
17456 machine_mode tp_mode = Pmode;
17457 int type;
17459 /* Fall back to global dynamic model if tool chain cannot support local
17460 dynamic. */
17461 if (TARGET_SUN_TLS && !TARGET_64BIT
17462 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17463 && model == TLS_MODEL_LOCAL_DYNAMIC)
17464 model = TLS_MODEL_GLOBAL_DYNAMIC;
17466 switch (model)
17468 case TLS_MODEL_GLOBAL_DYNAMIC:
17469 dest = gen_reg_rtx (Pmode);
17471 if (!TARGET_64BIT)
17473 if (flag_pic && !TARGET_PECOFF)
17474 pic = pic_offset_table_rtx;
17475 else
17477 pic = gen_reg_rtx (Pmode);
17478 emit_insn (gen_set_got (pic));
17482 if (TARGET_GNU2_TLS)
17484 if (TARGET_64BIT)
17485 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17486 else
17487 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17489 tp = get_thread_pointer (Pmode, true);
17490 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17492 if (GET_MODE (x) != Pmode)
17493 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17495 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17497 else
17499 rtx caddr = ix86_tls_get_addr ();
17501 if (TARGET_64BIT)
17503 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17504 rtx_insn *insns;
17506 start_sequence ();
17507 emit_call_insn
17508 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17509 insns = get_insns ();
17510 end_sequence ();
17512 if (GET_MODE (x) != Pmode)
17513 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17515 RTL_CONST_CALL_P (insns) = 1;
17516 emit_libcall_block (insns, dest, rax, x);
17518 else
17519 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17521 break;
17523 case TLS_MODEL_LOCAL_DYNAMIC:
17524 base = gen_reg_rtx (Pmode);
17526 if (!TARGET_64BIT)
17528 if (flag_pic)
17529 pic = pic_offset_table_rtx;
17530 else
17532 pic = gen_reg_rtx (Pmode);
17533 emit_insn (gen_set_got (pic));
17537 if (TARGET_GNU2_TLS)
17539 rtx tmp = ix86_tls_module_base ();
17541 if (TARGET_64BIT)
17542 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17543 else
17544 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17546 tp = get_thread_pointer (Pmode, true);
17547 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17548 gen_rtx_MINUS (Pmode, tmp, tp));
17550 else
17552 rtx caddr = ix86_tls_get_addr ();
17554 if (TARGET_64BIT)
17556 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17557 rtx_insn *insns;
17558 rtx eqv;
17560 start_sequence ();
17561 emit_call_insn
17562 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17563 insns = get_insns ();
17564 end_sequence ();
17566 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17567 share the LD_BASE result with other LD model accesses. */
17568 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17569 UNSPEC_TLS_LD_BASE);
17571 RTL_CONST_CALL_P (insns) = 1;
17572 emit_libcall_block (insns, base, rax, eqv);
17574 else
17575 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17578 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17579 off = gen_rtx_CONST (Pmode, off);
17581 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17583 if (TARGET_GNU2_TLS)
17585 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17587 if (GET_MODE (x) != Pmode)
17588 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17590 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17592 break;
17594 case TLS_MODEL_INITIAL_EXEC:
17595 if (TARGET_64BIT)
17597 if (TARGET_SUN_TLS && !TARGET_X32)
17599 /* The Sun linker took the AMD64 TLS spec literally
17600 and can only handle %rax as destination of the
17601 initial executable code sequence. */
17603 dest = gen_reg_rtx (DImode);
17604 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17605 return dest;
17608 /* Generate DImode references to avoid %fs:(%reg32)
17609 problems and linker IE->LE relaxation bug. */
17610 tp_mode = DImode;
17611 pic = NULL;
17612 type = UNSPEC_GOTNTPOFF;
17614 else if (flag_pic)
17616 pic = pic_offset_table_rtx;
17617 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17619 else if (!TARGET_ANY_GNU_TLS)
17621 pic = gen_reg_rtx (Pmode);
17622 emit_insn (gen_set_got (pic));
17623 type = UNSPEC_GOTTPOFF;
17625 else
17627 pic = NULL;
17628 type = UNSPEC_INDNTPOFF;
17631 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17632 off = gen_rtx_CONST (tp_mode, off);
17633 if (pic)
17634 off = gen_rtx_PLUS (tp_mode, pic, off);
17635 off = gen_const_mem (tp_mode, off);
17636 set_mem_alias_set (off, ix86_GOT_alias_set ());
17638 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17640 base = get_thread_pointer (tp_mode,
17641 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17642 off = force_reg (tp_mode, off);
17643 dest = gen_rtx_PLUS (tp_mode, base, off);
17644 if (tp_mode != Pmode)
17645 dest = convert_to_mode (Pmode, dest, 1);
17647 else
17649 base = get_thread_pointer (Pmode, true);
17650 dest = gen_reg_rtx (Pmode);
17651 emit_insn (ix86_gen_sub3 (dest, base, off));
17653 break;
17655 case TLS_MODEL_LOCAL_EXEC:
17656 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17657 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17658 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17659 off = gen_rtx_CONST (Pmode, off);
17661 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17663 base = get_thread_pointer (Pmode,
17664 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17665 return gen_rtx_PLUS (Pmode, base, off);
17667 else
17669 base = get_thread_pointer (Pmode, true);
17670 dest = gen_reg_rtx (Pmode);
17671 emit_insn (ix86_gen_sub3 (dest, base, off));
17673 break;
17675 default:
17676 gcc_unreachable ();
17679 return dest;
17682 /* Return true if OP refers to a TLS address. */
17683 bool
17684 ix86_tls_address_pattern_p (rtx op)
17686 subrtx_var_iterator::array_type array;
17687 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
17689 rtx op = *iter;
17690 if (MEM_P (op))
17692 rtx *x = &XEXP (op, 0);
17693 while (GET_CODE (*x) == PLUS)
17695 int i;
17696 for (i = 0; i < 2; i++)
17698 rtx u = XEXP (*x, i);
17699 if (GET_CODE (u) == ZERO_EXTEND)
17700 u = XEXP (u, 0);
17701 if (GET_CODE (u) == UNSPEC
17702 && XINT (u, 1) == UNSPEC_TP)
17703 return true;
17705 x = &XEXP (*x, 0);
17708 iter.skip_subrtxes ();
17712 return false;
17715 /* Rewrite *LOC so that it refers to a default TLS address space. */
17716 void
17717 ix86_rewrite_tls_address_1 (rtx *loc)
17719 subrtx_ptr_iterator::array_type array;
17720 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
17722 rtx *loc = *iter;
17723 if (MEM_P (*loc))
17725 rtx addr = XEXP (*loc, 0);
17726 rtx *x = &addr;
17727 while (GET_CODE (*x) == PLUS)
17729 int i;
17730 for (i = 0; i < 2; i++)
17732 rtx u = XEXP (*x, i);
17733 if (GET_CODE (u) == ZERO_EXTEND)
17734 u = XEXP (u, 0);
17735 if (GET_CODE (u) == UNSPEC
17736 && XINT (u, 1) == UNSPEC_TP)
17738 addr_space_t as = DEFAULT_TLS_SEG_REG;
17740 *x = XEXP (*x, 1 - i);
17742 *loc = replace_equiv_address_nv (*loc, addr, true);
17743 set_mem_addr_space (*loc, as);
17744 return;
17747 x = &XEXP (*x, 0);
17750 iter.skip_subrtxes ();
17755 /* Rewrite instruction pattern involvning TLS address
17756 so that it refers to a default TLS address space. */
17758 ix86_rewrite_tls_address (rtx pattern)
17760 pattern = copy_insn (pattern);
17761 ix86_rewrite_tls_address_1 (&pattern);
17762 return pattern;
17765 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17766 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17767 unique refptr-DECL symbol corresponding to symbol DECL. */
17769 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17771 static inline hashval_t hash (tree_map *m) { return m->hash; }
17772 static inline bool
17773 equal (tree_map *a, tree_map *b)
17775 return a->base.from == b->base.from;
17778 static int
17779 keep_cache_entry (tree_map *&m)
17781 return ggc_marked_p (m->base.from);
17785 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17787 static tree
17788 get_dllimport_decl (tree decl, bool beimport)
17790 struct tree_map *h, in;
17791 const char *name;
17792 const char *prefix;
17793 size_t namelen, prefixlen;
17794 char *imp_name;
17795 tree to;
17796 rtx rtl;
17798 if (!dllimport_map)
17799 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17801 in.hash = htab_hash_pointer (decl);
17802 in.base.from = decl;
17803 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17804 h = *loc;
17805 if (h)
17806 return h->to;
17808 *loc = h = ggc_alloc<tree_map> ();
17809 h->hash = in.hash;
17810 h->base.from = decl;
17811 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17812 VAR_DECL, NULL, ptr_type_node);
17813 DECL_ARTIFICIAL (to) = 1;
17814 DECL_IGNORED_P (to) = 1;
17815 DECL_EXTERNAL (to) = 1;
17816 TREE_READONLY (to) = 1;
17818 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17819 name = targetm.strip_name_encoding (name);
17820 if (beimport)
17821 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17822 ? "*__imp_" : "*__imp__";
17823 else
17824 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17825 namelen = strlen (name);
17826 prefixlen = strlen (prefix);
17827 imp_name = (char *) alloca (namelen + prefixlen + 1);
17828 memcpy (imp_name, prefix, prefixlen);
17829 memcpy (imp_name + prefixlen, name, namelen + 1);
17831 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17832 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17833 SET_SYMBOL_REF_DECL (rtl, to);
17834 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17835 if (!beimport)
17837 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17838 #ifdef SUB_TARGET_RECORD_STUB
17839 SUB_TARGET_RECORD_STUB (name);
17840 #endif
17843 rtl = gen_const_mem (Pmode, rtl);
17844 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17846 SET_DECL_RTL (to, rtl);
17847 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17849 return to;
17852 /* Expand SYMBOL into its corresponding far-address symbol.
17853 WANT_REG is true if we require the result be a register. */
17855 static rtx
17856 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17858 tree imp_decl;
17859 rtx x;
17861 gcc_assert (SYMBOL_REF_DECL (symbol));
17862 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17864 x = DECL_RTL (imp_decl);
17865 if (want_reg)
17866 x = force_reg (Pmode, x);
17867 return x;
17870 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17871 true if we require the result be a register. */
17873 static rtx
17874 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17876 tree imp_decl;
17877 rtx x;
17879 gcc_assert (SYMBOL_REF_DECL (symbol));
17880 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17882 x = DECL_RTL (imp_decl);
17883 if (want_reg)
17884 x = force_reg (Pmode, x);
17885 return x;
17888 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17889 is true if we require the result be a register. */
17891 static rtx
17892 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17894 if (!TARGET_PECOFF)
17895 return NULL_RTX;
17897 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17899 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17900 return legitimize_dllimport_symbol (addr, inreg);
17901 if (GET_CODE (addr) == CONST
17902 && GET_CODE (XEXP (addr, 0)) == PLUS
17903 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17904 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17906 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17907 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17911 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17912 return NULL_RTX;
17913 if (GET_CODE (addr) == SYMBOL_REF
17914 && !is_imported_p (addr)
17915 && SYMBOL_REF_EXTERNAL_P (addr)
17916 && SYMBOL_REF_DECL (addr))
17917 return legitimize_pe_coff_extern_decl (addr, inreg);
17919 if (GET_CODE (addr) == CONST
17920 && GET_CODE (XEXP (addr, 0)) == PLUS
17921 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17922 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17923 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17924 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17926 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17927 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17929 return NULL_RTX;
17932 /* Try machine-dependent ways of modifying an illegitimate address
17933 to be legitimate. If we find one, return the new, valid address.
17934 This macro is used in only one place: `memory_address' in explow.c.
17936 OLDX is the address as it was before break_out_memory_refs was called.
17937 In some cases it is useful to look at this to decide what needs to be done.
17939 It is always safe for this macro to do nothing. It exists to recognize
17940 opportunities to optimize the output.
17942 For the 80386, we handle X+REG by loading X into a register R and
17943 using R+REG. R will go in a general reg and indexing will be used.
17944 However, if REG is a broken-out memory address or multiplication,
17945 nothing needs to be done because REG can certainly go in a general reg.
17947 When -fpic is used, special handling is needed for symbolic references.
17948 See comments by legitimize_pic_address in i386.c for details. */
17950 static rtx
17951 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17953 bool changed = false;
17954 unsigned log;
17956 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17957 if (log)
17958 return legitimize_tls_address (x, (enum tls_model) log, false);
17959 if (GET_CODE (x) == CONST
17960 && GET_CODE (XEXP (x, 0)) == PLUS
17961 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17962 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17964 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17965 (enum tls_model) log, false);
17966 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17969 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17971 rtx tmp = legitimize_pe_coff_symbol (x, true);
17972 if (tmp)
17973 return tmp;
17976 if (flag_pic && SYMBOLIC_CONST (x))
17977 return legitimize_pic_address (x, 0);
17979 #if TARGET_MACHO
17980 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17981 return machopic_indirect_data_reference (x, 0);
17982 #endif
17984 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17985 if (GET_CODE (x) == ASHIFT
17986 && CONST_INT_P (XEXP (x, 1))
17987 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17989 changed = true;
17990 log = INTVAL (XEXP (x, 1));
17991 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17992 GEN_INT (1 << log));
17995 if (GET_CODE (x) == PLUS)
17997 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17999 if (GET_CODE (XEXP (x, 0)) == ASHIFT
18000 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18001 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
18003 changed = true;
18004 log = INTVAL (XEXP (XEXP (x, 0), 1));
18005 XEXP (x, 0) = gen_rtx_MULT (Pmode,
18006 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
18007 GEN_INT (1 << log));
18010 if (GET_CODE (XEXP (x, 1)) == ASHIFT
18011 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
18012 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
18014 changed = true;
18015 log = INTVAL (XEXP (XEXP (x, 1), 1));
18016 XEXP (x, 1) = gen_rtx_MULT (Pmode,
18017 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
18018 GEN_INT (1 << log));
18021 /* Put multiply first if it isn't already. */
18022 if (GET_CODE (XEXP (x, 1)) == MULT)
18024 std::swap (XEXP (x, 0), XEXP (x, 1));
18025 changed = true;
18028 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
18029 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
18030 created by virtual register instantiation, register elimination, and
18031 similar optimizations. */
18032 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
18034 changed = true;
18035 x = gen_rtx_PLUS (Pmode,
18036 gen_rtx_PLUS (Pmode, XEXP (x, 0),
18037 XEXP (XEXP (x, 1), 0)),
18038 XEXP (XEXP (x, 1), 1));
18041 /* Canonicalize
18042 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
18043 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
18044 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
18045 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
18046 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
18047 && CONSTANT_P (XEXP (x, 1)))
18049 rtx constant;
18050 rtx other = NULL_RTX;
18052 if (CONST_INT_P (XEXP (x, 1)))
18054 constant = XEXP (x, 1);
18055 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
18057 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
18059 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
18060 other = XEXP (x, 1);
18062 else
18063 constant = 0;
18065 if (constant)
18067 changed = true;
18068 x = gen_rtx_PLUS (Pmode,
18069 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
18070 XEXP (XEXP (XEXP (x, 0), 1), 0)),
18071 plus_constant (Pmode, other,
18072 INTVAL (constant)));
18076 if (changed && ix86_legitimate_address_p (mode, x, false))
18077 return x;
18079 if (GET_CODE (XEXP (x, 0)) == MULT)
18081 changed = true;
18082 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
18085 if (GET_CODE (XEXP (x, 1)) == MULT)
18087 changed = true;
18088 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
18091 if (changed
18092 && REG_P (XEXP (x, 1))
18093 && REG_P (XEXP (x, 0)))
18094 return x;
18096 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
18098 changed = true;
18099 x = legitimize_pic_address (x, 0);
18102 if (changed && ix86_legitimate_address_p (mode, x, false))
18103 return x;
18105 if (REG_P (XEXP (x, 0)))
18107 rtx temp = gen_reg_rtx (Pmode);
18108 rtx val = force_operand (XEXP (x, 1), temp);
18109 if (val != temp)
18111 val = convert_to_mode (Pmode, val, 1);
18112 emit_move_insn (temp, val);
18115 XEXP (x, 1) = temp;
18116 return x;
18119 else if (REG_P (XEXP (x, 1)))
18121 rtx temp = gen_reg_rtx (Pmode);
18122 rtx val = force_operand (XEXP (x, 0), temp);
18123 if (val != temp)
18125 val = convert_to_mode (Pmode, val, 1);
18126 emit_move_insn (temp, val);
18129 XEXP (x, 0) = temp;
18130 return x;
18134 return x;
18137 /* Print an integer constant expression in assembler syntax. Addition
18138 and subtraction are the only arithmetic that may appear in these
18139 expressions. FILE is the stdio stream to write to, X is the rtx, and
18140 CODE is the operand print code from the output string. */
18142 static void
18143 output_pic_addr_const (FILE *file, rtx x, int code)
18145 char buf[256];
18147 switch (GET_CODE (x))
18149 case PC:
18150 gcc_assert (flag_pic);
18151 putc ('.', file);
18152 break;
18154 case SYMBOL_REF:
18155 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
18156 output_addr_const (file, x);
18157 else
18159 const char *name = XSTR (x, 0);
18161 /* Mark the decl as referenced so that cgraph will
18162 output the function. */
18163 if (SYMBOL_REF_DECL (x))
18164 mark_decl_referenced (SYMBOL_REF_DECL (x));
18166 #if TARGET_MACHO
18167 if (MACHOPIC_INDIRECT
18168 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
18169 name = machopic_indirection_name (x, /*stub_p=*/true);
18170 #endif
18171 assemble_name (file, name);
18173 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
18174 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
18175 fputs ("@PLT", file);
18176 break;
18178 case LABEL_REF:
18179 x = XEXP (x, 0);
18180 /* FALLTHRU */
18181 case CODE_LABEL:
18182 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
18183 assemble_name (asm_out_file, buf);
18184 break;
18186 case CONST_INT:
18187 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18188 break;
18190 case CONST:
18191 /* This used to output parentheses around the expression,
18192 but that does not work on the 386 (either ATT or BSD assembler). */
18193 output_pic_addr_const (file, XEXP (x, 0), code);
18194 break;
18196 case CONST_DOUBLE:
18197 /* We can't handle floating point constants;
18198 TARGET_PRINT_OPERAND must handle them. */
18199 output_operand_lossage ("floating constant misused");
18200 break;
18202 case PLUS:
18203 /* Some assemblers need integer constants to appear first. */
18204 if (CONST_INT_P (XEXP (x, 0)))
18206 output_pic_addr_const (file, XEXP (x, 0), code);
18207 putc ('+', file);
18208 output_pic_addr_const (file, XEXP (x, 1), code);
18210 else
18212 gcc_assert (CONST_INT_P (XEXP (x, 1)));
18213 output_pic_addr_const (file, XEXP (x, 1), code);
18214 putc ('+', file);
18215 output_pic_addr_const (file, XEXP (x, 0), code);
18217 break;
18219 case MINUS:
18220 if (!TARGET_MACHO)
18221 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
18222 output_pic_addr_const (file, XEXP (x, 0), code);
18223 putc ('-', file);
18224 output_pic_addr_const (file, XEXP (x, 1), code);
18225 if (!TARGET_MACHO)
18226 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
18227 break;
18229 case UNSPEC:
18230 gcc_assert (XVECLEN (x, 0) == 1);
18231 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
18232 switch (XINT (x, 1))
18234 case UNSPEC_GOT:
18235 fputs ("@GOT", file);
18236 break;
18237 case UNSPEC_GOTOFF:
18238 fputs ("@GOTOFF", file);
18239 break;
18240 case UNSPEC_PLTOFF:
18241 fputs ("@PLTOFF", file);
18242 break;
18243 case UNSPEC_PCREL:
18244 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18245 "(%rip)" : "[rip]", file);
18246 break;
18247 case UNSPEC_GOTPCREL:
18248 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18249 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
18250 break;
18251 case UNSPEC_GOTTPOFF:
18252 /* FIXME: This might be @TPOFF in Sun ld too. */
18253 fputs ("@gottpoff", file);
18254 break;
18255 case UNSPEC_TPOFF:
18256 fputs ("@tpoff", file);
18257 break;
18258 case UNSPEC_NTPOFF:
18259 if (TARGET_64BIT)
18260 fputs ("@tpoff", file);
18261 else
18262 fputs ("@ntpoff", file);
18263 break;
18264 case UNSPEC_DTPOFF:
18265 fputs ("@dtpoff", file);
18266 break;
18267 case UNSPEC_GOTNTPOFF:
18268 if (TARGET_64BIT)
18269 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18270 "@gottpoff(%rip)": "@gottpoff[rip]", file);
18271 else
18272 fputs ("@gotntpoff", file);
18273 break;
18274 case UNSPEC_INDNTPOFF:
18275 fputs ("@indntpoff", file);
18276 break;
18277 #if TARGET_MACHO
18278 case UNSPEC_MACHOPIC_OFFSET:
18279 putc ('-', file);
18280 machopic_output_function_base_name (file);
18281 break;
18282 #endif
18283 default:
18284 output_operand_lossage ("invalid UNSPEC as operand");
18285 break;
18287 break;
18289 default:
18290 output_operand_lossage ("invalid expression as operand");
18294 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
18295 We need to emit DTP-relative relocations. */
18297 static void ATTRIBUTE_UNUSED
18298 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18300 fputs (ASM_LONG, file);
18301 output_addr_const (file, x);
18302 fputs ("@dtpoff", file);
18303 switch (size)
18305 case 4:
18306 break;
18307 case 8:
18308 fputs (", 0", file);
18309 break;
18310 default:
18311 gcc_unreachable ();
18315 /* Return true if X is a representation of the PIC register. This copes
18316 with calls from ix86_find_base_term, where the register might have
18317 been replaced by a cselib value. */
18319 static bool
18320 ix86_pic_register_p (rtx x)
18322 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18323 return (pic_offset_table_rtx
18324 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18325 else if (!REG_P (x))
18326 return false;
18327 else if (pic_offset_table_rtx)
18329 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18330 return true;
18331 if (HARD_REGISTER_P (x)
18332 && !HARD_REGISTER_P (pic_offset_table_rtx)
18333 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18334 return true;
18335 return false;
18337 else
18338 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18341 /* Helper function for ix86_delegitimize_address.
18342 Attempt to delegitimize TLS local-exec accesses. */
18344 static rtx
18345 ix86_delegitimize_tls_address (rtx orig_x)
18347 rtx x = orig_x, unspec;
18348 struct ix86_address addr;
18350 if (!TARGET_TLS_DIRECT_SEG_REFS)
18351 return orig_x;
18352 if (MEM_P (x))
18353 x = XEXP (x, 0);
18354 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18355 return orig_x;
18356 if (ix86_decompose_address (x, &addr) == 0
18357 || addr.seg != DEFAULT_TLS_SEG_REG
18358 || addr.disp == NULL_RTX
18359 || GET_CODE (addr.disp) != CONST)
18360 return orig_x;
18361 unspec = XEXP (addr.disp, 0);
18362 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18363 unspec = XEXP (unspec, 0);
18364 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18365 return orig_x;
18366 x = XVECEXP (unspec, 0, 0);
18367 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18368 if (unspec != XEXP (addr.disp, 0))
18369 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18370 if (addr.index)
18372 rtx idx = addr.index;
18373 if (addr.scale != 1)
18374 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18375 x = gen_rtx_PLUS (Pmode, idx, x);
18377 if (addr.base)
18378 x = gen_rtx_PLUS (Pmode, addr.base, x);
18379 if (MEM_P (orig_x))
18380 x = replace_equiv_address_nv (orig_x, x);
18381 return x;
18384 /* In the name of slightly smaller debug output, and to cater to
18385 general assembler lossage, recognize PIC+GOTOFF and turn it back
18386 into a direct symbol reference.
18388 On Darwin, this is necessary to avoid a crash, because Darwin
18389 has a different PIC label for each routine but the DWARF debugging
18390 information is not associated with any particular routine, so it's
18391 necessary to remove references to the PIC label from RTL stored by
18392 the DWARF output code.
18394 This helper is used in the normal ix86_delegitimize_address
18395 entrypoint (e.g. used in the target delegitimization hook) and
18396 in ix86_find_base_term. As compile time memory optimization, we
18397 avoid allocating rtxes that will not change anything on the outcome
18398 of the callers (find_base_value and find_base_term). */
18400 static inline rtx
18401 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18403 rtx orig_x = delegitimize_mem_from_attrs (x);
18404 /* addend is NULL or some rtx if x is something+GOTOFF where
18405 something doesn't include the PIC register. */
18406 rtx addend = NULL_RTX;
18407 /* reg_addend is NULL or a multiple of some register. */
18408 rtx reg_addend = NULL_RTX;
18409 /* const_addend is NULL or a const_int. */
18410 rtx const_addend = NULL_RTX;
18411 /* This is the result, or NULL. */
18412 rtx result = NULL_RTX;
18414 x = orig_x;
18416 if (MEM_P (x))
18417 x = XEXP (x, 0);
18419 if (TARGET_64BIT)
18421 if (GET_CODE (x) == CONST
18422 && GET_CODE (XEXP (x, 0)) == PLUS
18423 && GET_MODE (XEXP (x, 0)) == Pmode
18424 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18425 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18426 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18428 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18429 base. A CONST can't be arg_pointer_rtx based. */
18430 if (base_term_p && MEM_P (orig_x))
18431 return orig_x;
18432 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18433 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18434 if (MEM_P (orig_x))
18435 x = replace_equiv_address_nv (orig_x, x);
18436 return x;
18439 if (GET_CODE (x) == CONST
18440 && GET_CODE (XEXP (x, 0)) == UNSPEC
18441 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18442 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18443 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18445 x = XVECEXP (XEXP (x, 0), 0, 0);
18446 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18448 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18449 if (x == NULL_RTX)
18450 return orig_x;
18452 return x;
18455 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18456 return ix86_delegitimize_tls_address (orig_x);
18458 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18459 and -mcmodel=medium -fpic. */
18462 if (GET_CODE (x) != PLUS
18463 || GET_CODE (XEXP (x, 1)) != CONST)
18464 return ix86_delegitimize_tls_address (orig_x);
18466 if (ix86_pic_register_p (XEXP (x, 0)))
18467 /* %ebx + GOT/GOTOFF */
18469 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18471 /* %ebx + %reg * scale + GOT/GOTOFF */
18472 reg_addend = XEXP (x, 0);
18473 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18474 reg_addend = XEXP (reg_addend, 1);
18475 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18476 reg_addend = XEXP (reg_addend, 0);
18477 else
18479 reg_addend = NULL_RTX;
18480 addend = XEXP (x, 0);
18483 else
18484 addend = XEXP (x, 0);
18486 x = XEXP (XEXP (x, 1), 0);
18487 if (GET_CODE (x) == PLUS
18488 && CONST_INT_P (XEXP (x, 1)))
18490 const_addend = XEXP (x, 1);
18491 x = XEXP (x, 0);
18494 if (GET_CODE (x) == UNSPEC
18495 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18496 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18497 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18498 && !MEM_P (orig_x) && !addend)))
18499 result = XVECEXP (x, 0, 0);
18501 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18502 && !MEM_P (orig_x))
18503 result = XVECEXP (x, 0, 0);
18505 if (! result)
18506 return ix86_delegitimize_tls_address (orig_x);
18508 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18509 recurse on the first operand. */
18510 if (const_addend && !base_term_p)
18511 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18512 if (reg_addend)
18513 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18514 if (addend)
18516 /* If the rest of original X doesn't involve the PIC register, add
18517 addend and subtract pic_offset_table_rtx. This can happen e.g.
18518 for code like:
18519 leal (%ebx, %ecx, 4), %ecx
18521 movl foo@GOTOFF(%ecx), %edx
18522 in which case we return (%ecx - %ebx) + foo
18523 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18524 and reload has completed. */
18525 if (pic_offset_table_rtx
18526 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18527 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18528 pic_offset_table_rtx),
18529 result);
18530 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18532 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18533 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18534 result = gen_rtx_PLUS (Pmode, tmp, result);
18536 else
18537 return orig_x;
18539 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18541 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18542 if (result == NULL_RTX)
18543 return orig_x;
18545 return result;
18548 /* The normal instantiation of the above template. */
18550 static rtx
18551 ix86_delegitimize_address (rtx x)
18553 return ix86_delegitimize_address_1 (x, false);
18556 /* If X is a machine specific address (i.e. a symbol or label being
18557 referenced as a displacement from the GOT implemented using an
18558 UNSPEC), then return the base term. Otherwise return X. */
18561 ix86_find_base_term (rtx x)
18563 rtx term;
18565 if (TARGET_64BIT)
18567 if (GET_CODE (x) != CONST)
18568 return x;
18569 term = XEXP (x, 0);
18570 if (GET_CODE (term) == PLUS
18571 && CONST_INT_P (XEXP (term, 1)))
18572 term = XEXP (term, 0);
18573 if (GET_CODE (term) != UNSPEC
18574 || (XINT (term, 1) != UNSPEC_GOTPCREL
18575 && XINT (term, 1) != UNSPEC_PCREL))
18576 return x;
18578 return XVECEXP (term, 0, 0);
18581 return ix86_delegitimize_address_1 (x, true);
18584 static void
18585 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18586 bool fp, FILE *file)
18588 const char *suffix;
18590 if (mode == CCFPmode || mode == CCFPUmode)
18592 code = ix86_fp_compare_code_to_integer (code);
18593 mode = CCmode;
18595 if (reverse)
18596 code = reverse_condition (code);
18598 switch (code)
18600 case EQ:
18601 switch (mode)
18603 case E_CCAmode:
18604 suffix = "a";
18605 break;
18606 case E_CCCmode:
18607 suffix = "c";
18608 break;
18609 case E_CCOmode:
18610 suffix = "o";
18611 break;
18612 case E_CCPmode:
18613 suffix = "p";
18614 break;
18615 case E_CCSmode:
18616 suffix = "s";
18617 break;
18618 default:
18619 suffix = "e";
18620 break;
18622 break;
18623 case NE:
18624 switch (mode)
18626 case E_CCAmode:
18627 suffix = "na";
18628 break;
18629 case E_CCCmode:
18630 suffix = "nc";
18631 break;
18632 case E_CCOmode:
18633 suffix = "no";
18634 break;
18635 case E_CCPmode:
18636 suffix = "np";
18637 break;
18638 case E_CCSmode:
18639 suffix = "ns";
18640 break;
18641 default:
18642 suffix = "ne";
18643 break;
18645 break;
18646 case GT:
18647 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18648 suffix = "g";
18649 break;
18650 case GTU:
18651 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18652 Those same assemblers have the same but opposite lossage on cmov. */
18653 if (mode == CCmode)
18654 suffix = fp ? "nbe" : "a";
18655 else
18656 gcc_unreachable ();
18657 break;
18658 case LT:
18659 switch (mode)
18661 case E_CCNOmode:
18662 case E_CCGOCmode:
18663 suffix = "s";
18664 break;
18666 case E_CCmode:
18667 case E_CCGCmode:
18668 suffix = "l";
18669 break;
18671 default:
18672 gcc_unreachable ();
18674 break;
18675 case LTU:
18676 if (mode == CCmode)
18677 suffix = "b";
18678 else if (mode == CCCmode)
18679 suffix = fp ? "b" : "c";
18680 else
18681 gcc_unreachable ();
18682 break;
18683 case GE:
18684 switch (mode)
18686 case E_CCNOmode:
18687 case E_CCGOCmode:
18688 suffix = "ns";
18689 break;
18691 case E_CCmode:
18692 case E_CCGCmode:
18693 suffix = "ge";
18694 break;
18696 default:
18697 gcc_unreachable ();
18699 break;
18700 case GEU:
18701 if (mode == CCmode)
18702 suffix = "nb";
18703 else if (mode == CCCmode)
18704 suffix = fp ? "nb" : "nc";
18705 else
18706 gcc_unreachable ();
18707 break;
18708 case LE:
18709 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18710 suffix = "le";
18711 break;
18712 case LEU:
18713 if (mode == CCmode)
18714 suffix = "be";
18715 else
18716 gcc_unreachable ();
18717 break;
18718 case UNORDERED:
18719 suffix = fp ? "u" : "p";
18720 break;
18721 case ORDERED:
18722 suffix = fp ? "nu" : "np";
18723 break;
18724 default:
18725 gcc_unreachable ();
18727 fputs (suffix, file);
18730 /* Print the name of register X to FILE based on its machine mode and number.
18731 If CODE is 'w', pretend the mode is HImode.
18732 If CODE is 'b', pretend the mode is QImode.
18733 If CODE is 'k', pretend the mode is SImode.
18734 If CODE is 'q', pretend the mode is DImode.
18735 If CODE is 'x', pretend the mode is V4SFmode.
18736 If CODE is 't', pretend the mode is V8SFmode.
18737 If CODE is 'g', pretend the mode is V16SFmode.
18738 If CODE is 'h', pretend the reg is the 'high' byte register.
18739 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18740 If CODE is 'd', duplicate the operand for AVX instruction.
18743 void
18744 print_reg (rtx x, int code, FILE *file)
18746 const char *reg;
18747 int msize;
18748 unsigned int regno;
18749 bool duplicated;
18751 if (ASSEMBLER_DIALECT == ASM_ATT)
18752 putc ('%', file);
18754 if (x == pc_rtx)
18756 gcc_assert (TARGET_64BIT);
18757 fputs ("rip", file);
18758 return;
18761 if (code == 'y' && STACK_TOP_P (x))
18763 fputs ("st(0)", file);
18764 return;
18767 if (code == 'w')
18768 msize = 2;
18769 else if (code == 'b')
18770 msize = 1;
18771 else if (code == 'k')
18772 msize = 4;
18773 else if (code == 'q')
18774 msize = 8;
18775 else if (code == 'h')
18776 msize = 0;
18777 else if (code == 'x')
18778 msize = 16;
18779 else if (code == 't')
18780 msize = 32;
18781 else if (code == 'g')
18782 msize = 64;
18783 else
18784 msize = GET_MODE_SIZE (GET_MODE (x));
18786 regno = REGNO (x);
18788 if (regno == ARG_POINTER_REGNUM
18789 || regno == FRAME_POINTER_REGNUM
18790 || regno == FPSR_REG
18791 || regno == FPCR_REG)
18793 output_operand_lossage
18794 ("invalid use of register '%s'", reg_names[regno]);
18795 return;
18797 else if (regno == FLAGS_REG)
18799 output_operand_lossage ("invalid use of asm flag output");
18800 return;
18803 duplicated = code == 'd' && TARGET_AVX;
18805 switch (msize)
18807 case 16:
18808 case 12:
18809 case 8:
18810 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18811 warning (0, "unsupported size for integer register");
18812 /* FALLTHRU */
18813 case 4:
18814 if (LEGACY_INT_REGNO_P (regno))
18815 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18816 /* FALLTHRU */
18817 case 2:
18818 normal:
18819 reg = hi_reg_name[regno];
18820 break;
18821 case 1:
18822 if (regno >= ARRAY_SIZE (qi_reg_name))
18823 goto normal;
18824 if (!ANY_QI_REGNO_P (regno))
18825 error ("unsupported size for integer register");
18826 reg = qi_reg_name[regno];
18827 break;
18828 case 0:
18829 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18830 goto normal;
18831 reg = qi_high_reg_name[regno];
18832 break;
18833 case 32:
18834 case 64:
18835 if (SSE_REGNO_P (regno))
18837 gcc_assert (!duplicated);
18838 putc (msize == 32 ? 'y' : 'z', file);
18839 reg = hi_reg_name[regno] + 1;
18840 break;
18842 goto normal;
18843 default:
18844 gcc_unreachable ();
18847 fputs (reg, file);
18849 /* Irritatingly, AMD extended registers use
18850 different naming convention: "r%d[bwd]" */
18851 if (REX_INT_REGNO_P (regno))
18853 gcc_assert (TARGET_64BIT);
18854 switch (msize)
18856 case 0:
18857 error ("extended registers have no high halves");
18858 break;
18859 case 1:
18860 putc ('b', file);
18861 break;
18862 case 2:
18863 putc ('w', file);
18864 break;
18865 case 4:
18866 putc ('d', file);
18867 break;
18868 case 8:
18869 /* no suffix */
18870 break;
18871 default:
18872 error ("unsupported operand size for extended register");
18873 break;
18875 return;
18878 if (duplicated)
18880 if (ASSEMBLER_DIALECT == ASM_ATT)
18881 fprintf (file, ", %%%s", reg);
18882 else
18883 fprintf (file, ", %s", reg);
18887 /* Meaning of CODE:
18888 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18889 C -- print opcode suffix for set/cmov insn.
18890 c -- like C, but print reversed condition
18891 F,f -- likewise, but for floating-point.
18892 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18893 otherwise nothing
18894 R -- print embeded rounding and sae.
18895 r -- print only sae.
18896 z -- print the opcode suffix for the size of the current operand.
18897 Z -- likewise, with special suffixes for x87 instructions.
18898 * -- print a star (in certain assembler syntax)
18899 A -- print an absolute memory reference.
18900 E -- print address with DImode register names if TARGET_64BIT.
18901 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18902 s -- print a shift double count, followed by the assemblers argument
18903 delimiter.
18904 b -- print the QImode name of the register for the indicated operand.
18905 %b0 would print %al if operands[0] is reg 0.
18906 w -- likewise, print the HImode name of the register.
18907 k -- likewise, print the SImode name of the register.
18908 q -- likewise, print the DImode name of the register.
18909 x -- likewise, print the V4SFmode name of the register.
18910 t -- likewise, print the V8SFmode name of the register.
18911 g -- likewise, print the V16SFmode name of the register.
18912 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18913 y -- print "st(0)" instead of "st" as a register.
18914 d -- print duplicated register operand for AVX instruction.
18915 D -- print condition for SSE cmp instruction.
18916 P -- if PIC, print an @PLT suffix.
18917 p -- print raw symbol name.
18918 X -- don't print any sort of PIC '@' suffix for a symbol.
18919 & -- print some in-use local-dynamic symbol name.
18920 H -- print a memory address offset by 8; used for sse high-parts
18921 Y -- print condition for XOP pcom* instruction.
18922 + -- print a branch hint as 'cs' or 'ds' prefix
18923 ; -- print a semicolon (after prefixes due to bug in older gas).
18924 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18925 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18926 ! -- print MPX prefix for jxx/call/ret instructions if required.
18929 void
18930 ix86_print_operand (FILE *file, rtx x, int code)
18932 if (code)
18934 switch (code)
18936 case 'A':
18937 switch (ASSEMBLER_DIALECT)
18939 case ASM_ATT:
18940 putc ('*', file);
18941 break;
18943 case ASM_INTEL:
18944 /* Intel syntax. For absolute addresses, registers should not
18945 be surrounded by braces. */
18946 if (!REG_P (x))
18948 putc ('[', file);
18949 ix86_print_operand (file, x, 0);
18950 putc (']', file);
18951 return;
18953 break;
18955 default:
18956 gcc_unreachable ();
18959 ix86_print_operand (file, x, 0);
18960 return;
18962 case 'E':
18963 /* Wrap address in an UNSPEC to declare special handling. */
18964 if (TARGET_64BIT)
18965 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18967 output_address (VOIDmode, x);
18968 return;
18970 case 'L':
18971 if (ASSEMBLER_DIALECT == ASM_ATT)
18972 putc ('l', file);
18973 return;
18975 case 'W':
18976 if (ASSEMBLER_DIALECT == ASM_ATT)
18977 putc ('w', file);
18978 return;
18980 case 'B':
18981 if (ASSEMBLER_DIALECT == ASM_ATT)
18982 putc ('b', file);
18983 return;
18985 case 'Q':
18986 if (ASSEMBLER_DIALECT == ASM_ATT)
18987 putc ('l', file);
18988 return;
18990 case 'S':
18991 if (ASSEMBLER_DIALECT == ASM_ATT)
18992 putc ('s', file);
18993 return;
18995 case 'T':
18996 if (ASSEMBLER_DIALECT == ASM_ATT)
18997 putc ('t', file);
18998 return;
19000 case 'O':
19001 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19002 if (ASSEMBLER_DIALECT != ASM_ATT)
19003 return;
19005 switch (GET_MODE_SIZE (GET_MODE (x)))
19007 case 2:
19008 putc ('w', file);
19009 break;
19011 case 4:
19012 putc ('l', file);
19013 break;
19015 case 8:
19016 putc ('q', file);
19017 break;
19019 default:
19020 output_operand_lossage ("invalid operand size for operand "
19021 "code 'O'");
19022 return;
19025 putc ('.', file);
19026 #endif
19027 return;
19029 case 'z':
19030 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
19032 /* Opcodes don't get size suffixes if using Intel opcodes. */
19033 if (ASSEMBLER_DIALECT == ASM_INTEL)
19034 return;
19036 switch (GET_MODE_SIZE (GET_MODE (x)))
19038 case 1:
19039 putc ('b', file);
19040 return;
19042 case 2:
19043 putc ('w', file);
19044 return;
19046 case 4:
19047 putc ('l', file);
19048 return;
19050 case 8:
19051 putc ('q', file);
19052 return;
19054 default:
19055 output_operand_lossage ("invalid operand size for operand "
19056 "code 'z'");
19057 return;
19061 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
19062 warning (0, "non-integer operand used with operand code 'z'");
19063 /* FALLTHRU */
19065 case 'Z':
19066 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
19067 if (ASSEMBLER_DIALECT == ASM_INTEL)
19068 return;
19070 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
19072 switch (GET_MODE_SIZE (GET_MODE (x)))
19074 case 2:
19075 #ifdef HAVE_AS_IX86_FILDS
19076 putc ('s', file);
19077 #endif
19078 return;
19080 case 4:
19081 putc ('l', file);
19082 return;
19084 case 8:
19085 #ifdef HAVE_AS_IX86_FILDQ
19086 putc ('q', file);
19087 #else
19088 fputs ("ll", file);
19089 #endif
19090 return;
19092 default:
19093 break;
19096 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
19098 /* 387 opcodes don't get size suffixes
19099 if the operands are registers. */
19100 if (STACK_REG_P (x))
19101 return;
19103 switch (GET_MODE_SIZE (GET_MODE (x)))
19105 case 4:
19106 putc ('s', file);
19107 return;
19109 case 8:
19110 putc ('l', file);
19111 return;
19113 case 12:
19114 case 16:
19115 putc ('t', file);
19116 return;
19118 default:
19119 break;
19122 else
19124 output_operand_lossage ("invalid operand type used with "
19125 "operand code 'Z'");
19126 return;
19129 output_operand_lossage ("invalid operand size for operand code 'Z'");
19130 return;
19132 case 'd':
19133 case 'b':
19134 case 'w':
19135 case 'k':
19136 case 'q':
19137 case 'h':
19138 case 't':
19139 case 'g':
19140 case 'y':
19141 case 'x':
19142 case 'X':
19143 case 'P':
19144 case 'p':
19145 break;
19147 case 's':
19148 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
19150 ix86_print_operand (file, x, 0);
19151 fputs (", ", file);
19153 return;
19155 case 'Y':
19156 switch (GET_CODE (x))
19158 case NE:
19159 fputs ("neq", file);
19160 break;
19161 case EQ:
19162 fputs ("eq", file);
19163 break;
19164 case GE:
19165 case GEU:
19166 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
19167 break;
19168 case GT:
19169 case GTU:
19170 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
19171 break;
19172 case LE:
19173 case LEU:
19174 fputs ("le", file);
19175 break;
19176 case LT:
19177 case LTU:
19178 fputs ("lt", file);
19179 break;
19180 case UNORDERED:
19181 fputs ("unord", file);
19182 break;
19183 case ORDERED:
19184 fputs ("ord", file);
19185 break;
19186 case UNEQ:
19187 fputs ("ueq", file);
19188 break;
19189 case UNGE:
19190 fputs ("nlt", file);
19191 break;
19192 case UNGT:
19193 fputs ("nle", file);
19194 break;
19195 case UNLE:
19196 fputs ("ule", file);
19197 break;
19198 case UNLT:
19199 fputs ("ult", file);
19200 break;
19201 case LTGT:
19202 fputs ("une", file);
19203 break;
19204 default:
19205 output_operand_lossage ("operand is not a condition code, "
19206 "invalid operand code 'Y'");
19207 return;
19209 return;
19211 case 'D':
19212 /* Little bit of braindamage here. The SSE compare instructions
19213 does use completely different names for the comparisons that the
19214 fp conditional moves. */
19215 switch (GET_CODE (x))
19217 case UNEQ:
19218 if (TARGET_AVX)
19220 fputs ("eq_us", file);
19221 break;
19223 /* FALLTHRU */
19224 case EQ:
19225 fputs ("eq", file);
19226 break;
19227 case UNLT:
19228 if (TARGET_AVX)
19230 fputs ("nge", file);
19231 break;
19233 /* FALLTHRU */
19234 case LT:
19235 fputs ("lt", file);
19236 break;
19237 case UNLE:
19238 if (TARGET_AVX)
19240 fputs ("ngt", file);
19241 break;
19243 /* FALLTHRU */
19244 case LE:
19245 fputs ("le", file);
19246 break;
19247 case UNORDERED:
19248 fputs ("unord", file);
19249 break;
19250 case LTGT:
19251 if (TARGET_AVX)
19253 fputs ("neq_oq", file);
19254 break;
19256 /* FALLTHRU */
19257 case NE:
19258 fputs ("neq", file);
19259 break;
19260 case GE:
19261 if (TARGET_AVX)
19263 fputs ("ge", file);
19264 break;
19266 /* FALLTHRU */
19267 case UNGE:
19268 fputs ("nlt", file);
19269 break;
19270 case GT:
19271 if (TARGET_AVX)
19273 fputs ("gt", file);
19274 break;
19276 /* FALLTHRU */
19277 case UNGT:
19278 fputs ("nle", file);
19279 break;
19280 case ORDERED:
19281 fputs ("ord", file);
19282 break;
19283 default:
19284 output_operand_lossage ("operand is not a condition code, "
19285 "invalid operand code 'D'");
19286 return;
19288 return;
19290 case 'F':
19291 case 'f':
19292 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19293 if (ASSEMBLER_DIALECT == ASM_ATT)
19294 putc ('.', file);
19295 gcc_fallthrough ();
19296 #endif
19298 case 'C':
19299 case 'c':
19300 if (!COMPARISON_P (x))
19302 output_operand_lossage ("operand is not a condition code, "
19303 "invalid operand code '%c'", code);
19304 return;
19306 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19307 code == 'c' || code == 'f',
19308 code == 'F' || code == 'f',
19309 file);
19310 return;
19312 case 'H':
19313 if (!offsettable_memref_p (x))
19315 output_operand_lossage ("operand is not an offsettable memory "
19316 "reference, invalid operand code 'H'");
19317 return;
19319 /* It doesn't actually matter what mode we use here, as we're
19320 only going to use this for printing. */
19321 x = adjust_address_nv (x, DImode, 8);
19322 /* Output 'qword ptr' for intel assembler dialect. */
19323 if (ASSEMBLER_DIALECT == ASM_INTEL)
19324 code = 'q';
19325 break;
19327 case 'K':
19328 if (!CONST_INT_P (x))
19330 output_operand_lossage ("operand is not an integer, invalid "
19331 "operand code 'K'");
19332 return;
19335 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19336 #ifdef HAVE_AS_IX86_HLE
19337 fputs ("xacquire ", file);
19338 #else
19339 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19340 #endif
19341 else if (INTVAL (x) & IX86_HLE_RELEASE)
19342 #ifdef HAVE_AS_IX86_HLE
19343 fputs ("xrelease ", file);
19344 #else
19345 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19346 #endif
19347 /* We do not want to print value of the operand. */
19348 return;
19350 case 'N':
19351 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19352 fputs ("{z}", file);
19353 return;
19355 case 'r':
19356 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19358 output_operand_lossage ("operand is not a specific integer, "
19359 "invalid operand code 'r'");
19360 return;
19363 if (ASSEMBLER_DIALECT == ASM_INTEL)
19364 fputs (", ", file);
19366 fputs ("{sae}", file);
19368 if (ASSEMBLER_DIALECT == ASM_ATT)
19369 fputs (", ", file);
19371 return;
19373 case 'R':
19374 if (!CONST_INT_P (x))
19376 output_operand_lossage ("operand is not an integer, invalid "
19377 "operand code 'R'");
19378 return;
19381 if (ASSEMBLER_DIALECT == ASM_INTEL)
19382 fputs (", ", file);
19384 switch (INTVAL (x))
19386 case ROUND_NEAREST_INT | ROUND_SAE:
19387 fputs ("{rn-sae}", file);
19388 break;
19389 case ROUND_NEG_INF | ROUND_SAE:
19390 fputs ("{rd-sae}", file);
19391 break;
19392 case ROUND_POS_INF | ROUND_SAE:
19393 fputs ("{ru-sae}", file);
19394 break;
19395 case ROUND_ZERO | ROUND_SAE:
19396 fputs ("{rz-sae}", file);
19397 break;
19398 default:
19399 output_operand_lossage ("operand is not a specific integer, "
19400 "invalid operand code 'R'");
19403 if (ASSEMBLER_DIALECT == ASM_ATT)
19404 fputs (", ", file);
19406 return;
19408 case '*':
19409 if (ASSEMBLER_DIALECT == ASM_ATT)
19410 putc ('*', file);
19411 return;
19413 case '&':
19415 const char *name = get_some_local_dynamic_name ();
19416 if (name == NULL)
19417 output_operand_lossage ("'%%&' used without any "
19418 "local dynamic TLS references");
19419 else
19420 assemble_name (file, name);
19421 return;
19424 case '+':
19426 rtx x;
19428 if (!optimize
19429 || optimize_function_for_size_p (cfun)
19430 || !TARGET_BRANCH_PREDICTION_HINTS)
19431 return;
19433 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19434 if (x)
19436 int pred_val = profile_probability::from_reg_br_prob_note
19437 (XINT (x, 0)).to_reg_br_prob_base ();
19439 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19440 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19442 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19443 bool cputaken
19444 = final_forward_branch_p (current_output_insn) == 0;
19446 /* Emit hints only in the case default branch prediction
19447 heuristics would fail. */
19448 if (taken != cputaken)
19450 /* We use 3e (DS) prefix for taken branches and
19451 2e (CS) prefix for not taken branches. */
19452 if (taken)
19453 fputs ("ds ; ", file);
19454 else
19455 fputs ("cs ; ", file);
19459 return;
19462 case ';':
19463 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19464 putc (';', file);
19465 #endif
19466 return;
19468 case '~':
19469 putc (TARGET_AVX2 ? 'i' : 'f', file);
19470 return;
19472 case '^':
19473 if (TARGET_64BIT && Pmode != word_mode)
19474 fputs ("addr32 ", file);
19475 return;
19477 case '!':
19478 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19479 fputs ("bnd ", file);
19480 return;
19482 default:
19483 output_operand_lossage ("invalid operand code '%c'", code);
19487 if (REG_P (x))
19488 print_reg (x, code, file);
19490 else if (MEM_P (x))
19492 rtx addr = XEXP (x, 0);
19494 /* No `byte ptr' prefix for call instructions ... */
19495 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19497 machine_mode mode = GET_MODE (x);
19498 const char *size;
19500 /* Check for explicit size override codes. */
19501 if (code == 'b')
19502 size = "BYTE";
19503 else if (code == 'w')
19504 size = "WORD";
19505 else if (code == 'k')
19506 size = "DWORD";
19507 else if (code == 'q')
19508 size = "QWORD";
19509 else if (code == 'x')
19510 size = "XMMWORD";
19511 else if (code == 't')
19512 size = "YMMWORD";
19513 else if (code == 'g')
19514 size = "ZMMWORD";
19515 else if (mode == BLKmode)
19516 /* ... or BLKmode operands, when not overridden. */
19517 size = NULL;
19518 else
19519 switch (GET_MODE_SIZE (mode))
19521 case 1: size = "BYTE"; break;
19522 case 2: size = "WORD"; break;
19523 case 4: size = "DWORD"; break;
19524 case 8: size = "QWORD"; break;
19525 case 12: size = "TBYTE"; break;
19526 case 16:
19527 if (mode == XFmode)
19528 size = "TBYTE";
19529 else
19530 size = "XMMWORD";
19531 break;
19532 case 32: size = "YMMWORD"; break;
19533 case 64: size = "ZMMWORD"; break;
19534 default:
19535 gcc_unreachable ();
19537 if (size)
19539 fputs (size, file);
19540 fputs (" PTR ", file);
19544 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19545 output_operand_lossage ("invalid constraints for operand");
19546 else
19547 ix86_print_operand_address_as
19548 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19551 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19553 long l;
19555 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19557 if (ASSEMBLER_DIALECT == ASM_ATT)
19558 putc ('$', file);
19559 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19560 if (code == 'q')
19561 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19562 (unsigned long long) (int) l);
19563 else
19564 fprintf (file, "0x%08x", (unsigned int) l);
19567 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19569 long l[2];
19571 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19573 if (ASSEMBLER_DIALECT == ASM_ATT)
19574 putc ('$', file);
19575 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19578 /* These float cases don't actually occur as immediate operands. */
19579 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19581 char dstr[30];
19583 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19584 fputs (dstr, file);
19587 else
19589 /* We have patterns that allow zero sets of memory, for instance.
19590 In 64-bit mode, we should probably support all 8-byte vectors,
19591 since we can in fact encode that into an immediate. */
19592 if (GET_CODE (x) == CONST_VECTOR)
19594 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19595 x = const0_rtx;
19598 if (code != 'P' && code != 'p')
19600 if (CONST_INT_P (x))
19602 if (ASSEMBLER_DIALECT == ASM_ATT)
19603 putc ('$', file);
19605 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19606 || GET_CODE (x) == LABEL_REF)
19608 if (ASSEMBLER_DIALECT == ASM_ATT)
19609 putc ('$', file);
19610 else
19611 fputs ("OFFSET FLAT:", file);
19614 if (CONST_INT_P (x))
19615 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19616 else if (flag_pic || MACHOPIC_INDIRECT)
19617 output_pic_addr_const (file, x, code);
19618 else
19619 output_addr_const (file, x);
19623 static bool
19624 ix86_print_operand_punct_valid_p (unsigned char code)
19626 return (code == '*' || code == '+' || code == '&' || code == ';'
19627 || code == '~' || code == '^' || code == '!');
19630 /* Print a memory operand whose address is ADDR. */
19632 static void
19633 ix86_print_operand_address_as (FILE *file, rtx addr,
19634 addr_space_t as, bool no_rip)
19636 struct ix86_address parts;
19637 rtx base, index, disp;
19638 int scale;
19639 int ok;
19640 bool vsib = false;
19641 int code = 0;
19643 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19645 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19646 gcc_assert (parts.index == NULL_RTX);
19647 parts.index = XVECEXP (addr, 0, 1);
19648 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19649 addr = XVECEXP (addr, 0, 0);
19650 vsib = true;
19652 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19654 gcc_assert (TARGET_64BIT);
19655 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19656 code = 'q';
19658 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19660 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19661 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19662 if (parts.base != NULL_RTX)
19664 parts.index = parts.base;
19665 parts.scale = 1;
19667 parts.base = XVECEXP (addr, 0, 0);
19668 addr = XVECEXP (addr, 0, 0);
19670 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19672 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19673 gcc_assert (parts.index == NULL_RTX);
19674 parts.index = XVECEXP (addr, 0, 1);
19675 addr = XVECEXP (addr, 0, 0);
19677 else
19678 ok = ix86_decompose_address (addr, &parts);
19680 gcc_assert (ok);
19682 base = parts.base;
19683 index = parts.index;
19684 disp = parts.disp;
19685 scale = parts.scale;
19687 if (ADDR_SPACE_GENERIC_P (as))
19688 as = parts.seg;
19689 else
19690 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19692 if (!ADDR_SPACE_GENERIC_P (as))
19694 const char *string;
19696 if (as == ADDR_SPACE_SEG_FS)
19697 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19698 else if (as == ADDR_SPACE_SEG_GS)
19699 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19700 else
19701 gcc_unreachable ();
19702 fputs (string, file);
19705 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19706 if (TARGET_64BIT && !base && !index && !no_rip)
19708 rtx symbol = disp;
19710 if (GET_CODE (disp) == CONST
19711 && GET_CODE (XEXP (disp, 0)) == PLUS
19712 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19713 symbol = XEXP (XEXP (disp, 0), 0);
19715 if (GET_CODE (symbol) == LABEL_REF
19716 || (GET_CODE (symbol) == SYMBOL_REF
19717 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19718 base = pc_rtx;
19721 if (!base && !index)
19723 /* Displacement only requires special attention. */
19724 if (CONST_INT_P (disp))
19726 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19727 fputs ("ds:", file);
19728 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19730 /* Load the external function address via the GOT slot to avoid PLT. */
19731 else if (GET_CODE (disp) == CONST
19732 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19733 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19734 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19735 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19736 output_pic_addr_const (file, disp, 0);
19737 else if (flag_pic)
19738 output_pic_addr_const (file, disp, 0);
19739 else
19740 output_addr_const (file, disp);
19742 else
19744 /* Print SImode register names to force addr32 prefix. */
19745 if (SImode_address_operand (addr, VOIDmode))
19747 if (flag_checking)
19749 gcc_assert (TARGET_64BIT);
19750 switch (GET_CODE (addr))
19752 case SUBREG:
19753 gcc_assert (GET_MODE (addr) == SImode);
19754 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19755 break;
19756 case ZERO_EXTEND:
19757 case AND:
19758 gcc_assert (GET_MODE (addr) == DImode);
19759 break;
19760 default:
19761 gcc_unreachable ();
19764 gcc_assert (!code);
19765 code = 'k';
19767 else if (code == 0
19768 && TARGET_X32
19769 && disp
19770 && CONST_INT_P (disp)
19771 && INTVAL (disp) < -16*1024*1024)
19773 /* X32 runs in 64-bit mode, where displacement, DISP, in
19774 address DISP(%r64), is encoded as 32-bit immediate sign-
19775 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19776 address is %r64 + 0xffffffffbffffd00. When %r64 <
19777 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19778 which is invalid for x32. The correct address is %r64
19779 - 0x40000300 == 0xf7ffdd64. To properly encode
19780 -0x40000300(%r64) for x32, we zero-extend negative
19781 displacement by forcing addr32 prefix which truncates
19782 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19783 zero-extend all negative displacements, including -1(%rsp).
19784 However, for small negative displacements, sign-extension
19785 won't cause overflow. We only zero-extend negative
19786 displacements if they < -16*1024*1024, which is also used
19787 to check legitimate address displacements for PIC. */
19788 code = 'k';
19791 if (ASSEMBLER_DIALECT == ASM_ATT)
19793 if (disp)
19795 if (flag_pic)
19796 output_pic_addr_const (file, disp, 0);
19797 else if (GET_CODE (disp) == LABEL_REF)
19798 output_asm_label (disp);
19799 else
19800 output_addr_const (file, disp);
19803 putc ('(', file);
19804 if (base)
19805 print_reg (base, code, file);
19806 if (index)
19808 putc (',', file);
19809 print_reg (index, vsib ? 0 : code, file);
19810 if (scale != 1 || vsib)
19811 fprintf (file, ",%d", scale);
19813 putc (')', file);
19815 else
19817 rtx offset = NULL_RTX;
19819 if (disp)
19821 /* Pull out the offset of a symbol; print any symbol itself. */
19822 if (GET_CODE (disp) == CONST
19823 && GET_CODE (XEXP (disp, 0)) == PLUS
19824 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19826 offset = XEXP (XEXP (disp, 0), 1);
19827 disp = gen_rtx_CONST (VOIDmode,
19828 XEXP (XEXP (disp, 0), 0));
19831 if (flag_pic)
19832 output_pic_addr_const (file, disp, 0);
19833 else if (GET_CODE (disp) == LABEL_REF)
19834 output_asm_label (disp);
19835 else if (CONST_INT_P (disp))
19836 offset = disp;
19837 else
19838 output_addr_const (file, disp);
19841 putc ('[', file);
19842 if (base)
19844 print_reg (base, code, file);
19845 if (offset)
19847 if (INTVAL (offset) >= 0)
19848 putc ('+', file);
19849 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19852 else if (offset)
19853 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19854 else
19855 putc ('0', file);
19857 if (index)
19859 putc ('+', file);
19860 print_reg (index, vsib ? 0 : code, file);
19861 if (scale != 1 || vsib)
19862 fprintf (file, "*%d", scale);
19864 putc (']', file);
19869 static void
19870 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19872 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19875 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19877 static bool
19878 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19880 rtx op;
19882 if (GET_CODE (x) != UNSPEC)
19883 return false;
19885 op = XVECEXP (x, 0, 0);
19886 switch (XINT (x, 1))
19888 case UNSPEC_GOTTPOFF:
19889 output_addr_const (file, op);
19890 /* FIXME: This might be @TPOFF in Sun ld. */
19891 fputs ("@gottpoff", file);
19892 break;
19893 case UNSPEC_TPOFF:
19894 output_addr_const (file, op);
19895 fputs ("@tpoff", file);
19896 break;
19897 case UNSPEC_NTPOFF:
19898 output_addr_const (file, op);
19899 if (TARGET_64BIT)
19900 fputs ("@tpoff", file);
19901 else
19902 fputs ("@ntpoff", file);
19903 break;
19904 case UNSPEC_DTPOFF:
19905 output_addr_const (file, op);
19906 fputs ("@dtpoff", file);
19907 break;
19908 case UNSPEC_GOTNTPOFF:
19909 output_addr_const (file, op);
19910 if (TARGET_64BIT)
19911 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19912 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19913 else
19914 fputs ("@gotntpoff", file);
19915 break;
19916 case UNSPEC_INDNTPOFF:
19917 output_addr_const (file, op);
19918 fputs ("@indntpoff", file);
19919 break;
19920 #if TARGET_MACHO
19921 case UNSPEC_MACHOPIC_OFFSET:
19922 output_addr_const (file, op);
19923 putc ('-', file);
19924 machopic_output_function_base_name (file);
19925 break;
19926 #endif
19928 default:
19929 return false;
19932 return true;
19935 /* Split one or more double-mode RTL references into pairs of half-mode
19936 references. The RTL can be REG, offsettable MEM, integer constant, or
19937 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19938 split and "num" is its length. lo_half and hi_half are output arrays
19939 that parallel "operands". */
19941 void
19942 split_double_mode (machine_mode mode, rtx operands[],
19943 int num, rtx lo_half[], rtx hi_half[])
19945 machine_mode half_mode;
19946 unsigned int byte;
19948 switch (mode)
19950 case E_TImode:
19951 half_mode = DImode;
19952 break;
19953 case E_DImode:
19954 half_mode = SImode;
19955 break;
19956 default:
19957 gcc_unreachable ();
19960 byte = GET_MODE_SIZE (half_mode);
19962 while (num--)
19964 rtx op = operands[num];
19966 /* simplify_subreg refuse to split volatile memory addresses,
19967 but we still have to handle it. */
19968 if (MEM_P (op))
19970 lo_half[num] = adjust_address (op, half_mode, 0);
19971 hi_half[num] = adjust_address (op, half_mode, byte);
19973 else
19975 lo_half[num] = simplify_gen_subreg (half_mode, op,
19976 GET_MODE (op) == VOIDmode
19977 ? mode : GET_MODE (op), 0);
19978 hi_half[num] = simplify_gen_subreg (half_mode, op,
19979 GET_MODE (op) == VOIDmode
19980 ? mode : GET_MODE (op), byte);
19985 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19986 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19987 is the expression of the binary operation. The output may either be
19988 emitted here, or returned to the caller, like all output_* functions.
19990 There is no guarantee that the operands are the same mode, as they
19991 might be within FLOAT or FLOAT_EXTEND expressions. */
19993 #ifndef SYSV386_COMPAT
19994 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19995 wants to fix the assemblers because that causes incompatibility
19996 with gcc. No-one wants to fix gcc because that causes
19997 incompatibility with assemblers... You can use the option of
19998 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19999 #define SYSV386_COMPAT 1
20000 #endif
20002 const char *
20003 output_387_binary_op (rtx_insn *insn, rtx *operands)
20005 static char buf[40];
20006 const char *p;
20007 const char *ssep;
20008 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
20010 /* Even if we do not want to check the inputs, this documents input
20011 constraints. Which helps in understanding the following code. */
20012 if (flag_checking)
20014 if (STACK_REG_P (operands[0])
20015 && ((REG_P (operands[1])
20016 && REGNO (operands[0]) == REGNO (operands[1])
20017 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
20018 || (REG_P (operands[2])
20019 && REGNO (operands[0]) == REGNO (operands[2])
20020 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
20021 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
20022 ; /* ok */
20023 else
20024 gcc_assert (is_sse);
20027 switch (GET_CODE (operands[3]))
20029 case PLUS:
20030 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20031 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20032 p = "fiadd";
20033 else
20034 p = "fadd";
20035 ssep = "vadd";
20036 break;
20038 case MINUS:
20039 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20040 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20041 p = "fisub";
20042 else
20043 p = "fsub";
20044 ssep = "vsub";
20045 break;
20047 case MULT:
20048 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20049 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20050 p = "fimul";
20051 else
20052 p = "fmul";
20053 ssep = "vmul";
20054 break;
20056 case DIV:
20057 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20058 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20059 p = "fidiv";
20060 else
20061 p = "fdiv";
20062 ssep = "vdiv";
20063 break;
20065 default:
20066 gcc_unreachable ();
20069 if (is_sse)
20071 if (TARGET_AVX)
20073 strcpy (buf, ssep);
20074 if (GET_MODE (operands[0]) == SFmode)
20075 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
20076 else
20077 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
20079 else
20081 strcpy (buf, ssep + 1);
20082 if (GET_MODE (operands[0]) == SFmode)
20083 strcat (buf, "ss\t{%2, %0|%0, %2}");
20084 else
20085 strcat (buf, "sd\t{%2, %0|%0, %2}");
20087 return buf;
20089 strcpy (buf, p);
20091 switch (GET_CODE (operands[3]))
20093 case MULT:
20094 case PLUS:
20095 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
20096 std::swap (operands[1], operands[2]);
20098 /* know operands[0] == operands[1]. */
20100 if (MEM_P (operands[2]))
20102 p = "%Z2\t%2";
20103 break;
20106 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20108 if (STACK_TOP_P (operands[0]))
20109 /* How is it that we are storing to a dead operand[2]?
20110 Well, presumably operands[1] is dead too. We can't
20111 store the result to st(0) as st(0) gets popped on this
20112 instruction. Instead store to operands[2] (which I
20113 think has to be st(1)). st(1) will be popped later.
20114 gcc <= 2.8.1 didn't have this check and generated
20115 assembly code that the Unixware assembler rejected. */
20116 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20117 else
20118 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20119 break;
20122 if (STACK_TOP_P (operands[0]))
20123 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20124 else
20125 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20126 break;
20128 case MINUS:
20129 case DIV:
20130 if (MEM_P (operands[1]))
20132 p = "r%Z1\t%1";
20133 break;
20136 if (MEM_P (operands[2]))
20138 p = "%Z2\t%2";
20139 break;
20142 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20144 #if SYSV386_COMPAT
20145 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
20146 derived assemblers, confusingly reverse the direction of
20147 the operation for fsub{r} and fdiv{r} when the
20148 destination register is not st(0). The Intel assembler
20149 doesn't have this brain damage. Read !SYSV386_COMPAT to
20150 figure out what the hardware really does. */
20151 if (STACK_TOP_P (operands[0]))
20152 p = "{p\t%0, %2|rp\t%2, %0}";
20153 else
20154 p = "{rp\t%2, %0|p\t%0, %2}";
20155 #else
20156 if (STACK_TOP_P (operands[0]))
20157 /* As above for fmul/fadd, we can't store to st(0). */
20158 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20159 else
20160 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20161 #endif
20162 break;
20165 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20167 #if SYSV386_COMPAT
20168 if (STACK_TOP_P (operands[0]))
20169 p = "{rp\t%0, %1|p\t%1, %0}";
20170 else
20171 p = "{p\t%1, %0|rp\t%0, %1}";
20172 #else
20173 if (STACK_TOP_P (operands[0]))
20174 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
20175 else
20176 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
20177 #endif
20178 break;
20181 if (STACK_TOP_P (operands[0]))
20183 if (STACK_TOP_P (operands[1]))
20184 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20185 else
20186 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
20187 break;
20189 else if (STACK_TOP_P (operands[1]))
20191 #if SYSV386_COMPAT
20192 p = "{\t%1, %0|r\t%0, %1}";
20193 #else
20194 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
20195 #endif
20197 else
20199 #if SYSV386_COMPAT
20200 p = "{r\t%2, %0|\t%0, %2}";
20201 #else
20202 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20203 #endif
20205 break;
20207 default:
20208 gcc_unreachable ();
20211 strcat (buf, p);
20212 return buf;
20215 /* Return needed mode for entity in optimize_mode_switching pass. */
20217 static int
20218 ix86_dirflag_mode_needed (rtx_insn *insn)
20220 if (CALL_P (insn))
20222 if (cfun->machine->func_type == TYPE_NORMAL)
20223 return X86_DIRFLAG_ANY;
20224 else
20225 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
20226 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
20229 if (recog_memoized (insn) < 0)
20230 return X86_DIRFLAG_ANY;
20232 if (get_attr_type (insn) == TYPE_STR)
20234 /* Emit cld instruction if stringops are used in the function. */
20235 if (cfun->machine->func_type == TYPE_NORMAL)
20236 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
20237 else
20238 return X86_DIRFLAG_RESET;
20241 return X86_DIRFLAG_ANY;
20244 /* Check if a 256bit AVX register is referenced inside of EXP. */
20246 static bool
20247 ix86_check_avx256_register (const_rtx exp)
20249 if (SUBREG_P (exp))
20250 exp = SUBREG_REG (exp);
20252 return (REG_P (exp)
20253 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
20256 /* Return needed mode for entity in optimize_mode_switching pass. */
20258 static int
20259 ix86_avx_u128_mode_needed (rtx_insn *insn)
20261 if (CALL_P (insn))
20263 rtx link;
20265 /* Needed mode is set to AVX_U128_CLEAN if there are
20266 no 256bit modes used in function arguments. */
20267 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20268 link;
20269 link = XEXP (link, 1))
20271 if (GET_CODE (XEXP (link, 0)) == USE)
20273 rtx arg = XEXP (XEXP (link, 0), 0);
20275 if (ix86_check_avx256_register (arg))
20276 return AVX_U128_DIRTY;
20280 return AVX_U128_CLEAN;
20283 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20284 changes state only when a 256bit register is written to, but we need
20285 to prevent the compiler from moving optimal insertion point above
20286 eventual read from 256bit register. */
20287 subrtx_iterator::array_type array;
20288 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20289 if (ix86_check_avx256_register (*iter))
20290 return AVX_U128_DIRTY;
20292 return AVX_U128_ANY;
20295 /* Return mode that i387 must be switched into
20296 prior to the execution of insn. */
20298 static int
20299 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20301 enum attr_i387_cw mode;
20303 /* The mode UNINITIALIZED is used to store control word after a
20304 function call or ASM pattern. The mode ANY specify that function
20305 has no requirements on the control word and make no changes in the
20306 bits we are interested in. */
20308 if (CALL_P (insn)
20309 || (NONJUMP_INSN_P (insn)
20310 && (asm_noperands (PATTERN (insn)) >= 0
20311 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20312 return I387_CW_UNINITIALIZED;
20314 if (recog_memoized (insn) < 0)
20315 return I387_CW_ANY;
20317 mode = get_attr_i387_cw (insn);
20319 switch (entity)
20321 case I387_TRUNC:
20322 if (mode == I387_CW_TRUNC)
20323 return mode;
20324 break;
20326 case I387_FLOOR:
20327 if (mode == I387_CW_FLOOR)
20328 return mode;
20329 break;
20331 case I387_CEIL:
20332 if (mode == I387_CW_CEIL)
20333 return mode;
20334 break;
20336 case I387_MASK_PM:
20337 if (mode == I387_CW_MASK_PM)
20338 return mode;
20339 break;
20341 default:
20342 gcc_unreachable ();
20345 return I387_CW_ANY;
20348 /* Return mode that entity must be switched into
20349 prior to the execution of insn. */
20351 static int
20352 ix86_mode_needed (int entity, rtx_insn *insn)
20354 switch (entity)
20356 case X86_DIRFLAG:
20357 return ix86_dirflag_mode_needed (insn);
20358 case AVX_U128:
20359 return ix86_avx_u128_mode_needed (insn);
20360 case I387_TRUNC:
20361 case I387_FLOOR:
20362 case I387_CEIL:
20363 case I387_MASK_PM:
20364 return ix86_i387_mode_needed (entity, insn);
20365 default:
20366 gcc_unreachable ();
20368 return 0;
20371 /* Check if a 256bit AVX register is referenced in stores. */
20373 static void
20374 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20376 if (ix86_check_avx256_register (dest))
20378 bool *used = (bool *) data;
20379 *used = true;
20383 /* Calculate mode of upper 128bit AVX registers after the insn. */
20385 static int
20386 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20388 rtx pat = PATTERN (insn);
20390 if (vzeroupper_operation (pat, VOIDmode)
20391 || vzeroall_operation (pat, VOIDmode))
20392 return AVX_U128_CLEAN;
20394 /* We know that state is clean after CALL insn if there are no
20395 256bit registers used in the function return register. */
20396 if (CALL_P (insn))
20398 bool avx_reg256_found = false;
20399 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20401 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20404 /* Otherwise, return current mode. Remember that if insn
20405 references AVX 256bit registers, the mode was already changed
20406 to DIRTY from MODE_NEEDED. */
20407 return mode;
20410 /* Return the mode that an insn results in. */
20412 static int
20413 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20415 switch (entity)
20417 case X86_DIRFLAG:
20418 return mode;
20419 case AVX_U128:
20420 return ix86_avx_u128_mode_after (mode, insn);
20421 case I387_TRUNC:
20422 case I387_FLOOR:
20423 case I387_CEIL:
20424 case I387_MASK_PM:
20425 return mode;
20426 default:
20427 gcc_unreachable ();
20431 static int
20432 ix86_dirflag_mode_entry (void)
20434 /* For TARGET_CLD or in the interrupt handler we can't assume
20435 direction flag state at function entry. */
20436 if (TARGET_CLD
20437 || cfun->machine->func_type != TYPE_NORMAL)
20438 return X86_DIRFLAG_ANY;
20440 return X86_DIRFLAG_RESET;
20443 static int
20444 ix86_avx_u128_mode_entry (void)
20446 tree arg;
20448 /* Entry mode is set to AVX_U128_DIRTY if there are
20449 256bit modes used in function arguments. */
20450 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20451 arg = TREE_CHAIN (arg))
20453 rtx incoming = DECL_INCOMING_RTL (arg);
20455 if (incoming && ix86_check_avx256_register (incoming))
20456 return AVX_U128_DIRTY;
20459 return AVX_U128_CLEAN;
20462 /* Return a mode that ENTITY is assumed to be
20463 switched to at function entry. */
20465 static int
20466 ix86_mode_entry (int entity)
20468 switch (entity)
20470 case X86_DIRFLAG:
20471 return ix86_dirflag_mode_entry ();
20472 case AVX_U128:
20473 return ix86_avx_u128_mode_entry ();
20474 case I387_TRUNC:
20475 case I387_FLOOR:
20476 case I387_CEIL:
20477 case I387_MASK_PM:
20478 return I387_CW_ANY;
20479 default:
20480 gcc_unreachable ();
20484 static int
20485 ix86_avx_u128_mode_exit (void)
20487 rtx reg = crtl->return_rtx;
20489 /* Exit mode is set to AVX_U128_DIRTY if there are
20490 256bit modes used in the function return register. */
20491 if (reg && ix86_check_avx256_register (reg))
20492 return AVX_U128_DIRTY;
20494 return AVX_U128_CLEAN;
20497 /* Return a mode that ENTITY is assumed to be
20498 switched to at function exit. */
20500 static int
20501 ix86_mode_exit (int entity)
20503 switch (entity)
20505 case X86_DIRFLAG:
20506 return X86_DIRFLAG_ANY;
20507 case AVX_U128:
20508 return ix86_avx_u128_mode_exit ();
20509 case I387_TRUNC:
20510 case I387_FLOOR:
20511 case I387_CEIL:
20512 case I387_MASK_PM:
20513 return I387_CW_ANY;
20514 default:
20515 gcc_unreachable ();
20519 static int
20520 ix86_mode_priority (int, int n)
20522 return n;
20525 /* Output code to initialize control word copies used by trunc?f?i and
20526 rounding patterns. CURRENT_MODE is set to current control word,
20527 while NEW_MODE is set to new control word. */
20529 static void
20530 emit_i387_cw_initialization (int mode)
20532 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20533 rtx new_mode;
20535 enum ix86_stack_slot slot;
20537 rtx reg = gen_reg_rtx (HImode);
20539 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20540 emit_move_insn (reg, copy_rtx (stored_mode));
20542 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20543 || optimize_insn_for_size_p ())
20545 switch (mode)
20547 case I387_CW_TRUNC:
20548 /* round toward zero (truncate) */
20549 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20550 slot = SLOT_CW_TRUNC;
20551 break;
20553 case I387_CW_FLOOR:
20554 /* round down toward -oo */
20555 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20556 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20557 slot = SLOT_CW_FLOOR;
20558 break;
20560 case I387_CW_CEIL:
20561 /* round up toward +oo */
20562 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20563 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20564 slot = SLOT_CW_CEIL;
20565 break;
20567 case I387_CW_MASK_PM:
20568 /* mask precision exception for nearbyint() */
20569 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20570 slot = SLOT_CW_MASK_PM;
20571 break;
20573 default:
20574 gcc_unreachable ();
20577 else
20579 switch (mode)
20581 case I387_CW_TRUNC:
20582 /* round toward zero (truncate) */
20583 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20584 slot = SLOT_CW_TRUNC;
20585 break;
20587 case I387_CW_FLOOR:
20588 /* round down toward -oo */
20589 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20590 slot = SLOT_CW_FLOOR;
20591 break;
20593 case I387_CW_CEIL:
20594 /* round up toward +oo */
20595 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20596 slot = SLOT_CW_CEIL;
20597 break;
20599 case I387_CW_MASK_PM:
20600 /* mask precision exception for nearbyint() */
20601 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20602 slot = SLOT_CW_MASK_PM;
20603 break;
20605 default:
20606 gcc_unreachable ();
20610 gcc_assert (slot < MAX_386_STACK_LOCALS);
20612 new_mode = assign_386_stack_local (HImode, slot);
20613 emit_move_insn (new_mode, reg);
20616 /* Emit vzeroupper. */
20618 void
20619 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20621 int i;
20623 /* Cancel automatic vzeroupper insertion if there are
20624 live call-saved SSE registers at the insertion point. */
20626 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20627 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20628 return;
20630 if (TARGET_64BIT)
20631 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20632 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20633 return;
20635 emit_insn (gen_avx_vzeroupper ());
20638 /* Generate one or more insns to set ENTITY to MODE. */
20640 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20641 is the set of hard registers live at the point where the insn(s)
20642 are to be inserted. */
20644 static void
20645 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20646 HARD_REG_SET regs_live)
20648 switch (entity)
20650 case X86_DIRFLAG:
20651 if (mode == X86_DIRFLAG_RESET)
20652 emit_insn (gen_cld ());
20653 break;
20654 case AVX_U128:
20655 if (mode == AVX_U128_CLEAN)
20656 ix86_avx_emit_vzeroupper (regs_live);
20657 break;
20658 case I387_TRUNC:
20659 case I387_FLOOR:
20660 case I387_CEIL:
20661 case I387_MASK_PM:
20662 if (mode != I387_CW_ANY
20663 && mode != I387_CW_UNINITIALIZED)
20664 emit_i387_cw_initialization (mode);
20665 break;
20666 default:
20667 gcc_unreachable ();
20671 /* Output code for INSN to convert a float to a signed int. OPERANDS
20672 are the insn operands. The output may be [HSD]Imode and the input
20673 operand may be [SDX]Fmode. */
20675 const char *
20676 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20678 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20679 int dimode_p = GET_MODE (operands[0]) == DImode;
20680 int round_mode = get_attr_i387_cw (insn);
20682 /* Jump through a hoop or two for DImode, since the hardware has no
20683 non-popping instruction. We used to do this a different way, but
20684 that was somewhat fragile and broke with post-reload splitters. */
20685 if ((dimode_p || fisttp) && !stack_top_dies)
20686 output_asm_insn ("fld\t%y1", operands);
20688 gcc_assert (STACK_TOP_P (operands[1]));
20689 gcc_assert (MEM_P (operands[0]));
20690 gcc_assert (GET_MODE (operands[1]) != TFmode);
20692 if (fisttp)
20693 output_asm_insn ("fisttp%Z0\t%0", operands);
20694 else
20696 if (round_mode != I387_CW_ANY)
20697 output_asm_insn ("fldcw\t%3", operands);
20698 if (stack_top_dies || dimode_p)
20699 output_asm_insn ("fistp%Z0\t%0", operands);
20700 else
20701 output_asm_insn ("fist%Z0\t%0", operands);
20702 if (round_mode != I387_CW_ANY)
20703 output_asm_insn ("fldcw\t%2", operands);
20706 return "";
20709 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20710 have the values zero or one, indicates the ffreep insn's operand
20711 from the OPERANDS array. */
20713 static const char *
20714 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20716 if (TARGET_USE_FFREEP)
20717 #ifdef HAVE_AS_IX86_FFREEP
20718 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20719 #else
20721 static char retval[32];
20722 int regno = REGNO (operands[opno]);
20724 gcc_assert (STACK_REGNO_P (regno));
20726 regno -= FIRST_STACK_REG;
20728 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20729 return retval;
20731 #endif
20733 return opno ? "fstp\t%y1" : "fstp\t%y0";
20737 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20738 should be used. UNORDERED_P is true when fucom should be used. */
20740 const char *
20741 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20743 int stack_top_dies;
20744 rtx cmp_op0, cmp_op1;
20745 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20747 if (eflags_p)
20749 cmp_op0 = operands[0];
20750 cmp_op1 = operands[1];
20752 else
20754 cmp_op0 = operands[1];
20755 cmp_op1 = operands[2];
20758 if (is_sse)
20760 if (GET_MODE (operands[0]) == SFmode)
20761 if (unordered_p)
20762 return "%vucomiss\t{%1, %0|%0, %1}";
20763 else
20764 return "%vcomiss\t{%1, %0|%0, %1}";
20765 else
20766 if (unordered_p)
20767 return "%vucomisd\t{%1, %0|%0, %1}";
20768 else
20769 return "%vcomisd\t{%1, %0|%0, %1}";
20772 gcc_assert (STACK_TOP_P (cmp_op0));
20774 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20776 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20778 if (stack_top_dies)
20780 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20781 return output_387_ffreep (operands, 1);
20783 else
20784 return "ftst\n\tfnstsw\t%0";
20787 if (STACK_REG_P (cmp_op1)
20788 && stack_top_dies
20789 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20790 && REGNO (cmp_op1) != FIRST_STACK_REG)
20792 /* If both the top of the 387 stack dies, and the other operand
20793 is also a stack register that dies, then this must be a
20794 `fcompp' float compare */
20796 if (eflags_p)
20798 /* There is no double popping fcomi variant. Fortunately,
20799 eflags is immune from the fstp's cc clobbering. */
20800 if (unordered_p)
20801 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20802 else
20803 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20804 return output_387_ffreep (operands, 0);
20806 else
20808 if (unordered_p)
20809 return "fucompp\n\tfnstsw\t%0";
20810 else
20811 return "fcompp\n\tfnstsw\t%0";
20814 else
20816 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20818 static const char * const alt[16] =
20820 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20821 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20822 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20823 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20825 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20826 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20827 NULL,
20828 NULL,
20830 "fcomi\t{%y1, %0|%0, %y1}",
20831 "fcomip\t{%y1, %0|%0, %y1}",
20832 "fucomi\t{%y1, %0|%0, %y1}",
20833 "fucomip\t{%y1, %0|%0, %y1}",
20835 NULL,
20836 NULL,
20837 NULL,
20838 NULL
20841 int mask;
20842 const char *ret;
20844 mask = eflags_p << 3;
20845 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20846 mask |= unordered_p << 1;
20847 mask |= stack_top_dies;
20849 gcc_assert (mask < 16);
20850 ret = alt[mask];
20851 gcc_assert (ret);
20853 return ret;
20857 void
20858 ix86_output_addr_vec_elt (FILE *file, int value)
20860 const char *directive = ASM_LONG;
20862 #ifdef ASM_QUAD
20863 if (TARGET_LP64)
20864 directive = ASM_QUAD;
20865 #else
20866 gcc_assert (!TARGET_64BIT);
20867 #endif
20869 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20872 void
20873 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20875 const char *directive = ASM_LONG;
20877 #ifdef ASM_QUAD
20878 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20879 directive = ASM_QUAD;
20880 #else
20881 gcc_assert (!TARGET_64BIT);
20882 #endif
20883 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20884 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20885 fprintf (file, "%s%s%d-%s%d\n",
20886 directive, LPREFIX, value, LPREFIX, rel);
20887 else if (HAVE_AS_GOTOFF_IN_DATA)
20888 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20889 #if TARGET_MACHO
20890 else if (TARGET_MACHO)
20892 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20893 machopic_output_function_base_name (file);
20894 putc ('\n', file);
20896 #endif
20897 else
20898 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20899 GOT_SYMBOL_NAME, LPREFIX, value);
20902 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20903 for the target. */
20905 void
20906 ix86_expand_clear (rtx dest)
20908 rtx tmp;
20910 /* We play register width games, which are only valid after reload. */
20911 gcc_assert (reload_completed);
20913 /* Avoid HImode and its attendant prefix byte. */
20914 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20915 dest = gen_rtx_REG (SImode, REGNO (dest));
20916 tmp = gen_rtx_SET (dest, const0_rtx);
20918 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20920 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20921 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20924 emit_insn (tmp);
20927 /* X is an unchanging MEM. If it is a constant pool reference, return
20928 the constant pool rtx, else NULL. */
20931 maybe_get_pool_constant (rtx x)
20933 x = ix86_delegitimize_address (XEXP (x, 0));
20935 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20936 return get_pool_constant (x);
20938 return NULL_RTX;
20941 void
20942 ix86_expand_move (machine_mode mode, rtx operands[])
20944 rtx op0, op1;
20945 rtx tmp, addend = NULL_RTX;
20946 enum tls_model model;
20948 op0 = operands[0];
20949 op1 = operands[1];
20951 switch (GET_CODE (op1))
20953 case CONST:
20954 tmp = XEXP (op1, 0);
20956 if (GET_CODE (tmp) != PLUS
20957 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20958 break;
20960 op1 = XEXP (tmp, 0);
20961 addend = XEXP (tmp, 1);
20962 /* FALLTHRU */
20964 case SYMBOL_REF:
20965 model = SYMBOL_REF_TLS_MODEL (op1);
20967 if (model)
20968 op1 = legitimize_tls_address (op1, model, true);
20969 else if (ix86_force_load_from_GOT_p (op1))
20971 /* Load the external function address via GOT slot to avoid PLT. */
20972 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20973 (TARGET_64BIT
20974 ? UNSPEC_GOTPCREL
20975 : UNSPEC_GOT));
20976 op1 = gen_rtx_CONST (Pmode, op1);
20977 op1 = gen_const_mem (Pmode, op1);
20978 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20980 else
20982 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20983 if (tmp)
20985 op1 = tmp;
20986 if (!addend)
20987 break;
20989 else
20991 op1 = operands[1];
20992 break;
20996 if (addend)
20998 op1 = force_operand (op1, NULL_RTX);
20999 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
21000 op0, 1, OPTAB_DIRECT);
21002 else
21003 op1 = force_operand (op1, op0);
21005 if (op1 == op0)
21006 return;
21008 op1 = convert_to_mode (mode, op1, 1);
21010 default:
21011 break;
21014 if ((flag_pic || MACHOPIC_INDIRECT)
21015 && symbolic_operand (op1, mode))
21017 if (TARGET_MACHO && !TARGET_64BIT)
21019 #if TARGET_MACHO
21020 /* dynamic-no-pic */
21021 if (MACHOPIC_INDIRECT)
21023 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
21024 ? op0 : gen_reg_rtx (Pmode);
21025 op1 = machopic_indirect_data_reference (op1, temp);
21026 if (MACHOPIC_PURE)
21027 op1 = machopic_legitimize_pic_address (op1, mode,
21028 temp == op1 ? 0 : temp);
21030 if (op0 != op1 && GET_CODE (op0) != MEM)
21032 rtx insn = gen_rtx_SET (op0, op1);
21033 emit_insn (insn);
21034 return;
21036 if (GET_CODE (op0) == MEM)
21037 op1 = force_reg (Pmode, op1);
21038 else
21040 rtx temp = op0;
21041 if (GET_CODE (temp) != REG)
21042 temp = gen_reg_rtx (Pmode);
21043 temp = legitimize_pic_address (op1, temp);
21044 if (temp == op0)
21045 return;
21046 op1 = temp;
21048 /* dynamic-no-pic */
21049 #endif
21051 else
21053 if (MEM_P (op0))
21054 op1 = force_reg (mode, op1);
21055 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
21057 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
21058 op1 = legitimize_pic_address (op1, reg);
21059 if (op0 == op1)
21060 return;
21061 op1 = convert_to_mode (mode, op1, 1);
21065 else
21067 if (MEM_P (op0)
21068 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
21069 || !push_operand (op0, mode))
21070 && MEM_P (op1))
21071 op1 = force_reg (mode, op1);
21073 if (push_operand (op0, mode)
21074 && ! general_no_elim_operand (op1, mode))
21075 op1 = copy_to_mode_reg (mode, op1);
21077 /* Force large constants in 64bit compilation into register
21078 to get them CSEed. */
21079 if (can_create_pseudo_p ()
21080 && (mode == DImode) && TARGET_64BIT
21081 && immediate_operand (op1, mode)
21082 && !x86_64_zext_immediate_operand (op1, VOIDmode)
21083 && !register_operand (op0, mode)
21084 && optimize)
21085 op1 = copy_to_mode_reg (mode, op1);
21087 if (can_create_pseudo_p ()
21088 && CONST_DOUBLE_P (op1))
21090 /* If we are loading a floating point constant to a register,
21091 force the value to memory now, since we'll get better code
21092 out the back end. */
21094 op1 = validize_mem (force_const_mem (mode, op1));
21095 if (!register_operand (op0, mode))
21097 rtx temp = gen_reg_rtx (mode);
21098 emit_insn (gen_rtx_SET (temp, op1));
21099 emit_move_insn (op0, temp);
21100 return;
21105 emit_insn (gen_rtx_SET (op0, op1));
21108 void
21109 ix86_expand_vector_move (machine_mode mode, rtx operands[])
21111 rtx op0 = operands[0], op1 = operands[1];
21112 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
21113 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
21114 unsigned int align = (TARGET_IAMCU
21115 ? GET_MODE_BITSIZE (mode)
21116 : GET_MODE_ALIGNMENT (mode));
21118 if (push_operand (op0, VOIDmode))
21119 op0 = emit_move_resolve_push (mode, op0);
21121 /* Force constants other than zero into memory. We do not know how
21122 the instructions used to build constants modify the upper 64 bits
21123 of the register, once we have that information we may be able
21124 to handle some of them more efficiently. */
21125 if (can_create_pseudo_p ()
21126 && (CONSTANT_P (op1)
21127 || (SUBREG_P (op1)
21128 && CONSTANT_P (SUBREG_REG (op1))))
21129 && ((register_operand (op0, mode)
21130 && !standard_sse_constant_p (op1, mode))
21131 /* ix86_expand_vector_move_misalign() does not like constants. */
21132 || (SSE_REG_MODE_P (mode)
21133 && MEM_P (op0)
21134 && MEM_ALIGN (op0) < align)))
21136 if (SUBREG_P (op1))
21138 machine_mode imode = GET_MODE (SUBREG_REG (op1));
21139 rtx r = force_const_mem (imode, SUBREG_REG (op1));
21140 if (r)
21141 r = validize_mem (r);
21142 else
21143 r = force_reg (imode, SUBREG_REG (op1));
21144 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
21146 else
21147 op1 = validize_mem (force_const_mem (mode, op1));
21150 /* We need to check memory alignment for SSE mode since attribute
21151 can make operands unaligned. */
21152 if (can_create_pseudo_p ()
21153 && SSE_REG_MODE_P (mode)
21154 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
21155 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
21157 rtx tmp[2];
21159 /* ix86_expand_vector_move_misalign() does not like both
21160 arguments in memory. */
21161 if (!register_operand (op0, mode)
21162 && !register_operand (op1, mode))
21163 op1 = force_reg (mode, op1);
21165 tmp[0] = op0; tmp[1] = op1;
21166 ix86_expand_vector_move_misalign (mode, tmp);
21167 return;
21170 /* Make operand1 a register if it isn't already. */
21171 if (can_create_pseudo_p ()
21172 && !register_operand (op0, mode)
21173 && !register_operand (op1, mode))
21175 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
21176 return;
21179 emit_insn (gen_rtx_SET (op0, op1));
21182 /* Split 32-byte AVX unaligned load and store if needed. */
21184 static void
21185 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
21187 rtx m;
21188 rtx (*extract) (rtx, rtx, rtx);
21189 machine_mode mode;
21191 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
21192 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
21194 emit_insn (gen_rtx_SET (op0, op1));
21195 return;
21198 rtx orig_op0 = NULL_RTX;
21199 mode = GET_MODE (op0);
21200 switch (GET_MODE_CLASS (mode))
21202 case MODE_VECTOR_INT:
21203 case MODE_INT:
21204 if (mode != V32QImode)
21206 if (!MEM_P (op0))
21208 orig_op0 = op0;
21209 op0 = gen_reg_rtx (V32QImode);
21211 else
21212 op0 = gen_lowpart (V32QImode, op0);
21213 op1 = gen_lowpart (V32QImode, op1);
21214 mode = V32QImode;
21216 break;
21217 case MODE_VECTOR_FLOAT:
21218 break;
21219 default:
21220 gcc_unreachable ();
21223 switch (mode)
21225 default:
21226 gcc_unreachable ();
21227 case E_V32QImode:
21228 extract = gen_avx_vextractf128v32qi;
21229 mode = V16QImode;
21230 break;
21231 case E_V8SFmode:
21232 extract = gen_avx_vextractf128v8sf;
21233 mode = V4SFmode;
21234 break;
21235 case E_V4DFmode:
21236 extract = gen_avx_vextractf128v4df;
21237 mode = V2DFmode;
21238 break;
21241 if (MEM_P (op1))
21243 rtx r = gen_reg_rtx (mode);
21244 m = adjust_address (op1, mode, 0);
21245 emit_move_insn (r, m);
21246 m = adjust_address (op1, mode, 16);
21247 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
21248 emit_move_insn (op0, r);
21250 else if (MEM_P (op0))
21252 m = adjust_address (op0, mode, 0);
21253 emit_insn (extract (m, op1, const0_rtx));
21254 m = adjust_address (op0, mode, 16);
21255 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
21257 else
21258 gcc_unreachable ();
21260 if (orig_op0)
21261 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
21264 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
21265 straight to ix86_expand_vector_move. */
21266 /* Code generation for scalar reg-reg moves of single and double precision data:
21267 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21268 movaps reg, reg
21269 else
21270 movss reg, reg
21271 if (x86_sse_partial_reg_dependency == true)
21272 movapd reg, reg
21273 else
21274 movsd reg, reg
21276 Code generation for scalar loads of double precision data:
21277 if (x86_sse_split_regs == true)
21278 movlpd mem, reg (gas syntax)
21279 else
21280 movsd mem, reg
21282 Code generation for unaligned packed loads of single precision data
21283 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21284 if (x86_sse_unaligned_move_optimal)
21285 movups mem, reg
21287 if (x86_sse_partial_reg_dependency == true)
21289 xorps reg, reg
21290 movlps mem, reg
21291 movhps mem+8, reg
21293 else
21295 movlps mem, reg
21296 movhps mem+8, reg
21299 Code generation for unaligned packed loads of double precision data
21300 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21301 if (x86_sse_unaligned_move_optimal)
21302 movupd mem, reg
21304 if (x86_sse_split_regs == true)
21306 movlpd mem, reg
21307 movhpd mem+8, reg
21309 else
21311 movsd mem, reg
21312 movhpd mem+8, reg
21316 void
21317 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21319 rtx op0, op1, m;
21321 op0 = operands[0];
21322 op1 = operands[1];
21324 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21325 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21327 emit_insn (gen_rtx_SET (op0, op1));
21328 return;
21331 if (TARGET_AVX)
21333 if (GET_MODE_SIZE (mode) == 32)
21334 ix86_avx256_split_vector_move_misalign (op0, op1);
21335 else
21336 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21337 emit_insn (gen_rtx_SET (op0, op1));
21338 return;
21341 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21342 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21344 emit_insn (gen_rtx_SET (op0, op1));
21345 return;
21348 /* ??? If we have typed data, then it would appear that using
21349 movdqu is the only way to get unaligned data loaded with
21350 integer type. */
21351 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21353 emit_insn (gen_rtx_SET (op0, op1));
21354 return;
21357 if (MEM_P (op1))
21359 if (TARGET_SSE2 && mode == V2DFmode)
21361 rtx zero;
21363 /* When SSE registers are split into halves, we can avoid
21364 writing to the top half twice. */
21365 if (TARGET_SSE_SPLIT_REGS)
21367 emit_clobber (op0);
21368 zero = op0;
21370 else
21372 /* ??? Not sure about the best option for the Intel chips.
21373 The following would seem to satisfy; the register is
21374 entirely cleared, breaking the dependency chain. We
21375 then store to the upper half, with a dependency depth
21376 of one. A rumor has it that Intel recommends two movsd
21377 followed by an unpacklpd, but this is unconfirmed. And
21378 given that the dependency depth of the unpacklpd would
21379 still be one, I'm not sure why this would be better. */
21380 zero = CONST0_RTX (V2DFmode);
21383 m = adjust_address (op1, DFmode, 0);
21384 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21385 m = adjust_address (op1, DFmode, 8);
21386 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21388 else
21390 rtx t;
21392 if (mode != V4SFmode)
21393 t = gen_reg_rtx (V4SFmode);
21394 else
21395 t = op0;
21397 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21398 emit_move_insn (t, CONST0_RTX (V4SFmode));
21399 else
21400 emit_clobber (t);
21402 m = adjust_address (op1, V2SFmode, 0);
21403 emit_insn (gen_sse_loadlps (t, t, m));
21404 m = adjust_address (op1, V2SFmode, 8);
21405 emit_insn (gen_sse_loadhps (t, t, m));
21406 if (mode != V4SFmode)
21407 emit_move_insn (op0, gen_lowpart (mode, t));
21410 else if (MEM_P (op0))
21412 if (TARGET_SSE2 && mode == V2DFmode)
21414 m = adjust_address (op0, DFmode, 0);
21415 emit_insn (gen_sse2_storelpd (m, op1));
21416 m = adjust_address (op0, DFmode, 8);
21417 emit_insn (gen_sse2_storehpd (m, op1));
21419 else
21421 if (mode != V4SFmode)
21422 op1 = gen_lowpart (V4SFmode, op1);
21424 m = adjust_address (op0, V2SFmode, 0);
21425 emit_insn (gen_sse_storelps (m, op1));
21426 m = adjust_address (op0, V2SFmode, 8);
21427 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21430 else
21431 gcc_unreachable ();
21434 /* Helper function of ix86_fixup_binary_operands to canonicalize
21435 operand order. Returns true if the operands should be swapped. */
21437 static bool
21438 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21439 rtx operands[])
21441 rtx dst = operands[0];
21442 rtx src1 = operands[1];
21443 rtx src2 = operands[2];
21445 /* If the operation is not commutative, we can't do anything. */
21446 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21447 return false;
21449 /* Highest priority is that src1 should match dst. */
21450 if (rtx_equal_p (dst, src1))
21451 return false;
21452 if (rtx_equal_p (dst, src2))
21453 return true;
21455 /* Next highest priority is that immediate constants come second. */
21456 if (immediate_operand (src2, mode))
21457 return false;
21458 if (immediate_operand (src1, mode))
21459 return true;
21461 /* Lowest priority is that memory references should come second. */
21462 if (MEM_P (src2))
21463 return false;
21464 if (MEM_P (src1))
21465 return true;
21467 return false;
21471 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21472 destination to use for the operation. If different from the true
21473 destination in operands[0], a copy operation will be required. */
21476 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21477 rtx operands[])
21479 rtx dst = operands[0];
21480 rtx src1 = operands[1];
21481 rtx src2 = operands[2];
21483 /* Canonicalize operand order. */
21484 if (ix86_swap_binary_operands_p (code, mode, operands))
21486 /* It is invalid to swap operands of different modes. */
21487 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21489 std::swap (src1, src2);
21492 /* Both source operands cannot be in memory. */
21493 if (MEM_P (src1) && MEM_P (src2))
21495 /* Optimization: Only read from memory once. */
21496 if (rtx_equal_p (src1, src2))
21498 src2 = force_reg (mode, src2);
21499 src1 = src2;
21501 else if (rtx_equal_p (dst, src1))
21502 src2 = force_reg (mode, src2);
21503 else
21504 src1 = force_reg (mode, src1);
21507 /* If the destination is memory, and we do not have matching source
21508 operands, do things in registers. */
21509 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21510 dst = gen_reg_rtx (mode);
21512 /* Source 1 cannot be a constant. */
21513 if (CONSTANT_P (src1))
21514 src1 = force_reg (mode, src1);
21516 /* Source 1 cannot be a non-matching memory. */
21517 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21518 src1 = force_reg (mode, src1);
21520 /* Improve address combine. */
21521 if (code == PLUS
21522 && GET_MODE_CLASS (mode) == MODE_INT
21523 && MEM_P (src2))
21524 src2 = force_reg (mode, src2);
21526 operands[1] = src1;
21527 operands[2] = src2;
21528 return dst;
21531 /* Similarly, but assume that the destination has already been
21532 set up properly. */
21534 void
21535 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21536 machine_mode mode, rtx operands[])
21538 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21539 gcc_assert (dst == operands[0]);
21542 /* Attempt to expand a binary operator. Make the expansion closer to the
21543 actual machine, then just general_operand, which will allow 3 separate
21544 memory references (one output, two input) in a single insn. */
21546 void
21547 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21548 rtx operands[])
21550 rtx src1, src2, dst, op, clob;
21552 dst = ix86_fixup_binary_operands (code, mode, operands);
21553 src1 = operands[1];
21554 src2 = operands[2];
21556 /* Emit the instruction. */
21558 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21560 if (reload_completed
21561 && code == PLUS
21562 && !rtx_equal_p (dst, src1))
21564 /* This is going to be an LEA; avoid splitting it later. */
21565 emit_insn (op);
21567 else
21569 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21570 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21573 /* Fix up the destination if needed. */
21574 if (dst != operands[0])
21575 emit_move_insn (operands[0], dst);
21578 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21579 the given OPERANDS. */
21581 void
21582 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21583 rtx operands[])
21585 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21586 if (SUBREG_P (operands[1]))
21588 op1 = operands[1];
21589 op2 = operands[2];
21591 else if (SUBREG_P (operands[2]))
21593 op1 = operands[2];
21594 op2 = operands[1];
21596 /* Optimize (__m128i) d | (__m128i) e and similar code
21597 when d and e are float vectors into float vector logical
21598 insn. In C/C++ without using intrinsics there is no other way
21599 to express vector logical operation on float vectors than
21600 to cast them temporarily to integer vectors. */
21601 if (op1
21602 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21603 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21604 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21605 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21606 && SUBREG_BYTE (op1) == 0
21607 && (GET_CODE (op2) == CONST_VECTOR
21608 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21609 && SUBREG_BYTE (op2) == 0))
21610 && can_create_pseudo_p ())
21612 rtx dst;
21613 switch (GET_MODE (SUBREG_REG (op1)))
21615 case E_V4SFmode:
21616 case E_V8SFmode:
21617 case E_V16SFmode:
21618 case E_V2DFmode:
21619 case E_V4DFmode:
21620 case E_V8DFmode:
21621 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21622 if (GET_CODE (op2) == CONST_VECTOR)
21624 op2 = gen_lowpart (GET_MODE (dst), op2);
21625 op2 = force_reg (GET_MODE (dst), op2);
21627 else
21629 op1 = operands[1];
21630 op2 = SUBREG_REG (operands[2]);
21631 if (!vector_operand (op2, GET_MODE (dst)))
21632 op2 = force_reg (GET_MODE (dst), op2);
21634 op1 = SUBREG_REG (op1);
21635 if (!vector_operand (op1, GET_MODE (dst)))
21636 op1 = force_reg (GET_MODE (dst), op1);
21637 emit_insn (gen_rtx_SET (dst,
21638 gen_rtx_fmt_ee (code, GET_MODE (dst),
21639 op1, op2)));
21640 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21641 return;
21642 default:
21643 break;
21646 if (!vector_operand (operands[1], mode))
21647 operands[1] = force_reg (mode, operands[1]);
21648 if (!vector_operand (operands[2], mode))
21649 operands[2] = force_reg (mode, operands[2]);
21650 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21651 emit_insn (gen_rtx_SET (operands[0],
21652 gen_rtx_fmt_ee (code, mode, operands[1],
21653 operands[2])));
21656 /* Return TRUE or FALSE depending on whether the binary operator meets the
21657 appropriate constraints. */
21659 bool
21660 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21661 rtx operands[3])
21663 rtx dst = operands[0];
21664 rtx src1 = operands[1];
21665 rtx src2 = operands[2];
21667 /* Both source operands cannot be in memory. */
21668 if (MEM_P (src1) && MEM_P (src2))
21669 return false;
21671 /* Canonicalize operand order for commutative operators. */
21672 if (ix86_swap_binary_operands_p (code, mode, operands))
21673 std::swap (src1, src2);
21675 /* If the destination is memory, we must have a matching source operand. */
21676 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21677 return false;
21679 /* Source 1 cannot be a constant. */
21680 if (CONSTANT_P (src1))
21681 return false;
21683 /* Source 1 cannot be a non-matching memory. */
21684 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21685 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21686 return (code == AND
21687 && (mode == HImode
21688 || mode == SImode
21689 || (TARGET_64BIT && mode == DImode))
21690 && satisfies_constraint_L (src2));
21692 return true;
21695 /* Attempt to expand a unary operator. Make the expansion closer to the
21696 actual machine, then just general_operand, which will allow 2 separate
21697 memory references (one output, one input) in a single insn. */
21699 void
21700 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21701 rtx operands[])
21703 bool matching_memory = false;
21704 rtx src, dst, op, clob;
21706 dst = operands[0];
21707 src = operands[1];
21709 /* If the destination is memory, and we do not have matching source
21710 operands, do things in registers. */
21711 if (MEM_P (dst))
21713 if (rtx_equal_p (dst, src))
21714 matching_memory = true;
21715 else
21716 dst = gen_reg_rtx (mode);
21719 /* When source operand is memory, destination must match. */
21720 if (MEM_P (src) && !matching_memory)
21721 src = force_reg (mode, src);
21723 /* Emit the instruction. */
21725 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21727 if (code == NOT)
21728 emit_insn (op);
21729 else
21731 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21732 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21735 /* Fix up the destination if needed. */
21736 if (dst != operands[0])
21737 emit_move_insn (operands[0], dst);
21740 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21741 divisor are within the range [0-255]. */
21743 void
21744 ix86_split_idivmod (machine_mode mode, rtx operands[],
21745 bool signed_p)
21747 rtx_code_label *end_label, *qimode_label;
21748 rtx div, mod;
21749 rtx_insn *insn;
21750 rtx scratch, tmp0, tmp1, tmp2;
21751 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21752 rtx (*gen_zero_extend) (rtx, rtx);
21753 rtx (*gen_test_ccno_1) (rtx, rtx);
21755 switch (mode)
21757 case E_SImode:
21758 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21759 gen_test_ccno_1 = gen_testsi_ccno_1;
21760 gen_zero_extend = gen_zero_extendqisi2;
21761 break;
21762 case E_DImode:
21763 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21764 gen_test_ccno_1 = gen_testdi_ccno_1;
21765 gen_zero_extend = gen_zero_extendqidi2;
21766 break;
21767 default:
21768 gcc_unreachable ();
21771 end_label = gen_label_rtx ();
21772 qimode_label = gen_label_rtx ();
21774 scratch = gen_reg_rtx (mode);
21776 /* Use 8bit unsigned divimod if dividend and divisor are within
21777 the range [0-255]. */
21778 emit_move_insn (scratch, operands[2]);
21779 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21780 scratch, 1, OPTAB_DIRECT);
21781 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21782 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21783 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21784 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21785 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21786 pc_rtx);
21787 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21788 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21789 JUMP_LABEL (insn) = qimode_label;
21791 /* Generate original signed/unsigned divimod. */
21792 div = gen_divmod4_1 (operands[0], operands[1],
21793 operands[2], operands[3]);
21794 emit_insn (div);
21796 /* Branch to the end. */
21797 emit_jump_insn (gen_jump (end_label));
21798 emit_barrier ();
21800 /* Generate 8bit unsigned divide. */
21801 emit_label (qimode_label);
21802 /* Don't use operands[0] for result of 8bit divide since not all
21803 registers support QImode ZERO_EXTRACT. */
21804 tmp0 = lowpart_subreg (HImode, scratch, mode);
21805 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21806 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21807 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21809 if (signed_p)
21811 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21812 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21814 else
21816 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21817 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21820 /* Extract remainder from AH. */
21821 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21822 if (REG_P (operands[1]))
21823 insn = emit_move_insn (operands[1], tmp1);
21824 else
21826 /* Need a new scratch register since the old one has result
21827 of 8bit divide. */
21828 scratch = gen_reg_rtx (mode);
21829 emit_move_insn (scratch, tmp1);
21830 insn = emit_move_insn (operands[1], scratch);
21832 set_unique_reg_note (insn, REG_EQUAL, mod);
21834 /* Zero extend quotient from AL. */
21835 tmp1 = gen_lowpart (QImode, tmp0);
21836 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21837 set_unique_reg_note (insn, REG_EQUAL, div);
21839 emit_label (end_label);
21842 #define LEA_MAX_STALL (3)
21843 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21845 /* Increase given DISTANCE in half-cycles according to
21846 dependencies between PREV and NEXT instructions.
21847 Add 1 half-cycle if there is no dependency and
21848 go to next cycle if there is some dependecy. */
21850 static unsigned int
21851 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21853 df_ref def, use;
21855 if (!prev || !next)
21856 return distance + (distance & 1) + 2;
21858 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21859 return distance + 1;
21861 FOR_EACH_INSN_USE (use, next)
21862 FOR_EACH_INSN_DEF (def, prev)
21863 if (!DF_REF_IS_ARTIFICIAL (def)
21864 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21865 return distance + (distance & 1) + 2;
21867 return distance + 1;
21870 /* Function checks if instruction INSN defines register number
21871 REGNO1 or REGNO2. */
21873 static bool
21874 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21875 rtx_insn *insn)
21877 df_ref def;
21879 FOR_EACH_INSN_DEF (def, insn)
21880 if (DF_REF_REG_DEF_P (def)
21881 && !DF_REF_IS_ARTIFICIAL (def)
21882 && (regno1 == DF_REF_REGNO (def)
21883 || regno2 == DF_REF_REGNO (def)))
21884 return true;
21886 return false;
21889 /* Function checks if instruction INSN uses register number
21890 REGNO as a part of address expression. */
21892 static bool
21893 insn_uses_reg_mem (unsigned int regno, rtx insn)
21895 df_ref use;
21897 FOR_EACH_INSN_USE (use, insn)
21898 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21899 return true;
21901 return false;
21904 /* Search backward for non-agu definition of register number REGNO1
21905 or register number REGNO2 in basic block starting from instruction
21906 START up to head of basic block or instruction INSN.
21908 Function puts true value into *FOUND var if definition was found
21909 and false otherwise.
21911 Distance in half-cycles between START and found instruction or head
21912 of BB is added to DISTANCE and returned. */
21914 static int
21915 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21916 rtx_insn *insn, int distance,
21917 rtx_insn *start, bool *found)
21919 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21920 rtx_insn *prev = start;
21921 rtx_insn *next = NULL;
21923 *found = false;
21925 while (prev
21926 && prev != insn
21927 && distance < LEA_SEARCH_THRESHOLD)
21929 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21931 distance = increase_distance (prev, next, distance);
21932 if (insn_defines_reg (regno1, regno2, prev))
21934 if (recog_memoized (prev) < 0
21935 || get_attr_type (prev) != TYPE_LEA)
21937 *found = true;
21938 return distance;
21942 next = prev;
21944 if (prev == BB_HEAD (bb))
21945 break;
21947 prev = PREV_INSN (prev);
21950 return distance;
21953 /* Search backward for non-agu definition of register number REGNO1
21954 or register number REGNO2 in INSN's basic block until
21955 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21956 2. Reach neighbor BBs boundary, or
21957 3. Reach agu definition.
21958 Returns the distance between the non-agu definition point and INSN.
21959 If no definition point, returns -1. */
21961 static int
21962 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21963 rtx_insn *insn)
21965 basic_block bb = BLOCK_FOR_INSN (insn);
21966 int distance = 0;
21967 bool found = false;
21969 if (insn != BB_HEAD (bb))
21970 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21971 distance, PREV_INSN (insn),
21972 &found);
21974 if (!found && distance < LEA_SEARCH_THRESHOLD)
21976 edge e;
21977 edge_iterator ei;
21978 bool simple_loop = false;
21980 FOR_EACH_EDGE (e, ei, bb->preds)
21981 if (e->src == bb)
21983 simple_loop = true;
21984 break;
21987 if (simple_loop)
21988 distance = distance_non_agu_define_in_bb (regno1, regno2,
21989 insn, distance,
21990 BB_END (bb), &found);
21991 else
21993 int shortest_dist = -1;
21994 bool found_in_bb = false;
21996 FOR_EACH_EDGE (e, ei, bb->preds)
21998 int bb_dist
21999 = distance_non_agu_define_in_bb (regno1, regno2,
22000 insn, distance,
22001 BB_END (e->src),
22002 &found_in_bb);
22003 if (found_in_bb)
22005 if (shortest_dist < 0)
22006 shortest_dist = bb_dist;
22007 else if (bb_dist > 0)
22008 shortest_dist = MIN (bb_dist, shortest_dist);
22010 found = true;
22014 distance = shortest_dist;
22018 /* get_attr_type may modify recog data. We want to make sure
22019 that recog data is valid for instruction INSN, on which
22020 distance_non_agu_define is called. INSN is unchanged here. */
22021 extract_insn_cached (insn);
22023 if (!found)
22024 return -1;
22026 return distance >> 1;
22029 /* Return the distance in half-cycles between INSN and the next
22030 insn that uses register number REGNO in memory address added
22031 to DISTANCE. Return -1 if REGNO0 is set.
22033 Put true value into *FOUND if register usage was found and
22034 false otherwise.
22035 Put true value into *REDEFINED if register redefinition was
22036 found and false otherwise. */
22038 static int
22039 distance_agu_use_in_bb (unsigned int regno,
22040 rtx_insn *insn, int distance, rtx_insn *start,
22041 bool *found, bool *redefined)
22043 basic_block bb = NULL;
22044 rtx_insn *next = start;
22045 rtx_insn *prev = NULL;
22047 *found = false;
22048 *redefined = false;
22050 if (start != NULL_RTX)
22052 bb = BLOCK_FOR_INSN (start);
22053 if (start != BB_HEAD (bb))
22054 /* If insn and start belong to the same bb, set prev to insn,
22055 so the call to increase_distance will increase the distance
22056 between insns by 1. */
22057 prev = insn;
22060 while (next
22061 && next != insn
22062 && distance < LEA_SEARCH_THRESHOLD)
22064 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
22066 distance = increase_distance(prev, next, distance);
22067 if (insn_uses_reg_mem (regno, next))
22069 /* Return DISTANCE if OP0 is used in memory
22070 address in NEXT. */
22071 *found = true;
22072 return distance;
22075 if (insn_defines_reg (regno, INVALID_REGNUM, next))
22077 /* Return -1 if OP0 is set in NEXT. */
22078 *redefined = true;
22079 return -1;
22082 prev = next;
22085 if (next == BB_END (bb))
22086 break;
22088 next = NEXT_INSN (next);
22091 return distance;
22094 /* Return the distance between INSN and the next insn that uses
22095 register number REGNO0 in memory address. Return -1 if no such
22096 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
22098 static int
22099 distance_agu_use (unsigned int regno0, rtx_insn *insn)
22101 basic_block bb = BLOCK_FOR_INSN (insn);
22102 int distance = 0;
22103 bool found = false;
22104 bool redefined = false;
22106 if (insn != BB_END (bb))
22107 distance = distance_agu_use_in_bb (regno0, insn, distance,
22108 NEXT_INSN (insn),
22109 &found, &redefined);
22111 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
22113 edge e;
22114 edge_iterator ei;
22115 bool simple_loop = false;
22117 FOR_EACH_EDGE (e, ei, bb->succs)
22118 if (e->dest == bb)
22120 simple_loop = true;
22121 break;
22124 if (simple_loop)
22125 distance = distance_agu_use_in_bb (regno0, insn,
22126 distance, BB_HEAD (bb),
22127 &found, &redefined);
22128 else
22130 int shortest_dist = -1;
22131 bool found_in_bb = false;
22132 bool redefined_in_bb = false;
22134 FOR_EACH_EDGE (e, ei, bb->succs)
22136 int bb_dist
22137 = distance_agu_use_in_bb (regno0, insn,
22138 distance, BB_HEAD (e->dest),
22139 &found_in_bb, &redefined_in_bb);
22140 if (found_in_bb)
22142 if (shortest_dist < 0)
22143 shortest_dist = bb_dist;
22144 else if (bb_dist > 0)
22145 shortest_dist = MIN (bb_dist, shortest_dist);
22147 found = true;
22151 distance = shortest_dist;
22155 if (!found || redefined)
22156 return -1;
22158 return distance >> 1;
22161 /* Define this macro to tune LEA priority vs ADD, it take effect when
22162 there is a dilemma of choicing LEA or ADD
22163 Negative value: ADD is more preferred than LEA
22164 Zero: Netrual
22165 Positive value: LEA is more preferred than ADD*/
22166 #define IX86_LEA_PRIORITY 0
22168 /* Return true if usage of lea INSN has performance advantage
22169 over a sequence of instructions. Instructions sequence has
22170 SPLIT_COST cycles higher latency than lea latency. */
22172 static bool
22173 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
22174 unsigned int regno2, int split_cost, bool has_scale)
22176 int dist_define, dist_use;
22178 /* For Silvermont if using a 2-source or 3-source LEA for
22179 non-destructive destination purposes, or due to wanting
22180 ability to use SCALE, the use of LEA is justified. */
22181 if (TARGET_SILVERMONT || TARGET_INTEL)
22183 if (has_scale)
22184 return true;
22185 if (split_cost < 1)
22186 return false;
22187 if (regno0 == regno1 || regno0 == regno2)
22188 return false;
22189 return true;
22192 dist_define = distance_non_agu_define (regno1, regno2, insn);
22193 dist_use = distance_agu_use (regno0, insn);
22195 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
22197 /* If there is no non AGU operand definition, no AGU
22198 operand usage and split cost is 0 then both lea
22199 and non lea variants have same priority. Currently
22200 we prefer lea for 64 bit code and non lea on 32 bit
22201 code. */
22202 if (dist_use < 0 && split_cost == 0)
22203 return TARGET_64BIT || IX86_LEA_PRIORITY;
22204 else
22205 return true;
22208 /* With longer definitions distance lea is more preferable.
22209 Here we change it to take into account splitting cost and
22210 lea priority. */
22211 dist_define += split_cost + IX86_LEA_PRIORITY;
22213 /* If there is no use in memory addess then we just check
22214 that split cost exceeds AGU stall. */
22215 if (dist_use < 0)
22216 return dist_define > LEA_MAX_STALL;
22218 /* If this insn has both backward non-agu dependence and forward
22219 agu dependence, the one with short distance takes effect. */
22220 return dist_define >= dist_use;
22223 /* Return true if it is legal to clobber flags by INSN and
22224 false otherwise. */
22226 static bool
22227 ix86_ok_to_clobber_flags (rtx_insn *insn)
22229 basic_block bb = BLOCK_FOR_INSN (insn);
22230 df_ref use;
22231 bitmap live;
22233 while (insn)
22235 if (NONDEBUG_INSN_P (insn))
22237 FOR_EACH_INSN_USE (use, insn)
22238 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
22239 return false;
22241 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
22242 return true;
22245 if (insn == BB_END (bb))
22246 break;
22248 insn = NEXT_INSN (insn);
22251 live = df_get_live_out(bb);
22252 return !REGNO_REG_SET_P (live, FLAGS_REG);
22255 /* Return true if we need to split op0 = op1 + op2 into a sequence of
22256 move and add to avoid AGU stalls. */
22258 bool
22259 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
22261 unsigned int regno0, regno1, regno2;
22263 /* Check if we need to optimize. */
22264 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22265 return false;
22267 /* Check it is correct to split here. */
22268 if (!ix86_ok_to_clobber_flags(insn))
22269 return false;
22271 regno0 = true_regnum (operands[0]);
22272 regno1 = true_regnum (operands[1]);
22273 regno2 = true_regnum (operands[2]);
22275 /* We need to split only adds with non destructive
22276 destination operand. */
22277 if (regno0 == regno1 || regno0 == regno2)
22278 return false;
22279 else
22280 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22283 /* Return true if we should emit lea instruction instead of mov
22284 instruction. */
22286 bool
22287 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22289 unsigned int regno0, regno1;
22291 /* Check if we need to optimize. */
22292 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22293 return false;
22295 /* Use lea for reg to reg moves only. */
22296 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22297 return false;
22299 regno0 = true_regnum (operands[0]);
22300 regno1 = true_regnum (operands[1]);
22302 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22305 /* Return true if we need to split lea into a sequence of
22306 instructions to avoid AGU stalls. */
22308 bool
22309 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22311 unsigned int regno0, regno1, regno2;
22312 int split_cost;
22313 struct ix86_address parts;
22314 int ok;
22316 /* Check we need to optimize. */
22317 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22318 return false;
22320 /* The "at least two components" test below might not catch simple
22321 move or zero extension insns if parts.base is non-NULL and parts.disp
22322 is const0_rtx as the only components in the address, e.g. if the
22323 register is %rbp or %r13. As this test is much cheaper and moves or
22324 zero extensions are the common case, do this check first. */
22325 if (REG_P (operands[1])
22326 || (SImode_address_operand (operands[1], VOIDmode)
22327 && REG_P (XEXP (operands[1], 0))))
22328 return false;
22330 /* Check if it is OK to split here. */
22331 if (!ix86_ok_to_clobber_flags (insn))
22332 return false;
22334 ok = ix86_decompose_address (operands[1], &parts);
22335 gcc_assert (ok);
22337 /* There should be at least two components in the address. */
22338 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22339 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22340 return false;
22342 /* We should not split into add if non legitimate pic
22343 operand is used as displacement. */
22344 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22345 return false;
22347 regno0 = true_regnum (operands[0]) ;
22348 regno1 = INVALID_REGNUM;
22349 regno2 = INVALID_REGNUM;
22351 if (parts.base)
22352 regno1 = true_regnum (parts.base);
22353 if (parts.index)
22354 regno2 = true_regnum (parts.index);
22356 split_cost = 0;
22358 /* Compute how many cycles we will add to execution time
22359 if split lea into a sequence of instructions. */
22360 if (parts.base || parts.index)
22362 /* Have to use mov instruction if non desctructive
22363 destination form is used. */
22364 if (regno1 != regno0 && regno2 != regno0)
22365 split_cost += 1;
22367 /* Have to add index to base if both exist. */
22368 if (parts.base && parts.index)
22369 split_cost += 1;
22371 /* Have to use shift and adds if scale is 2 or greater. */
22372 if (parts.scale > 1)
22374 if (regno0 != regno1)
22375 split_cost += 1;
22376 else if (regno2 == regno0)
22377 split_cost += 4;
22378 else
22379 split_cost += parts.scale;
22382 /* Have to use add instruction with immediate if
22383 disp is non zero. */
22384 if (parts.disp && parts.disp != const0_rtx)
22385 split_cost += 1;
22387 /* Subtract the price of lea. */
22388 split_cost -= 1;
22391 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22392 parts.scale > 1);
22395 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22396 matches destination. RTX includes clobber of FLAGS_REG. */
22398 static void
22399 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22400 rtx dst, rtx src)
22402 rtx op, clob;
22404 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22405 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22407 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22410 /* Return true if regno1 def is nearest to the insn. */
22412 static bool
22413 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22415 rtx_insn *prev = insn;
22416 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22418 if (insn == start)
22419 return false;
22420 while (prev && prev != start)
22422 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22424 prev = PREV_INSN (prev);
22425 continue;
22427 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22428 return true;
22429 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22430 return false;
22431 prev = PREV_INSN (prev);
22434 /* None of the regs is defined in the bb. */
22435 return false;
22438 /* Split lea instructions into a sequence of instructions
22439 which are executed on ALU to avoid AGU stalls.
22440 It is assumed that it is allowed to clobber flags register
22441 at lea position. */
22443 void
22444 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22446 unsigned int regno0, regno1, regno2;
22447 struct ix86_address parts;
22448 rtx target, tmp;
22449 int ok, adds;
22451 ok = ix86_decompose_address (operands[1], &parts);
22452 gcc_assert (ok);
22454 target = gen_lowpart (mode, operands[0]);
22456 regno0 = true_regnum (target);
22457 regno1 = INVALID_REGNUM;
22458 regno2 = INVALID_REGNUM;
22460 if (parts.base)
22462 parts.base = gen_lowpart (mode, parts.base);
22463 regno1 = true_regnum (parts.base);
22466 if (parts.index)
22468 parts.index = gen_lowpart (mode, parts.index);
22469 regno2 = true_regnum (parts.index);
22472 if (parts.disp)
22473 parts.disp = gen_lowpart (mode, parts.disp);
22475 if (parts.scale > 1)
22477 /* Case r1 = r1 + ... */
22478 if (regno1 == regno0)
22480 /* If we have a case r1 = r1 + C * r2 then we
22481 should use multiplication which is very
22482 expensive. Assume cost model is wrong if we
22483 have such case here. */
22484 gcc_assert (regno2 != regno0);
22486 for (adds = parts.scale; adds > 0; adds--)
22487 ix86_emit_binop (PLUS, mode, target, parts.index);
22489 else
22491 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22492 if (regno0 != regno2)
22493 emit_insn (gen_rtx_SET (target, parts.index));
22495 /* Use shift for scaling. */
22496 ix86_emit_binop (ASHIFT, mode, target,
22497 GEN_INT (exact_log2 (parts.scale)));
22499 if (parts.base)
22500 ix86_emit_binop (PLUS, mode, target, parts.base);
22502 if (parts.disp && parts.disp != const0_rtx)
22503 ix86_emit_binop (PLUS, mode, target, parts.disp);
22506 else if (!parts.base && !parts.index)
22508 gcc_assert(parts.disp);
22509 emit_insn (gen_rtx_SET (target, parts.disp));
22511 else
22513 if (!parts.base)
22515 if (regno0 != regno2)
22516 emit_insn (gen_rtx_SET (target, parts.index));
22518 else if (!parts.index)
22520 if (regno0 != regno1)
22521 emit_insn (gen_rtx_SET (target, parts.base));
22523 else
22525 if (regno0 == regno1)
22526 tmp = parts.index;
22527 else if (regno0 == regno2)
22528 tmp = parts.base;
22529 else
22531 rtx tmp1;
22533 /* Find better operand for SET instruction, depending
22534 on which definition is farther from the insn. */
22535 if (find_nearest_reg_def (insn, regno1, regno2))
22536 tmp = parts.index, tmp1 = parts.base;
22537 else
22538 tmp = parts.base, tmp1 = parts.index;
22540 emit_insn (gen_rtx_SET (target, tmp));
22542 if (parts.disp && parts.disp != const0_rtx)
22543 ix86_emit_binop (PLUS, mode, target, parts.disp);
22545 ix86_emit_binop (PLUS, mode, target, tmp1);
22546 return;
22549 ix86_emit_binop (PLUS, mode, target, tmp);
22552 if (parts.disp && parts.disp != const0_rtx)
22553 ix86_emit_binop (PLUS, mode, target, parts.disp);
22557 /* Return true if it is ok to optimize an ADD operation to LEA
22558 operation to avoid flag register consumation. For most processors,
22559 ADD is faster than LEA. For the processors like BONNELL, if the
22560 destination register of LEA holds an actual address which will be
22561 used soon, LEA is better and otherwise ADD is better. */
22563 bool
22564 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22566 unsigned int regno0 = true_regnum (operands[0]);
22567 unsigned int regno1 = true_regnum (operands[1]);
22568 unsigned int regno2 = true_regnum (operands[2]);
22570 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22571 if (regno0 != regno1 && regno0 != regno2)
22572 return true;
22574 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22575 return false;
22577 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22580 /* Return true if destination reg of SET_BODY is shift count of
22581 USE_BODY. */
22583 static bool
22584 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22586 rtx set_dest;
22587 rtx shift_rtx;
22588 int i;
22590 /* Retrieve destination of SET_BODY. */
22591 switch (GET_CODE (set_body))
22593 case SET:
22594 set_dest = SET_DEST (set_body);
22595 if (!set_dest || !REG_P (set_dest))
22596 return false;
22597 break;
22598 case PARALLEL:
22599 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22600 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22601 use_body))
22602 return true;
22603 /* FALLTHROUGH */
22604 default:
22605 return false;
22608 /* Retrieve shift count of USE_BODY. */
22609 switch (GET_CODE (use_body))
22611 case SET:
22612 shift_rtx = XEXP (use_body, 1);
22613 break;
22614 case PARALLEL:
22615 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22616 if (ix86_dep_by_shift_count_body (set_body,
22617 XVECEXP (use_body, 0, i)))
22618 return true;
22619 /* FALLTHROUGH */
22620 default:
22621 return false;
22624 if (shift_rtx
22625 && (GET_CODE (shift_rtx) == ASHIFT
22626 || GET_CODE (shift_rtx) == LSHIFTRT
22627 || GET_CODE (shift_rtx) == ASHIFTRT
22628 || GET_CODE (shift_rtx) == ROTATE
22629 || GET_CODE (shift_rtx) == ROTATERT))
22631 rtx shift_count = XEXP (shift_rtx, 1);
22633 /* Return true if shift count is dest of SET_BODY. */
22634 if (REG_P (shift_count))
22636 /* Add check since it can be invoked before register
22637 allocation in pre-reload schedule. */
22638 if (reload_completed
22639 && true_regnum (set_dest) == true_regnum (shift_count))
22640 return true;
22641 else if (REGNO(set_dest) == REGNO(shift_count))
22642 return true;
22646 return false;
22649 /* Return true if destination reg of SET_INSN is shift count of
22650 USE_INSN. */
22652 bool
22653 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22655 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22656 PATTERN (use_insn));
22659 /* Return TRUE or FALSE depending on whether the unary operator meets the
22660 appropriate constraints. */
22662 bool
22663 ix86_unary_operator_ok (enum rtx_code,
22664 machine_mode,
22665 rtx operands[2])
22667 /* If one of operands is memory, source and destination must match. */
22668 if ((MEM_P (operands[0])
22669 || MEM_P (operands[1]))
22670 && ! rtx_equal_p (operands[0], operands[1]))
22671 return false;
22672 return true;
22675 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22676 are ok, keeping in mind the possible movddup alternative. */
22678 bool
22679 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22681 if (MEM_P (operands[0]))
22682 return rtx_equal_p (operands[0], operands[1 + high]);
22683 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22684 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22685 return true;
22688 /* Post-reload splitter for converting an SF or DFmode value in an
22689 SSE register into an unsigned SImode. */
22691 void
22692 ix86_split_convert_uns_si_sse (rtx operands[])
22694 machine_mode vecmode;
22695 rtx value, large, zero_or_two31, input, two31, x;
22697 large = operands[1];
22698 zero_or_two31 = operands[2];
22699 input = operands[3];
22700 two31 = operands[4];
22701 vecmode = GET_MODE (large);
22702 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22704 /* Load up the value into the low element. We must ensure that the other
22705 elements are valid floats -- zero is the easiest such value. */
22706 if (MEM_P (input))
22708 if (vecmode == V4SFmode)
22709 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22710 else
22711 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22713 else
22715 input = gen_rtx_REG (vecmode, REGNO (input));
22716 emit_move_insn (value, CONST0_RTX (vecmode));
22717 if (vecmode == V4SFmode)
22718 emit_insn (gen_sse_movss (value, value, input));
22719 else
22720 emit_insn (gen_sse2_movsd (value, value, input));
22723 emit_move_insn (large, two31);
22724 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22726 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22727 emit_insn (gen_rtx_SET (large, x));
22729 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22730 emit_insn (gen_rtx_SET (zero_or_two31, x));
22732 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22733 emit_insn (gen_rtx_SET (value, x));
22735 large = gen_rtx_REG (V4SImode, REGNO (large));
22736 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22738 x = gen_rtx_REG (V4SImode, REGNO (value));
22739 if (vecmode == V4SFmode)
22740 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22741 else
22742 emit_insn (gen_sse2_cvttpd2dq (x, value));
22743 value = x;
22745 emit_insn (gen_xorv4si3 (value, value, large));
22748 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22749 Expects the 64-bit DImode to be supplied in a pair of integral
22750 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22751 -mfpmath=sse, !optimize_size only. */
22753 void
22754 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22756 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22757 rtx int_xmm, fp_xmm;
22758 rtx biases, exponents;
22759 rtx x;
22761 int_xmm = gen_reg_rtx (V4SImode);
22762 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22763 emit_insn (gen_movdi_to_sse (int_xmm, input));
22764 else if (TARGET_SSE_SPLIT_REGS)
22766 emit_clobber (int_xmm);
22767 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22769 else
22771 x = gen_reg_rtx (V2DImode);
22772 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22773 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22776 x = gen_rtx_CONST_VECTOR (V4SImode,
22777 gen_rtvec (4, GEN_INT (0x43300000UL),
22778 GEN_INT (0x45300000UL),
22779 const0_rtx, const0_rtx));
22780 exponents = validize_mem (force_const_mem (V4SImode, x));
22782 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22783 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22785 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22786 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22787 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22788 (0x1.0p84 + double(fp_value_hi_xmm)).
22789 Note these exponents differ by 32. */
22791 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22793 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22794 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22795 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22796 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22797 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22798 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22799 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22800 biases = validize_mem (force_const_mem (V2DFmode, biases));
22801 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22803 /* Add the upper and lower DFmode values together. */
22804 if (TARGET_SSE3)
22805 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22806 else
22808 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22809 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22810 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22813 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22816 /* Not used, but eases macroization of patterns. */
22817 void
22818 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22820 gcc_unreachable ();
22823 /* Convert an unsigned SImode value into a DFmode. Only currently used
22824 for SSE, but applicable anywhere. */
22826 void
22827 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22829 REAL_VALUE_TYPE TWO31r;
22830 rtx x, fp;
22832 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22833 NULL, 1, OPTAB_DIRECT);
22835 fp = gen_reg_rtx (DFmode);
22836 emit_insn (gen_floatsidf2 (fp, x));
22838 real_ldexp (&TWO31r, &dconst1, 31);
22839 x = const_double_from_real_value (TWO31r, DFmode);
22841 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22842 if (x != target)
22843 emit_move_insn (target, x);
22846 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22847 32-bit mode; otherwise we have a direct convert instruction. */
22849 void
22850 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22852 REAL_VALUE_TYPE TWO32r;
22853 rtx fp_lo, fp_hi, x;
22855 fp_lo = gen_reg_rtx (DFmode);
22856 fp_hi = gen_reg_rtx (DFmode);
22858 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22860 real_ldexp (&TWO32r, &dconst1, 32);
22861 x = const_double_from_real_value (TWO32r, DFmode);
22862 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22864 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22866 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22867 0, OPTAB_DIRECT);
22868 if (x != target)
22869 emit_move_insn (target, x);
22872 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22873 For x86_32, -mfpmath=sse, !optimize_size only. */
22874 void
22875 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22877 REAL_VALUE_TYPE ONE16r;
22878 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22880 real_ldexp (&ONE16r, &dconst1, 16);
22881 x = const_double_from_real_value (ONE16r, SFmode);
22882 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22883 NULL, 0, OPTAB_DIRECT);
22884 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22885 NULL, 0, OPTAB_DIRECT);
22886 fp_hi = gen_reg_rtx (SFmode);
22887 fp_lo = gen_reg_rtx (SFmode);
22888 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22889 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22890 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22891 0, OPTAB_DIRECT);
22892 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22893 0, OPTAB_DIRECT);
22894 if (!rtx_equal_p (target, fp_hi))
22895 emit_move_insn (target, fp_hi);
22898 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22899 a vector of unsigned ints VAL to vector of floats TARGET. */
22901 void
22902 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22904 rtx tmp[8];
22905 REAL_VALUE_TYPE TWO16r;
22906 machine_mode intmode = GET_MODE (val);
22907 machine_mode fltmode = GET_MODE (target);
22908 rtx (*cvt) (rtx, rtx);
22910 if (intmode == V4SImode)
22911 cvt = gen_floatv4siv4sf2;
22912 else
22913 cvt = gen_floatv8siv8sf2;
22914 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22915 tmp[0] = force_reg (intmode, tmp[0]);
22916 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22917 OPTAB_DIRECT);
22918 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22919 NULL_RTX, 1, OPTAB_DIRECT);
22920 tmp[3] = gen_reg_rtx (fltmode);
22921 emit_insn (cvt (tmp[3], tmp[1]));
22922 tmp[4] = gen_reg_rtx (fltmode);
22923 emit_insn (cvt (tmp[4], tmp[2]));
22924 real_ldexp (&TWO16r, &dconst1, 16);
22925 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22926 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22927 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22928 OPTAB_DIRECT);
22929 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22930 OPTAB_DIRECT);
22931 if (tmp[7] != target)
22932 emit_move_insn (target, tmp[7]);
22935 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22936 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22937 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22938 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22941 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22943 REAL_VALUE_TYPE TWO31r;
22944 rtx two31r, tmp[4];
22945 machine_mode mode = GET_MODE (val);
22946 machine_mode scalarmode = GET_MODE_INNER (mode);
22947 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22948 rtx (*cmp) (rtx, rtx, rtx, rtx);
22949 int i;
22951 for (i = 0; i < 3; i++)
22952 tmp[i] = gen_reg_rtx (mode);
22953 real_ldexp (&TWO31r, &dconst1, 31);
22954 two31r = const_double_from_real_value (TWO31r, scalarmode);
22955 two31r = ix86_build_const_vector (mode, 1, two31r);
22956 two31r = force_reg (mode, two31r);
22957 switch (mode)
22959 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22960 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22961 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22962 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22963 default: gcc_unreachable ();
22965 tmp[3] = gen_rtx_LE (mode, two31r, val);
22966 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22967 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22968 0, OPTAB_DIRECT);
22969 if (intmode == V4SImode || TARGET_AVX2)
22970 *xorp = expand_simple_binop (intmode, ASHIFT,
22971 gen_lowpart (intmode, tmp[0]),
22972 GEN_INT (31), NULL_RTX, 0,
22973 OPTAB_DIRECT);
22974 else
22976 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22977 two31 = ix86_build_const_vector (intmode, 1, two31);
22978 *xorp = expand_simple_binop (intmode, AND,
22979 gen_lowpart (intmode, tmp[0]),
22980 two31, NULL_RTX, 0,
22981 OPTAB_DIRECT);
22983 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22984 0, OPTAB_DIRECT);
22987 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22988 then replicate the value for all elements of the vector
22989 register. */
22992 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22994 int i, n_elt;
22995 rtvec v;
22996 machine_mode scalar_mode;
22998 switch (mode)
23000 case E_V64QImode:
23001 case E_V32QImode:
23002 case E_V16QImode:
23003 case E_V32HImode:
23004 case E_V16HImode:
23005 case E_V8HImode:
23006 case E_V16SImode:
23007 case E_V8SImode:
23008 case E_V4SImode:
23009 case E_V8DImode:
23010 case E_V4DImode:
23011 case E_V2DImode:
23012 gcc_assert (vect);
23013 /* FALLTHRU */
23014 case E_V16SFmode:
23015 case E_V8SFmode:
23016 case E_V4SFmode:
23017 case E_V8DFmode:
23018 case E_V4DFmode:
23019 case E_V2DFmode:
23020 n_elt = GET_MODE_NUNITS (mode);
23021 v = rtvec_alloc (n_elt);
23022 scalar_mode = GET_MODE_INNER (mode);
23024 RTVEC_ELT (v, 0) = value;
23026 for (i = 1; i < n_elt; ++i)
23027 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
23029 return gen_rtx_CONST_VECTOR (mode, v);
23031 default:
23032 gcc_unreachable ();
23036 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
23037 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
23038 for an SSE register. If VECT is true, then replicate the mask for
23039 all elements of the vector register. If INVERT is true, then create
23040 a mask excluding the sign bit. */
23043 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
23045 machine_mode vec_mode, imode;
23046 wide_int w;
23047 rtx mask, v;
23049 switch (mode)
23051 case E_V16SImode:
23052 case E_V16SFmode:
23053 case E_V8SImode:
23054 case E_V4SImode:
23055 case E_V8SFmode:
23056 case E_V4SFmode:
23057 vec_mode = mode;
23058 imode = SImode;
23059 break;
23061 case E_V8DImode:
23062 case E_V4DImode:
23063 case E_V2DImode:
23064 case E_V8DFmode:
23065 case E_V4DFmode:
23066 case E_V2DFmode:
23067 vec_mode = mode;
23068 imode = DImode;
23069 break;
23071 case E_TImode:
23072 case E_TFmode:
23073 vec_mode = VOIDmode;
23074 imode = TImode;
23075 break;
23077 default:
23078 gcc_unreachable ();
23081 machine_mode inner_mode = GET_MODE_INNER (mode);
23082 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
23083 GET_MODE_BITSIZE (inner_mode));
23084 if (invert)
23085 w = wi::bit_not (w);
23087 /* Force this value into the low part of a fp vector constant. */
23088 mask = immed_wide_int_const (w, imode);
23089 mask = gen_lowpart (inner_mode, mask);
23091 if (vec_mode == VOIDmode)
23092 return force_reg (inner_mode, mask);
23094 v = ix86_build_const_vector (vec_mode, vect, mask);
23095 return force_reg (vec_mode, v);
23098 /* Generate code for floating point ABS or NEG. */
23100 void
23101 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
23102 rtx operands[])
23104 rtx mask, set, dst, src;
23105 bool use_sse = false;
23106 bool vector_mode = VECTOR_MODE_P (mode);
23107 machine_mode vmode = mode;
23109 if (vector_mode)
23110 use_sse = true;
23111 else if (mode == TFmode)
23112 use_sse = true;
23113 else if (TARGET_SSE_MATH)
23115 use_sse = SSE_FLOAT_MODE_P (mode);
23116 if (mode == SFmode)
23117 vmode = V4SFmode;
23118 else if (mode == DFmode)
23119 vmode = V2DFmode;
23122 /* NEG and ABS performed with SSE use bitwise mask operations.
23123 Create the appropriate mask now. */
23124 if (use_sse)
23125 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
23126 else
23127 mask = NULL_RTX;
23129 dst = operands[0];
23130 src = operands[1];
23132 set = gen_rtx_fmt_e (code, mode, src);
23133 set = gen_rtx_SET (dst, set);
23135 if (mask)
23137 rtx use, clob;
23138 rtvec par;
23140 use = gen_rtx_USE (VOIDmode, mask);
23141 if (vector_mode)
23142 par = gen_rtvec (2, set, use);
23143 else
23145 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
23146 par = gen_rtvec (3, set, use, clob);
23148 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
23150 else
23151 emit_insn (set);
23154 /* Expand a copysign operation. Special case operand 0 being a constant. */
23156 void
23157 ix86_expand_copysign (rtx operands[])
23159 machine_mode mode, vmode;
23160 rtx dest, op0, op1, mask, nmask;
23162 dest = operands[0];
23163 op0 = operands[1];
23164 op1 = operands[2];
23166 mode = GET_MODE (dest);
23168 if (mode == SFmode)
23169 vmode = V4SFmode;
23170 else if (mode == DFmode)
23171 vmode = V2DFmode;
23172 else
23173 vmode = mode;
23175 if (CONST_DOUBLE_P (op0))
23177 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
23179 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
23180 op0 = simplify_unary_operation (ABS, mode, op0, mode);
23182 if (mode == SFmode || mode == DFmode)
23184 if (op0 == CONST0_RTX (mode))
23185 op0 = CONST0_RTX (vmode);
23186 else
23188 rtx v = ix86_build_const_vector (vmode, false, op0);
23190 op0 = force_reg (vmode, v);
23193 else if (op0 != CONST0_RTX (mode))
23194 op0 = force_reg (mode, op0);
23196 mask = ix86_build_signbit_mask (vmode, 0, 0);
23198 if (mode == SFmode)
23199 copysign_insn = gen_copysignsf3_const;
23200 else if (mode == DFmode)
23201 copysign_insn = gen_copysigndf3_const;
23202 else
23203 copysign_insn = gen_copysigntf3_const;
23205 emit_insn (copysign_insn (dest, op0, op1, mask));
23207 else
23209 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
23211 nmask = ix86_build_signbit_mask (vmode, 0, 1);
23212 mask = ix86_build_signbit_mask (vmode, 0, 0);
23214 if (mode == SFmode)
23215 copysign_insn = gen_copysignsf3_var;
23216 else if (mode == DFmode)
23217 copysign_insn = gen_copysigndf3_var;
23218 else
23219 copysign_insn = gen_copysigntf3_var;
23221 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
23225 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
23226 be a constant, and so has already been expanded into a vector constant. */
23228 void
23229 ix86_split_copysign_const (rtx operands[])
23231 machine_mode mode, vmode;
23232 rtx dest, op0, mask, x;
23234 dest = operands[0];
23235 op0 = operands[1];
23236 mask = operands[3];
23238 mode = GET_MODE (dest);
23239 vmode = GET_MODE (mask);
23241 dest = lowpart_subreg (vmode, dest, mode);
23242 x = gen_rtx_AND (vmode, dest, mask);
23243 emit_insn (gen_rtx_SET (dest, x));
23245 if (op0 != CONST0_RTX (vmode))
23247 x = gen_rtx_IOR (vmode, dest, op0);
23248 emit_insn (gen_rtx_SET (dest, x));
23252 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
23253 so we have to do two masks. */
23255 void
23256 ix86_split_copysign_var (rtx operands[])
23258 machine_mode mode, vmode;
23259 rtx dest, scratch, op0, op1, mask, nmask, x;
23261 dest = operands[0];
23262 scratch = operands[1];
23263 op0 = operands[2];
23264 op1 = operands[3];
23265 nmask = operands[4];
23266 mask = operands[5];
23268 mode = GET_MODE (dest);
23269 vmode = GET_MODE (mask);
23271 if (rtx_equal_p (op0, op1))
23273 /* Shouldn't happen often (it's useless, obviously), but when it does
23274 we'd generate incorrect code if we continue below. */
23275 emit_move_insn (dest, op0);
23276 return;
23279 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23281 gcc_assert (REGNO (op1) == REGNO (scratch));
23283 x = gen_rtx_AND (vmode, scratch, mask);
23284 emit_insn (gen_rtx_SET (scratch, x));
23286 dest = mask;
23287 op0 = lowpart_subreg (vmode, op0, mode);
23288 x = gen_rtx_NOT (vmode, dest);
23289 x = gen_rtx_AND (vmode, x, op0);
23290 emit_insn (gen_rtx_SET (dest, x));
23292 else
23294 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23296 x = gen_rtx_AND (vmode, scratch, mask);
23298 else /* alternative 2,4 */
23300 gcc_assert (REGNO (mask) == REGNO (scratch));
23301 op1 = lowpart_subreg (vmode, op1, mode);
23302 x = gen_rtx_AND (vmode, scratch, op1);
23304 emit_insn (gen_rtx_SET (scratch, x));
23306 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23308 dest = lowpart_subreg (vmode, op0, mode);
23309 x = gen_rtx_AND (vmode, dest, nmask);
23311 else /* alternative 3,4 */
23313 gcc_assert (REGNO (nmask) == REGNO (dest));
23314 dest = nmask;
23315 op0 = lowpart_subreg (vmode, op0, mode);
23316 x = gen_rtx_AND (vmode, dest, op0);
23318 emit_insn (gen_rtx_SET (dest, x));
23321 x = gen_rtx_IOR (vmode, dest, scratch);
23322 emit_insn (gen_rtx_SET (dest, x));
23325 /* Return TRUE or FALSE depending on whether the first SET in INSN
23326 has source and destination with matching CC modes, and that the
23327 CC mode is at least as constrained as REQ_MODE. */
23329 bool
23330 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23332 rtx set;
23333 machine_mode set_mode;
23335 set = PATTERN (insn);
23336 if (GET_CODE (set) == PARALLEL)
23337 set = XVECEXP (set, 0, 0);
23338 gcc_assert (GET_CODE (set) == SET);
23339 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23341 set_mode = GET_MODE (SET_DEST (set));
23342 switch (set_mode)
23344 case E_CCNOmode:
23345 if (req_mode != CCNOmode
23346 && (req_mode != CCmode
23347 || XEXP (SET_SRC (set), 1) != const0_rtx))
23348 return false;
23349 break;
23350 case E_CCmode:
23351 if (req_mode == CCGCmode)
23352 return false;
23353 /* FALLTHRU */
23354 case E_CCGCmode:
23355 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23356 return false;
23357 /* FALLTHRU */
23358 case E_CCGOCmode:
23359 if (req_mode == CCZmode)
23360 return false;
23361 /* FALLTHRU */
23362 case E_CCZmode:
23363 break;
23365 case E_CCAmode:
23366 case E_CCCmode:
23367 case E_CCOmode:
23368 case E_CCPmode:
23369 case E_CCSmode:
23370 if (set_mode != req_mode)
23371 return false;
23372 break;
23374 default:
23375 gcc_unreachable ();
23378 return GET_MODE (SET_SRC (set)) == set_mode;
23381 /* Generate insn patterns to do an integer compare of OPERANDS. */
23383 static rtx
23384 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23386 machine_mode cmpmode;
23387 rtx tmp, flags;
23389 cmpmode = SELECT_CC_MODE (code, op0, op1);
23390 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23392 /* This is very simple, but making the interface the same as in the
23393 FP case makes the rest of the code easier. */
23394 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23395 emit_insn (gen_rtx_SET (flags, tmp));
23397 /* Return the test that should be put into the flags user, i.e.
23398 the bcc, scc, or cmov instruction. */
23399 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23402 /* Figure out whether to use ordered or unordered fp comparisons.
23403 Return the appropriate mode to use. */
23405 machine_mode
23406 ix86_fp_compare_mode (enum rtx_code)
23408 /* ??? In order to make all comparisons reversible, we do all comparisons
23409 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23410 all forms trapping and nontrapping comparisons, we can make inequality
23411 comparisons trapping again, since it results in better code when using
23412 FCOM based compares. */
23413 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23416 machine_mode
23417 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23419 machine_mode mode = GET_MODE (op0);
23421 if (SCALAR_FLOAT_MODE_P (mode))
23423 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23424 return ix86_fp_compare_mode (code);
23427 switch (code)
23429 /* Only zero flag is needed. */
23430 case EQ: /* ZF=0 */
23431 case NE: /* ZF!=0 */
23432 return CCZmode;
23433 /* Codes needing carry flag. */
23434 case GEU: /* CF=0 */
23435 case LTU: /* CF=1 */
23436 /* Detect overflow checks. They need just the carry flag. */
23437 if (GET_CODE (op0) == PLUS
23438 && (rtx_equal_p (op1, XEXP (op0, 0))
23439 || rtx_equal_p (op1, XEXP (op0, 1))))
23440 return CCCmode;
23441 else
23442 return CCmode;
23443 case GTU: /* CF=0 & ZF=0 */
23444 case LEU: /* CF=1 | ZF=1 */
23445 return CCmode;
23446 /* Codes possibly doable only with sign flag when
23447 comparing against zero. */
23448 case GE: /* SF=OF or SF=0 */
23449 case LT: /* SF<>OF or SF=1 */
23450 if (op1 == const0_rtx)
23451 return CCGOCmode;
23452 else
23453 /* For other cases Carry flag is not required. */
23454 return CCGCmode;
23455 /* Codes doable only with sign flag when comparing
23456 against zero, but we miss jump instruction for it
23457 so we need to use relational tests against overflow
23458 that thus needs to be zero. */
23459 case GT: /* ZF=0 & SF=OF */
23460 case LE: /* ZF=1 | SF<>OF */
23461 if (op1 == const0_rtx)
23462 return CCNOmode;
23463 else
23464 return CCGCmode;
23465 /* strcmp pattern do (use flags) and combine may ask us for proper
23466 mode. */
23467 case USE:
23468 return CCmode;
23469 default:
23470 gcc_unreachable ();
23474 /* Return the fixed registers used for condition codes. */
23476 static bool
23477 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23479 *p1 = FLAGS_REG;
23480 *p2 = FPSR_REG;
23481 return true;
23484 /* If two condition code modes are compatible, return a condition code
23485 mode which is compatible with both. Otherwise, return
23486 VOIDmode. */
23488 static machine_mode
23489 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23491 if (m1 == m2)
23492 return m1;
23494 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23495 return VOIDmode;
23497 if ((m1 == CCGCmode && m2 == CCGOCmode)
23498 || (m1 == CCGOCmode && m2 == CCGCmode))
23499 return CCGCmode;
23501 if ((m1 == CCNOmode && m2 == CCGOCmode)
23502 || (m1 == CCGOCmode && m2 == CCNOmode))
23503 return CCNOmode;
23505 if (m1 == CCZmode
23506 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23507 return m2;
23508 else if (m2 == CCZmode
23509 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23510 return m1;
23512 switch (m1)
23514 default:
23515 gcc_unreachable ();
23517 case E_CCmode:
23518 case E_CCGCmode:
23519 case E_CCGOCmode:
23520 case E_CCNOmode:
23521 case E_CCAmode:
23522 case E_CCCmode:
23523 case E_CCOmode:
23524 case E_CCPmode:
23525 case E_CCSmode:
23526 case E_CCZmode:
23527 switch (m2)
23529 default:
23530 return VOIDmode;
23532 case E_CCmode:
23533 case E_CCGCmode:
23534 case E_CCGOCmode:
23535 case E_CCNOmode:
23536 case E_CCAmode:
23537 case E_CCCmode:
23538 case E_CCOmode:
23539 case E_CCPmode:
23540 case E_CCSmode:
23541 case E_CCZmode:
23542 return CCmode;
23545 case E_CCFPmode:
23546 case E_CCFPUmode:
23547 /* These are only compatible with themselves, which we already
23548 checked above. */
23549 return VOIDmode;
23554 /* Return a comparison we can do and that it is equivalent to
23555 swap_condition (code) apart possibly from orderedness.
23556 But, never change orderedness if TARGET_IEEE_FP, returning
23557 UNKNOWN in that case if necessary. */
23559 static enum rtx_code
23560 ix86_fp_swap_condition (enum rtx_code code)
23562 switch (code)
23564 case GT: /* GTU - CF=0 & ZF=0 */
23565 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23566 case GE: /* GEU - CF=0 */
23567 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23568 case UNLT: /* LTU - CF=1 */
23569 return TARGET_IEEE_FP ? UNKNOWN : GT;
23570 case UNLE: /* LEU - CF=1 | ZF=1 */
23571 return TARGET_IEEE_FP ? UNKNOWN : GE;
23572 default:
23573 return swap_condition (code);
23577 /* Return cost of comparison CODE using the best strategy for performance.
23578 All following functions do use number of instructions as a cost metrics.
23579 In future this should be tweaked to compute bytes for optimize_size and
23580 take into account performance of various instructions on various CPUs. */
23582 static int
23583 ix86_fp_comparison_cost (enum rtx_code code)
23585 int arith_cost;
23587 /* The cost of code using bit-twiddling on %ah. */
23588 switch (code)
23590 case UNLE:
23591 case UNLT:
23592 case LTGT:
23593 case GT:
23594 case GE:
23595 case UNORDERED:
23596 case ORDERED:
23597 case UNEQ:
23598 arith_cost = 4;
23599 break;
23600 case LT:
23601 case NE:
23602 case EQ:
23603 case UNGE:
23604 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23605 break;
23606 case LE:
23607 case UNGT:
23608 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23609 break;
23610 default:
23611 gcc_unreachable ();
23614 switch (ix86_fp_comparison_strategy (code))
23616 case IX86_FPCMP_COMI:
23617 return arith_cost > 4 ? 3 : 2;
23618 case IX86_FPCMP_SAHF:
23619 return arith_cost > 4 ? 4 : 3;
23620 default:
23621 return arith_cost;
23625 /* Return strategy to use for floating-point. We assume that fcomi is always
23626 preferrable where available, since that is also true when looking at size
23627 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23629 enum ix86_fpcmp_strategy
23630 ix86_fp_comparison_strategy (enum rtx_code)
23632 /* Do fcomi/sahf based test when profitable. */
23634 if (TARGET_CMOVE)
23635 return IX86_FPCMP_COMI;
23637 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23638 return IX86_FPCMP_SAHF;
23640 return IX86_FPCMP_ARITH;
23643 /* Swap, force into registers, or otherwise massage the two operands
23644 to a fp comparison. The operands are updated in place; the new
23645 comparison code is returned. */
23647 static enum rtx_code
23648 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23650 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23651 rtx op0 = *pop0, op1 = *pop1;
23652 machine_mode op_mode = GET_MODE (op0);
23653 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23655 /* All of the unordered compare instructions only work on registers.
23656 The same is true of the fcomi compare instructions. The XFmode
23657 compare instructions require registers except when comparing
23658 against zero or when converting operand 1 from fixed point to
23659 floating point. */
23661 if (!is_sse
23662 && (fpcmp_mode == CCFPUmode
23663 || (op_mode == XFmode
23664 && ! (standard_80387_constant_p (op0) == 1
23665 || standard_80387_constant_p (op1) == 1)
23666 && GET_CODE (op1) != FLOAT)
23667 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23669 op0 = force_reg (op_mode, op0);
23670 op1 = force_reg (op_mode, op1);
23672 else
23674 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23675 things around if they appear profitable, otherwise force op0
23676 into a register. */
23678 if (standard_80387_constant_p (op0) == 0
23679 || (MEM_P (op0)
23680 && ! (standard_80387_constant_p (op1) == 0
23681 || MEM_P (op1))))
23683 enum rtx_code new_code = ix86_fp_swap_condition (code);
23684 if (new_code != UNKNOWN)
23686 std::swap (op0, op1);
23687 code = new_code;
23691 if (!REG_P (op0))
23692 op0 = force_reg (op_mode, op0);
23694 if (CONSTANT_P (op1))
23696 int tmp = standard_80387_constant_p (op1);
23697 if (tmp == 0)
23698 op1 = validize_mem (force_const_mem (op_mode, op1));
23699 else if (tmp == 1)
23701 if (TARGET_CMOVE)
23702 op1 = force_reg (op_mode, op1);
23704 else
23705 op1 = force_reg (op_mode, op1);
23709 /* Try to rearrange the comparison to make it cheaper. */
23710 if (ix86_fp_comparison_cost (code)
23711 > ix86_fp_comparison_cost (swap_condition (code))
23712 && (REG_P (op1) || can_create_pseudo_p ()))
23714 std::swap (op0, op1);
23715 code = swap_condition (code);
23716 if (!REG_P (op0))
23717 op0 = force_reg (op_mode, op0);
23720 *pop0 = op0;
23721 *pop1 = op1;
23722 return code;
23725 /* Convert comparison codes we use to represent FP comparison to integer
23726 code that will result in proper branch. Return UNKNOWN if no such code
23727 is available. */
23729 enum rtx_code
23730 ix86_fp_compare_code_to_integer (enum rtx_code code)
23732 switch (code)
23734 case GT:
23735 return GTU;
23736 case GE:
23737 return GEU;
23738 case ORDERED:
23739 case UNORDERED:
23740 return code;
23741 case UNEQ:
23742 return EQ;
23743 case UNLT:
23744 return LTU;
23745 case UNLE:
23746 return LEU;
23747 case LTGT:
23748 return NE;
23749 default:
23750 return UNKNOWN;
23754 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23756 static rtx
23757 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23759 machine_mode fpcmp_mode, intcmp_mode;
23760 rtx tmp, tmp2;
23762 fpcmp_mode = ix86_fp_compare_mode (code);
23763 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23765 /* Do fcomi/sahf based test when profitable. */
23766 switch (ix86_fp_comparison_strategy (code))
23768 case IX86_FPCMP_COMI:
23769 intcmp_mode = fpcmp_mode;
23770 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23771 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23772 emit_insn (tmp);
23773 break;
23775 case IX86_FPCMP_SAHF:
23776 intcmp_mode = fpcmp_mode;
23777 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23778 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23780 if (!scratch)
23781 scratch = gen_reg_rtx (HImode);
23782 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23783 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23784 break;
23786 case IX86_FPCMP_ARITH:
23787 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23788 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23789 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23790 if (!scratch)
23791 scratch = gen_reg_rtx (HImode);
23792 emit_insn (gen_rtx_SET (scratch, tmp2));
23794 /* In the unordered case, we have to check C2 for NaN's, which
23795 doesn't happen to work out to anything nice combination-wise.
23796 So do some bit twiddling on the value we've got in AH to come
23797 up with an appropriate set of condition codes. */
23799 intcmp_mode = CCNOmode;
23800 switch (code)
23802 case GT:
23803 case UNGT:
23804 if (code == GT || !TARGET_IEEE_FP)
23806 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23807 code = EQ;
23809 else
23811 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23812 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23813 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23814 intcmp_mode = CCmode;
23815 code = GEU;
23817 break;
23818 case LT:
23819 case UNLT:
23820 if (code == LT && TARGET_IEEE_FP)
23822 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23823 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23824 intcmp_mode = CCmode;
23825 code = EQ;
23827 else
23829 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23830 code = NE;
23832 break;
23833 case GE:
23834 case UNGE:
23835 if (code == GE || !TARGET_IEEE_FP)
23837 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23838 code = EQ;
23840 else
23842 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23843 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23844 code = NE;
23846 break;
23847 case LE:
23848 case UNLE:
23849 if (code == LE && TARGET_IEEE_FP)
23851 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23852 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23853 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23854 intcmp_mode = CCmode;
23855 code = LTU;
23857 else
23859 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23860 code = NE;
23862 break;
23863 case EQ:
23864 case UNEQ:
23865 if (code == EQ && TARGET_IEEE_FP)
23867 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23868 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23869 intcmp_mode = CCmode;
23870 code = EQ;
23872 else
23874 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23875 code = NE;
23877 break;
23878 case NE:
23879 case LTGT:
23880 if (code == NE && TARGET_IEEE_FP)
23882 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23883 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23884 GEN_INT (0x40)));
23885 code = NE;
23887 else
23889 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23890 code = EQ;
23892 break;
23894 case UNORDERED:
23895 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23896 code = NE;
23897 break;
23898 case ORDERED:
23899 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23900 code = EQ;
23901 break;
23903 default:
23904 gcc_unreachable ();
23906 break;
23908 default:
23909 gcc_unreachable();
23912 /* Return the test that should be put into the flags user, i.e.
23913 the bcc, scc, or cmov instruction. */
23914 return gen_rtx_fmt_ee (code, VOIDmode,
23915 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23916 const0_rtx);
23919 static rtx
23920 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23922 rtx ret;
23924 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23925 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23927 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23929 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23930 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23932 else
23933 ret = ix86_expand_int_compare (code, op0, op1);
23935 return ret;
23938 void
23939 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23941 machine_mode mode = GET_MODE (op0);
23942 rtx tmp;
23944 /* Handle special case - vector comparsion with boolean result, transform
23945 it using ptest instruction. */
23946 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23948 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23949 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23951 gcc_assert (code == EQ || code == NE);
23952 /* Generate XOR since we can't check that one operand is zero vector. */
23953 tmp = gen_reg_rtx (mode);
23954 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23955 tmp = gen_lowpart (p_mode, tmp);
23956 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23957 gen_rtx_UNSPEC (CCmode,
23958 gen_rtvec (2, tmp, tmp),
23959 UNSPEC_PTEST)));
23960 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23961 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23962 gen_rtx_LABEL_REF (VOIDmode, label),
23963 pc_rtx);
23964 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23965 return;
23968 switch (mode)
23970 case E_SFmode:
23971 case E_DFmode:
23972 case E_XFmode:
23973 case E_QImode:
23974 case E_HImode:
23975 case E_SImode:
23976 simple:
23977 tmp = ix86_expand_compare (code, op0, op1);
23978 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23979 gen_rtx_LABEL_REF (VOIDmode, label),
23980 pc_rtx);
23981 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23982 return;
23984 case E_DImode:
23985 if (TARGET_64BIT)
23986 goto simple;
23987 /* For 32-bit target DI comparison may be performed on
23988 SSE registers. To allow this we should avoid split
23989 to SI mode which is achieved by doing xor in DI mode
23990 and then comparing with zero (which is recognized by
23991 STV pass). We don't compare using xor when optimizing
23992 for size. */
23993 if (!optimize_insn_for_size_p ()
23994 && TARGET_STV
23995 && (code == EQ || code == NE))
23997 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23998 op1 = const0_rtx;
24000 /* FALLTHRU */
24001 case E_TImode:
24002 /* Expand DImode branch into multiple compare+branch. */
24004 rtx lo[2], hi[2];
24005 rtx_code_label *label2;
24006 enum rtx_code code1, code2, code3;
24007 machine_mode submode;
24009 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
24011 std::swap (op0, op1);
24012 code = swap_condition (code);
24015 split_double_mode (mode, &op0, 1, lo+0, hi+0);
24016 split_double_mode (mode, &op1, 1, lo+1, hi+1);
24018 submode = mode == DImode ? SImode : DImode;
24020 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
24021 avoid two branches. This costs one extra insn, so disable when
24022 optimizing for size. */
24024 if ((code == EQ || code == NE)
24025 && (!optimize_insn_for_size_p ()
24026 || hi[1] == const0_rtx || lo[1] == const0_rtx))
24028 rtx xor0, xor1;
24030 xor1 = hi[0];
24031 if (hi[1] != const0_rtx)
24032 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
24033 NULL_RTX, 0, OPTAB_WIDEN);
24035 xor0 = lo[0];
24036 if (lo[1] != const0_rtx)
24037 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
24038 NULL_RTX, 0, OPTAB_WIDEN);
24040 tmp = expand_binop (submode, ior_optab, xor1, xor0,
24041 NULL_RTX, 0, OPTAB_WIDEN);
24043 ix86_expand_branch (code, tmp, const0_rtx, label);
24044 return;
24047 /* Otherwise, if we are doing less-than or greater-or-equal-than,
24048 op1 is a constant and the low word is zero, then we can just
24049 examine the high word. Similarly for low word -1 and
24050 less-or-equal-than or greater-than. */
24052 if (CONST_INT_P (hi[1]))
24053 switch (code)
24055 case LT: case LTU: case GE: case GEU:
24056 if (lo[1] == const0_rtx)
24058 ix86_expand_branch (code, hi[0], hi[1], label);
24059 return;
24061 break;
24062 case LE: case LEU: case GT: case GTU:
24063 if (lo[1] == constm1_rtx)
24065 ix86_expand_branch (code, hi[0], hi[1], label);
24066 return;
24068 break;
24069 default:
24070 break;
24073 /* Otherwise, we need two or three jumps. */
24075 label2 = gen_label_rtx ();
24077 code1 = code;
24078 code2 = swap_condition (code);
24079 code3 = unsigned_condition (code);
24081 switch (code)
24083 case LT: case GT: case LTU: case GTU:
24084 break;
24086 case LE: code1 = LT; code2 = GT; break;
24087 case GE: code1 = GT; code2 = LT; break;
24088 case LEU: code1 = LTU; code2 = GTU; break;
24089 case GEU: code1 = GTU; code2 = LTU; break;
24091 case EQ: code1 = UNKNOWN; code2 = NE; break;
24092 case NE: code2 = UNKNOWN; break;
24094 default:
24095 gcc_unreachable ();
24099 * a < b =>
24100 * if (hi(a) < hi(b)) goto true;
24101 * if (hi(a) > hi(b)) goto false;
24102 * if (lo(a) < lo(b)) goto true;
24103 * false:
24106 if (code1 != UNKNOWN)
24107 ix86_expand_branch (code1, hi[0], hi[1], label);
24108 if (code2 != UNKNOWN)
24109 ix86_expand_branch (code2, hi[0], hi[1], label2);
24111 ix86_expand_branch (code3, lo[0], lo[1], label);
24113 if (code2 != UNKNOWN)
24114 emit_label (label2);
24115 return;
24118 default:
24119 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
24120 goto simple;
24124 /* Split branch based on floating point condition. */
24125 void
24126 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
24127 rtx target1, rtx target2, rtx tmp)
24129 rtx condition;
24130 rtx_insn *i;
24132 if (target2 != pc_rtx)
24134 std::swap (target1, target2);
24135 code = reverse_condition_maybe_unordered (code);
24138 condition = ix86_expand_fp_compare (code, op1, op2,
24139 tmp);
24141 i = emit_jump_insn (gen_rtx_SET
24142 (pc_rtx,
24143 gen_rtx_IF_THEN_ELSE (VOIDmode,
24144 condition, target1, target2)));
24145 if (split_branch_probability.initialized_p ())
24146 add_reg_br_prob_note (i, split_branch_probability);
24149 void
24150 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
24152 rtx ret;
24154 gcc_assert (GET_MODE (dest) == QImode);
24156 ret = ix86_expand_compare (code, op0, op1);
24157 PUT_MODE (ret, QImode);
24158 emit_insn (gen_rtx_SET (dest, ret));
24161 /* Expand comparison setting or clearing carry flag. Return true when
24162 successful and set pop for the operation. */
24163 static bool
24164 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
24166 machine_mode mode =
24167 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
24169 /* Do not handle double-mode compares that go through special path. */
24170 if (mode == (TARGET_64BIT ? TImode : DImode))
24171 return false;
24173 if (SCALAR_FLOAT_MODE_P (mode))
24175 rtx compare_op;
24176 rtx_insn *compare_seq;
24178 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
24180 /* Shortcut: following common codes never translate
24181 into carry flag compares. */
24182 if (code == EQ || code == NE || code == UNEQ || code == LTGT
24183 || code == ORDERED || code == UNORDERED)
24184 return false;
24186 /* These comparisons require zero flag; swap operands so they won't. */
24187 if ((code == GT || code == UNLE || code == LE || code == UNGT)
24188 && !TARGET_IEEE_FP)
24190 std::swap (op0, op1);
24191 code = swap_condition (code);
24194 /* Try to expand the comparison and verify that we end up with
24195 carry flag based comparison. This fails to be true only when
24196 we decide to expand comparison using arithmetic that is not
24197 too common scenario. */
24198 start_sequence ();
24199 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
24200 compare_seq = get_insns ();
24201 end_sequence ();
24203 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
24204 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
24205 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
24206 else
24207 code = GET_CODE (compare_op);
24209 if (code != LTU && code != GEU)
24210 return false;
24212 emit_insn (compare_seq);
24213 *pop = compare_op;
24214 return true;
24217 if (!INTEGRAL_MODE_P (mode))
24218 return false;
24220 switch (code)
24222 case LTU:
24223 case GEU:
24224 break;
24226 /* Convert a==0 into (unsigned)a<1. */
24227 case EQ:
24228 case NE:
24229 if (op1 != const0_rtx)
24230 return false;
24231 op1 = const1_rtx;
24232 code = (code == EQ ? LTU : GEU);
24233 break;
24235 /* Convert a>b into b<a or a>=b-1. */
24236 case GTU:
24237 case LEU:
24238 if (CONST_INT_P (op1))
24240 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
24241 /* Bail out on overflow. We still can swap operands but that
24242 would force loading of the constant into register. */
24243 if (op1 == const0_rtx
24244 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
24245 return false;
24246 code = (code == GTU ? GEU : LTU);
24248 else
24250 std::swap (op0, op1);
24251 code = (code == GTU ? LTU : GEU);
24253 break;
24255 /* Convert a>=0 into (unsigned)a<0x80000000. */
24256 case LT:
24257 case GE:
24258 if (mode == DImode || op1 != const0_rtx)
24259 return false;
24260 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24261 code = (code == LT ? GEU : LTU);
24262 break;
24263 case LE:
24264 case GT:
24265 if (mode == DImode || op1 != constm1_rtx)
24266 return false;
24267 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24268 code = (code == LE ? GEU : LTU);
24269 break;
24271 default:
24272 return false;
24274 /* Swapping operands may cause constant to appear as first operand. */
24275 if (!nonimmediate_operand (op0, VOIDmode))
24277 if (!can_create_pseudo_p ())
24278 return false;
24279 op0 = force_reg (mode, op0);
24281 *pop = ix86_expand_compare (code, op0, op1);
24282 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24283 return true;
24286 bool
24287 ix86_expand_int_movcc (rtx operands[])
24289 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24290 rtx_insn *compare_seq;
24291 rtx compare_op;
24292 machine_mode mode = GET_MODE (operands[0]);
24293 bool sign_bit_compare_p = false;
24294 rtx op0 = XEXP (operands[1], 0);
24295 rtx op1 = XEXP (operands[1], 1);
24297 if (GET_MODE (op0) == TImode
24298 || (GET_MODE (op0) == DImode
24299 && !TARGET_64BIT))
24300 return false;
24302 start_sequence ();
24303 compare_op = ix86_expand_compare (code, op0, op1);
24304 compare_seq = get_insns ();
24305 end_sequence ();
24307 compare_code = GET_CODE (compare_op);
24309 if ((op1 == const0_rtx && (code == GE || code == LT))
24310 || (op1 == constm1_rtx && (code == GT || code == LE)))
24311 sign_bit_compare_p = true;
24313 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24314 HImode insns, we'd be swallowed in word prefix ops. */
24316 if ((mode != HImode || TARGET_FAST_PREFIX)
24317 && (mode != (TARGET_64BIT ? TImode : DImode))
24318 && CONST_INT_P (operands[2])
24319 && CONST_INT_P (operands[3]))
24321 rtx out = operands[0];
24322 HOST_WIDE_INT ct = INTVAL (operands[2]);
24323 HOST_WIDE_INT cf = INTVAL (operands[3]);
24324 HOST_WIDE_INT diff;
24326 diff = ct - cf;
24327 /* Sign bit compares are better done using shifts than we do by using
24328 sbb. */
24329 if (sign_bit_compare_p
24330 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24332 /* Detect overlap between destination and compare sources. */
24333 rtx tmp = out;
24335 if (!sign_bit_compare_p)
24337 rtx flags;
24338 bool fpcmp = false;
24340 compare_code = GET_CODE (compare_op);
24342 flags = XEXP (compare_op, 0);
24344 if (GET_MODE (flags) == CCFPmode
24345 || GET_MODE (flags) == CCFPUmode)
24347 fpcmp = true;
24348 compare_code
24349 = ix86_fp_compare_code_to_integer (compare_code);
24352 /* To simplify rest of code, restrict to the GEU case. */
24353 if (compare_code == LTU)
24355 std::swap (ct, cf);
24356 compare_code = reverse_condition (compare_code);
24357 code = reverse_condition (code);
24359 else
24361 if (fpcmp)
24362 PUT_CODE (compare_op,
24363 reverse_condition_maybe_unordered
24364 (GET_CODE (compare_op)));
24365 else
24366 PUT_CODE (compare_op,
24367 reverse_condition (GET_CODE (compare_op)));
24369 diff = ct - cf;
24371 if (reg_overlap_mentioned_p (out, op0)
24372 || reg_overlap_mentioned_p (out, op1))
24373 tmp = gen_reg_rtx (mode);
24375 if (mode == DImode)
24376 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24377 else
24378 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24379 flags, compare_op));
24381 else
24383 if (code == GT || code == GE)
24384 code = reverse_condition (code);
24385 else
24387 std::swap (ct, cf);
24388 diff = ct - cf;
24390 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24393 if (diff == 1)
24396 * cmpl op0,op1
24397 * sbbl dest,dest
24398 * [addl dest, ct]
24400 * Size 5 - 8.
24402 if (ct)
24403 tmp = expand_simple_binop (mode, PLUS,
24404 tmp, GEN_INT (ct),
24405 copy_rtx (tmp), 1, OPTAB_DIRECT);
24407 else if (cf == -1)
24410 * cmpl op0,op1
24411 * sbbl dest,dest
24412 * orl $ct, dest
24414 * Size 8.
24416 tmp = expand_simple_binop (mode, IOR,
24417 tmp, GEN_INT (ct),
24418 copy_rtx (tmp), 1, OPTAB_DIRECT);
24420 else if (diff == -1 && ct)
24423 * cmpl op0,op1
24424 * sbbl dest,dest
24425 * notl dest
24426 * [addl dest, cf]
24428 * Size 8 - 11.
24430 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24431 if (cf)
24432 tmp = expand_simple_binop (mode, PLUS,
24433 copy_rtx (tmp), GEN_INT (cf),
24434 copy_rtx (tmp), 1, OPTAB_DIRECT);
24436 else
24439 * cmpl op0,op1
24440 * sbbl dest,dest
24441 * [notl dest]
24442 * andl cf - ct, dest
24443 * [addl dest, ct]
24445 * Size 8 - 11.
24448 if (cf == 0)
24450 cf = ct;
24451 ct = 0;
24452 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24455 tmp = expand_simple_binop (mode, AND,
24456 copy_rtx (tmp),
24457 gen_int_mode (cf - ct, mode),
24458 copy_rtx (tmp), 1, OPTAB_DIRECT);
24459 if (ct)
24460 tmp = expand_simple_binop (mode, PLUS,
24461 copy_rtx (tmp), GEN_INT (ct),
24462 copy_rtx (tmp), 1, OPTAB_DIRECT);
24465 if (!rtx_equal_p (tmp, out))
24466 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24468 return true;
24471 if (diff < 0)
24473 machine_mode cmp_mode = GET_MODE (op0);
24474 enum rtx_code new_code;
24476 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24478 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24480 /* We may be reversing unordered compare to normal compare, that
24481 is not valid in general (we may convert non-trapping condition
24482 to trapping one), however on i386 we currently emit all
24483 comparisons unordered. */
24484 new_code = reverse_condition_maybe_unordered (code);
24486 else
24487 new_code = ix86_reverse_condition (code, cmp_mode);
24488 if (new_code != UNKNOWN)
24490 std::swap (ct, cf);
24491 diff = -diff;
24492 code = new_code;
24496 compare_code = UNKNOWN;
24497 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24498 && CONST_INT_P (op1))
24500 if (op1 == const0_rtx
24501 && (code == LT || code == GE))
24502 compare_code = code;
24503 else if (op1 == constm1_rtx)
24505 if (code == LE)
24506 compare_code = LT;
24507 else if (code == GT)
24508 compare_code = GE;
24512 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24513 if (compare_code != UNKNOWN
24514 && GET_MODE (op0) == GET_MODE (out)
24515 && (cf == -1 || ct == -1))
24517 /* If lea code below could be used, only optimize
24518 if it results in a 2 insn sequence. */
24520 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24521 || diff == 3 || diff == 5 || diff == 9)
24522 || (compare_code == LT && ct == -1)
24523 || (compare_code == GE && cf == -1))
24526 * notl op1 (if necessary)
24527 * sarl $31, op1
24528 * orl cf, op1
24530 if (ct != -1)
24532 cf = ct;
24533 ct = -1;
24534 code = reverse_condition (code);
24537 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24539 out = expand_simple_binop (mode, IOR,
24540 out, GEN_INT (cf),
24541 out, 1, OPTAB_DIRECT);
24542 if (out != operands[0])
24543 emit_move_insn (operands[0], out);
24545 return true;
24550 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24551 || diff == 3 || diff == 5 || diff == 9)
24552 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24553 && (mode != DImode
24554 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24557 * xorl dest,dest
24558 * cmpl op1,op2
24559 * setcc dest
24560 * lea cf(dest*(ct-cf)),dest
24562 * Size 14.
24564 * This also catches the degenerate setcc-only case.
24567 rtx tmp;
24568 int nops;
24570 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24572 nops = 0;
24573 /* On x86_64 the lea instruction operates on Pmode, so we need
24574 to get arithmetics done in proper mode to match. */
24575 if (diff == 1)
24576 tmp = copy_rtx (out);
24577 else
24579 rtx out1;
24580 out1 = copy_rtx (out);
24581 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24582 nops++;
24583 if (diff & 1)
24585 tmp = gen_rtx_PLUS (mode, tmp, out1);
24586 nops++;
24589 if (cf != 0)
24591 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24592 nops++;
24594 if (!rtx_equal_p (tmp, out))
24596 if (nops == 1)
24597 out = force_operand (tmp, copy_rtx (out));
24598 else
24599 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24601 if (!rtx_equal_p (out, operands[0]))
24602 emit_move_insn (operands[0], copy_rtx (out));
24604 return true;
24608 * General case: Jumpful:
24609 * xorl dest,dest cmpl op1, op2
24610 * cmpl op1, op2 movl ct, dest
24611 * setcc dest jcc 1f
24612 * decl dest movl cf, dest
24613 * andl (cf-ct),dest 1:
24614 * addl ct,dest
24616 * Size 20. Size 14.
24618 * This is reasonably steep, but branch mispredict costs are
24619 * high on modern cpus, so consider failing only if optimizing
24620 * for space.
24623 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24624 && BRANCH_COST (optimize_insn_for_speed_p (),
24625 false) >= 2)
24627 if (cf == 0)
24629 machine_mode cmp_mode = GET_MODE (op0);
24630 enum rtx_code new_code;
24632 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24634 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24636 /* We may be reversing unordered compare to normal compare,
24637 that is not valid in general (we may convert non-trapping
24638 condition to trapping one), however on i386 we currently
24639 emit all comparisons unordered. */
24640 new_code = reverse_condition_maybe_unordered (code);
24642 else
24644 new_code = ix86_reverse_condition (code, cmp_mode);
24645 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24646 compare_code = reverse_condition (compare_code);
24649 if (new_code != UNKNOWN)
24651 cf = ct;
24652 ct = 0;
24653 code = new_code;
24657 if (compare_code != UNKNOWN)
24659 /* notl op1 (if needed)
24660 sarl $31, op1
24661 andl (cf-ct), op1
24662 addl ct, op1
24664 For x < 0 (resp. x <= -1) there will be no notl,
24665 so if possible swap the constants to get rid of the
24666 complement.
24667 True/false will be -1/0 while code below (store flag
24668 followed by decrement) is 0/-1, so the constants need
24669 to be exchanged once more. */
24671 if (compare_code == GE || !cf)
24673 code = reverse_condition (code);
24674 compare_code = LT;
24676 else
24677 std::swap (ct, cf);
24679 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24681 else
24683 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24685 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24686 constm1_rtx,
24687 copy_rtx (out), 1, OPTAB_DIRECT);
24690 out = expand_simple_binop (mode, AND, copy_rtx (out),
24691 gen_int_mode (cf - ct, mode),
24692 copy_rtx (out), 1, OPTAB_DIRECT);
24693 if (ct)
24694 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24695 copy_rtx (out), 1, OPTAB_DIRECT);
24696 if (!rtx_equal_p (out, operands[0]))
24697 emit_move_insn (operands[0], copy_rtx (out));
24699 return true;
24703 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24705 /* Try a few things more with specific constants and a variable. */
24707 optab op;
24708 rtx var, orig_out, out, tmp;
24710 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24711 return false;
24713 /* If one of the two operands is an interesting constant, load a
24714 constant with the above and mask it in with a logical operation. */
24716 if (CONST_INT_P (operands[2]))
24718 var = operands[3];
24719 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24720 operands[3] = constm1_rtx, op = and_optab;
24721 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24722 operands[3] = const0_rtx, op = ior_optab;
24723 else
24724 return false;
24726 else if (CONST_INT_P (operands[3]))
24728 var = operands[2];
24729 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24730 operands[2] = constm1_rtx, op = and_optab;
24731 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24732 operands[2] = const0_rtx, op = ior_optab;
24733 else
24734 return false;
24736 else
24737 return false;
24739 orig_out = operands[0];
24740 tmp = gen_reg_rtx (mode);
24741 operands[0] = tmp;
24743 /* Recurse to get the constant loaded. */
24744 if (!ix86_expand_int_movcc (operands))
24745 return false;
24747 /* Mask in the interesting variable. */
24748 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24749 OPTAB_WIDEN);
24750 if (!rtx_equal_p (out, orig_out))
24751 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24753 return true;
24757 * For comparison with above,
24759 * movl cf,dest
24760 * movl ct,tmp
24761 * cmpl op1,op2
24762 * cmovcc tmp,dest
24764 * Size 15.
24767 if (! nonimmediate_operand (operands[2], mode))
24768 operands[2] = force_reg (mode, operands[2]);
24769 if (! nonimmediate_operand (operands[3], mode))
24770 operands[3] = force_reg (mode, operands[3]);
24772 if (! register_operand (operands[2], VOIDmode)
24773 && (mode == QImode
24774 || ! register_operand (operands[3], VOIDmode)))
24775 operands[2] = force_reg (mode, operands[2]);
24777 if (mode == QImode
24778 && ! register_operand (operands[3], VOIDmode))
24779 operands[3] = force_reg (mode, operands[3]);
24781 emit_insn (compare_seq);
24782 emit_insn (gen_rtx_SET (operands[0],
24783 gen_rtx_IF_THEN_ELSE (mode,
24784 compare_op, operands[2],
24785 operands[3])));
24786 return true;
24789 /* Swap, force into registers, or otherwise massage the two operands
24790 to an sse comparison with a mask result. Thus we differ a bit from
24791 ix86_prepare_fp_compare_args which expects to produce a flags result.
24793 The DEST operand exists to help determine whether to commute commutative
24794 operators. The POP0/POP1 operands are updated in place. The new
24795 comparison code is returned, or UNKNOWN if not implementable. */
24797 static enum rtx_code
24798 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24799 rtx *pop0, rtx *pop1)
24801 switch (code)
24803 case LTGT:
24804 case UNEQ:
24805 /* AVX supports all the needed comparisons. */
24806 if (TARGET_AVX)
24807 break;
24808 /* We have no LTGT as an operator. We could implement it with
24809 NE & ORDERED, but this requires an extra temporary. It's
24810 not clear that it's worth it. */
24811 return UNKNOWN;
24813 case LT:
24814 case LE:
24815 case UNGT:
24816 case UNGE:
24817 /* These are supported directly. */
24818 break;
24820 case EQ:
24821 case NE:
24822 case UNORDERED:
24823 case ORDERED:
24824 /* AVX has 3 operand comparisons, no need to swap anything. */
24825 if (TARGET_AVX)
24826 break;
24827 /* For commutative operators, try to canonicalize the destination
24828 operand to be first in the comparison - this helps reload to
24829 avoid extra moves. */
24830 if (!dest || !rtx_equal_p (dest, *pop1))
24831 break;
24832 /* FALLTHRU */
24834 case GE:
24835 case GT:
24836 case UNLE:
24837 case UNLT:
24838 /* These are not supported directly before AVX, and furthermore
24839 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24840 comparison operands to transform into something that is
24841 supported. */
24842 std::swap (*pop0, *pop1);
24843 code = swap_condition (code);
24844 break;
24846 default:
24847 gcc_unreachable ();
24850 return code;
24853 /* Detect conditional moves that exactly match min/max operational
24854 semantics. Note that this is IEEE safe, as long as we don't
24855 interchange the operands.
24857 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24858 and TRUE if the operation is successful and instructions are emitted. */
24860 static bool
24861 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24862 rtx cmp_op1, rtx if_true, rtx if_false)
24864 machine_mode mode;
24865 bool is_min;
24866 rtx tmp;
24868 if (code == LT)
24870 else if (code == UNGE)
24871 std::swap (if_true, if_false);
24872 else
24873 return false;
24875 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24876 is_min = true;
24877 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24878 is_min = false;
24879 else
24880 return false;
24882 mode = GET_MODE (dest);
24884 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24885 but MODE may be a vector mode and thus not appropriate. */
24886 if (!flag_finite_math_only || flag_signed_zeros)
24888 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24889 rtvec v;
24891 if_true = force_reg (mode, if_true);
24892 v = gen_rtvec (2, if_true, if_false);
24893 tmp = gen_rtx_UNSPEC (mode, v, u);
24895 else
24897 code = is_min ? SMIN : SMAX;
24898 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24901 emit_insn (gen_rtx_SET (dest, tmp));
24902 return true;
24905 /* Expand an sse vector comparison. Return the register with the result. */
24907 static rtx
24908 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24909 rtx op_true, rtx op_false)
24911 machine_mode mode = GET_MODE (dest);
24912 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24914 /* In general case result of comparison can differ from operands' type. */
24915 machine_mode cmp_mode;
24917 /* In AVX512F the result of comparison is an integer mask. */
24918 bool maskcmp = false;
24919 rtx x;
24921 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24923 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
24924 cmp_mode = int_mode_for_size (nbits, 0).require ();
24925 maskcmp = true;
24927 else
24928 cmp_mode = cmp_ops_mode;
24931 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24932 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24933 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24935 if (optimize
24936 || (maskcmp && cmp_mode != mode)
24937 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24938 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24939 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24941 /* Compare patterns for int modes are unspec in AVX512F only. */
24942 if (maskcmp && (code == GT || code == EQ))
24944 rtx (*gen)(rtx, rtx, rtx);
24946 switch (cmp_ops_mode)
24948 case E_V64QImode:
24949 gcc_assert (TARGET_AVX512BW);
24950 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24951 break;
24952 case E_V32HImode:
24953 gcc_assert (TARGET_AVX512BW);
24954 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24955 break;
24956 case E_V16SImode:
24957 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24958 break;
24959 case E_V8DImode:
24960 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24961 break;
24962 default:
24963 gen = NULL;
24966 if (gen)
24968 emit_insn (gen (dest, cmp_op0, cmp_op1));
24969 return dest;
24972 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24974 if (cmp_mode != mode && !maskcmp)
24976 x = force_reg (cmp_ops_mode, x);
24977 convert_move (dest, x, false);
24979 else
24980 emit_insn (gen_rtx_SET (dest, x));
24982 return dest;
24985 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24986 operations. This is used for both scalar and vector conditional moves. */
24988 void
24989 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24991 machine_mode mode = GET_MODE (dest);
24992 machine_mode cmpmode = GET_MODE (cmp);
24994 /* In AVX512F the result of comparison is an integer mask. */
24995 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24997 rtx t2, t3, x;
24999 /* If we have an integer mask and FP value then we need
25000 to cast mask to FP mode. */
25001 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
25003 cmp = force_reg (cmpmode, cmp);
25004 cmp = gen_rtx_SUBREG (mode, cmp, 0);
25007 if (vector_all_ones_operand (op_true, mode)
25008 && rtx_equal_p (op_false, CONST0_RTX (mode))
25009 && !maskcmp)
25011 emit_insn (gen_rtx_SET (dest, cmp));
25013 else if (op_false == CONST0_RTX (mode)
25014 && !maskcmp)
25016 op_true = force_reg (mode, op_true);
25017 x = gen_rtx_AND (mode, cmp, op_true);
25018 emit_insn (gen_rtx_SET (dest, x));
25020 else if (op_true == CONST0_RTX (mode)
25021 && !maskcmp)
25023 op_false = force_reg (mode, op_false);
25024 x = gen_rtx_NOT (mode, cmp);
25025 x = gen_rtx_AND (mode, x, op_false);
25026 emit_insn (gen_rtx_SET (dest, x));
25028 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
25029 && !maskcmp)
25031 op_false = force_reg (mode, op_false);
25032 x = gen_rtx_IOR (mode, cmp, op_false);
25033 emit_insn (gen_rtx_SET (dest, x));
25035 else if (TARGET_XOP
25036 && !maskcmp)
25038 op_true = force_reg (mode, op_true);
25040 if (!nonimmediate_operand (op_false, mode))
25041 op_false = force_reg (mode, op_false);
25043 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
25044 op_true,
25045 op_false)));
25047 else
25049 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25050 rtx d = dest;
25052 if (!nonimmediate_operand (op_true, mode))
25053 op_true = force_reg (mode, op_true);
25055 op_false = force_reg (mode, op_false);
25057 switch (mode)
25059 case E_V4SFmode:
25060 if (TARGET_SSE4_1)
25061 gen = gen_sse4_1_blendvps;
25062 break;
25063 case E_V2DFmode:
25064 if (TARGET_SSE4_1)
25065 gen = gen_sse4_1_blendvpd;
25066 break;
25067 case E_V16QImode:
25068 case E_V8HImode:
25069 case E_V4SImode:
25070 case E_V2DImode:
25071 if (TARGET_SSE4_1)
25073 gen = gen_sse4_1_pblendvb;
25074 if (mode != V16QImode)
25075 d = gen_reg_rtx (V16QImode);
25076 op_false = gen_lowpart (V16QImode, op_false);
25077 op_true = gen_lowpart (V16QImode, op_true);
25078 cmp = gen_lowpart (V16QImode, cmp);
25080 break;
25081 case E_V8SFmode:
25082 if (TARGET_AVX)
25083 gen = gen_avx_blendvps256;
25084 break;
25085 case E_V4DFmode:
25086 if (TARGET_AVX)
25087 gen = gen_avx_blendvpd256;
25088 break;
25089 case E_V32QImode:
25090 case E_V16HImode:
25091 case E_V8SImode:
25092 case E_V4DImode:
25093 if (TARGET_AVX2)
25095 gen = gen_avx2_pblendvb;
25096 if (mode != V32QImode)
25097 d = gen_reg_rtx (V32QImode);
25098 op_false = gen_lowpart (V32QImode, op_false);
25099 op_true = gen_lowpart (V32QImode, op_true);
25100 cmp = gen_lowpart (V32QImode, cmp);
25102 break;
25104 case E_V64QImode:
25105 gen = gen_avx512bw_blendmv64qi;
25106 break;
25107 case E_V32HImode:
25108 gen = gen_avx512bw_blendmv32hi;
25109 break;
25110 case E_V16SImode:
25111 gen = gen_avx512f_blendmv16si;
25112 break;
25113 case E_V8DImode:
25114 gen = gen_avx512f_blendmv8di;
25115 break;
25116 case E_V8DFmode:
25117 gen = gen_avx512f_blendmv8df;
25118 break;
25119 case E_V16SFmode:
25120 gen = gen_avx512f_blendmv16sf;
25121 break;
25123 default:
25124 break;
25127 if (gen != NULL)
25129 emit_insn (gen (d, op_false, op_true, cmp));
25130 if (d != dest)
25131 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
25133 else
25135 op_true = force_reg (mode, op_true);
25137 t2 = gen_reg_rtx (mode);
25138 if (optimize)
25139 t3 = gen_reg_rtx (mode);
25140 else
25141 t3 = dest;
25143 x = gen_rtx_AND (mode, op_true, cmp);
25144 emit_insn (gen_rtx_SET (t2, x));
25146 x = gen_rtx_NOT (mode, cmp);
25147 x = gen_rtx_AND (mode, x, op_false);
25148 emit_insn (gen_rtx_SET (t3, x));
25150 x = gen_rtx_IOR (mode, t3, t2);
25151 emit_insn (gen_rtx_SET (dest, x));
25156 /* Expand a floating-point conditional move. Return true if successful. */
25158 bool
25159 ix86_expand_fp_movcc (rtx operands[])
25161 machine_mode mode = GET_MODE (operands[0]);
25162 enum rtx_code code = GET_CODE (operands[1]);
25163 rtx tmp, compare_op;
25164 rtx op0 = XEXP (operands[1], 0);
25165 rtx op1 = XEXP (operands[1], 1);
25167 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
25169 machine_mode cmode;
25171 /* Since we've no cmove for sse registers, don't force bad register
25172 allocation just to gain access to it. Deny movcc when the
25173 comparison mode doesn't match the move mode. */
25174 cmode = GET_MODE (op0);
25175 if (cmode == VOIDmode)
25176 cmode = GET_MODE (op1);
25177 if (cmode != mode)
25178 return false;
25180 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
25181 if (code == UNKNOWN)
25182 return false;
25184 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
25185 operands[2], operands[3]))
25186 return true;
25188 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
25189 operands[2], operands[3]);
25190 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
25191 return true;
25194 if (GET_MODE (op0) == TImode
25195 || (GET_MODE (op0) == DImode
25196 && !TARGET_64BIT))
25197 return false;
25199 /* The floating point conditional move instructions don't directly
25200 support conditions resulting from a signed integer comparison. */
25202 compare_op = ix86_expand_compare (code, op0, op1);
25203 if (!fcmov_comparison_operator (compare_op, VOIDmode))
25205 tmp = gen_reg_rtx (QImode);
25206 ix86_expand_setcc (tmp, code, op0, op1);
25208 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
25211 emit_insn (gen_rtx_SET (operands[0],
25212 gen_rtx_IF_THEN_ELSE (mode, compare_op,
25213 operands[2], operands[3])));
25215 return true;
25218 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
25220 static int
25221 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
25223 switch (code)
25225 case EQ:
25226 return 0;
25227 case LT:
25228 case LTU:
25229 return 1;
25230 case LE:
25231 case LEU:
25232 return 2;
25233 case NE:
25234 return 4;
25235 case GE:
25236 case GEU:
25237 return 5;
25238 case GT:
25239 case GTU:
25240 return 6;
25241 default:
25242 gcc_unreachable ();
25246 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
25248 static int
25249 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
25251 switch (code)
25253 case EQ:
25254 return 0x00;
25255 case NE:
25256 return 0x04;
25257 case GT:
25258 return 0x0e;
25259 case LE:
25260 return 0x02;
25261 case GE:
25262 return 0x0d;
25263 case LT:
25264 return 0x01;
25265 case UNLE:
25266 return 0x0a;
25267 case UNLT:
25268 return 0x09;
25269 case UNGE:
25270 return 0x05;
25271 case UNGT:
25272 return 0x06;
25273 case UNEQ:
25274 return 0x18;
25275 case LTGT:
25276 return 0x0c;
25277 case ORDERED:
25278 return 0x07;
25279 case UNORDERED:
25280 return 0x03;
25281 default:
25282 gcc_unreachable ();
25286 /* Return immediate value to be used in UNSPEC_PCMP
25287 for comparison CODE in MODE. */
25289 static int
25290 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25292 if (FLOAT_MODE_P (mode))
25293 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25294 return ix86_int_cmp_code_to_pcmp_immediate (code);
25297 /* Expand AVX-512 vector comparison. */
25299 bool
25300 ix86_expand_mask_vec_cmp (rtx operands[])
25302 machine_mode mask_mode = GET_MODE (operands[0]);
25303 machine_mode cmp_mode = GET_MODE (operands[2]);
25304 enum rtx_code code = GET_CODE (operands[1]);
25305 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25306 int unspec_code;
25307 rtx unspec;
25309 switch (code)
25311 case LEU:
25312 case GTU:
25313 case GEU:
25314 case LTU:
25315 unspec_code = UNSPEC_UNSIGNED_PCMP;
25316 break;
25318 default:
25319 unspec_code = UNSPEC_PCMP;
25322 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25323 operands[3], imm),
25324 unspec_code);
25325 emit_insn (gen_rtx_SET (operands[0], unspec));
25327 return true;
25330 /* Expand fp vector comparison. */
25332 bool
25333 ix86_expand_fp_vec_cmp (rtx operands[])
25335 enum rtx_code code = GET_CODE (operands[1]);
25336 rtx cmp;
25338 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25339 &operands[2], &operands[3]);
25340 if (code == UNKNOWN)
25342 rtx temp;
25343 switch (GET_CODE (operands[1]))
25345 case LTGT:
25346 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25347 operands[3], NULL, NULL);
25348 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25349 operands[3], NULL, NULL);
25350 code = AND;
25351 break;
25352 case UNEQ:
25353 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25354 operands[3], NULL, NULL);
25355 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25356 operands[3], NULL, NULL);
25357 code = IOR;
25358 break;
25359 default:
25360 gcc_unreachable ();
25362 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25363 OPTAB_DIRECT);
25365 else
25366 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25367 operands[1], operands[2]);
25369 if (operands[0] != cmp)
25370 emit_move_insn (operands[0], cmp);
25372 return true;
25375 static rtx
25376 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25377 rtx op_true, rtx op_false, bool *negate)
25379 machine_mode data_mode = GET_MODE (dest);
25380 machine_mode mode = GET_MODE (cop0);
25381 rtx x;
25383 *negate = false;
25385 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25386 if (TARGET_XOP
25387 && (mode == V16QImode || mode == V8HImode
25388 || mode == V4SImode || mode == V2DImode))
25390 else
25392 /* Canonicalize the comparison to EQ, GT, GTU. */
25393 switch (code)
25395 case EQ:
25396 case GT:
25397 case GTU:
25398 break;
25400 case NE:
25401 case LE:
25402 case LEU:
25403 code = reverse_condition (code);
25404 *negate = true;
25405 break;
25407 case GE:
25408 case GEU:
25409 code = reverse_condition (code);
25410 *negate = true;
25411 /* FALLTHRU */
25413 case LT:
25414 case LTU:
25415 std::swap (cop0, cop1);
25416 code = swap_condition (code);
25417 break;
25419 default:
25420 gcc_unreachable ();
25423 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25424 if (mode == V2DImode)
25426 switch (code)
25428 case EQ:
25429 /* SSE4.1 supports EQ. */
25430 if (!TARGET_SSE4_1)
25431 return NULL;
25432 break;
25434 case GT:
25435 case GTU:
25436 /* SSE4.2 supports GT/GTU. */
25437 if (!TARGET_SSE4_2)
25438 return NULL;
25439 break;
25441 default:
25442 gcc_unreachable ();
25446 /* Unsigned parallel compare is not supported by the hardware.
25447 Play some tricks to turn this into a signed comparison
25448 against 0. */
25449 if (code == GTU)
25451 cop0 = force_reg (mode, cop0);
25453 switch (mode)
25455 case E_V16SImode:
25456 case E_V8DImode:
25457 case E_V8SImode:
25458 case E_V4DImode:
25459 case E_V4SImode:
25460 case E_V2DImode:
25462 rtx t1, t2, mask;
25463 rtx (*gen_sub3) (rtx, rtx, rtx);
25465 switch (mode)
25467 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
25468 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
25469 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
25470 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
25471 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
25472 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
25473 default:
25474 gcc_unreachable ();
25476 /* Subtract (-(INT MAX) - 1) from both operands to make
25477 them signed. */
25478 mask = ix86_build_signbit_mask (mode, true, false);
25479 t1 = gen_reg_rtx (mode);
25480 emit_insn (gen_sub3 (t1, cop0, mask));
25482 t2 = gen_reg_rtx (mode);
25483 emit_insn (gen_sub3 (t2, cop1, mask));
25485 cop0 = t1;
25486 cop1 = t2;
25487 code = GT;
25489 break;
25491 case E_V64QImode:
25492 case E_V32HImode:
25493 case E_V32QImode:
25494 case E_V16HImode:
25495 case E_V16QImode:
25496 case E_V8HImode:
25497 /* Perform a parallel unsigned saturating subtraction. */
25498 x = gen_reg_rtx (mode);
25499 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25500 cop1)));
25502 cop0 = x;
25503 cop1 = CONST0_RTX (mode);
25504 code = EQ;
25505 *negate = !*negate;
25506 break;
25508 default:
25509 gcc_unreachable ();
25514 if (*negate)
25515 std::swap (op_true, op_false);
25517 /* Allow the comparison to be done in one mode, but the movcc to
25518 happen in another mode. */
25519 if (data_mode == mode)
25521 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25522 op_true, op_false);
25524 else
25526 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25527 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25528 op_true, op_false);
25529 if (GET_MODE (x) == mode)
25530 x = gen_lowpart (data_mode, x);
25533 return x;
25536 /* Expand integer vector comparison. */
25538 bool
25539 ix86_expand_int_vec_cmp (rtx operands[])
25541 rtx_code code = GET_CODE (operands[1]);
25542 bool negate = false;
25543 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25544 operands[3], NULL, NULL, &negate);
25546 if (!cmp)
25547 return false;
25549 if (negate)
25550 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25551 CONST0_RTX (GET_MODE (cmp)),
25552 NULL, NULL, &negate);
25554 gcc_assert (!negate);
25556 if (operands[0] != cmp)
25557 emit_move_insn (operands[0], cmp);
25559 return true;
25562 /* Expand a floating-point vector conditional move; a vcond operation
25563 rather than a movcc operation. */
25565 bool
25566 ix86_expand_fp_vcond (rtx operands[])
25568 enum rtx_code code = GET_CODE (operands[3]);
25569 rtx cmp;
25571 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25572 &operands[4], &operands[5]);
25573 if (code == UNKNOWN)
25575 rtx temp;
25576 switch (GET_CODE (operands[3]))
25578 case LTGT:
25579 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25580 operands[5], operands[0], operands[0]);
25581 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25582 operands[5], operands[1], operands[2]);
25583 code = AND;
25584 break;
25585 case UNEQ:
25586 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25587 operands[5], operands[0], operands[0]);
25588 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25589 operands[5], operands[1], operands[2]);
25590 code = IOR;
25591 break;
25592 default:
25593 gcc_unreachable ();
25595 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25596 OPTAB_DIRECT);
25597 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25598 return true;
25601 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25602 operands[5], operands[1], operands[2]))
25603 return true;
25605 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25606 operands[1], operands[2]);
25607 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25608 return true;
25611 /* Expand a signed/unsigned integral vector conditional move. */
25613 bool
25614 ix86_expand_int_vcond (rtx operands[])
25616 machine_mode data_mode = GET_MODE (operands[0]);
25617 machine_mode mode = GET_MODE (operands[4]);
25618 enum rtx_code code = GET_CODE (operands[3]);
25619 bool negate = false;
25620 rtx x, cop0, cop1;
25622 cop0 = operands[4];
25623 cop1 = operands[5];
25625 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25626 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25627 if ((code == LT || code == GE)
25628 && data_mode == mode
25629 && cop1 == CONST0_RTX (mode)
25630 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25631 && GET_MODE_UNIT_SIZE (data_mode) > 1
25632 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25633 && (GET_MODE_SIZE (data_mode) == 16
25634 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25636 rtx negop = operands[2 - (code == LT)];
25637 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25638 if (negop == CONST1_RTX (data_mode))
25640 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25641 operands[0], 1, OPTAB_DIRECT);
25642 if (res != operands[0])
25643 emit_move_insn (operands[0], res);
25644 return true;
25646 else if (GET_MODE_INNER (data_mode) != DImode
25647 && vector_all_ones_operand (negop, data_mode))
25649 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25650 operands[0], 0, OPTAB_DIRECT);
25651 if (res != operands[0])
25652 emit_move_insn (operands[0], res);
25653 return true;
25657 if (!nonimmediate_operand (cop1, mode))
25658 cop1 = force_reg (mode, cop1);
25659 if (!general_operand (operands[1], data_mode))
25660 operands[1] = force_reg (data_mode, operands[1]);
25661 if (!general_operand (operands[2], data_mode))
25662 operands[2] = force_reg (data_mode, operands[2]);
25664 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25665 operands[1], operands[2], &negate);
25667 if (!x)
25668 return false;
25670 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25671 operands[2-negate]);
25672 return true;
25675 /* AVX512F does support 64-byte integer vector operations,
25676 thus the longest vector we are faced with is V64QImode. */
25677 #define MAX_VECT_LEN 64
25679 struct expand_vec_perm_d
25681 rtx target, op0, op1;
25682 unsigned char perm[MAX_VECT_LEN];
25683 machine_mode vmode;
25684 unsigned char nelt;
25685 bool one_operand_p;
25686 bool testing_p;
25689 static bool
25690 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25691 struct expand_vec_perm_d *d)
25693 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25694 expander, so args are either in d, or in op0, op1 etc. */
25695 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25696 machine_mode maskmode = mode;
25697 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25699 switch (mode)
25701 case E_V8HImode:
25702 if (TARGET_AVX512VL && TARGET_AVX512BW)
25703 gen = gen_avx512vl_vpermi2varv8hi3;
25704 break;
25705 case E_V16HImode:
25706 if (TARGET_AVX512VL && TARGET_AVX512BW)
25707 gen = gen_avx512vl_vpermi2varv16hi3;
25708 break;
25709 case E_V64QImode:
25710 if (TARGET_AVX512VBMI)
25711 gen = gen_avx512bw_vpermi2varv64qi3;
25712 break;
25713 case E_V32HImode:
25714 if (TARGET_AVX512BW)
25715 gen = gen_avx512bw_vpermi2varv32hi3;
25716 break;
25717 case E_V4SImode:
25718 if (TARGET_AVX512VL)
25719 gen = gen_avx512vl_vpermi2varv4si3;
25720 break;
25721 case E_V8SImode:
25722 if (TARGET_AVX512VL)
25723 gen = gen_avx512vl_vpermi2varv8si3;
25724 break;
25725 case E_V16SImode:
25726 if (TARGET_AVX512F)
25727 gen = gen_avx512f_vpermi2varv16si3;
25728 break;
25729 case E_V4SFmode:
25730 if (TARGET_AVX512VL)
25732 gen = gen_avx512vl_vpermi2varv4sf3;
25733 maskmode = V4SImode;
25735 break;
25736 case E_V8SFmode:
25737 if (TARGET_AVX512VL)
25739 gen = gen_avx512vl_vpermi2varv8sf3;
25740 maskmode = V8SImode;
25742 break;
25743 case E_V16SFmode:
25744 if (TARGET_AVX512F)
25746 gen = gen_avx512f_vpermi2varv16sf3;
25747 maskmode = V16SImode;
25749 break;
25750 case E_V2DImode:
25751 if (TARGET_AVX512VL)
25752 gen = gen_avx512vl_vpermi2varv2di3;
25753 break;
25754 case E_V4DImode:
25755 if (TARGET_AVX512VL)
25756 gen = gen_avx512vl_vpermi2varv4di3;
25757 break;
25758 case E_V8DImode:
25759 if (TARGET_AVX512F)
25760 gen = gen_avx512f_vpermi2varv8di3;
25761 break;
25762 case E_V2DFmode:
25763 if (TARGET_AVX512VL)
25765 gen = gen_avx512vl_vpermi2varv2df3;
25766 maskmode = V2DImode;
25768 break;
25769 case E_V4DFmode:
25770 if (TARGET_AVX512VL)
25772 gen = gen_avx512vl_vpermi2varv4df3;
25773 maskmode = V4DImode;
25775 break;
25776 case E_V8DFmode:
25777 if (TARGET_AVX512F)
25779 gen = gen_avx512f_vpermi2varv8df3;
25780 maskmode = V8DImode;
25782 break;
25783 default:
25784 break;
25787 if (gen == NULL)
25788 return false;
25790 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25791 expander, so args are either in d, or in op0, op1 etc. */
25792 if (d)
25794 rtx vec[64];
25795 target = d->target;
25796 op0 = d->op0;
25797 op1 = d->op1;
25798 for (int i = 0; i < d->nelt; ++i)
25799 vec[i] = GEN_INT (d->perm[i]);
25800 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25803 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25804 return true;
25807 /* Expand a variable vector permutation. */
25809 void
25810 ix86_expand_vec_perm (rtx operands[])
25812 rtx target = operands[0];
25813 rtx op0 = operands[1];
25814 rtx op1 = operands[2];
25815 rtx mask = operands[3];
25816 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25817 machine_mode mode = GET_MODE (op0);
25818 machine_mode maskmode = GET_MODE (mask);
25819 int w, e, i;
25820 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25822 /* Number of elements in the vector. */
25823 w = GET_MODE_NUNITS (mode);
25824 e = GET_MODE_UNIT_SIZE (mode);
25825 gcc_assert (w <= 64);
25827 if (TARGET_AVX512F && one_operand_shuffle)
25829 rtx (*gen) (rtx, rtx, rtx) = NULL;
25830 switch (mode)
25832 case E_V16SImode:
25833 gen =gen_avx512f_permvarv16si;
25834 break;
25835 case E_V16SFmode:
25836 gen = gen_avx512f_permvarv16sf;
25837 break;
25838 case E_V8DImode:
25839 gen = gen_avx512f_permvarv8di;
25840 break;
25841 case E_V8DFmode:
25842 gen = gen_avx512f_permvarv8df;
25843 break;
25844 default:
25845 break;
25847 if (gen != NULL)
25849 emit_insn (gen (target, op0, mask));
25850 return;
25854 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25855 return;
25857 if (TARGET_AVX2)
25859 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25861 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25862 an constant shuffle operand. With a tiny bit of effort we can
25863 use VPERMD instead. A re-interpretation stall for V4DFmode is
25864 unfortunate but there's no avoiding it.
25865 Similarly for V16HImode we don't have instructions for variable
25866 shuffling, while for V32QImode we can use after preparing suitable
25867 masks vpshufb; vpshufb; vpermq; vpor. */
25869 if (mode == V16HImode)
25871 maskmode = mode = V32QImode;
25872 w = 32;
25873 e = 1;
25875 else
25877 maskmode = mode = V8SImode;
25878 w = 8;
25879 e = 4;
25881 t1 = gen_reg_rtx (maskmode);
25883 /* Replicate the low bits of the V4DImode mask into V8SImode:
25884 mask = { A B C D }
25885 t1 = { A A B B C C D D }. */
25886 for (i = 0; i < w / 2; ++i)
25887 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25888 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25889 vt = force_reg (maskmode, vt);
25890 mask = gen_lowpart (maskmode, mask);
25891 if (maskmode == V8SImode)
25892 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25893 else
25894 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25896 /* Multiply the shuffle indicies by two. */
25897 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25898 OPTAB_DIRECT);
25900 /* Add one to the odd shuffle indicies:
25901 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25902 for (i = 0; i < w / 2; ++i)
25904 vec[i * 2] = const0_rtx;
25905 vec[i * 2 + 1] = const1_rtx;
25907 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25908 vt = validize_mem (force_const_mem (maskmode, vt));
25909 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25910 OPTAB_DIRECT);
25912 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25913 operands[3] = mask = t1;
25914 target = gen_reg_rtx (mode);
25915 op0 = gen_lowpart (mode, op0);
25916 op1 = gen_lowpart (mode, op1);
25919 switch (mode)
25921 case E_V8SImode:
25922 /* The VPERMD and VPERMPS instructions already properly ignore
25923 the high bits of the shuffle elements. No need for us to
25924 perform an AND ourselves. */
25925 if (one_operand_shuffle)
25927 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25928 if (target != operands[0])
25929 emit_move_insn (operands[0],
25930 gen_lowpart (GET_MODE (operands[0]), target));
25932 else
25934 t1 = gen_reg_rtx (V8SImode);
25935 t2 = gen_reg_rtx (V8SImode);
25936 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25937 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25938 goto merge_two;
25940 return;
25942 case E_V8SFmode:
25943 mask = gen_lowpart (V8SImode, mask);
25944 if (one_operand_shuffle)
25945 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25946 else
25948 t1 = gen_reg_rtx (V8SFmode);
25949 t2 = gen_reg_rtx (V8SFmode);
25950 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25951 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25952 goto merge_two;
25954 return;
25956 case E_V4SImode:
25957 /* By combining the two 128-bit input vectors into one 256-bit
25958 input vector, we can use VPERMD and VPERMPS for the full
25959 two-operand shuffle. */
25960 t1 = gen_reg_rtx (V8SImode);
25961 t2 = gen_reg_rtx (V8SImode);
25962 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25963 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25964 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25965 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25966 return;
25968 case E_V4SFmode:
25969 t1 = gen_reg_rtx (V8SFmode);
25970 t2 = gen_reg_rtx (V8SImode);
25971 mask = gen_lowpart (V4SImode, mask);
25972 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25973 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25974 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25975 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25976 return;
25978 case E_V32QImode:
25979 t1 = gen_reg_rtx (V32QImode);
25980 t2 = gen_reg_rtx (V32QImode);
25981 t3 = gen_reg_rtx (V32QImode);
25982 vt2 = GEN_INT (-128);
25983 for (i = 0; i < 32; i++)
25984 vec[i] = vt2;
25985 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25986 vt = force_reg (V32QImode, vt);
25987 for (i = 0; i < 32; i++)
25988 vec[i] = i < 16 ? vt2 : const0_rtx;
25989 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25990 vt2 = force_reg (V32QImode, vt2);
25991 /* From mask create two adjusted masks, which contain the same
25992 bits as mask in the low 7 bits of each vector element.
25993 The first mask will have the most significant bit clear
25994 if it requests element from the same 128-bit lane
25995 and MSB set if it requests element from the other 128-bit lane.
25996 The second mask will have the opposite values of the MSB,
25997 and additionally will have its 128-bit lanes swapped.
25998 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25999 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
26000 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
26001 stands for other 12 bytes. */
26002 /* The bit whether element is from the same lane or the other
26003 lane is bit 4, so shift it up by 3 to the MSB position. */
26004 t5 = gen_reg_rtx (V4DImode);
26005 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
26006 GEN_INT (3)));
26007 /* Clear MSB bits from the mask just in case it had them set. */
26008 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
26009 /* After this t1 will have MSB set for elements from other lane. */
26010 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
26011 /* Clear bits other than MSB. */
26012 emit_insn (gen_andv32qi3 (t1, t1, vt));
26013 /* Or in the lower bits from mask into t3. */
26014 emit_insn (gen_iorv32qi3 (t3, t1, t2));
26015 /* And invert MSB bits in t1, so MSB is set for elements from the same
26016 lane. */
26017 emit_insn (gen_xorv32qi3 (t1, t1, vt));
26018 /* Swap 128-bit lanes in t3. */
26019 t6 = gen_reg_rtx (V4DImode);
26020 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
26021 const2_rtx, GEN_INT (3),
26022 const0_rtx, const1_rtx));
26023 /* And or in the lower bits from mask into t1. */
26024 emit_insn (gen_iorv32qi3 (t1, t1, t2));
26025 if (one_operand_shuffle)
26027 /* Each of these shuffles will put 0s in places where
26028 element from the other 128-bit lane is needed, otherwise
26029 will shuffle in the requested value. */
26030 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
26031 gen_lowpart (V32QImode, t6)));
26032 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
26033 /* For t3 the 128-bit lanes are swapped again. */
26034 t7 = gen_reg_rtx (V4DImode);
26035 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
26036 const2_rtx, GEN_INT (3),
26037 const0_rtx, const1_rtx));
26038 /* And oring both together leads to the result. */
26039 emit_insn (gen_iorv32qi3 (target, t1,
26040 gen_lowpart (V32QImode, t7)));
26041 if (target != operands[0])
26042 emit_move_insn (operands[0],
26043 gen_lowpart (GET_MODE (operands[0]), target));
26044 return;
26047 t4 = gen_reg_rtx (V32QImode);
26048 /* Similarly to the above one_operand_shuffle code,
26049 just for repeated twice for each operand. merge_two:
26050 code will merge the two results together. */
26051 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
26052 gen_lowpart (V32QImode, t6)));
26053 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
26054 gen_lowpart (V32QImode, t6)));
26055 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
26056 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
26057 t7 = gen_reg_rtx (V4DImode);
26058 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
26059 const2_rtx, GEN_INT (3),
26060 const0_rtx, const1_rtx));
26061 t8 = gen_reg_rtx (V4DImode);
26062 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
26063 const2_rtx, GEN_INT (3),
26064 const0_rtx, const1_rtx));
26065 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
26066 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
26067 t1 = t4;
26068 t2 = t3;
26069 goto merge_two;
26071 default:
26072 gcc_assert (GET_MODE_SIZE (mode) <= 16);
26073 break;
26077 if (TARGET_XOP)
26079 /* The XOP VPPERM insn supports three inputs. By ignoring the
26080 one_operand_shuffle special case, we avoid creating another
26081 set of constant vectors in memory. */
26082 one_operand_shuffle = false;
26084 /* mask = mask & {2*w-1, ...} */
26085 vt = GEN_INT (2*w - 1);
26087 else
26089 /* mask = mask & {w-1, ...} */
26090 vt = GEN_INT (w - 1);
26093 for (i = 0; i < w; i++)
26094 vec[i] = vt;
26095 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26096 mask = expand_simple_binop (maskmode, AND, mask, vt,
26097 NULL_RTX, 0, OPTAB_DIRECT);
26099 /* For non-QImode operations, convert the word permutation control
26100 into a byte permutation control. */
26101 if (mode != V16QImode)
26103 mask = expand_simple_binop (maskmode, ASHIFT, mask,
26104 GEN_INT (exact_log2 (e)),
26105 NULL_RTX, 0, OPTAB_DIRECT);
26107 /* Convert mask to vector of chars. */
26108 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
26110 /* Replicate each of the input bytes into byte positions:
26111 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
26112 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
26113 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
26114 for (i = 0; i < 16; ++i)
26115 vec[i] = GEN_INT (i/e * e);
26116 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
26117 vt = validize_mem (force_const_mem (V16QImode, vt));
26118 if (TARGET_XOP)
26119 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
26120 else
26121 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
26123 /* Convert it into the byte positions by doing
26124 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
26125 for (i = 0; i < 16; ++i)
26126 vec[i] = GEN_INT (i % e);
26127 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
26128 vt = validize_mem (force_const_mem (V16QImode, vt));
26129 emit_insn (gen_addv16qi3 (mask, mask, vt));
26132 /* The actual shuffle operations all operate on V16QImode. */
26133 op0 = gen_lowpart (V16QImode, op0);
26134 op1 = gen_lowpart (V16QImode, op1);
26136 if (TARGET_XOP)
26138 if (GET_MODE (target) != V16QImode)
26139 target = gen_reg_rtx (V16QImode);
26140 emit_insn (gen_xop_pperm (target, op0, op1, mask));
26141 if (target != operands[0])
26142 emit_move_insn (operands[0],
26143 gen_lowpart (GET_MODE (operands[0]), target));
26145 else if (one_operand_shuffle)
26147 if (GET_MODE (target) != V16QImode)
26148 target = gen_reg_rtx (V16QImode);
26149 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
26150 if (target != operands[0])
26151 emit_move_insn (operands[0],
26152 gen_lowpart (GET_MODE (operands[0]), target));
26154 else
26156 rtx xops[6];
26157 bool ok;
26159 /* Shuffle the two input vectors independently. */
26160 t1 = gen_reg_rtx (V16QImode);
26161 t2 = gen_reg_rtx (V16QImode);
26162 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
26163 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
26165 merge_two:
26166 /* Then merge them together. The key is whether any given control
26167 element contained a bit set that indicates the second word. */
26168 mask = operands[3];
26169 vt = GEN_INT (w);
26170 if (maskmode == V2DImode && !TARGET_SSE4_1)
26172 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
26173 more shuffle to convert the V2DI input mask into a V4SI
26174 input mask. At which point the masking that expand_int_vcond
26175 will work as desired. */
26176 rtx t3 = gen_reg_rtx (V4SImode);
26177 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
26178 const0_rtx, const0_rtx,
26179 const2_rtx, const2_rtx));
26180 mask = t3;
26181 maskmode = V4SImode;
26182 e = w = 4;
26185 for (i = 0; i < w; i++)
26186 vec[i] = vt;
26187 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26188 vt = force_reg (maskmode, vt);
26189 mask = expand_simple_binop (maskmode, AND, mask, vt,
26190 NULL_RTX, 0, OPTAB_DIRECT);
26192 if (GET_MODE (target) != mode)
26193 target = gen_reg_rtx (mode);
26194 xops[0] = target;
26195 xops[1] = gen_lowpart (mode, t2);
26196 xops[2] = gen_lowpart (mode, t1);
26197 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
26198 xops[4] = mask;
26199 xops[5] = vt;
26200 ok = ix86_expand_int_vcond (xops);
26201 gcc_assert (ok);
26202 if (target != operands[0])
26203 emit_move_insn (operands[0],
26204 gen_lowpart (GET_MODE (operands[0]), target));
26208 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
26209 true if we should do zero extension, else sign extension. HIGH_P is
26210 true if we want the N/2 high elements, else the low elements. */
26212 void
26213 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
26215 machine_mode imode = GET_MODE (src);
26216 rtx tmp;
26218 if (TARGET_SSE4_1)
26220 rtx (*unpack)(rtx, rtx);
26221 rtx (*extract)(rtx, rtx) = NULL;
26222 machine_mode halfmode = BLKmode;
26224 switch (imode)
26226 case E_V64QImode:
26227 if (unsigned_p)
26228 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
26229 else
26230 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
26231 halfmode = V32QImode;
26232 extract
26233 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
26234 break;
26235 case E_V32QImode:
26236 if (unsigned_p)
26237 unpack = gen_avx2_zero_extendv16qiv16hi2;
26238 else
26239 unpack = gen_avx2_sign_extendv16qiv16hi2;
26240 halfmode = V16QImode;
26241 extract
26242 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
26243 break;
26244 case E_V32HImode:
26245 if (unsigned_p)
26246 unpack = gen_avx512f_zero_extendv16hiv16si2;
26247 else
26248 unpack = gen_avx512f_sign_extendv16hiv16si2;
26249 halfmode = V16HImode;
26250 extract
26251 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
26252 break;
26253 case E_V16HImode:
26254 if (unsigned_p)
26255 unpack = gen_avx2_zero_extendv8hiv8si2;
26256 else
26257 unpack = gen_avx2_sign_extendv8hiv8si2;
26258 halfmode = V8HImode;
26259 extract
26260 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
26261 break;
26262 case E_V16SImode:
26263 if (unsigned_p)
26264 unpack = gen_avx512f_zero_extendv8siv8di2;
26265 else
26266 unpack = gen_avx512f_sign_extendv8siv8di2;
26267 halfmode = V8SImode;
26268 extract
26269 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26270 break;
26271 case E_V8SImode:
26272 if (unsigned_p)
26273 unpack = gen_avx2_zero_extendv4siv4di2;
26274 else
26275 unpack = gen_avx2_sign_extendv4siv4di2;
26276 halfmode = V4SImode;
26277 extract
26278 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26279 break;
26280 case E_V16QImode:
26281 if (unsigned_p)
26282 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26283 else
26284 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26285 break;
26286 case E_V8HImode:
26287 if (unsigned_p)
26288 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26289 else
26290 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26291 break;
26292 case E_V4SImode:
26293 if (unsigned_p)
26294 unpack = gen_sse4_1_zero_extendv2siv2di2;
26295 else
26296 unpack = gen_sse4_1_sign_extendv2siv2di2;
26297 break;
26298 default:
26299 gcc_unreachable ();
26302 if (GET_MODE_SIZE (imode) >= 32)
26304 tmp = gen_reg_rtx (halfmode);
26305 emit_insn (extract (tmp, src));
26307 else if (high_p)
26309 /* Shift higher 8 bytes to lower 8 bytes. */
26310 tmp = gen_reg_rtx (V1TImode);
26311 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26312 GEN_INT (64)));
26313 tmp = gen_lowpart (imode, tmp);
26315 else
26316 tmp = src;
26318 emit_insn (unpack (dest, tmp));
26320 else
26322 rtx (*unpack)(rtx, rtx, rtx);
26324 switch (imode)
26326 case E_V16QImode:
26327 if (high_p)
26328 unpack = gen_vec_interleave_highv16qi;
26329 else
26330 unpack = gen_vec_interleave_lowv16qi;
26331 break;
26332 case E_V8HImode:
26333 if (high_p)
26334 unpack = gen_vec_interleave_highv8hi;
26335 else
26336 unpack = gen_vec_interleave_lowv8hi;
26337 break;
26338 case E_V4SImode:
26339 if (high_p)
26340 unpack = gen_vec_interleave_highv4si;
26341 else
26342 unpack = gen_vec_interleave_lowv4si;
26343 break;
26344 default:
26345 gcc_unreachable ();
26348 if (unsigned_p)
26349 tmp = force_reg (imode, CONST0_RTX (imode));
26350 else
26351 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26352 src, pc_rtx, pc_rtx);
26354 rtx tmp2 = gen_reg_rtx (imode);
26355 emit_insn (unpack (tmp2, src, tmp));
26356 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26360 /* Expand conditional increment or decrement using adb/sbb instructions.
26361 The default case using setcc followed by the conditional move can be
26362 done by generic code. */
26363 bool
26364 ix86_expand_int_addcc (rtx operands[])
26366 enum rtx_code code = GET_CODE (operands[1]);
26367 rtx flags;
26368 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26369 rtx compare_op;
26370 rtx val = const0_rtx;
26371 bool fpcmp = false;
26372 machine_mode mode;
26373 rtx op0 = XEXP (operands[1], 0);
26374 rtx op1 = XEXP (operands[1], 1);
26376 if (operands[3] != const1_rtx
26377 && operands[3] != constm1_rtx)
26378 return false;
26379 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26380 return false;
26381 code = GET_CODE (compare_op);
26383 flags = XEXP (compare_op, 0);
26385 if (GET_MODE (flags) == CCFPmode
26386 || GET_MODE (flags) == CCFPUmode)
26388 fpcmp = true;
26389 code = ix86_fp_compare_code_to_integer (code);
26392 if (code != LTU)
26394 val = constm1_rtx;
26395 if (fpcmp)
26396 PUT_CODE (compare_op,
26397 reverse_condition_maybe_unordered
26398 (GET_CODE (compare_op)));
26399 else
26400 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26403 mode = GET_MODE (operands[0]);
26405 /* Construct either adc or sbb insn. */
26406 if ((code == LTU) == (operands[3] == constm1_rtx))
26408 switch (mode)
26410 case E_QImode:
26411 insn = gen_subqi3_carry;
26412 break;
26413 case E_HImode:
26414 insn = gen_subhi3_carry;
26415 break;
26416 case E_SImode:
26417 insn = gen_subsi3_carry;
26418 break;
26419 case E_DImode:
26420 insn = gen_subdi3_carry;
26421 break;
26422 default:
26423 gcc_unreachable ();
26426 else
26428 switch (mode)
26430 case E_QImode:
26431 insn = gen_addqi3_carry;
26432 break;
26433 case E_HImode:
26434 insn = gen_addhi3_carry;
26435 break;
26436 case E_SImode:
26437 insn = gen_addsi3_carry;
26438 break;
26439 case E_DImode:
26440 insn = gen_adddi3_carry;
26441 break;
26442 default:
26443 gcc_unreachable ();
26446 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26448 return true;
26452 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26453 but works for floating pointer parameters and nonoffsetable memories.
26454 For pushes, it returns just stack offsets; the values will be saved
26455 in the right order. Maximally three parts are generated. */
26457 static int
26458 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26460 int size;
26462 if (!TARGET_64BIT)
26463 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26464 else
26465 size = (GET_MODE_SIZE (mode) + 4) / 8;
26467 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26468 gcc_assert (size >= 2 && size <= 4);
26470 /* Optimize constant pool reference to immediates. This is used by fp
26471 moves, that force all constants to memory to allow combining. */
26472 if (MEM_P (operand) && MEM_READONLY_P (operand))
26474 rtx tmp = maybe_get_pool_constant (operand);
26475 if (tmp)
26476 operand = tmp;
26479 if (MEM_P (operand) && !offsettable_memref_p (operand))
26481 /* The only non-offsetable memories we handle are pushes. */
26482 int ok = push_operand (operand, VOIDmode);
26484 gcc_assert (ok);
26486 operand = copy_rtx (operand);
26487 PUT_MODE (operand, word_mode);
26488 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26489 return size;
26492 if (GET_CODE (operand) == CONST_VECTOR)
26494 scalar_int_mode imode = int_mode_for_mode (mode).require ();
26495 /* Caution: if we looked through a constant pool memory above,
26496 the operand may actually have a different mode now. That's
26497 ok, since we want to pun this all the way back to an integer. */
26498 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26499 gcc_assert (operand != NULL);
26500 mode = imode;
26503 if (!TARGET_64BIT)
26505 if (mode == DImode)
26506 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26507 else
26509 int i;
26511 if (REG_P (operand))
26513 gcc_assert (reload_completed);
26514 for (i = 0; i < size; i++)
26515 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26517 else if (offsettable_memref_p (operand))
26519 operand = adjust_address (operand, SImode, 0);
26520 parts[0] = operand;
26521 for (i = 1; i < size; i++)
26522 parts[i] = adjust_address (operand, SImode, 4 * i);
26524 else if (CONST_DOUBLE_P (operand))
26526 const REAL_VALUE_TYPE *r;
26527 long l[4];
26529 r = CONST_DOUBLE_REAL_VALUE (operand);
26530 switch (mode)
26532 case E_TFmode:
26533 real_to_target (l, r, mode);
26534 parts[3] = gen_int_mode (l[3], SImode);
26535 parts[2] = gen_int_mode (l[2], SImode);
26536 break;
26537 case E_XFmode:
26538 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26539 long double may not be 80-bit. */
26540 real_to_target (l, r, mode);
26541 parts[2] = gen_int_mode (l[2], SImode);
26542 break;
26543 case E_DFmode:
26544 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26545 break;
26546 default:
26547 gcc_unreachable ();
26549 parts[1] = gen_int_mode (l[1], SImode);
26550 parts[0] = gen_int_mode (l[0], SImode);
26552 else
26553 gcc_unreachable ();
26556 else
26558 if (mode == TImode)
26559 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26560 if (mode == XFmode || mode == TFmode)
26562 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26563 if (REG_P (operand))
26565 gcc_assert (reload_completed);
26566 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26567 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26569 else if (offsettable_memref_p (operand))
26571 operand = adjust_address (operand, DImode, 0);
26572 parts[0] = operand;
26573 parts[1] = adjust_address (operand, upper_mode, 8);
26575 else if (CONST_DOUBLE_P (operand))
26577 long l[4];
26579 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26581 /* real_to_target puts 32-bit pieces in each long. */
26582 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26583 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26584 << 32), DImode);
26586 if (upper_mode == SImode)
26587 parts[1] = gen_int_mode (l[2], SImode);
26588 else
26589 parts[1]
26590 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26591 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26592 << 32), DImode);
26594 else
26595 gcc_unreachable ();
26599 return size;
26602 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26603 Return false when normal moves are needed; true when all required
26604 insns have been emitted. Operands 2-4 contain the input values
26605 int the correct order; operands 5-7 contain the output values. */
26607 void
26608 ix86_split_long_move (rtx operands[])
26610 rtx part[2][4];
26611 int nparts, i, j;
26612 int push = 0;
26613 int collisions = 0;
26614 machine_mode mode = GET_MODE (operands[0]);
26615 bool collisionparts[4];
26617 /* The DFmode expanders may ask us to move double.
26618 For 64bit target this is single move. By hiding the fact
26619 here we simplify i386.md splitters. */
26620 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26622 /* Optimize constant pool reference to immediates. This is used by
26623 fp moves, that force all constants to memory to allow combining. */
26625 if (MEM_P (operands[1])
26626 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26627 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26628 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26629 if (push_operand (operands[0], VOIDmode))
26631 operands[0] = copy_rtx (operands[0]);
26632 PUT_MODE (operands[0], word_mode);
26634 else
26635 operands[0] = gen_lowpart (DImode, operands[0]);
26636 operands[1] = gen_lowpart (DImode, operands[1]);
26637 emit_move_insn (operands[0], operands[1]);
26638 return;
26641 /* The only non-offsettable memory we handle is push. */
26642 if (push_operand (operands[0], VOIDmode))
26643 push = 1;
26644 else
26645 gcc_assert (!MEM_P (operands[0])
26646 || offsettable_memref_p (operands[0]));
26648 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26649 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26651 /* When emitting push, take care for source operands on the stack. */
26652 if (push && MEM_P (operands[1])
26653 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26655 rtx src_base = XEXP (part[1][nparts - 1], 0);
26657 /* Compensate for the stack decrement by 4. */
26658 if (!TARGET_64BIT && nparts == 3
26659 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26660 src_base = plus_constant (Pmode, src_base, 4);
26662 /* src_base refers to the stack pointer and is
26663 automatically decreased by emitted push. */
26664 for (i = 0; i < nparts; i++)
26665 part[1][i] = change_address (part[1][i],
26666 GET_MODE (part[1][i]), src_base);
26669 /* We need to do copy in the right order in case an address register
26670 of the source overlaps the destination. */
26671 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26673 rtx tmp;
26675 for (i = 0; i < nparts; i++)
26677 collisionparts[i]
26678 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26679 if (collisionparts[i])
26680 collisions++;
26683 /* Collision in the middle part can be handled by reordering. */
26684 if (collisions == 1 && nparts == 3 && collisionparts [1])
26686 std::swap (part[0][1], part[0][2]);
26687 std::swap (part[1][1], part[1][2]);
26689 else if (collisions == 1
26690 && nparts == 4
26691 && (collisionparts [1] || collisionparts [2]))
26693 if (collisionparts [1])
26695 std::swap (part[0][1], part[0][2]);
26696 std::swap (part[1][1], part[1][2]);
26698 else
26700 std::swap (part[0][2], part[0][3]);
26701 std::swap (part[1][2], part[1][3]);
26705 /* If there are more collisions, we can't handle it by reordering.
26706 Do an lea to the last part and use only one colliding move. */
26707 else if (collisions > 1)
26709 rtx base, addr, tls_base = NULL_RTX;
26711 collisions = 1;
26713 base = part[0][nparts - 1];
26715 /* Handle the case when the last part isn't valid for lea.
26716 Happens in 64-bit mode storing the 12-byte XFmode. */
26717 if (GET_MODE (base) != Pmode)
26718 base = gen_rtx_REG (Pmode, REGNO (base));
26720 addr = XEXP (part[1][0], 0);
26721 if (TARGET_TLS_DIRECT_SEG_REFS)
26723 struct ix86_address parts;
26724 int ok = ix86_decompose_address (addr, &parts);
26725 gcc_assert (ok);
26726 if (parts.seg == DEFAULT_TLS_SEG_REG)
26728 /* It is not valid to use %gs: or %fs: in
26729 lea though, so we need to remove it from the
26730 address used for lea and add it to each individual
26731 memory loads instead. */
26732 addr = copy_rtx (addr);
26733 rtx *x = &addr;
26734 while (GET_CODE (*x) == PLUS)
26736 for (i = 0; i < 2; i++)
26738 rtx u = XEXP (*x, i);
26739 if (GET_CODE (u) == ZERO_EXTEND)
26740 u = XEXP (u, 0);
26741 if (GET_CODE (u) == UNSPEC
26742 && XINT (u, 1) == UNSPEC_TP)
26744 tls_base = XEXP (*x, i);
26745 *x = XEXP (*x, 1 - i);
26746 break;
26749 if (tls_base)
26750 break;
26751 x = &XEXP (*x, 0);
26753 gcc_assert (tls_base);
26756 emit_insn (gen_rtx_SET (base, addr));
26757 if (tls_base)
26758 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26759 part[1][0] = replace_equiv_address (part[1][0], base);
26760 for (i = 1; i < nparts; i++)
26762 if (tls_base)
26763 base = copy_rtx (base);
26764 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26765 part[1][i] = replace_equiv_address (part[1][i], tmp);
26770 if (push)
26772 if (!TARGET_64BIT)
26774 if (nparts == 3)
26776 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26777 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26778 stack_pointer_rtx, GEN_INT (-4)));
26779 emit_move_insn (part[0][2], part[1][2]);
26781 else if (nparts == 4)
26783 emit_move_insn (part[0][3], part[1][3]);
26784 emit_move_insn (part[0][2], part[1][2]);
26787 else
26789 /* In 64bit mode we don't have 32bit push available. In case this is
26790 register, it is OK - we will just use larger counterpart. We also
26791 retype memory - these comes from attempt to avoid REX prefix on
26792 moving of second half of TFmode value. */
26793 if (GET_MODE (part[1][1]) == SImode)
26795 switch (GET_CODE (part[1][1]))
26797 case MEM:
26798 part[1][1] = adjust_address (part[1][1], DImode, 0);
26799 break;
26801 case REG:
26802 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26803 break;
26805 default:
26806 gcc_unreachable ();
26809 if (GET_MODE (part[1][0]) == SImode)
26810 part[1][0] = part[1][1];
26813 emit_move_insn (part[0][1], part[1][1]);
26814 emit_move_insn (part[0][0], part[1][0]);
26815 return;
26818 /* Choose correct order to not overwrite the source before it is copied. */
26819 if ((REG_P (part[0][0])
26820 && REG_P (part[1][1])
26821 && (REGNO (part[0][0]) == REGNO (part[1][1])
26822 || (nparts == 3
26823 && REGNO (part[0][0]) == REGNO (part[1][2]))
26824 || (nparts == 4
26825 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26826 || (collisions > 0
26827 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26829 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26831 operands[2 + i] = part[0][j];
26832 operands[6 + i] = part[1][j];
26835 else
26837 for (i = 0; i < nparts; i++)
26839 operands[2 + i] = part[0][i];
26840 operands[6 + i] = part[1][i];
26844 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26845 if (optimize_insn_for_size_p ())
26847 for (j = 0; j < nparts - 1; j++)
26848 if (CONST_INT_P (operands[6 + j])
26849 && operands[6 + j] != const0_rtx
26850 && REG_P (operands[2 + j]))
26851 for (i = j; i < nparts - 1; i++)
26852 if (CONST_INT_P (operands[7 + i])
26853 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26854 operands[7 + i] = operands[2 + j];
26857 for (i = 0; i < nparts; i++)
26858 emit_move_insn (operands[2 + i], operands[6 + i]);
26860 return;
26863 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26864 left shift by a constant, either using a single shift or
26865 a sequence of add instructions. */
26867 static void
26868 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26870 rtx (*insn)(rtx, rtx, rtx);
26872 if (count == 1
26873 || (count * ix86_cost->add <= ix86_cost->shift_const
26874 && !optimize_insn_for_size_p ()))
26876 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26877 while (count-- > 0)
26878 emit_insn (insn (operand, operand, operand));
26880 else
26882 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26883 emit_insn (insn (operand, operand, GEN_INT (count)));
26887 void
26888 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26890 rtx (*gen_ashl3)(rtx, rtx, rtx);
26891 rtx (*gen_shld)(rtx, rtx, rtx);
26892 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26894 rtx low[2], high[2];
26895 int count;
26897 if (CONST_INT_P (operands[2]))
26899 split_double_mode (mode, operands, 2, low, high);
26900 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26902 if (count >= half_width)
26904 emit_move_insn (high[0], low[1]);
26905 emit_move_insn (low[0], const0_rtx);
26907 if (count > half_width)
26908 ix86_expand_ashl_const (high[0], count - half_width, mode);
26910 else
26912 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26914 if (!rtx_equal_p (operands[0], operands[1]))
26915 emit_move_insn (operands[0], operands[1]);
26917 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26918 ix86_expand_ashl_const (low[0], count, mode);
26920 return;
26923 split_double_mode (mode, operands, 1, low, high);
26925 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26927 if (operands[1] == const1_rtx)
26929 /* Assuming we've chosen a QImode capable registers, then 1 << N
26930 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26931 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26933 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26935 ix86_expand_clear (low[0]);
26936 ix86_expand_clear (high[0]);
26937 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26939 d = gen_lowpart (QImode, low[0]);
26940 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26941 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26942 emit_insn (gen_rtx_SET (d, s));
26944 d = gen_lowpart (QImode, high[0]);
26945 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26946 s = gen_rtx_NE (QImode, flags, const0_rtx);
26947 emit_insn (gen_rtx_SET (d, s));
26950 /* Otherwise, we can get the same results by manually performing
26951 a bit extract operation on bit 5/6, and then performing the two
26952 shifts. The two methods of getting 0/1 into low/high are exactly
26953 the same size. Avoiding the shift in the bit extract case helps
26954 pentium4 a bit; no one else seems to care much either way. */
26955 else
26957 machine_mode half_mode;
26958 rtx (*gen_lshr3)(rtx, rtx, rtx);
26959 rtx (*gen_and3)(rtx, rtx, rtx);
26960 rtx (*gen_xor3)(rtx, rtx, rtx);
26961 HOST_WIDE_INT bits;
26962 rtx x;
26964 if (mode == DImode)
26966 half_mode = SImode;
26967 gen_lshr3 = gen_lshrsi3;
26968 gen_and3 = gen_andsi3;
26969 gen_xor3 = gen_xorsi3;
26970 bits = 5;
26972 else
26974 half_mode = DImode;
26975 gen_lshr3 = gen_lshrdi3;
26976 gen_and3 = gen_anddi3;
26977 gen_xor3 = gen_xordi3;
26978 bits = 6;
26981 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26982 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26983 else
26984 x = gen_lowpart (half_mode, operands[2]);
26985 emit_insn (gen_rtx_SET (high[0], x));
26987 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26988 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26989 emit_move_insn (low[0], high[0]);
26990 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26993 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26994 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26995 return;
26998 if (operands[1] == constm1_rtx)
27000 /* For -1 << N, we can avoid the shld instruction, because we
27001 know that we're shifting 0...31/63 ones into a -1. */
27002 emit_move_insn (low[0], constm1_rtx);
27003 if (optimize_insn_for_size_p ())
27004 emit_move_insn (high[0], low[0]);
27005 else
27006 emit_move_insn (high[0], constm1_rtx);
27008 else
27010 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
27012 if (!rtx_equal_p (operands[0], operands[1]))
27013 emit_move_insn (operands[0], operands[1]);
27015 split_double_mode (mode, operands, 1, low, high);
27016 emit_insn (gen_shld (high[0], low[0], operands[2]));
27019 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
27021 if (TARGET_CMOVE && scratch)
27023 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27024 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27026 ix86_expand_clear (scratch);
27027 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
27029 else
27031 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27032 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27034 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
27038 void
27039 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
27041 rtx (*gen_ashr3)(rtx, rtx, rtx)
27042 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
27043 rtx (*gen_shrd)(rtx, rtx, rtx);
27044 int half_width = GET_MODE_BITSIZE (mode) >> 1;
27046 rtx low[2], high[2];
27047 int count;
27049 if (CONST_INT_P (operands[2]))
27051 split_double_mode (mode, operands, 2, low, high);
27052 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27054 if (count == GET_MODE_BITSIZE (mode) - 1)
27056 emit_move_insn (high[0], high[1]);
27057 emit_insn (gen_ashr3 (high[0], high[0],
27058 GEN_INT (half_width - 1)));
27059 emit_move_insn (low[0], high[0]);
27062 else if (count >= half_width)
27064 emit_move_insn (low[0], high[1]);
27065 emit_move_insn (high[0], low[0]);
27066 emit_insn (gen_ashr3 (high[0], high[0],
27067 GEN_INT (half_width - 1)));
27069 if (count > half_width)
27070 emit_insn (gen_ashr3 (low[0], low[0],
27071 GEN_INT (count - half_width)));
27073 else
27075 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27077 if (!rtx_equal_p (operands[0], operands[1]))
27078 emit_move_insn (operands[0], operands[1]);
27080 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27081 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
27084 else
27086 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27088 if (!rtx_equal_p (operands[0], operands[1]))
27089 emit_move_insn (operands[0], operands[1]);
27091 split_double_mode (mode, operands, 1, low, high);
27093 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27094 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
27096 if (TARGET_CMOVE && scratch)
27098 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27099 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27101 emit_move_insn (scratch, high[0]);
27102 emit_insn (gen_ashr3 (scratch, scratch,
27103 GEN_INT (half_width - 1)));
27104 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27105 scratch));
27107 else
27109 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
27110 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
27112 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
27117 void
27118 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
27120 rtx (*gen_lshr3)(rtx, rtx, rtx)
27121 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
27122 rtx (*gen_shrd)(rtx, rtx, rtx);
27123 int half_width = GET_MODE_BITSIZE (mode) >> 1;
27125 rtx low[2], high[2];
27126 int count;
27128 if (CONST_INT_P (operands[2]))
27130 split_double_mode (mode, operands, 2, low, high);
27131 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27133 if (count >= half_width)
27135 emit_move_insn (low[0], high[1]);
27136 ix86_expand_clear (high[0]);
27138 if (count > half_width)
27139 emit_insn (gen_lshr3 (low[0], low[0],
27140 GEN_INT (count - half_width)));
27142 else
27144 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27146 if (!rtx_equal_p (operands[0], operands[1]))
27147 emit_move_insn (operands[0], operands[1]);
27149 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27150 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
27153 else
27155 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27157 if (!rtx_equal_p (operands[0], operands[1]))
27158 emit_move_insn (operands[0], operands[1]);
27160 split_double_mode (mode, operands, 1, low, high);
27162 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27163 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
27165 if (TARGET_CMOVE && scratch)
27167 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27168 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27170 ix86_expand_clear (scratch);
27171 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27172 scratch));
27174 else
27176 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27177 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27179 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
27184 /* Predict just emitted jump instruction to be taken with probability PROB. */
27185 static void
27186 predict_jump (int prob)
27188 rtx_insn *insn = get_last_insn ();
27189 gcc_assert (JUMP_P (insn));
27190 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
27193 /* Helper function for the string operations below. Dest VARIABLE whether
27194 it is aligned to VALUE bytes. If true, jump to the label. */
27195 static rtx_code_label *
27196 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
27198 rtx_code_label *label = gen_label_rtx ();
27199 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
27200 if (GET_MODE (variable) == DImode)
27201 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
27202 else
27203 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
27204 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
27205 1, label);
27206 if (epilogue)
27207 predict_jump (REG_BR_PROB_BASE * 50 / 100);
27208 else
27209 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27210 return label;
27213 /* Adjust COUNTER by the VALUE. */
27214 static void
27215 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
27217 rtx (*gen_add)(rtx, rtx, rtx)
27218 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
27220 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
27223 /* Zero extend possibly SImode EXP to Pmode register. */
27225 ix86_zero_extend_to_Pmode (rtx exp)
27227 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
27230 /* Divide COUNTREG by SCALE. */
27231 static rtx
27232 scale_counter (rtx countreg, int scale)
27234 rtx sc;
27236 if (scale == 1)
27237 return countreg;
27238 if (CONST_INT_P (countreg))
27239 return GEN_INT (INTVAL (countreg) / scale);
27240 gcc_assert (REG_P (countreg));
27242 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
27243 GEN_INT (exact_log2 (scale)),
27244 NULL, 1, OPTAB_DIRECT);
27245 return sc;
27248 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
27249 DImode for constant loop counts. */
27251 static machine_mode
27252 counter_mode (rtx count_exp)
27254 if (GET_MODE (count_exp) != VOIDmode)
27255 return GET_MODE (count_exp);
27256 if (!CONST_INT_P (count_exp))
27257 return Pmode;
27258 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
27259 return DImode;
27260 return SImode;
27263 /* Copy the address to a Pmode register. This is used for x32 to
27264 truncate DImode TLS address to a SImode register. */
27266 static rtx
27267 ix86_copy_addr_to_reg (rtx addr)
27269 rtx reg;
27270 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27272 reg = copy_addr_to_reg (addr);
27273 REG_POINTER (reg) = 1;
27274 return reg;
27276 else
27278 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27279 reg = copy_to_mode_reg (DImode, addr);
27280 REG_POINTER (reg) = 1;
27281 return gen_rtx_SUBREG (SImode, reg, 0);
27285 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27286 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27287 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27288 memory by VALUE (supposed to be in MODE).
27290 The size is rounded down to whole number of chunk size moved at once.
27291 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27294 static void
27295 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27296 rtx destptr, rtx srcptr, rtx value,
27297 rtx count, machine_mode mode, int unroll,
27298 int expected_size, bool issetmem)
27300 rtx_code_label *out_label, *top_label;
27301 rtx iter, tmp;
27302 machine_mode iter_mode = counter_mode (count);
27303 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27304 rtx piece_size = GEN_INT (piece_size_n);
27305 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27306 rtx size;
27307 int i;
27309 top_label = gen_label_rtx ();
27310 out_label = gen_label_rtx ();
27311 iter = gen_reg_rtx (iter_mode);
27313 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27314 NULL, 1, OPTAB_DIRECT);
27315 /* Those two should combine. */
27316 if (piece_size == const1_rtx)
27318 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27319 true, out_label);
27320 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27322 emit_move_insn (iter, const0_rtx);
27324 emit_label (top_label);
27326 tmp = convert_modes (Pmode, iter_mode, iter, true);
27328 /* This assert could be relaxed - in this case we'll need to compute
27329 smallest power of two, containing in PIECE_SIZE_N and pass it to
27330 offset_address. */
27331 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27332 destmem = offset_address (destmem, tmp, piece_size_n);
27333 destmem = adjust_address (destmem, mode, 0);
27335 if (!issetmem)
27337 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27338 srcmem = adjust_address (srcmem, mode, 0);
27340 /* When unrolling for chips that reorder memory reads and writes,
27341 we can save registers by using single temporary.
27342 Also using 4 temporaries is overkill in 32bit mode. */
27343 if (!TARGET_64BIT && 0)
27345 for (i = 0; i < unroll; i++)
27347 if (i)
27349 destmem =
27350 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27351 srcmem =
27352 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27354 emit_move_insn (destmem, srcmem);
27357 else
27359 rtx tmpreg[4];
27360 gcc_assert (unroll <= 4);
27361 for (i = 0; i < unroll; i++)
27363 tmpreg[i] = gen_reg_rtx (mode);
27364 if (i)
27366 srcmem =
27367 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27369 emit_move_insn (tmpreg[i], srcmem);
27371 for (i = 0; i < unroll; i++)
27373 if (i)
27375 destmem =
27376 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27378 emit_move_insn (destmem, tmpreg[i]);
27382 else
27383 for (i = 0; i < unroll; i++)
27385 if (i)
27386 destmem =
27387 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27388 emit_move_insn (destmem, value);
27391 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27392 true, OPTAB_LIB_WIDEN);
27393 if (tmp != iter)
27394 emit_move_insn (iter, tmp);
27396 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27397 true, top_label);
27398 if (expected_size != -1)
27400 expected_size /= GET_MODE_SIZE (mode) * unroll;
27401 if (expected_size == 0)
27402 predict_jump (0);
27403 else if (expected_size > REG_BR_PROB_BASE)
27404 predict_jump (REG_BR_PROB_BASE - 1);
27405 else
27406 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27408 else
27409 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27410 iter = ix86_zero_extend_to_Pmode (iter);
27411 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27412 true, OPTAB_LIB_WIDEN);
27413 if (tmp != destptr)
27414 emit_move_insn (destptr, tmp);
27415 if (!issetmem)
27417 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27418 true, OPTAB_LIB_WIDEN);
27419 if (tmp != srcptr)
27420 emit_move_insn (srcptr, tmp);
27422 emit_label (out_label);
27425 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27426 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27427 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27428 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27429 ORIG_VALUE is the original value passed to memset to fill the memory with.
27430 Other arguments have same meaning as for previous function. */
27432 static void
27433 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27434 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27435 rtx count,
27436 machine_mode mode, bool issetmem)
27438 rtx destexp;
27439 rtx srcexp;
27440 rtx countreg;
27441 HOST_WIDE_INT rounded_count;
27443 /* If possible, it is shorter to use rep movs.
27444 TODO: Maybe it is better to move this logic to decide_alg. */
27445 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27446 && (!issetmem || orig_value == const0_rtx))
27447 mode = SImode;
27449 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27450 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27452 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27453 GET_MODE_SIZE (mode)));
27454 if (mode != QImode)
27456 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27457 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27458 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27460 else
27461 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27462 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27464 rounded_count
27465 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27466 destmem = shallow_copy_rtx (destmem);
27467 set_mem_size (destmem, rounded_count);
27469 else if (MEM_SIZE_KNOWN_P (destmem))
27470 clear_mem_size (destmem);
27472 if (issetmem)
27474 value = force_reg (mode, gen_lowpart (mode, value));
27475 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27477 else
27479 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27480 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27481 if (mode != QImode)
27483 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27484 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27485 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27487 else
27488 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27489 if (CONST_INT_P (count))
27491 rounded_count
27492 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27493 srcmem = shallow_copy_rtx (srcmem);
27494 set_mem_size (srcmem, rounded_count);
27496 else
27498 if (MEM_SIZE_KNOWN_P (srcmem))
27499 clear_mem_size (srcmem);
27501 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27502 destexp, srcexp));
27506 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27507 DESTMEM.
27508 SRC is passed by pointer to be updated on return.
27509 Return value is updated DST. */
27510 static rtx
27511 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27512 HOST_WIDE_INT size_to_move)
27514 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27515 enum insn_code code;
27516 machine_mode move_mode;
27517 int piece_size, i;
27519 /* Find the widest mode in which we could perform moves.
27520 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27521 it until move of such size is supported. */
27522 piece_size = 1 << floor_log2 (size_to_move);
27523 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
27524 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
27526 gcc_assert (piece_size > 1);
27527 piece_size >>= 1;
27530 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27531 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27532 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27534 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27535 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27536 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
27538 move_mode = word_mode;
27539 piece_size = GET_MODE_SIZE (move_mode);
27540 code = optab_handler (mov_optab, move_mode);
27543 gcc_assert (code != CODE_FOR_nothing);
27545 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27546 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27548 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27549 gcc_assert (size_to_move % piece_size == 0);
27550 adjust = GEN_INT (piece_size);
27551 for (i = 0; i < size_to_move; i += piece_size)
27553 /* We move from memory to memory, so we'll need to do it via
27554 a temporary register. */
27555 tempreg = gen_reg_rtx (move_mode);
27556 emit_insn (GEN_FCN (code) (tempreg, src));
27557 emit_insn (GEN_FCN (code) (dst, tempreg));
27559 emit_move_insn (destptr,
27560 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27561 emit_move_insn (srcptr,
27562 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27564 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27565 piece_size);
27566 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27567 piece_size);
27570 /* Update DST and SRC rtx. */
27571 *srcmem = src;
27572 return dst;
27575 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27576 static void
27577 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27578 rtx destptr, rtx srcptr, rtx count, int max_size)
27580 rtx src, dest;
27581 if (CONST_INT_P (count))
27583 HOST_WIDE_INT countval = INTVAL (count);
27584 HOST_WIDE_INT epilogue_size = countval % max_size;
27585 int i;
27587 /* For now MAX_SIZE should be a power of 2. This assert could be
27588 relaxed, but it'll require a bit more complicated epilogue
27589 expanding. */
27590 gcc_assert ((max_size & (max_size - 1)) == 0);
27591 for (i = max_size; i >= 1; i >>= 1)
27593 if (epilogue_size & i)
27594 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27596 return;
27598 if (max_size > 8)
27600 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27601 count, 1, OPTAB_DIRECT);
27602 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27603 count, QImode, 1, 4, false);
27604 return;
27607 /* When there are stringops, we can cheaply increase dest and src pointers.
27608 Otherwise we save code size by maintaining offset (zero is readily
27609 available from preceding rep operation) and using x86 addressing modes.
27611 if (TARGET_SINGLE_STRINGOP)
27613 if (max_size > 4)
27615 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27616 src = change_address (srcmem, SImode, srcptr);
27617 dest = change_address (destmem, SImode, destptr);
27618 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27619 emit_label (label);
27620 LABEL_NUSES (label) = 1;
27622 if (max_size > 2)
27624 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27625 src = change_address (srcmem, HImode, srcptr);
27626 dest = change_address (destmem, HImode, destptr);
27627 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27628 emit_label (label);
27629 LABEL_NUSES (label) = 1;
27631 if (max_size > 1)
27633 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27634 src = change_address (srcmem, QImode, srcptr);
27635 dest = change_address (destmem, QImode, destptr);
27636 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27637 emit_label (label);
27638 LABEL_NUSES (label) = 1;
27641 else
27643 rtx offset = force_reg (Pmode, const0_rtx);
27644 rtx tmp;
27646 if (max_size > 4)
27648 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27649 src = change_address (srcmem, SImode, srcptr);
27650 dest = change_address (destmem, SImode, destptr);
27651 emit_move_insn (dest, src);
27652 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27653 true, OPTAB_LIB_WIDEN);
27654 if (tmp != offset)
27655 emit_move_insn (offset, tmp);
27656 emit_label (label);
27657 LABEL_NUSES (label) = 1;
27659 if (max_size > 2)
27661 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27662 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27663 src = change_address (srcmem, HImode, tmp);
27664 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27665 dest = change_address (destmem, HImode, tmp);
27666 emit_move_insn (dest, src);
27667 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27668 true, OPTAB_LIB_WIDEN);
27669 if (tmp != offset)
27670 emit_move_insn (offset, tmp);
27671 emit_label (label);
27672 LABEL_NUSES (label) = 1;
27674 if (max_size > 1)
27676 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27677 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27678 src = change_address (srcmem, QImode, tmp);
27679 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27680 dest = change_address (destmem, QImode, tmp);
27681 emit_move_insn (dest, src);
27682 emit_label (label);
27683 LABEL_NUSES (label) = 1;
27688 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27689 with value PROMOTED_VAL.
27690 SRC is passed by pointer to be updated on return.
27691 Return value is updated DST. */
27692 static rtx
27693 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27694 HOST_WIDE_INT size_to_move)
27696 rtx dst = destmem, adjust;
27697 enum insn_code code;
27698 machine_mode move_mode;
27699 int piece_size, i;
27701 /* Find the widest mode in which we could perform moves.
27702 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27703 it until move of such size is supported. */
27704 move_mode = GET_MODE (promoted_val);
27705 if (move_mode == VOIDmode)
27706 move_mode = QImode;
27707 if (size_to_move < GET_MODE_SIZE (move_mode))
27709 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
27710 move_mode = int_mode_for_size (move_bits, 0).require ();
27711 promoted_val = gen_lowpart (move_mode, promoted_val);
27713 piece_size = GET_MODE_SIZE (move_mode);
27714 code = optab_handler (mov_optab, move_mode);
27715 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27717 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27719 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27720 gcc_assert (size_to_move % piece_size == 0);
27721 adjust = GEN_INT (piece_size);
27722 for (i = 0; i < size_to_move; i += piece_size)
27724 if (piece_size <= GET_MODE_SIZE (word_mode))
27726 emit_insn (gen_strset (destptr, dst, promoted_val));
27727 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27728 piece_size);
27729 continue;
27732 emit_insn (GEN_FCN (code) (dst, promoted_val));
27734 emit_move_insn (destptr,
27735 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27737 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27738 piece_size);
27741 /* Update DST rtx. */
27742 return dst;
27744 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27745 static void
27746 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27747 rtx count, int max_size)
27749 count =
27750 expand_simple_binop (counter_mode (count), AND, count,
27751 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27752 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27753 gen_lowpart (QImode, value), count, QImode,
27754 1, max_size / 2, true);
27757 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27758 static void
27759 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27760 rtx count, int max_size)
27762 rtx dest;
27764 if (CONST_INT_P (count))
27766 HOST_WIDE_INT countval = INTVAL (count);
27767 HOST_WIDE_INT epilogue_size = countval % max_size;
27768 int i;
27770 /* For now MAX_SIZE should be a power of 2. This assert could be
27771 relaxed, but it'll require a bit more complicated epilogue
27772 expanding. */
27773 gcc_assert ((max_size & (max_size - 1)) == 0);
27774 for (i = max_size; i >= 1; i >>= 1)
27776 if (epilogue_size & i)
27778 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27779 destmem = emit_memset (destmem, destptr, vec_value, i);
27780 else
27781 destmem = emit_memset (destmem, destptr, value, i);
27784 return;
27786 if (max_size > 32)
27788 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27789 return;
27791 if (max_size > 16)
27793 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27794 if (TARGET_64BIT)
27796 dest = change_address (destmem, DImode, destptr);
27797 emit_insn (gen_strset (destptr, dest, value));
27798 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27799 emit_insn (gen_strset (destptr, dest, value));
27801 else
27803 dest = change_address (destmem, SImode, destptr);
27804 emit_insn (gen_strset (destptr, dest, value));
27805 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27806 emit_insn (gen_strset (destptr, dest, value));
27807 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27808 emit_insn (gen_strset (destptr, dest, value));
27809 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27810 emit_insn (gen_strset (destptr, dest, value));
27812 emit_label (label);
27813 LABEL_NUSES (label) = 1;
27815 if (max_size > 8)
27817 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27818 if (TARGET_64BIT)
27820 dest = change_address (destmem, DImode, destptr);
27821 emit_insn (gen_strset (destptr, dest, value));
27823 else
27825 dest = change_address (destmem, SImode, destptr);
27826 emit_insn (gen_strset (destptr, dest, value));
27827 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27828 emit_insn (gen_strset (destptr, dest, value));
27830 emit_label (label);
27831 LABEL_NUSES (label) = 1;
27833 if (max_size > 4)
27835 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27836 dest = change_address (destmem, SImode, destptr);
27837 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27838 emit_label (label);
27839 LABEL_NUSES (label) = 1;
27841 if (max_size > 2)
27843 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27844 dest = change_address (destmem, HImode, destptr);
27845 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27846 emit_label (label);
27847 LABEL_NUSES (label) = 1;
27849 if (max_size > 1)
27851 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27852 dest = change_address (destmem, QImode, destptr);
27853 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27854 emit_label (label);
27855 LABEL_NUSES (label) = 1;
27859 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27860 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27861 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27862 ignored.
27863 Return value is updated DESTMEM. */
27864 static rtx
27865 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27866 rtx destptr, rtx srcptr, rtx value,
27867 rtx vec_value, rtx count, int align,
27868 int desired_alignment, bool issetmem)
27870 int i;
27871 for (i = 1; i < desired_alignment; i <<= 1)
27873 if (align <= i)
27875 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27876 if (issetmem)
27878 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27879 destmem = emit_memset (destmem, destptr, vec_value, i);
27880 else
27881 destmem = emit_memset (destmem, destptr, value, i);
27883 else
27884 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27885 ix86_adjust_counter (count, i);
27886 emit_label (label);
27887 LABEL_NUSES (label) = 1;
27888 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27891 return destmem;
27894 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27895 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27896 and jump to DONE_LABEL. */
27897 static void
27898 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27899 rtx destptr, rtx srcptr,
27900 rtx value, rtx vec_value,
27901 rtx count, int size,
27902 rtx done_label, bool issetmem)
27904 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27905 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
27906 rtx modesize;
27907 int n;
27909 /* If we do not have vector value to copy, we must reduce size. */
27910 if (issetmem)
27912 if (!vec_value)
27914 if (GET_MODE (value) == VOIDmode && size > 8)
27915 mode = Pmode;
27916 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27917 mode = GET_MODE (value);
27919 else
27920 mode = GET_MODE (vec_value), value = vec_value;
27922 else
27924 /* Choose appropriate vector mode. */
27925 if (size >= 32)
27926 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27927 else if (size >= 16)
27928 mode = TARGET_SSE ? V16QImode : DImode;
27929 srcmem = change_address (srcmem, mode, srcptr);
27931 destmem = change_address (destmem, mode, destptr);
27932 modesize = GEN_INT (GET_MODE_SIZE (mode));
27933 gcc_assert (GET_MODE_SIZE (mode) <= size);
27934 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27936 if (issetmem)
27937 emit_move_insn (destmem, gen_lowpart (mode, value));
27938 else
27940 emit_move_insn (destmem, srcmem);
27941 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27943 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27946 destmem = offset_address (destmem, count, 1);
27947 destmem = offset_address (destmem, GEN_INT (-2 * size),
27948 GET_MODE_SIZE (mode));
27949 if (!issetmem)
27951 srcmem = offset_address (srcmem, count, 1);
27952 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27953 GET_MODE_SIZE (mode));
27955 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27957 if (issetmem)
27958 emit_move_insn (destmem, gen_lowpart (mode, value));
27959 else
27961 emit_move_insn (destmem, srcmem);
27962 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27964 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27966 emit_jump_insn (gen_jump (done_label));
27967 emit_barrier ();
27969 emit_label (label);
27970 LABEL_NUSES (label) = 1;
27973 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27974 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27975 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27976 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27977 DONE_LABEL is a label after the whole copying sequence. The label is created
27978 on demand if *DONE_LABEL is NULL.
27979 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27980 bounds after the initial copies.
27982 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27983 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27984 we will dispatch to a library call for large blocks.
27986 In pseudocode we do:
27988 if (COUNT < SIZE)
27990 Assume that SIZE is 4. Bigger sizes are handled analogously
27991 if (COUNT & 4)
27993 copy 4 bytes from SRCPTR to DESTPTR
27994 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27995 goto done_label
27997 if (!COUNT)
27998 goto done_label;
27999 copy 1 byte from SRCPTR to DESTPTR
28000 if (COUNT & 2)
28002 copy 2 bytes from SRCPTR to DESTPTR
28003 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
28006 else
28008 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
28009 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
28011 OLD_DESPTR = DESTPTR;
28012 Align DESTPTR up to DESIRED_ALIGN
28013 SRCPTR += DESTPTR - OLD_DESTPTR
28014 COUNT -= DEST_PTR - OLD_DESTPTR
28015 if (DYNAMIC_CHECK)
28016 Round COUNT down to multiple of SIZE
28017 << optional caller supplied zero size guard is here >>
28018 << optional caller supplied dynamic check is here >>
28019 << caller supplied main copy loop is here >>
28021 done_label:
28023 static void
28024 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
28025 rtx *destptr, rtx *srcptr,
28026 machine_mode mode,
28027 rtx value, rtx vec_value,
28028 rtx *count,
28029 rtx_code_label **done_label,
28030 int size,
28031 int desired_align,
28032 int align,
28033 unsigned HOST_WIDE_INT *min_size,
28034 bool dynamic_check,
28035 bool issetmem)
28037 rtx_code_label *loop_label = NULL, *label;
28038 int n;
28039 rtx modesize;
28040 int prolog_size = 0;
28041 rtx mode_value;
28043 /* Chose proper value to copy. */
28044 if (issetmem && VECTOR_MODE_P (mode))
28045 mode_value = vec_value;
28046 else
28047 mode_value = value;
28048 gcc_assert (GET_MODE_SIZE (mode) <= size);
28050 /* See if block is big or small, handle small blocks. */
28051 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
28053 int size2 = size;
28054 loop_label = gen_label_rtx ();
28056 if (!*done_label)
28057 *done_label = gen_label_rtx ();
28059 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
28060 1, loop_label);
28061 size2 >>= 1;
28063 /* Handle sizes > 3. */
28064 for (;size2 > 2; size2 >>= 1)
28065 expand_small_movmem_or_setmem (destmem, srcmem,
28066 *destptr, *srcptr,
28067 value, vec_value,
28068 *count,
28069 size2, *done_label, issetmem);
28070 /* Nothing to copy? Jump to DONE_LABEL if so */
28071 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
28072 1, *done_label);
28074 /* Do a byte copy. */
28075 destmem = change_address (destmem, QImode, *destptr);
28076 if (issetmem)
28077 emit_move_insn (destmem, gen_lowpart (QImode, value));
28078 else
28080 srcmem = change_address (srcmem, QImode, *srcptr);
28081 emit_move_insn (destmem, srcmem);
28084 /* Handle sizes 2 and 3. */
28085 label = ix86_expand_aligntest (*count, 2, false);
28086 destmem = change_address (destmem, HImode, *destptr);
28087 destmem = offset_address (destmem, *count, 1);
28088 destmem = offset_address (destmem, GEN_INT (-2), 2);
28089 if (issetmem)
28090 emit_move_insn (destmem, gen_lowpart (HImode, value));
28091 else
28093 srcmem = change_address (srcmem, HImode, *srcptr);
28094 srcmem = offset_address (srcmem, *count, 1);
28095 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
28096 emit_move_insn (destmem, srcmem);
28099 emit_label (label);
28100 LABEL_NUSES (label) = 1;
28101 emit_jump_insn (gen_jump (*done_label));
28102 emit_barrier ();
28104 else
28105 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
28106 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
28108 /* Start memcpy for COUNT >= SIZE. */
28109 if (loop_label)
28111 emit_label (loop_label);
28112 LABEL_NUSES (loop_label) = 1;
28115 /* Copy first desired_align bytes. */
28116 if (!issetmem)
28117 srcmem = change_address (srcmem, mode, *srcptr);
28118 destmem = change_address (destmem, mode, *destptr);
28119 modesize = GEN_INT (GET_MODE_SIZE (mode));
28120 for (n = 0; prolog_size < desired_align - align; n++)
28122 if (issetmem)
28123 emit_move_insn (destmem, mode_value);
28124 else
28126 emit_move_insn (destmem, srcmem);
28127 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28129 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28130 prolog_size += GET_MODE_SIZE (mode);
28134 /* Copy last SIZE bytes. */
28135 destmem = offset_address (destmem, *count, 1);
28136 destmem = offset_address (destmem,
28137 GEN_INT (-size - prolog_size),
28139 if (issetmem)
28140 emit_move_insn (destmem, mode_value);
28141 else
28143 srcmem = offset_address (srcmem, *count, 1);
28144 srcmem = offset_address (srcmem,
28145 GEN_INT (-size - prolog_size),
28147 emit_move_insn (destmem, srcmem);
28149 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
28151 destmem = offset_address (destmem, modesize, 1);
28152 if (issetmem)
28153 emit_move_insn (destmem, mode_value);
28154 else
28156 srcmem = offset_address (srcmem, modesize, 1);
28157 emit_move_insn (destmem, srcmem);
28161 /* Align destination. */
28162 if (desired_align > 1 && desired_align > align)
28164 rtx saveddest = *destptr;
28166 gcc_assert (desired_align <= size);
28167 /* Align destptr up, place it to new register. */
28168 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
28169 GEN_INT (prolog_size),
28170 NULL_RTX, 1, OPTAB_DIRECT);
28171 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
28172 REG_POINTER (*destptr) = 1;
28173 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
28174 GEN_INT (-desired_align),
28175 *destptr, 1, OPTAB_DIRECT);
28176 /* See how many bytes we skipped. */
28177 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
28178 *destptr,
28179 saveddest, 1, OPTAB_DIRECT);
28180 /* Adjust srcptr and count. */
28181 if (!issetmem)
28182 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
28183 saveddest, *srcptr, 1, OPTAB_DIRECT);
28184 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28185 saveddest, *count, 1, OPTAB_DIRECT);
28186 /* We copied at most size + prolog_size. */
28187 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
28188 *min_size
28189 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
28190 else
28191 *min_size = 0;
28193 /* Our loops always round down the block size, but for dispatch to
28194 library we need precise value. */
28195 if (dynamic_check)
28196 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
28197 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
28199 else
28201 gcc_assert (prolog_size == 0);
28202 /* Decrease count, so we won't end up copying last word twice. */
28203 if (!CONST_INT_P (*count))
28204 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28205 constm1_rtx, *count, 1, OPTAB_DIRECT);
28206 else
28207 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
28208 (unsigned HOST_WIDE_INT)size));
28209 if (*min_size)
28210 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
28215 /* This function is like the previous one, except here we know how many bytes
28216 need to be copied. That allows us to update alignment not only of DST, which
28217 is returned, but also of SRC, which is passed as a pointer for that
28218 reason. */
28219 static rtx
28220 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
28221 rtx srcreg, rtx value, rtx vec_value,
28222 int desired_align, int align_bytes,
28223 bool issetmem)
28225 rtx src = NULL;
28226 rtx orig_dst = dst;
28227 rtx orig_src = NULL;
28228 int piece_size = 1;
28229 int copied_bytes = 0;
28231 if (!issetmem)
28233 gcc_assert (srcp != NULL);
28234 src = *srcp;
28235 orig_src = src;
28238 for (piece_size = 1;
28239 piece_size <= desired_align && copied_bytes < align_bytes;
28240 piece_size <<= 1)
28242 if (align_bytes & piece_size)
28244 if (issetmem)
28246 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
28247 dst = emit_memset (dst, destreg, vec_value, piece_size);
28248 else
28249 dst = emit_memset (dst, destreg, value, piece_size);
28251 else
28252 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
28253 copied_bytes += piece_size;
28256 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
28257 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28258 if (MEM_SIZE_KNOWN_P (orig_dst))
28259 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
28261 if (!issetmem)
28263 int src_align_bytes = get_mem_align_offset (src, desired_align
28264 * BITS_PER_UNIT);
28265 if (src_align_bytes >= 0)
28266 src_align_bytes = desired_align - src_align_bytes;
28267 if (src_align_bytes >= 0)
28269 unsigned int src_align;
28270 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28272 if ((src_align_bytes & (src_align - 1))
28273 == (align_bytes & (src_align - 1)))
28274 break;
28276 if (src_align > (unsigned int) desired_align)
28277 src_align = desired_align;
28278 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28279 set_mem_align (src, src_align * BITS_PER_UNIT);
28281 if (MEM_SIZE_KNOWN_P (orig_src))
28282 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28283 *srcp = src;
28286 return dst;
28289 /* Return true if ALG can be used in current context.
28290 Assume we expand memset if MEMSET is true. */
28291 static bool
28292 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28294 if (alg == no_stringop)
28295 return false;
28296 if (alg == vector_loop)
28297 return TARGET_SSE || TARGET_AVX;
28298 /* Algorithms using the rep prefix want at least edi and ecx;
28299 additionally, memset wants eax and memcpy wants esi. Don't
28300 consider such algorithms if the user has appropriated those
28301 registers for their own purposes, or if we have a non-default
28302 address space, since some string insns cannot override the segment. */
28303 if (alg == rep_prefix_1_byte
28304 || alg == rep_prefix_4_byte
28305 || alg == rep_prefix_8_byte)
28307 if (have_as)
28308 return false;
28309 if (fixed_regs[CX_REG]
28310 || fixed_regs[DI_REG]
28311 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28312 return false;
28314 return true;
28317 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28318 static enum stringop_alg
28319 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28320 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28321 bool memset, bool zero_memset, bool have_as,
28322 int *dynamic_check, bool *noalign, bool recur)
28324 const struct stringop_algs *algs;
28325 bool optimize_for_speed;
28326 int max = 0;
28327 const struct processor_costs *cost;
28328 int i;
28329 bool any_alg_usable_p = false;
28331 *noalign = false;
28332 *dynamic_check = -1;
28334 /* Even if the string operation call is cold, we still might spend a lot
28335 of time processing large blocks. */
28336 if (optimize_function_for_size_p (cfun)
28337 || (optimize_insn_for_size_p ()
28338 && (max_size < 256
28339 || (expected_size != -1 && expected_size < 256))))
28340 optimize_for_speed = false;
28341 else
28342 optimize_for_speed = true;
28344 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28345 if (memset)
28346 algs = &cost->memset[TARGET_64BIT != 0];
28347 else
28348 algs = &cost->memcpy[TARGET_64BIT != 0];
28350 /* See maximal size for user defined algorithm. */
28351 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28353 enum stringop_alg candidate = algs->size[i].alg;
28354 bool usable = alg_usable_p (candidate, memset, have_as);
28355 any_alg_usable_p |= usable;
28357 if (candidate != libcall && candidate && usable)
28358 max = algs->size[i].max;
28361 /* If expected size is not known but max size is small enough
28362 so inline version is a win, set expected size into
28363 the range. */
28364 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28365 && expected_size == -1)
28366 expected_size = min_size / 2 + max_size / 2;
28368 /* If user specified the algorithm, honor it if possible. */
28369 if (ix86_stringop_alg != no_stringop
28370 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28371 return ix86_stringop_alg;
28372 /* rep; movq or rep; movl is the smallest variant. */
28373 else if (!optimize_for_speed)
28375 *noalign = true;
28376 if (!count || (count & 3) || (memset && !zero_memset))
28377 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28378 ? rep_prefix_1_byte : loop_1_byte;
28379 else
28380 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28381 ? rep_prefix_4_byte : loop;
28383 /* Very tiny blocks are best handled via the loop, REP is expensive to
28384 setup. */
28385 else if (expected_size != -1 && expected_size < 4)
28386 return loop_1_byte;
28387 else if (expected_size != -1)
28389 enum stringop_alg alg = libcall;
28390 bool alg_noalign = false;
28391 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28393 /* We get here if the algorithms that were not libcall-based
28394 were rep-prefix based and we are unable to use rep prefixes
28395 based on global register usage. Break out of the loop and
28396 use the heuristic below. */
28397 if (algs->size[i].max == 0)
28398 break;
28399 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28401 enum stringop_alg candidate = algs->size[i].alg;
28403 if (candidate != libcall
28404 && alg_usable_p (candidate, memset, have_as))
28406 alg = candidate;
28407 alg_noalign = algs->size[i].noalign;
28409 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28410 last non-libcall inline algorithm. */
28411 if (TARGET_INLINE_ALL_STRINGOPS)
28413 /* When the current size is best to be copied by a libcall,
28414 but we are still forced to inline, run the heuristic below
28415 that will pick code for medium sized blocks. */
28416 if (alg != libcall)
28418 *noalign = alg_noalign;
28419 return alg;
28421 else if (!any_alg_usable_p)
28422 break;
28424 else if (alg_usable_p (candidate, memset, have_as))
28426 *noalign = algs->size[i].noalign;
28427 return candidate;
28432 /* When asked to inline the call anyway, try to pick meaningful choice.
28433 We look for maximal size of block that is faster to copy by hand and
28434 take blocks of at most of that size guessing that average size will
28435 be roughly half of the block.
28437 If this turns out to be bad, we might simply specify the preferred
28438 choice in ix86_costs. */
28439 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28440 && (algs->unknown_size == libcall
28441 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28443 enum stringop_alg alg;
28444 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28446 /* If there aren't any usable algorithms or if recursing already,
28447 then recursing on smaller sizes or same size isn't going to
28448 find anything. Just return the simple byte-at-a-time copy loop. */
28449 if (!any_alg_usable_p || recur)
28451 /* Pick something reasonable. */
28452 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28453 *dynamic_check = 128;
28454 return loop_1_byte;
28456 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28457 zero_memset, have_as, dynamic_check, noalign, true);
28458 gcc_assert (*dynamic_check == -1);
28459 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28460 *dynamic_check = max;
28461 else
28462 gcc_assert (alg != libcall);
28463 return alg;
28465 return (alg_usable_p (algs->unknown_size, memset, have_as)
28466 ? algs->unknown_size : libcall);
28469 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28470 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28471 static int
28472 decide_alignment (int align,
28473 enum stringop_alg alg,
28474 int expected_size,
28475 machine_mode move_mode)
28477 int desired_align = 0;
28479 gcc_assert (alg != no_stringop);
28481 if (alg == libcall)
28482 return 0;
28483 if (move_mode == VOIDmode)
28484 return 0;
28486 desired_align = GET_MODE_SIZE (move_mode);
28487 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28488 copying whole cacheline at once. */
28489 if (TARGET_PENTIUMPRO
28490 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28491 desired_align = 8;
28493 if (optimize_size)
28494 desired_align = 1;
28495 if (desired_align < align)
28496 desired_align = align;
28497 if (expected_size != -1 && expected_size < 4)
28498 desired_align = align;
28500 return desired_align;
28504 /* Helper function for memcpy. For QImode value 0xXY produce
28505 0xXYXYXYXY of wide specified by MODE. This is essentially
28506 a * 0x10101010, but we can do slightly better than
28507 synth_mult by unwinding the sequence by hand on CPUs with
28508 slow multiply. */
28509 static rtx
28510 promote_duplicated_reg (machine_mode mode, rtx val)
28512 machine_mode valmode = GET_MODE (val);
28513 rtx tmp;
28514 int nops = mode == DImode ? 3 : 2;
28516 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28517 if (val == const0_rtx)
28518 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28519 if (CONST_INT_P (val))
28521 HOST_WIDE_INT v = INTVAL (val) & 255;
28523 v |= v << 8;
28524 v |= v << 16;
28525 if (mode == DImode)
28526 v |= (v << 16) << 16;
28527 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28530 if (valmode == VOIDmode)
28531 valmode = QImode;
28532 if (valmode != QImode)
28533 val = gen_lowpart (QImode, val);
28534 if (mode == QImode)
28535 return val;
28536 if (!TARGET_PARTIAL_REG_STALL)
28537 nops--;
28538 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28539 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28540 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28541 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28543 rtx reg = convert_modes (mode, QImode, val, true);
28544 tmp = promote_duplicated_reg (mode, const1_rtx);
28545 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28546 OPTAB_DIRECT);
28548 else
28550 rtx reg = convert_modes (mode, QImode, val, true);
28552 if (!TARGET_PARTIAL_REG_STALL)
28553 if (mode == SImode)
28554 emit_insn (gen_insvsi_1 (reg, reg));
28555 else
28556 emit_insn (gen_insvdi_1 (reg, reg));
28557 else
28559 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28560 NULL, 1, OPTAB_DIRECT);
28561 reg =
28562 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28564 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28565 NULL, 1, OPTAB_DIRECT);
28566 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28567 if (mode == SImode)
28568 return reg;
28569 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28570 NULL, 1, OPTAB_DIRECT);
28571 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28572 return reg;
28576 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28577 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28578 alignment from ALIGN to DESIRED_ALIGN. */
28579 static rtx
28580 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28581 int align)
28583 rtx promoted_val;
28585 if (TARGET_64BIT
28586 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28587 promoted_val = promote_duplicated_reg (DImode, val);
28588 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28589 promoted_val = promote_duplicated_reg (SImode, val);
28590 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28591 promoted_val = promote_duplicated_reg (HImode, val);
28592 else
28593 promoted_val = val;
28595 return promoted_val;
28598 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28599 operations when profitable. The code depends upon architecture, block size
28600 and alignment, but always has one of the following overall structures:
28602 Aligned move sequence:
28604 1) Prologue guard: Conditional that jumps up to epilogues for small
28605 blocks that can be handled by epilogue alone. This is faster
28606 but also needed for correctness, since prologue assume the block
28607 is larger than the desired alignment.
28609 Optional dynamic check for size and libcall for large
28610 blocks is emitted here too, with -minline-stringops-dynamically.
28612 2) Prologue: copy first few bytes in order to get destination
28613 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28614 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28615 copied. We emit either a jump tree on power of two sized
28616 blocks, or a byte loop.
28618 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28619 with specified algorithm.
28621 4) Epilogue: code copying tail of the block that is too small to be
28622 handled by main body (or up to size guarded by prologue guard).
28624 Misaligned move sequence
28626 1) missaligned move prologue/epilogue containing:
28627 a) Prologue handling small memory blocks and jumping to done_label
28628 (skipped if blocks are known to be large enough)
28629 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28630 needed by single possibly misaligned move
28631 (skipped if alignment is not needed)
28632 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28634 2) Zero size guard dispatching to done_label, if needed
28636 3) dispatch to library call, if needed,
28638 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28639 with specified algorithm. */
28640 bool
28641 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28642 rtx align_exp, rtx expected_align_exp,
28643 rtx expected_size_exp, rtx min_size_exp,
28644 rtx max_size_exp, rtx probable_max_size_exp,
28645 bool issetmem)
28647 rtx destreg;
28648 rtx srcreg = NULL;
28649 rtx_code_label *label = NULL;
28650 rtx tmp;
28651 rtx_code_label *jump_around_label = NULL;
28652 HOST_WIDE_INT align = 1;
28653 unsigned HOST_WIDE_INT count = 0;
28654 HOST_WIDE_INT expected_size = -1;
28655 int size_needed = 0, epilogue_size_needed;
28656 int desired_align = 0, align_bytes = 0;
28657 enum stringop_alg alg;
28658 rtx promoted_val = NULL;
28659 rtx vec_promoted_val = NULL;
28660 bool force_loopy_epilogue = false;
28661 int dynamic_check;
28662 bool need_zero_guard = false;
28663 bool noalign;
28664 machine_mode move_mode = VOIDmode;
28665 machine_mode wider_mode;
28666 int unroll_factor = 1;
28667 /* TODO: Once value ranges are available, fill in proper data. */
28668 unsigned HOST_WIDE_INT min_size = 0;
28669 unsigned HOST_WIDE_INT max_size = -1;
28670 unsigned HOST_WIDE_INT probable_max_size = -1;
28671 bool misaligned_prologue_used = false;
28672 bool have_as;
28674 if (CONST_INT_P (align_exp))
28675 align = INTVAL (align_exp);
28676 /* i386 can do misaligned access on reasonably increased cost. */
28677 if (CONST_INT_P (expected_align_exp)
28678 && INTVAL (expected_align_exp) > align)
28679 align = INTVAL (expected_align_exp);
28680 /* ALIGN is the minimum of destination and source alignment, but we care here
28681 just about destination alignment. */
28682 else if (!issetmem
28683 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28684 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28686 if (CONST_INT_P (count_exp))
28688 min_size = max_size = probable_max_size = count = expected_size
28689 = INTVAL (count_exp);
28690 /* When COUNT is 0, there is nothing to do. */
28691 if (!count)
28692 return true;
28694 else
28696 if (min_size_exp)
28697 min_size = INTVAL (min_size_exp);
28698 if (max_size_exp)
28699 max_size = INTVAL (max_size_exp);
28700 if (probable_max_size_exp)
28701 probable_max_size = INTVAL (probable_max_size_exp);
28702 if (CONST_INT_P (expected_size_exp))
28703 expected_size = INTVAL (expected_size_exp);
28706 /* Make sure we don't need to care about overflow later on. */
28707 if (count > (HOST_WIDE_INT_1U << 30))
28708 return false;
28710 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28711 if (!issetmem)
28712 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28714 /* Step 0: Decide on preferred algorithm, desired alignment and
28715 size of chunks to be copied by main loop. */
28716 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28717 issetmem,
28718 issetmem && val_exp == const0_rtx, have_as,
28719 &dynamic_check, &noalign, false);
28720 if (alg == libcall)
28721 return false;
28722 gcc_assert (alg != no_stringop);
28724 /* For now vector-version of memset is generated only for memory zeroing, as
28725 creating of promoted vector value is very cheap in this case. */
28726 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28727 alg = unrolled_loop;
28729 if (!count)
28730 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28731 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28732 if (!issetmem)
28733 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28735 unroll_factor = 1;
28736 move_mode = word_mode;
28737 switch (alg)
28739 case libcall:
28740 case no_stringop:
28741 case last_alg:
28742 gcc_unreachable ();
28743 case loop_1_byte:
28744 need_zero_guard = true;
28745 move_mode = QImode;
28746 break;
28747 case loop:
28748 need_zero_guard = true;
28749 break;
28750 case unrolled_loop:
28751 need_zero_guard = true;
28752 unroll_factor = (TARGET_64BIT ? 4 : 2);
28753 break;
28754 case vector_loop:
28755 need_zero_guard = true;
28756 unroll_factor = 4;
28757 /* Find the widest supported mode. */
28758 move_mode = word_mode;
28759 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
28760 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
28761 move_mode = wider_mode;
28763 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28764 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28765 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28767 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28768 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
28769 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28770 move_mode = word_mode;
28772 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28773 break;
28774 case rep_prefix_8_byte:
28775 move_mode = DImode;
28776 break;
28777 case rep_prefix_4_byte:
28778 move_mode = SImode;
28779 break;
28780 case rep_prefix_1_byte:
28781 move_mode = QImode;
28782 break;
28784 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28785 epilogue_size_needed = size_needed;
28787 /* If we are going to call any library calls conditionally, make sure any
28788 pending stack adjustment happen before the first conditional branch,
28789 otherwise they will be emitted before the library call only and won't
28790 happen from the other branches. */
28791 if (dynamic_check != -1)
28792 do_pending_stack_adjust ();
28794 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28795 if (!TARGET_ALIGN_STRINGOPS || noalign)
28796 align = desired_align;
28798 /* Step 1: Prologue guard. */
28800 /* Alignment code needs count to be in register. */
28801 if (CONST_INT_P (count_exp) && desired_align > align)
28803 if (INTVAL (count_exp) > desired_align
28804 && INTVAL (count_exp) > size_needed)
28806 align_bytes
28807 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28808 if (align_bytes <= 0)
28809 align_bytes = 0;
28810 else
28811 align_bytes = desired_align - align_bytes;
28813 if (align_bytes == 0)
28814 count_exp = force_reg (counter_mode (count_exp), count_exp);
28816 gcc_assert (desired_align >= 1 && align >= 1);
28818 /* Misaligned move sequences handle both prologue and epilogue at once.
28819 Default code generation results in a smaller code for large alignments
28820 and also avoids redundant job when sizes are known precisely. */
28821 misaligned_prologue_used
28822 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28823 && MAX (desired_align, epilogue_size_needed) <= 32
28824 && desired_align <= epilogue_size_needed
28825 && ((desired_align > align && !align_bytes)
28826 || (!count && epilogue_size_needed > 1)));
28828 /* Do the cheap promotion to allow better CSE across the
28829 main loop and epilogue (ie one load of the big constant in the
28830 front of all code.
28831 For now the misaligned move sequences do not have fast path
28832 without broadcasting. */
28833 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28835 if (alg == vector_loop)
28837 gcc_assert (val_exp == const0_rtx);
28838 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28839 promoted_val = promote_duplicated_reg_to_size (val_exp,
28840 GET_MODE_SIZE (word_mode),
28841 desired_align, align);
28843 else
28845 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28846 desired_align, align);
28849 /* Misaligned move sequences handles both prologues and epilogues at once.
28850 Default code generation results in smaller code for large alignments and
28851 also avoids redundant job when sizes are known precisely. */
28852 if (misaligned_prologue_used)
28854 /* Misaligned move prologue handled small blocks by itself. */
28855 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28856 (dst, src, &destreg, &srcreg,
28857 move_mode, promoted_val, vec_promoted_val,
28858 &count_exp,
28859 &jump_around_label,
28860 desired_align < align
28861 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28862 desired_align, align, &min_size, dynamic_check, issetmem);
28863 if (!issetmem)
28864 src = change_address (src, BLKmode, srcreg);
28865 dst = change_address (dst, BLKmode, destreg);
28866 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28867 epilogue_size_needed = 0;
28868 if (need_zero_guard
28869 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28871 /* It is possible that we copied enough so the main loop will not
28872 execute. */
28873 gcc_assert (size_needed > 1);
28874 if (jump_around_label == NULL_RTX)
28875 jump_around_label = gen_label_rtx ();
28876 emit_cmp_and_jump_insns (count_exp,
28877 GEN_INT (size_needed),
28878 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28879 if (expected_size == -1
28880 || expected_size < (desired_align - align) / 2 + size_needed)
28881 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28882 else
28883 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28886 /* Ensure that alignment prologue won't copy past end of block. */
28887 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28889 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28890 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28891 Make sure it is power of 2. */
28892 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28894 /* To improve performance of small blocks, we jump around the VAL
28895 promoting mode. This mean that if the promoted VAL is not constant,
28896 we might not use it in the epilogue and have to use byte
28897 loop variant. */
28898 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28899 force_loopy_epilogue = true;
28900 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28901 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28903 /* If main algorithm works on QImode, no epilogue is needed.
28904 For small sizes just don't align anything. */
28905 if (size_needed == 1)
28906 desired_align = align;
28907 else
28908 goto epilogue;
28910 else if (!count
28911 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28913 label = gen_label_rtx ();
28914 emit_cmp_and_jump_insns (count_exp,
28915 GEN_INT (epilogue_size_needed),
28916 LTU, 0, counter_mode (count_exp), 1, label);
28917 if (expected_size == -1 || expected_size < epilogue_size_needed)
28918 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28919 else
28920 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28924 /* Emit code to decide on runtime whether library call or inline should be
28925 used. */
28926 if (dynamic_check != -1)
28928 if (!issetmem && CONST_INT_P (count_exp))
28930 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28932 emit_block_copy_via_libcall (dst, src, count_exp);
28933 count_exp = const0_rtx;
28934 goto epilogue;
28937 else
28939 rtx_code_label *hot_label = gen_label_rtx ();
28940 if (jump_around_label == NULL_RTX)
28941 jump_around_label = gen_label_rtx ();
28942 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28943 LEU, 0, counter_mode (count_exp),
28944 1, hot_label);
28945 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28946 if (issetmem)
28947 set_storage_via_libcall (dst, count_exp, val_exp);
28948 else
28949 emit_block_copy_via_libcall (dst, src, count_exp);
28950 emit_jump (jump_around_label);
28951 emit_label (hot_label);
28955 /* Step 2: Alignment prologue. */
28956 /* Do the expensive promotion once we branched off the small blocks. */
28957 if (issetmem && !promoted_val)
28958 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28959 desired_align, align);
28961 if (desired_align > align && !misaligned_prologue_used)
28963 if (align_bytes == 0)
28965 /* Except for the first move in prologue, we no longer know
28966 constant offset in aliasing info. It don't seems to worth
28967 the pain to maintain it for the first move, so throw away
28968 the info early. */
28969 dst = change_address (dst, BLKmode, destreg);
28970 if (!issetmem)
28971 src = change_address (src, BLKmode, srcreg);
28972 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28973 promoted_val, vec_promoted_val,
28974 count_exp, align, desired_align,
28975 issetmem);
28976 /* At most desired_align - align bytes are copied. */
28977 if (min_size < (unsigned)(desired_align - align))
28978 min_size = 0;
28979 else
28980 min_size -= desired_align - align;
28982 else
28984 /* If we know how many bytes need to be stored before dst is
28985 sufficiently aligned, maintain aliasing info accurately. */
28986 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28987 srcreg,
28988 promoted_val,
28989 vec_promoted_val,
28990 desired_align,
28991 align_bytes,
28992 issetmem);
28994 count_exp = plus_constant (counter_mode (count_exp),
28995 count_exp, -align_bytes);
28996 count -= align_bytes;
28997 min_size -= align_bytes;
28998 max_size -= align_bytes;
29000 if (need_zero_guard
29001 && min_size < (unsigned HOST_WIDE_INT) size_needed
29002 && (count < (unsigned HOST_WIDE_INT) size_needed
29003 || (align_bytes == 0
29004 && count < ((unsigned HOST_WIDE_INT) size_needed
29005 + desired_align - align))))
29007 /* It is possible that we copied enough so the main loop will not
29008 execute. */
29009 gcc_assert (size_needed > 1);
29010 if (label == NULL_RTX)
29011 label = gen_label_rtx ();
29012 emit_cmp_and_jump_insns (count_exp,
29013 GEN_INT (size_needed),
29014 LTU, 0, counter_mode (count_exp), 1, label);
29015 if (expected_size == -1
29016 || expected_size < (desired_align - align) / 2 + size_needed)
29017 predict_jump (REG_BR_PROB_BASE * 20 / 100);
29018 else
29019 predict_jump (REG_BR_PROB_BASE * 60 / 100);
29022 if (label && size_needed == 1)
29024 emit_label (label);
29025 LABEL_NUSES (label) = 1;
29026 label = NULL;
29027 epilogue_size_needed = 1;
29028 if (issetmem)
29029 promoted_val = val_exp;
29031 else if (label == NULL_RTX && !misaligned_prologue_used)
29032 epilogue_size_needed = size_needed;
29034 /* Step 3: Main loop. */
29036 switch (alg)
29038 case libcall:
29039 case no_stringop:
29040 case last_alg:
29041 gcc_unreachable ();
29042 case loop_1_byte:
29043 case loop:
29044 case unrolled_loop:
29045 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
29046 count_exp, move_mode, unroll_factor,
29047 expected_size, issetmem);
29048 break;
29049 case vector_loop:
29050 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
29051 vec_promoted_val, count_exp, move_mode,
29052 unroll_factor, expected_size, issetmem);
29053 break;
29054 case rep_prefix_8_byte:
29055 case rep_prefix_4_byte:
29056 case rep_prefix_1_byte:
29057 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
29058 val_exp, count_exp, move_mode, issetmem);
29059 break;
29061 /* Adjust properly the offset of src and dest memory for aliasing. */
29062 if (CONST_INT_P (count_exp))
29064 if (!issetmem)
29065 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
29066 (count / size_needed) * size_needed);
29067 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
29068 (count / size_needed) * size_needed);
29070 else
29072 if (!issetmem)
29073 src = change_address (src, BLKmode, srcreg);
29074 dst = change_address (dst, BLKmode, destreg);
29077 /* Step 4: Epilogue to copy the remaining bytes. */
29078 epilogue:
29079 if (label)
29081 /* When the main loop is done, COUNT_EXP might hold original count,
29082 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
29083 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
29084 bytes. Compensate if needed. */
29086 if (size_needed < epilogue_size_needed)
29088 tmp =
29089 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
29090 GEN_INT (size_needed - 1), count_exp, 1,
29091 OPTAB_DIRECT);
29092 if (tmp != count_exp)
29093 emit_move_insn (count_exp, tmp);
29095 emit_label (label);
29096 LABEL_NUSES (label) = 1;
29099 if (count_exp != const0_rtx && epilogue_size_needed > 1)
29101 if (force_loopy_epilogue)
29102 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
29103 epilogue_size_needed);
29104 else
29106 if (issetmem)
29107 expand_setmem_epilogue (dst, destreg, promoted_val,
29108 vec_promoted_val, count_exp,
29109 epilogue_size_needed);
29110 else
29111 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
29112 epilogue_size_needed);
29115 if (jump_around_label)
29116 emit_label (jump_around_label);
29117 return true;
29121 /* Expand the appropriate insns for doing strlen if not just doing
29122 repnz; scasb
29124 out = result, initialized with the start address
29125 align_rtx = alignment of the address.
29126 scratch = scratch register, initialized with the startaddress when
29127 not aligned, otherwise undefined
29129 This is just the body. It needs the initializations mentioned above and
29130 some address computing at the end. These things are done in i386.md. */
29132 static void
29133 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
29135 int align;
29136 rtx tmp;
29137 rtx_code_label *align_2_label = NULL;
29138 rtx_code_label *align_3_label = NULL;
29139 rtx_code_label *align_4_label = gen_label_rtx ();
29140 rtx_code_label *end_0_label = gen_label_rtx ();
29141 rtx mem;
29142 rtx tmpreg = gen_reg_rtx (SImode);
29143 rtx scratch = gen_reg_rtx (SImode);
29144 rtx cmp;
29146 align = 0;
29147 if (CONST_INT_P (align_rtx))
29148 align = INTVAL (align_rtx);
29150 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
29152 /* Is there a known alignment and is it less than 4? */
29153 if (align < 4)
29155 rtx scratch1 = gen_reg_rtx (Pmode);
29156 emit_move_insn (scratch1, out);
29157 /* Is there a known alignment and is it not 2? */
29158 if (align != 2)
29160 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
29161 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
29163 /* Leave just the 3 lower bits. */
29164 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
29165 NULL_RTX, 0, OPTAB_WIDEN);
29167 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29168 Pmode, 1, align_4_label);
29169 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
29170 Pmode, 1, align_2_label);
29171 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
29172 Pmode, 1, align_3_label);
29174 else
29176 /* Since the alignment is 2, we have to check 2 or 0 bytes;
29177 check if is aligned to 4 - byte. */
29179 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
29180 NULL_RTX, 0, OPTAB_WIDEN);
29182 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29183 Pmode, 1, align_4_label);
29186 mem = change_address (src, QImode, out);
29188 /* Now compare the bytes. */
29190 /* Compare the first n unaligned byte on a byte per byte basis. */
29191 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
29192 QImode, 1, end_0_label);
29194 /* Increment the address. */
29195 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29197 /* Not needed with an alignment of 2 */
29198 if (align != 2)
29200 emit_label (align_2_label);
29202 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29203 end_0_label);
29205 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29207 emit_label (align_3_label);
29210 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29211 end_0_label);
29213 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29216 /* Generate loop to check 4 bytes at a time. It is not a good idea to
29217 align this loop. It gives only huge programs, but does not help to
29218 speed up. */
29219 emit_label (align_4_label);
29221 mem = change_address (src, SImode, out);
29222 emit_move_insn (scratch, mem);
29223 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
29225 /* This formula yields a nonzero result iff one of the bytes is zero.
29226 This saves three branches inside loop and many cycles. */
29228 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
29229 emit_insn (gen_one_cmplsi2 (scratch, scratch));
29230 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
29231 emit_insn (gen_andsi3 (tmpreg, tmpreg,
29232 gen_int_mode (0x80808080, SImode)));
29233 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
29234 align_4_label);
29236 if (TARGET_CMOVE)
29238 rtx reg = gen_reg_rtx (SImode);
29239 rtx reg2 = gen_reg_rtx (Pmode);
29240 emit_move_insn (reg, tmpreg);
29241 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
29243 /* If zero is not in the first two bytes, move two bytes forward. */
29244 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29245 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29246 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29247 emit_insn (gen_rtx_SET (tmpreg,
29248 gen_rtx_IF_THEN_ELSE (SImode, tmp,
29249 reg,
29250 tmpreg)));
29251 /* Emit lea manually to avoid clobbering of flags. */
29252 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
29254 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29255 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29256 emit_insn (gen_rtx_SET (out,
29257 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
29258 reg2,
29259 out)));
29261 else
29263 rtx_code_label *end_2_label = gen_label_rtx ();
29264 /* Is zero in the first two bytes? */
29266 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29267 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29268 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29269 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29270 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29271 pc_rtx);
29272 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29273 JUMP_LABEL (tmp) = end_2_label;
29275 /* Not in the first two. Move two bytes forward. */
29276 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29277 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29279 emit_label (end_2_label);
29283 /* Avoid branch in fixing the byte. */
29284 tmpreg = gen_lowpart (QImode, tmpreg);
29285 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29286 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29287 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29288 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29290 emit_label (end_0_label);
29293 /* Expand strlen. */
29295 bool
29296 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29298 rtx addr, scratch1, scratch2, scratch3, scratch4;
29300 /* The generic case of strlen expander is long. Avoid it's
29301 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29303 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29304 && !TARGET_INLINE_ALL_STRINGOPS
29305 && !optimize_insn_for_size_p ()
29306 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29307 return false;
29309 addr = force_reg (Pmode, XEXP (src, 0));
29310 scratch1 = gen_reg_rtx (Pmode);
29312 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29313 && !optimize_insn_for_size_p ())
29315 /* Well it seems that some optimizer does not combine a call like
29316 foo(strlen(bar), strlen(bar));
29317 when the move and the subtraction is done here. It does calculate
29318 the length just once when these instructions are done inside of
29319 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29320 often used and I use one fewer register for the lifetime of
29321 output_strlen_unroll() this is better. */
29323 emit_move_insn (out, addr);
29325 ix86_expand_strlensi_unroll_1 (out, src, align);
29327 /* strlensi_unroll_1 returns the address of the zero at the end of
29328 the string, like memchr(), so compute the length by subtracting
29329 the start address. */
29330 emit_insn (ix86_gen_sub3 (out, out, addr));
29332 else
29334 rtx unspec;
29336 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29337 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29338 return false;
29339 /* Can't use this for non-default address spaces. */
29340 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29341 return false;
29343 scratch2 = gen_reg_rtx (Pmode);
29344 scratch3 = gen_reg_rtx (Pmode);
29345 scratch4 = force_reg (Pmode, constm1_rtx);
29347 emit_move_insn (scratch3, addr);
29348 eoschar = force_reg (QImode, eoschar);
29350 src = replace_equiv_address_nv (src, scratch3);
29352 /* If .md starts supporting :P, this can be done in .md. */
29353 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29354 scratch4), UNSPEC_SCAS);
29355 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29356 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29357 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29359 return true;
29362 /* For given symbol (function) construct code to compute address of it's PLT
29363 entry in large x86-64 PIC model. */
29364 static rtx
29365 construct_plt_address (rtx symbol)
29367 rtx tmp, unspec;
29369 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29370 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29371 gcc_assert (Pmode == DImode);
29373 tmp = gen_reg_rtx (Pmode);
29374 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29376 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29377 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29378 return tmp;
29382 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29383 rtx callarg2,
29384 rtx pop, bool sibcall)
29386 rtx vec[3];
29387 rtx use = NULL, call;
29388 unsigned int vec_len = 0;
29389 tree fndecl;
29391 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29393 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29394 if (fndecl
29395 && (lookup_attribute ("interrupt",
29396 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29397 error ("interrupt service routine can't be called directly");
29399 else
29400 fndecl = NULL_TREE;
29402 if (pop == const0_rtx)
29403 pop = NULL;
29404 gcc_assert (!TARGET_64BIT || !pop);
29406 if (TARGET_MACHO && !TARGET_64BIT)
29408 #if TARGET_MACHO
29409 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29410 fnaddr = machopic_indirect_call_target (fnaddr);
29411 #endif
29413 else
29415 /* Static functions and indirect calls don't need the pic register. Also,
29416 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29417 it an indirect call. */
29418 rtx addr = XEXP (fnaddr, 0);
29419 if (flag_pic
29420 && GET_CODE (addr) == SYMBOL_REF
29421 && !SYMBOL_REF_LOCAL_P (addr))
29423 if (flag_plt
29424 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29425 || !lookup_attribute ("noplt",
29426 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29428 if (!TARGET_64BIT
29429 || (ix86_cmodel == CM_LARGE_PIC
29430 && DEFAULT_ABI != MS_ABI))
29432 use_reg (&use, gen_rtx_REG (Pmode,
29433 REAL_PIC_OFFSET_TABLE_REGNUM));
29434 if (ix86_use_pseudo_pic_reg ())
29435 emit_move_insn (gen_rtx_REG (Pmode,
29436 REAL_PIC_OFFSET_TABLE_REGNUM),
29437 pic_offset_table_rtx);
29440 else if (!TARGET_PECOFF && !TARGET_MACHO)
29442 if (TARGET_64BIT)
29444 fnaddr = gen_rtx_UNSPEC (Pmode,
29445 gen_rtvec (1, addr),
29446 UNSPEC_GOTPCREL);
29447 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29449 else
29451 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29452 UNSPEC_GOT);
29453 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29454 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29455 fnaddr);
29457 fnaddr = gen_const_mem (Pmode, fnaddr);
29458 /* Pmode may not be the same as word_mode for x32, which
29459 doesn't support indirect branch via 32-bit memory slot.
29460 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29461 indirect branch via x32 GOT slot is OK. */
29462 if (GET_MODE (fnaddr) != word_mode)
29463 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29464 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29469 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29470 parameters passed in vector registers. */
29471 if (TARGET_64BIT
29472 && (INTVAL (callarg2) > 0
29473 || (INTVAL (callarg2) == 0
29474 && (TARGET_SSE || !flag_skip_rax_setup))))
29476 rtx al = gen_rtx_REG (QImode, AX_REG);
29477 emit_move_insn (al, callarg2);
29478 use_reg (&use, al);
29481 if (ix86_cmodel == CM_LARGE_PIC
29482 && !TARGET_PECOFF
29483 && MEM_P (fnaddr)
29484 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29485 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29486 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29487 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29488 branch via x32 GOT slot is OK. */
29489 else if (!(TARGET_X32
29490 && MEM_P (fnaddr)
29491 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29492 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29493 && (sibcall
29494 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29495 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29497 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29498 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29501 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29503 if (retval)
29505 /* We should add bounds as destination register in case
29506 pointer with bounds may be returned. */
29507 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29509 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29510 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29511 if (GET_CODE (retval) == PARALLEL)
29513 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29514 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29515 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29516 retval = chkp_join_splitted_slot (retval, par);
29518 else
29520 retval = gen_rtx_PARALLEL (VOIDmode,
29521 gen_rtvec (3, retval, b0, b1));
29522 chkp_put_regs_to_expr_list (retval);
29526 call = gen_rtx_SET (retval, call);
29528 vec[vec_len++] = call;
29530 if (pop)
29532 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29533 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29534 vec[vec_len++] = pop;
29537 if (cfun->machine->no_caller_saved_registers
29538 && (!fndecl
29539 || (!TREE_THIS_VOLATILE (fndecl)
29540 && !lookup_attribute ("no_caller_saved_registers",
29541 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29543 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29544 bool is_64bit_ms_abi = (TARGET_64BIT
29545 && ix86_function_abi (fndecl) == MS_ABI);
29546 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29548 /* If there are no caller-saved registers, add all registers
29549 that are clobbered by the call which returns. */
29550 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29551 if (!fixed_regs[i]
29552 && (ix86_call_used_regs[i] == 1
29553 || (ix86_call_used_regs[i] & c_mask))
29554 && !STACK_REGNO_P (i)
29555 && !MMX_REGNO_P (i))
29556 clobber_reg (&use,
29557 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29559 else if (TARGET_64BIT_MS_ABI
29560 && (!callarg2 || INTVAL (callarg2) != -2))
29562 unsigned i;
29564 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29566 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29567 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29569 clobber_reg (&use, gen_rtx_REG (mode, regno));
29572 /* Set here, but it may get cleared later. */
29573 if (TARGET_CALL_MS2SYSV_XLOGUES)
29575 if (!TARGET_SSE)
29578 /* Don't break hot-patched functions. */
29579 else if (ix86_function_ms_hook_prologue (current_function_decl))
29582 /* TODO: Cases not yet examined. */
29583 else if (flag_split_stack)
29584 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29586 else
29588 gcc_assert (!reload_completed);
29589 cfun->machine->call_ms2sysv = true;
29594 if (vec_len > 1)
29595 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29596 call = emit_call_insn (call);
29597 if (use)
29598 CALL_INSN_FUNCTION_USAGE (call) = use;
29600 return call;
29603 /* Return true if the function being called was marked with attribute
29604 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29605 to handle the non-PIC case in the backend because there is no easy
29606 interface for the front-end to force non-PLT calls to use the GOT.
29607 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29608 to call the function marked "noplt" indirectly. */
29610 static bool
29611 ix86_nopic_noplt_attribute_p (rtx call_op)
29613 if (flag_pic || ix86_cmodel == CM_LARGE
29614 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29615 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29616 || SYMBOL_REF_LOCAL_P (call_op))
29617 return false;
29619 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29621 if (!flag_plt
29622 || (symbol_decl != NULL_TREE
29623 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29624 return true;
29626 return false;
29629 /* Output the assembly for a call instruction. */
29631 const char *
29632 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29634 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29635 bool seh_nop_p = false;
29636 const char *xasm;
29638 if (SIBLING_CALL_P (insn))
29640 if (direct_p)
29642 if (ix86_nopic_noplt_attribute_p (call_op))
29644 if (TARGET_64BIT)
29645 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29646 else
29647 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29649 else
29650 xasm = "%!jmp\t%P0";
29652 /* SEH epilogue detection requires the indirect branch case
29653 to include REX.W. */
29654 else if (TARGET_SEH)
29655 xasm = "%!rex.W jmp\t%A0";
29656 else
29657 xasm = "%!jmp\t%A0";
29659 output_asm_insn (xasm, &call_op);
29660 return "";
29663 /* SEH unwinding can require an extra nop to be emitted in several
29664 circumstances. Determine if we have one of those. */
29665 if (TARGET_SEH)
29667 rtx_insn *i;
29669 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29671 /* If we get to another real insn, we don't need the nop. */
29672 if (INSN_P (i))
29673 break;
29675 /* If we get to the epilogue note, prevent a catch region from
29676 being adjacent to the standard epilogue sequence. If non-
29677 call-exceptions, we'll have done this during epilogue emission. */
29678 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29679 && !flag_non_call_exceptions
29680 && !can_throw_internal (insn))
29682 seh_nop_p = true;
29683 break;
29687 /* If we didn't find a real insn following the call, prevent the
29688 unwinder from looking into the next function. */
29689 if (i == NULL)
29690 seh_nop_p = true;
29693 if (direct_p)
29695 if (ix86_nopic_noplt_attribute_p (call_op))
29697 if (TARGET_64BIT)
29698 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29699 else
29700 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29702 else
29703 xasm = "%!call\t%P0";
29705 else
29706 xasm = "%!call\t%A0";
29708 output_asm_insn (xasm, &call_op);
29710 if (seh_nop_p)
29711 return "nop";
29713 return "";
29716 /* Clear stack slot assignments remembered from previous functions.
29717 This is called from INIT_EXPANDERS once before RTL is emitted for each
29718 function. */
29720 static struct machine_function *
29721 ix86_init_machine_status (void)
29723 struct machine_function *f;
29725 f = ggc_cleared_alloc<machine_function> ();
29726 f->call_abi = ix86_abi;
29728 return f;
29731 /* Return a MEM corresponding to a stack slot with mode MODE.
29732 Allocate a new slot if necessary.
29734 The RTL for a function can have several slots available: N is
29735 which slot to use. */
29738 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29740 struct stack_local_entry *s;
29742 gcc_assert (n < MAX_386_STACK_LOCALS);
29744 for (s = ix86_stack_locals; s; s = s->next)
29745 if (s->mode == mode && s->n == n)
29746 return validize_mem (copy_rtx (s->rtl));
29748 s = ggc_alloc<stack_local_entry> ();
29749 s->n = n;
29750 s->mode = mode;
29751 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29753 s->next = ix86_stack_locals;
29754 ix86_stack_locals = s;
29755 return validize_mem (copy_rtx (s->rtl));
29758 static void
29759 ix86_instantiate_decls (void)
29761 struct stack_local_entry *s;
29763 for (s = ix86_stack_locals; s; s = s->next)
29764 if (s->rtl != NULL_RTX)
29765 instantiate_decl_rtl (s->rtl);
29768 /* Return the number used for encoding REG, in the range 0..7. */
29770 static int
29771 reg_encoded_number (rtx reg)
29773 unsigned regno = REGNO (reg);
29774 switch (regno)
29776 case AX_REG:
29777 return 0;
29778 case CX_REG:
29779 return 1;
29780 case DX_REG:
29781 return 2;
29782 case BX_REG:
29783 return 3;
29784 case SP_REG:
29785 return 4;
29786 case BP_REG:
29787 return 5;
29788 case SI_REG:
29789 return 6;
29790 case DI_REG:
29791 return 7;
29792 default:
29793 break;
29795 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29796 return regno - FIRST_STACK_REG;
29797 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29798 return regno - FIRST_SSE_REG;
29799 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29800 return regno - FIRST_MMX_REG;
29801 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29802 return regno - FIRST_REX_SSE_REG;
29803 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29804 return regno - FIRST_REX_INT_REG;
29805 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29806 return regno - FIRST_MASK_REG;
29807 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29808 return regno - FIRST_BND_REG;
29809 return -1;
29812 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29813 in its encoding if it could be relevant for ROP mitigation, otherwise
29814 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29815 used for calculating it into them. */
29817 static int
29818 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29819 int *popno0 = 0, int *popno1 = 0)
29821 if (asm_noperands (PATTERN (insn)) >= 0)
29822 return -1;
29823 int has_modrm = get_attr_modrm (insn);
29824 if (!has_modrm)
29825 return -1;
29826 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29827 rtx op0, op1;
29828 switch (cls)
29830 case MODRM_CLASS_OP02:
29831 gcc_assert (noperands >= 3);
29832 if (popno0)
29834 *popno0 = 0;
29835 *popno1 = 2;
29837 op0 = operands[0];
29838 op1 = operands[2];
29839 break;
29840 case MODRM_CLASS_OP01:
29841 gcc_assert (noperands >= 2);
29842 if (popno0)
29844 *popno0 = 0;
29845 *popno1 = 1;
29847 op0 = operands[0];
29848 op1 = operands[1];
29849 break;
29850 default:
29851 return -1;
29853 if (REG_P (op0) && REG_P (op1))
29855 int enc0 = reg_encoded_number (op0);
29856 int enc1 = reg_encoded_number (op1);
29857 return 0xc0 + (enc1 << 3) + enc0;
29859 return -1;
29862 /* Check whether x86 address PARTS is a pc-relative address. */
29864 static bool
29865 rip_relative_addr_p (struct ix86_address *parts)
29867 rtx base, index, disp;
29869 base = parts->base;
29870 index = parts->index;
29871 disp = parts->disp;
29873 if (disp && !base && !index)
29875 if (TARGET_64BIT)
29877 rtx symbol = disp;
29879 if (GET_CODE (disp) == CONST)
29880 symbol = XEXP (disp, 0);
29881 if (GET_CODE (symbol) == PLUS
29882 && CONST_INT_P (XEXP (symbol, 1)))
29883 symbol = XEXP (symbol, 0);
29885 if (GET_CODE (symbol) == LABEL_REF
29886 || (GET_CODE (symbol) == SYMBOL_REF
29887 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29888 || (GET_CODE (symbol) == UNSPEC
29889 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29890 || XINT (symbol, 1) == UNSPEC_PCREL
29891 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29892 return true;
29895 return false;
29898 /* Calculate the length of the memory address in the instruction encoding.
29899 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29900 or other prefixes. We never generate addr32 prefix for LEA insn. */
29903 memory_address_length (rtx addr, bool lea)
29905 struct ix86_address parts;
29906 rtx base, index, disp;
29907 int len;
29908 int ok;
29910 if (GET_CODE (addr) == PRE_DEC
29911 || GET_CODE (addr) == POST_INC
29912 || GET_CODE (addr) == PRE_MODIFY
29913 || GET_CODE (addr) == POST_MODIFY)
29914 return 0;
29916 ok = ix86_decompose_address (addr, &parts);
29917 gcc_assert (ok);
29919 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29921 /* If this is not LEA instruction, add the length of addr32 prefix. */
29922 if (TARGET_64BIT && !lea
29923 && (SImode_address_operand (addr, VOIDmode)
29924 || (parts.base && GET_MODE (parts.base) == SImode)
29925 || (parts.index && GET_MODE (parts.index) == SImode)))
29926 len++;
29928 base = parts.base;
29929 index = parts.index;
29930 disp = parts.disp;
29932 if (base && SUBREG_P (base))
29933 base = SUBREG_REG (base);
29934 if (index && SUBREG_P (index))
29935 index = SUBREG_REG (index);
29937 gcc_assert (base == NULL_RTX || REG_P (base));
29938 gcc_assert (index == NULL_RTX || REG_P (index));
29940 /* Rule of thumb:
29941 - esp as the base always wants an index,
29942 - ebp as the base always wants a displacement,
29943 - r12 as the base always wants an index,
29944 - r13 as the base always wants a displacement. */
29946 /* Register Indirect. */
29947 if (base && !index && !disp)
29949 /* esp (for its index) and ebp (for its displacement) need
29950 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29951 code. */
29952 if (base == arg_pointer_rtx
29953 || base == frame_pointer_rtx
29954 || REGNO (base) == SP_REG
29955 || REGNO (base) == BP_REG
29956 || REGNO (base) == R12_REG
29957 || REGNO (base) == R13_REG)
29958 len++;
29961 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29962 is not disp32, but disp32(%rip), so for disp32
29963 SIB byte is needed, unless print_operand_address
29964 optimizes it into disp32(%rip) or (%rip) is implied
29965 by UNSPEC. */
29966 else if (disp && !base && !index)
29968 len += 4;
29969 if (!rip_relative_addr_p (&parts))
29970 len++;
29972 else
29974 /* Find the length of the displacement constant. */
29975 if (disp)
29977 if (base && satisfies_constraint_K (disp))
29978 len += 1;
29979 else
29980 len += 4;
29982 /* ebp always wants a displacement. Similarly r13. */
29983 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29984 len++;
29986 /* An index requires the two-byte modrm form.... */
29987 if (index
29988 /* ...like esp (or r12), which always wants an index. */
29989 || base == arg_pointer_rtx
29990 || base == frame_pointer_rtx
29991 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29992 len++;
29995 return len;
29998 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29999 is set, expect that insn have 8bit immediate alternative. */
30001 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
30003 int len = 0;
30004 int i;
30005 extract_insn_cached (insn);
30006 for (i = recog_data.n_operands - 1; i >= 0; --i)
30007 if (CONSTANT_P (recog_data.operand[i]))
30009 enum attr_mode mode = get_attr_mode (insn);
30011 gcc_assert (!len);
30012 if (shortform && CONST_INT_P (recog_data.operand[i]))
30014 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
30015 switch (mode)
30017 case MODE_QI:
30018 len = 1;
30019 continue;
30020 case MODE_HI:
30021 ival = trunc_int_for_mode (ival, HImode);
30022 break;
30023 case MODE_SI:
30024 ival = trunc_int_for_mode (ival, SImode);
30025 break;
30026 default:
30027 break;
30029 if (IN_RANGE (ival, -128, 127))
30031 len = 1;
30032 continue;
30035 switch (mode)
30037 case MODE_QI:
30038 len = 1;
30039 break;
30040 case MODE_HI:
30041 len = 2;
30042 break;
30043 case MODE_SI:
30044 len = 4;
30045 break;
30046 /* Immediates for DImode instructions are encoded
30047 as 32bit sign extended values. */
30048 case MODE_DI:
30049 len = 4;
30050 break;
30051 default:
30052 fatal_insn ("unknown insn mode", insn);
30055 return len;
30058 /* Compute default value for "length_address" attribute. */
30060 ix86_attr_length_address_default (rtx_insn *insn)
30062 int i;
30064 if (get_attr_type (insn) == TYPE_LEA)
30066 rtx set = PATTERN (insn), addr;
30068 if (GET_CODE (set) == PARALLEL)
30069 set = XVECEXP (set, 0, 0);
30071 gcc_assert (GET_CODE (set) == SET);
30073 addr = SET_SRC (set);
30075 return memory_address_length (addr, true);
30078 extract_insn_cached (insn);
30079 for (i = recog_data.n_operands - 1; i >= 0; --i)
30081 rtx op = recog_data.operand[i];
30082 if (MEM_P (op))
30084 constrain_operands_cached (insn, reload_completed);
30085 if (which_alternative != -1)
30087 const char *constraints = recog_data.constraints[i];
30088 int alt = which_alternative;
30090 while (*constraints == '=' || *constraints == '+')
30091 constraints++;
30092 while (alt-- > 0)
30093 while (*constraints++ != ',')
30095 /* Skip ignored operands. */
30096 if (*constraints == 'X')
30097 continue;
30100 int len = memory_address_length (XEXP (op, 0), false);
30102 /* Account for segment prefix for non-default addr spaces. */
30103 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
30104 len++;
30106 return len;
30109 return 0;
30112 /* Compute default value for "length_vex" attribute. It includes
30113 2 or 3 byte VEX prefix and 1 opcode byte. */
30116 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
30117 bool has_vex_w)
30119 int i;
30121 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
30122 byte VEX prefix. */
30123 if (!has_0f_opcode || has_vex_w)
30124 return 3 + 1;
30126 /* We can always use 2 byte VEX prefix in 32bit. */
30127 if (!TARGET_64BIT)
30128 return 2 + 1;
30130 extract_insn_cached (insn);
30132 for (i = recog_data.n_operands - 1; i >= 0; --i)
30133 if (REG_P (recog_data.operand[i]))
30135 /* REX.W bit uses 3 byte VEX prefix. */
30136 if (GET_MODE (recog_data.operand[i]) == DImode
30137 && GENERAL_REG_P (recog_data.operand[i]))
30138 return 3 + 1;
30140 else
30142 /* REX.X or REX.B bits use 3 byte VEX prefix. */
30143 if (MEM_P (recog_data.operand[i])
30144 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
30145 return 3 + 1;
30148 return 2 + 1;
30151 /* Return the maximum number of instructions a cpu can issue. */
30153 static int
30154 ix86_issue_rate (void)
30156 switch (ix86_tune)
30158 case PROCESSOR_PENTIUM:
30159 case PROCESSOR_LAKEMONT:
30160 case PROCESSOR_BONNELL:
30161 case PROCESSOR_SILVERMONT:
30162 case PROCESSOR_KNL:
30163 case PROCESSOR_INTEL:
30164 case PROCESSOR_K6:
30165 case PROCESSOR_BTVER2:
30166 case PROCESSOR_PENTIUM4:
30167 case PROCESSOR_NOCONA:
30168 return 2;
30170 case PROCESSOR_PENTIUMPRO:
30171 case PROCESSOR_ATHLON:
30172 case PROCESSOR_K8:
30173 case PROCESSOR_AMDFAM10:
30174 case PROCESSOR_GENERIC:
30175 case PROCESSOR_BTVER1:
30176 return 3;
30178 case PROCESSOR_BDVER1:
30179 case PROCESSOR_BDVER2:
30180 case PROCESSOR_BDVER3:
30181 case PROCESSOR_BDVER4:
30182 case PROCESSOR_ZNVER1:
30183 case PROCESSOR_CORE2:
30184 case PROCESSOR_NEHALEM:
30185 case PROCESSOR_SANDYBRIDGE:
30186 case PROCESSOR_HASWELL:
30187 return 4;
30189 default:
30190 return 1;
30194 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
30195 by DEP_INSN and nothing set by DEP_INSN. */
30197 static bool
30198 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
30200 rtx set, set2;
30202 /* Simplify the test for uninteresting insns. */
30203 if (insn_type != TYPE_SETCC
30204 && insn_type != TYPE_ICMOV
30205 && insn_type != TYPE_FCMOV
30206 && insn_type != TYPE_IBR)
30207 return false;
30209 if ((set = single_set (dep_insn)) != 0)
30211 set = SET_DEST (set);
30212 set2 = NULL_RTX;
30214 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
30215 && XVECLEN (PATTERN (dep_insn), 0) == 2
30216 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
30217 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
30219 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30220 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30222 else
30223 return false;
30225 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
30226 return false;
30228 /* This test is true if the dependent insn reads the flags but
30229 not any other potentially set register. */
30230 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
30231 return false;
30233 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
30234 return false;
30236 return true;
30239 /* Return true iff USE_INSN has a memory address with operands set by
30240 SET_INSN. */
30242 bool
30243 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
30245 int i;
30246 extract_insn_cached (use_insn);
30247 for (i = recog_data.n_operands - 1; i >= 0; --i)
30248 if (MEM_P (recog_data.operand[i]))
30250 rtx addr = XEXP (recog_data.operand[i], 0);
30251 if (modified_in_p (addr, set_insn) != 0)
30253 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
30254 has SP based memory (unless index reg is modified in a pop). */
30255 rtx set = single_set (set_insn);
30256 if (set
30257 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
30258 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
30260 struct ix86_address parts;
30261 if (ix86_decompose_address (addr, &parts)
30262 && parts.base == stack_pointer_rtx
30263 && (parts.index == NULL_RTX
30264 || MEM_P (SET_DEST (set))
30265 || !modified_in_p (parts.index, set_insn)))
30266 return false;
30268 return true;
30270 return false;
30272 return false;
30275 /* Helper function for exact_store_load_dependency.
30276 Return true if addr is found in insn. */
30277 static bool
30278 exact_dependency_1 (rtx addr, rtx insn)
30280 enum rtx_code code;
30281 const char *format_ptr;
30282 int i, j;
30284 code = GET_CODE (insn);
30285 switch (code)
30287 case MEM:
30288 if (rtx_equal_p (addr, insn))
30289 return true;
30290 break;
30291 case REG:
30292 CASE_CONST_ANY:
30293 case SYMBOL_REF:
30294 case CODE_LABEL:
30295 case PC:
30296 case CC0:
30297 case EXPR_LIST:
30298 return false;
30299 default:
30300 break;
30303 format_ptr = GET_RTX_FORMAT (code);
30304 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30306 switch (*format_ptr++)
30308 case 'e':
30309 if (exact_dependency_1 (addr, XEXP (insn, i)))
30310 return true;
30311 break;
30312 case 'E':
30313 for (j = 0; j < XVECLEN (insn, i); j++)
30314 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30315 return true;
30316 break;
30319 return false;
30322 /* Return true if there exists exact dependency for store & load, i.e.
30323 the same memory address is used in them. */
30324 static bool
30325 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30327 rtx set1, set2;
30329 set1 = single_set (store);
30330 if (!set1)
30331 return false;
30332 if (!MEM_P (SET_DEST (set1)))
30333 return false;
30334 set2 = single_set (load);
30335 if (!set2)
30336 return false;
30337 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30338 return true;
30339 return false;
30342 static int
30343 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30344 unsigned int)
30346 enum attr_type insn_type, dep_insn_type;
30347 enum attr_memory memory;
30348 rtx set, set2;
30349 int dep_insn_code_number;
30351 /* Anti and output dependencies have zero cost on all CPUs. */
30352 if (dep_type != 0)
30353 return 0;
30355 dep_insn_code_number = recog_memoized (dep_insn);
30357 /* If we can't recognize the insns, we can't really do anything. */
30358 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30359 return cost;
30361 insn_type = get_attr_type (insn);
30362 dep_insn_type = get_attr_type (dep_insn);
30364 switch (ix86_tune)
30366 case PROCESSOR_PENTIUM:
30367 case PROCESSOR_LAKEMONT:
30368 /* Address Generation Interlock adds a cycle of latency. */
30369 if (insn_type == TYPE_LEA)
30371 rtx addr = PATTERN (insn);
30373 if (GET_CODE (addr) == PARALLEL)
30374 addr = XVECEXP (addr, 0, 0);
30376 gcc_assert (GET_CODE (addr) == SET);
30378 addr = SET_SRC (addr);
30379 if (modified_in_p (addr, dep_insn))
30380 cost += 1;
30382 else if (ix86_agi_dependent (dep_insn, insn))
30383 cost += 1;
30385 /* ??? Compares pair with jump/setcc. */
30386 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30387 cost = 0;
30389 /* Floating point stores require value to be ready one cycle earlier. */
30390 if (insn_type == TYPE_FMOV
30391 && get_attr_memory (insn) == MEMORY_STORE
30392 && !ix86_agi_dependent (dep_insn, insn))
30393 cost += 1;
30394 break;
30396 case PROCESSOR_PENTIUMPRO:
30397 /* INT->FP conversion is expensive. */
30398 if (get_attr_fp_int_src (dep_insn))
30399 cost += 5;
30401 /* There is one cycle extra latency between an FP op and a store. */
30402 if (insn_type == TYPE_FMOV
30403 && (set = single_set (dep_insn)) != NULL_RTX
30404 && (set2 = single_set (insn)) != NULL_RTX
30405 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30406 && MEM_P (SET_DEST (set2)))
30407 cost += 1;
30409 memory = get_attr_memory (insn);
30411 /* Show ability of reorder buffer to hide latency of load by executing
30412 in parallel with previous instruction in case
30413 previous instruction is not needed to compute the address. */
30414 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30415 && !ix86_agi_dependent (dep_insn, insn))
30417 /* Claim moves to take one cycle, as core can issue one load
30418 at time and the next load can start cycle later. */
30419 if (dep_insn_type == TYPE_IMOV
30420 || dep_insn_type == TYPE_FMOV)
30421 cost = 1;
30422 else if (cost > 1)
30423 cost--;
30425 break;
30427 case PROCESSOR_K6:
30428 /* The esp dependency is resolved before
30429 the instruction is really finished. */
30430 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30431 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30432 return 1;
30434 /* INT->FP conversion is expensive. */
30435 if (get_attr_fp_int_src (dep_insn))
30436 cost += 5;
30438 memory = get_attr_memory (insn);
30440 /* Show ability of reorder buffer to hide latency of load by executing
30441 in parallel with previous instruction in case
30442 previous instruction is not needed to compute the address. */
30443 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30444 && !ix86_agi_dependent (dep_insn, insn))
30446 /* Claim moves to take one cycle, as core can issue one load
30447 at time and the next load can start cycle later. */
30448 if (dep_insn_type == TYPE_IMOV
30449 || dep_insn_type == TYPE_FMOV)
30450 cost = 1;
30451 else if (cost > 2)
30452 cost -= 2;
30453 else
30454 cost = 1;
30456 break;
30458 case PROCESSOR_AMDFAM10:
30459 case PROCESSOR_BDVER1:
30460 case PROCESSOR_BDVER2:
30461 case PROCESSOR_BDVER3:
30462 case PROCESSOR_BDVER4:
30463 case PROCESSOR_ZNVER1:
30464 case PROCESSOR_BTVER1:
30465 case PROCESSOR_BTVER2:
30466 case PROCESSOR_GENERIC:
30467 /* Stack engine allows to execute push&pop instructions in parall. */
30468 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30469 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30470 return 0;
30471 /* FALLTHRU */
30473 case PROCESSOR_ATHLON:
30474 case PROCESSOR_K8:
30475 memory = get_attr_memory (insn);
30477 /* Show ability of reorder buffer to hide latency of load by executing
30478 in parallel with previous instruction in case
30479 previous instruction is not needed to compute the address. */
30480 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30481 && !ix86_agi_dependent (dep_insn, insn))
30483 enum attr_unit unit = get_attr_unit (insn);
30484 int loadcost = 3;
30486 /* Because of the difference between the length of integer and
30487 floating unit pipeline preparation stages, the memory operands
30488 for floating point are cheaper.
30490 ??? For Athlon it the difference is most probably 2. */
30491 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30492 loadcost = 3;
30493 else
30494 loadcost = TARGET_ATHLON ? 2 : 0;
30496 if (cost >= loadcost)
30497 cost -= loadcost;
30498 else
30499 cost = 0;
30501 break;
30503 case PROCESSOR_CORE2:
30504 case PROCESSOR_NEHALEM:
30505 case PROCESSOR_SANDYBRIDGE:
30506 case PROCESSOR_HASWELL:
30507 /* Stack engine allows to execute push&pop instructions in parall. */
30508 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30509 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30510 return 0;
30512 memory = get_attr_memory (insn);
30514 /* Show ability of reorder buffer to hide latency of load by executing
30515 in parallel with previous instruction in case
30516 previous instruction is not needed to compute the address. */
30517 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30518 && !ix86_agi_dependent (dep_insn, insn))
30520 if (cost >= 4)
30521 cost -= 4;
30522 else
30523 cost = 0;
30525 break;
30527 case PROCESSOR_SILVERMONT:
30528 case PROCESSOR_KNL:
30529 case PROCESSOR_INTEL:
30530 if (!reload_completed)
30531 return cost;
30533 /* Increase cost of integer loads. */
30534 memory = get_attr_memory (dep_insn);
30535 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30537 enum attr_unit unit = get_attr_unit (dep_insn);
30538 if (unit == UNIT_INTEGER && cost == 1)
30540 if (memory == MEMORY_LOAD)
30541 cost = 3;
30542 else
30544 /* Increase cost of ld/st for short int types only
30545 because of store forwarding issue. */
30546 rtx set = single_set (dep_insn);
30547 if (set && (GET_MODE (SET_DEST (set)) == QImode
30548 || GET_MODE (SET_DEST (set)) == HImode))
30550 /* Increase cost of store/load insn if exact
30551 dependence exists and it is load insn. */
30552 enum attr_memory insn_memory = get_attr_memory (insn);
30553 if (insn_memory == MEMORY_LOAD
30554 && exact_store_load_dependency (dep_insn, insn))
30555 cost = 3;
30561 default:
30562 break;
30565 return cost;
30568 /* How many alternative schedules to try. This should be as wide as the
30569 scheduling freedom in the DFA, but no wider. Making this value too
30570 large results extra work for the scheduler. */
30572 static int
30573 ia32_multipass_dfa_lookahead (void)
30575 switch (ix86_tune)
30577 case PROCESSOR_PENTIUM:
30578 case PROCESSOR_LAKEMONT:
30579 return 2;
30581 case PROCESSOR_PENTIUMPRO:
30582 case PROCESSOR_K6:
30583 return 1;
30585 case PROCESSOR_BDVER1:
30586 case PROCESSOR_BDVER2:
30587 case PROCESSOR_BDVER3:
30588 case PROCESSOR_BDVER4:
30589 /* We use lookahead value 4 for BD both before and after reload
30590 schedules. Plan is to have value 8 included for O3. */
30591 return 4;
30593 case PROCESSOR_CORE2:
30594 case PROCESSOR_NEHALEM:
30595 case PROCESSOR_SANDYBRIDGE:
30596 case PROCESSOR_HASWELL:
30597 case PROCESSOR_BONNELL:
30598 case PROCESSOR_SILVERMONT:
30599 case PROCESSOR_KNL:
30600 case PROCESSOR_INTEL:
30601 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30602 as many instructions can be executed on a cycle, i.e.,
30603 issue_rate. I wonder why tuning for many CPUs does not do this. */
30604 if (reload_completed)
30605 return ix86_issue_rate ();
30606 /* Don't use lookahead for pre-reload schedule to save compile time. */
30607 return 0;
30609 default:
30610 return 0;
30614 /* Return true if target platform supports macro-fusion. */
30616 static bool
30617 ix86_macro_fusion_p ()
30619 return TARGET_FUSE_CMP_AND_BRANCH;
30622 /* Check whether current microarchitecture support macro fusion
30623 for insn pair "CONDGEN + CONDJMP". Refer to
30624 "Intel Architectures Optimization Reference Manual". */
30626 static bool
30627 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30629 rtx src, dest;
30630 enum rtx_code ccode;
30631 rtx compare_set = NULL_RTX, test_if, cond;
30632 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30634 if (!any_condjump_p (condjmp))
30635 return false;
30637 unsigned int condreg1, condreg2;
30638 rtx cc_reg_1;
30639 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30640 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30641 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30642 || !condgen
30643 || !modified_in_p (cc_reg_1, condgen))
30644 return false;
30646 if (get_attr_type (condgen) != TYPE_TEST
30647 && get_attr_type (condgen) != TYPE_ICMP
30648 && get_attr_type (condgen) != TYPE_INCDEC
30649 && get_attr_type (condgen) != TYPE_ALU)
30650 return false;
30652 compare_set = single_set (condgen);
30653 if (compare_set == NULL_RTX
30654 && !TARGET_FUSE_ALU_AND_BRANCH)
30655 return false;
30657 if (compare_set == NULL_RTX)
30659 int i;
30660 rtx pat = PATTERN (condgen);
30661 for (i = 0; i < XVECLEN (pat, 0); i++)
30662 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30664 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30665 if (GET_CODE (set_src) == COMPARE)
30666 compare_set = XVECEXP (pat, 0, i);
30667 else
30668 alu_set = XVECEXP (pat, 0, i);
30671 if (compare_set == NULL_RTX)
30672 return false;
30673 src = SET_SRC (compare_set);
30674 if (GET_CODE (src) != COMPARE)
30675 return false;
30677 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30678 supported. */
30679 if ((MEM_P (XEXP (src, 0))
30680 && CONST_INT_P (XEXP (src, 1)))
30681 || (MEM_P (XEXP (src, 1))
30682 && CONST_INT_P (XEXP (src, 0))))
30683 return false;
30685 /* No fusion for RIP-relative address. */
30686 if (MEM_P (XEXP (src, 0)))
30687 addr = XEXP (XEXP (src, 0), 0);
30688 else if (MEM_P (XEXP (src, 1)))
30689 addr = XEXP (XEXP (src, 1), 0);
30691 if (addr) {
30692 ix86_address parts;
30693 int ok = ix86_decompose_address (addr, &parts);
30694 gcc_assert (ok);
30696 if (rip_relative_addr_p (&parts))
30697 return false;
30700 test_if = SET_SRC (pc_set (condjmp));
30701 cond = XEXP (test_if, 0);
30702 ccode = GET_CODE (cond);
30703 /* Check whether conditional jump use Sign or Overflow Flags. */
30704 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30705 && (ccode == GE
30706 || ccode == GT
30707 || ccode == LE
30708 || ccode == LT))
30709 return false;
30711 /* Return true for TYPE_TEST and TYPE_ICMP. */
30712 if (get_attr_type (condgen) == TYPE_TEST
30713 || get_attr_type (condgen) == TYPE_ICMP)
30714 return true;
30716 /* The following is the case that macro-fusion for alu + jmp. */
30717 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30718 return false;
30720 /* No fusion for alu op with memory destination operand. */
30721 dest = SET_DEST (alu_set);
30722 if (MEM_P (dest))
30723 return false;
30725 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30726 supported. */
30727 if (get_attr_type (condgen) == TYPE_INCDEC
30728 && (ccode == GEU
30729 || ccode == GTU
30730 || ccode == LEU
30731 || ccode == LTU))
30732 return false;
30734 return true;
30737 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30738 execution. It is applied if
30739 (1) IMUL instruction is on the top of list;
30740 (2) There exists the only producer of independent IMUL instruction in
30741 ready list.
30742 Return index of IMUL producer if it was found and -1 otherwise. */
30743 static int
30744 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30746 rtx_insn *insn;
30747 rtx set, insn1, insn2;
30748 sd_iterator_def sd_it;
30749 dep_t dep;
30750 int index = -1;
30751 int i;
30753 if (!TARGET_BONNELL)
30754 return index;
30756 /* Check that IMUL instruction is on the top of ready list. */
30757 insn = ready[n_ready - 1];
30758 set = single_set (insn);
30759 if (!set)
30760 return index;
30761 if (!(GET_CODE (SET_SRC (set)) == MULT
30762 && GET_MODE (SET_SRC (set)) == SImode))
30763 return index;
30765 /* Search for producer of independent IMUL instruction. */
30766 for (i = n_ready - 2; i >= 0; i--)
30768 insn = ready[i];
30769 if (!NONDEBUG_INSN_P (insn))
30770 continue;
30771 /* Skip IMUL instruction. */
30772 insn2 = PATTERN (insn);
30773 if (GET_CODE (insn2) == PARALLEL)
30774 insn2 = XVECEXP (insn2, 0, 0);
30775 if (GET_CODE (insn2) == SET
30776 && GET_CODE (SET_SRC (insn2)) == MULT
30777 && GET_MODE (SET_SRC (insn2)) == SImode)
30778 continue;
30780 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30782 rtx con;
30783 con = DEP_CON (dep);
30784 if (!NONDEBUG_INSN_P (con))
30785 continue;
30786 insn1 = PATTERN (con);
30787 if (GET_CODE (insn1) == PARALLEL)
30788 insn1 = XVECEXP (insn1, 0, 0);
30790 if (GET_CODE (insn1) == SET
30791 && GET_CODE (SET_SRC (insn1)) == MULT
30792 && GET_MODE (SET_SRC (insn1)) == SImode)
30794 sd_iterator_def sd_it1;
30795 dep_t dep1;
30796 /* Check if there is no other dependee for IMUL. */
30797 index = i;
30798 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30800 rtx pro;
30801 pro = DEP_PRO (dep1);
30802 if (!NONDEBUG_INSN_P (pro))
30803 continue;
30804 if (pro != insn)
30805 index = -1;
30807 if (index >= 0)
30808 break;
30811 if (index >= 0)
30812 break;
30814 return index;
30817 /* Try to find the best candidate on the top of ready list if two insns
30818 have the same priority - candidate is best if its dependees were
30819 scheduled earlier. Applied for Silvermont only.
30820 Return true if top 2 insns must be interchanged. */
30821 static bool
30822 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30824 rtx_insn *top = ready[n_ready - 1];
30825 rtx_insn *next = ready[n_ready - 2];
30826 rtx set;
30827 sd_iterator_def sd_it;
30828 dep_t dep;
30829 int clock1 = -1;
30830 int clock2 = -1;
30831 #define INSN_TICK(INSN) (HID (INSN)->tick)
30833 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30834 return false;
30836 if (!NONDEBUG_INSN_P (top))
30837 return false;
30838 if (!NONJUMP_INSN_P (top))
30839 return false;
30840 if (!NONDEBUG_INSN_P (next))
30841 return false;
30842 if (!NONJUMP_INSN_P (next))
30843 return false;
30844 set = single_set (top);
30845 if (!set)
30846 return false;
30847 set = single_set (next);
30848 if (!set)
30849 return false;
30851 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30853 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30854 return false;
30855 /* Determine winner more precise. */
30856 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30858 rtx pro;
30859 pro = DEP_PRO (dep);
30860 if (!NONDEBUG_INSN_P (pro))
30861 continue;
30862 if (INSN_TICK (pro) > clock1)
30863 clock1 = INSN_TICK (pro);
30865 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30867 rtx pro;
30868 pro = DEP_PRO (dep);
30869 if (!NONDEBUG_INSN_P (pro))
30870 continue;
30871 if (INSN_TICK (pro) > clock2)
30872 clock2 = INSN_TICK (pro);
30875 if (clock1 == clock2)
30877 /* Determine winner - load must win. */
30878 enum attr_memory memory1, memory2;
30879 memory1 = get_attr_memory (top);
30880 memory2 = get_attr_memory (next);
30881 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30882 return true;
30884 return (bool) (clock2 < clock1);
30886 return false;
30887 #undef INSN_TICK
30890 /* Perform possible reodering of ready list for Atom/Silvermont only.
30891 Return issue rate. */
30892 static int
30893 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30894 int *pn_ready, int clock_var)
30896 int issue_rate = -1;
30897 int n_ready = *pn_ready;
30898 int i;
30899 rtx_insn *insn;
30900 int index = -1;
30902 /* Set up issue rate. */
30903 issue_rate = ix86_issue_rate ();
30905 /* Do reodering for BONNELL/SILVERMONT only. */
30906 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30907 return issue_rate;
30909 /* Nothing to do if ready list contains only 1 instruction. */
30910 if (n_ready <= 1)
30911 return issue_rate;
30913 /* Do reodering for post-reload scheduler only. */
30914 if (!reload_completed)
30915 return issue_rate;
30917 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30919 if (sched_verbose > 1)
30920 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30921 INSN_UID (ready[index]));
30923 /* Put IMUL producer (ready[index]) at the top of ready list. */
30924 insn = ready[index];
30925 for (i = index; i < n_ready - 1; i++)
30926 ready[i] = ready[i + 1];
30927 ready[n_ready - 1] = insn;
30928 return issue_rate;
30931 /* Skip selective scheduling since HID is not populated in it. */
30932 if (clock_var != 0
30933 && !sel_sched_p ()
30934 && swap_top_of_ready_list (ready, n_ready))
30936 if (sched_verbose > 1)
30937 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30938 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30939 /* Swap 2 top elements of ready list. */
30940 insn = ready[n_ready - 1];
30941 ready[n_ready - 1] = ready[n_ready - 2];
30942 ready[n_ready - 2] = insn;
30944 return issue_rate;
30947 static bool
30948 ix86_class_likely_spilled_p (reg_class_t);
30950 /* Returns true if lhs of insn is HW function argument register and set up
30951 is_spilled to true if it is likely spilled HW register. */
30952 static bool
30953 insn_is_function_arg (rtx insn, bool* is_spilled)
30955 rtx dst;
30957 if (!NONDEBUG_INSN_P (insn))
30958 return false;
30959 /* Call instructions are not movable, ignore it. */
30960 if (CALL_P (insn))
30961 return false;
30962 insn = PATTERN (insn);
30963 if (GET_CODE (insn) == PARALLEL)
30964 insn = XVECEXP (insn, 0, 0);
30965 if (GET_CODE (insn) != SET)
30966 return false;
30967 dst = SET_DEST (insn);
30968 if (REG_P (dst) && HARD_REGISTER_P (dst)
30969 && ix86_function_arg_regno_p (REGNO (dst)))
30971 /* Is it likely spilled HW register? */
30972 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30973 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30974 *is_spilled = true;
30975 return true;
30977 return false;
30980 /* Add output dependencies for chain of function adjacent arguments if only
30981 there is a move to likely spilled HW register. Return first argument
30982 if at least one dependence was added or NULL otherwise. */
30983 static rtx_insn *
30984 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30986 rtx_insn *insn;
30987 rtx_insn *last = call;
30988 rtx_insn *first_arg = NULL;
30989 bool is_spilled = false;
30991 head = PREV_INSN (head);
30993 /* Find nearest to call argument passing instruction. */
30994 while (true)
30996 last = PREV_INSN (last);
30997 if (last == head)
30998 return NULL;
30999 if (!NONDEBUG_INSN_P (last))
31000 continue;
31001 if (insn_is_function_arg (last, &is_spilled))
31002 break;
31003 return NULL;
31006 first_arg = last;
31007 while (true)
31009 insn = PREV_INSN (last);
31010 if (!INSN_P (insn))
31011 break;
31012 if (insn == head)
31013 break;
31014 if (!NONDEBUG_INSN_P (insn))
31016 last = insn;
31017 continue;
31019 if (insn_is_function_arg (insn, &is_spilled))
31021 /* Add output depdendence between two function arguments if chain
31022 of output arguments contains likely spilled HW registers. */
31023 if (is_spilled)
31024 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
31025 first_arg = last = insn;
31027 else
31028 break;
31030 if (!is_spilled)
31031 return NULL;
31032 return first_arg;
31035 /* Add output or anti dependency from insn to first_arg to restrict its code
31036 motion. */
31037 static void
31038 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
31040 rtx set;
31041 rtx tmp;
31043 /* Add anti dependencies for bounds stores. */
31044 if (INSN_P (insn)
31045 && GET_CODE (PATTERN (insn)) == PARALLEL
31046 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
31047 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
31049 add_dependence (first_arg, insn, REG_DEP_ANTI);
31050 return;
31053 set = single_set (insn);
31054 if (!set)
31055 return;
31056 tmp = SET_DEST (set);
31057 if (REG_P (tmp))
31059 /* Add output dependency to the first function argument. */
31060 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
31061 return;
31063 /* Add anti dependency. */
31064 add_dependence (first_arg, insn, REG_DEP_ANTI);
31067 /* Avoid cross block motion of function argument through adding dependency
31068 from the first non-jump instruction in bb. */
31069 static void
31070 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
31072 rtx_insn *insn = BB_END (bb);
31074 while (insn)
31076 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
31078 rtx set = single_set (insn);
31079 if (set)
31081 avoid_func_arg_motion (arg, insn);
31082 return;
31085 if (insn == BB_HEAD (bb))
31086 return;
31087 insn = PREV_INSN (insn);
31091 /* Hook for pre-reload schedule - avoid motion of function arguments
31092 passed in likely spilled HW registers. */
31093 static void
31094 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
31096 rtx_insn *insn;
31097 rtx_insn *first_arg = NULL;
31098 if (reload_completed)
31099 return;
31100 while (head != tail && DEBUG_INSN_P (head))
31101 head = NEXT_INSN (head);
31102 for (insn = tail; insn != head; insn = PREV_INSN (insn))
31103 if (INSN_P (insn) && CALL_P (insn))
31105 first_arg = add_parameter_dependencies (insn, head);
31106 if (first_arg)
31108 /* Add dependee for first argument to predecessors if only
31109 region contains more than one block. */
31110 basic_block bb = BLOCK_FOR_INSN (insn);
31111 int rgn = CONTAINING_RGN (bb->index);
31112 int nr_blks = RGN_NR_BLOCKS (rgn);
31113 /* Skip trivial regions and region head blocks that can have
31114 predecessors outside of region. */
31115 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
31117 edge e;
31118 edge_iterator ei;
31120 /* Regions are SCCs with the exception of selective
31121 scheduling with pipelining of outer blocks enabled.
31122 So also check that immediate predecessors of a non-head
31123 block are in the same region. */
31124 FOR_EACH_EDGE (e, ei, bb->preds)
31126 /* Avoid creating of loop-carried dependencies through
31127 using topological ordering in the region. */
31128 if (rgn == CONTAINING_RGN (e->src->index)
31129 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
31130 add_dependee_for_func_arg (first_arg, e->src);
31133 insn = first_arg;
31134 if (insn == head)
31135 break;
31138 else if (first_arg)
31139 avoid_func_arg_motion (first_arg, insn);
31142 /* Hook for pre-reload schedule - set priority of moves from likely spilled
31143 HW registers to maximum, to schedule them at soon as possible. These are
31144 moves from function argument registers at the top of the function entry
31145 and moves from function return value registers after call. */
31146 static int
31147 ix86_adjust_priority (rtx_insn *insn, int priority)
31149 rtx set;
31151 if (reload_completed)
31152 return priority;
31154 if (!NONDEBUG_INSN_P (insn))
31155 return priority;
31157 set = single_set (insn);
31158 if (set)
31160 rtx tmp = SET_SRC (set);
31161 if (REG_P (tmp)
31162 && HARD_REGISTER_P (tmp)
31163 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
31164 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
31165 return current_sched_info->sched_max_insns_priority;
31168 return priority;
31171 /* Model decoder of Core 2/i7.
31172 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
31173 track the instruction fetch block boundaries and make sure that long
31174 (9+ bytes) instructions are assigned to D0. */
31176 /* Maximum length of an insn that can be handled by
31177 a secondary decoder unit. '8' for Core 2/i7. */
31178 static int core2i7_secondary_decoder_max_insn_size;
31180 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
31181 '16' for Core 2/i7. */
31182 static int core2i7_ifetch_block_size;
31184 /* Maximum number of instructions decoder can handle per cycle.
31185 '6' for Core 2/i7. */
31186 static int core2i7_ifetch_block_max_insns;
31188 typedef struct ix86_first_cycle_multipass_data_ *
31189 ix86_first_cycle_multipass_data_t;
31190 typedef const struct ix86_first_cycle_multipass_data_ *
31191 const_ix86_first_cycle_multipass_data_t;
31193 /* A variable to store target state across calls to max_issue within
31194 one cycle. */
31195 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
31196 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
31198 /* Initialize DATA. */
31199 static void
31200 core2i7_first_cycle_multipass_init (void *_data)
31202 ix86_first_cycle_multipass_data_t data
31203 = (ix86_first_cycle_multipass_data_t) _data;
31205 data->ifetch_block_len = 0;
31206 data->ifetch_block_n_insns = 0;
31207 data->ready_try_change = NULL;
31208 data->ready_try_change_size = 0;
31211 /* Advancing the cycle; reset ifetch block counts. */
31212 static void
31213 core2i7_dfa_post_advance_cycle (void)
31215 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
31217 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31219 data->ifetch_block_len = 0;
31220 data->ifetch_block_n_insns = 0;
31223 static int min_insn_size (rtx_insn *);
31225 /* Filter out insns from ready_try that the core will not be able to issue
31226 on current cycle due to decoder. */
31227 static void
31228 core2i7_first_cycle_multipass_filter_ready_try
31229 (const_ix86_first_cycle_multipass_data_t data,
31230 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
31232 while (n_ready--)
31234 rtx_insn *insn;
31235 int insn_size;
31237 if (ready_try[n_ready])
31238 continue;
31240 insn = get_ready_element (n_ready);
31241 insn_size = min_insn_size (insn);
31243 if (/* If this is a too long an insn for a secondary decoder ... */
31244 (!first_cycle_insn_p
31245 && insn_size > core2i7_secondary_decoder_max_insn_size)
31246 /* ... or it would not fit into the ifetch block ... */
31247 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
31248 /* ... or the decoder is full already ... */
31249 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
31250 /* ... mask the insn out. */
31252 ready_try[n_ready] = 1;
31254 if (data->ready_try_change)
31255 bitmap_set_bit (data->ready_try_change, n_ready);
31260 /* Prepare for a new round of multipass lookahead scheduling. */
31261 static void
31262 core2i7_first_cycle_multipass_begin (void *_data,
31263 signed char *ready_try, int n_ready,
31264 bool first_cycle_insn_p)
31266 ix86_first_cycle_multipass_data_t data
31267 = (ix86_first_cycle_multipass_data_t) _data;
31268 const_ix86_first_cycle_multipass_data_t prev_data
31269 = ix86_first_cycle_multipass_data;
31271 /* Restore the state from the end of the previous round. */
31272 data->ifetch_block_len = prev_data->ifetch_block_len;
31273 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31275 /* Filter instructions that cannot be issued on current cycle due to
31276 decoder restrictions. */
31277 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31278 first_cycle_insn_p);
31281 /* INSN is being issued in current solution. Account for its impact on
31282 the decoder model. */
31283 static void
31284 core2i7_first_cycle_multipass_issue (void *_data,
31285 signed char *ready_try, int n_ready,
31286 rtx_insn *insn, const void *_prev_data)
31288 ix86_first_cycle_multipass_data_t data
31289 = (ix86_first_cycle_multipass_data_t) _data;
31290 const_ix86_first_cycle_multipass_data_t prev_data
31291 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31293 int insn_size = min_insn_size (insn);
31295 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31296 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31297 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31298 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31300 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31301 if (!data->ready_try_change)
31303 data->ready_try_change = sbitmap_alloc (n_ready);
31304 data->ready_try_change_size = n_ready;
31306 else if (data->ready_try_change_size < n_ready)
31308 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31309 n_ready, 0);
31310 data->ready_try_change_size = n_ready;
31312 bitmap_clear (data->ready_try_change);
31314 /* Filter out insns from ready_try that the core will not be able to issue
31315 on current cycle due to decoder. */
31316 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31317 false);
31320 /* Revert the effect on ready_try. */
31321 static void
31322 core2i7_first_cycle_multipass_backtrack (const void *_data,
31323 signed char *ready_try,
31324 int n_ready ATTRIBUTE_UNUSED)
31326 const_ix86_first_cycle_multipass_data_t data
31327 = (const_ix86_first_cycle_multipass_data_t) _data;
31328 unsigned int i = 0;
31329 sbitmap_iterator sbi;
31331 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31332 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31334 ready_try[i] = 0;
31338 /* Save the result of multipass lookahead scheduling for the next round. */
31339 static void
31340 core2i7_first_cycle_multipass_end (const void *_data)
31342 const_ix86_first_cycle_multipass_data_t data
31343 = (const_ix86_first_cycle_multipass_data_t) _data;
31344 ix86_first_cycle_multipass_data_t next_data
31345 = ix86_first_cycle_multipass_data;
31347 if (data != NULL)
31349 next_data->ifetch_block_len = data->ifetch_block_len;
31350 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31354 /* Deallocate target data. */
31355 static void
31356 core2i7_first_cycle_multipass_fini (void *_data)
31358 ix86_first_cycle_multipass_data_t data
31359 = (ix86_first_cycle_multipass_data_t) _data;
31361 if (data->ready_try_change)
31363 sbitmap_free (data->ready_try_change);
31364 data->ready_try_change = NULL;
31365 data->ready_try_change_size = 0;
31369 /* Prepare for scheduling pass. */
31370 static void
31371 ix86_sched_init_global (FILE *, int, int)
31373 /* Install scheduling hooks for current CPU. Some of these hooks are used
31374 in time-critical parts of the scheduler, so we only set them up when
31375 they are actually used. */
31376 switch (ix86_tune)
31378 case PROCESSOR_CORE2:
31379 case PROCESSOR_NEHALEM:
31380 case PROCESSOR_SANDYBRIDGE:
31381 case PROCESSOR_HASWELL:
31382 /* Do not perform multipass scheduling for pre-reload schedule
31383 to save compile time. */
31384 if (reload_completed)
31386 targetm.sched.dfa_post_advance_cycle
31387 = core2i7_dfa_post_advance_cycle;
31388 targetm.sched.first_cycle_multipass_init
31389 = core2i7_first_cycle_multipass_init;
31390 targetm.sched.first_cycle_multipass_begin
31391 = core2i7_first_cycle_multipass_begin;
31392 targetm.sched.first_cycle_multipass_issue
31393 = core2i7_first_cycle_multipass_issue;
31394 targetm.sched.first_cycle_multipass_backtrack
31395 = core2i7_first_cycle_multipass_backtrack;
31396 targetm.sched.first_cycle_multipass_end
31397 = core2i7_first_cycle_multipass_end;
31398 targetm.sched.first_cycle_multipass_fini
31399 = core2i7_first_cycle_multipass_fini;
31401 /* Set decoder parameters. */
31402 core2i7_secondary_decoder_max_insn_size = 8;
31403 core2i7_ifetch_block_size = 16;
31404 core2i7_ifetch_block_max_insns = 6;
31405 break;
31407 /* Fall through. */
31408 default:
31409 targetm.sched.dfa_post_advance_cycle = NULL;
31410 targetm.sched.first_cycle_multipass_init = NULL;
31411 targetm.sched.first_cycle_multipass_begin = NULL;
31412 targetm.sched.first_cycle_multipass_issue = NULL;
31413 targetm.sched.first_cycle_multipass_backtrack = NULL;
31414 targetm.sched.first_cycle_multipass_end = NULL;
31415 targetm.sched.first_cycle_multipass_fini = NULL;
31416 break;
31421 /* Compute the alignment given to a constant that is being placed in memory.
31422 EXP is the constant and ALIGN is the alignment that the object would
31423 ordinarily have.
31424 The value of this function is used instead of that alignment to align
31425 the object. */
31428 ix86_constant_alignment (tree exp, int align)
31430 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31431 || TREE_CODE (exp) == INTEGER_CST)
31433 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31434 return 64;
31435 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31436 return 128;
31438 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31439 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31440 return BITS_PER_WORD;
31442 return align;
31445 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31446 the data type, and ALIGN is the alignment that the object would
31447 ordinarily have. */
31449 static int
31450 iamcu_alignment (tree type, int align)
31452 machine_mode mode;
31454 if (align < 32 || TYPE_USER_ALIGN (type))
31455 return align;
31457 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31458 bytes. */
31459 mode = TYPE_MODE (strip_array_types (type));
31460 switch (GET_MODE_CLASS (mode))
31462 case MODE_INT:
31463 case MODE_COMPLEX_INT:
31464 case MODE_COMPLEX_FLOAT:
31465 case MODE_FLOAT:
31466 case MODE_DECIMAL_FLOAT:
31467 return 32;
31468 default:
31469 return align;
31473 /* Compute the alignment for a static variable.
31474 TYPE is the data type, and ALIGN is the alignment that
31475 the object would ordinarily have. The value of this function is used
31476 instead of that alignment to align the object. */
31479 ix86_data_alignment (tree type, int align, bool opt)
31481 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31482 for symbols from other compilation units or symbols that don't need
31483 to bind locally. In order to preserve some ABI compatibility with
31484 those compilers, ensure we don't decrease alignment from what we
31485 used to assume. */
31487 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31489 /* A data structure, equal or greater than the size of a cache line
31490 (64 bytes in the Pentium 4 and other recent Intel processors, including
31491 processors based on Intel Core microarchitecture) should be aligned
31492 so that its base address is a multiple of a cache line size. */
31494 int max_align
31495 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31497 if (max_align < BITS_PER_WORD)
31498 max_align = BITS_PER_WORD;
31500 switch (ix86_align_data_type)
31502 case ix86_align_data_type_abi: opt = false; break;
31503 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31504 case ix86_align_data_type_cacheline: break;
31507 if (TARGET_IAMCU)
31508 align = iamcu_alignment (type, align);
31510 if (opt
31511 && AGGREGATE_TYPE_P (type)
31512 && TYPE_SIZE (type)
31513 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31515 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31516 && align < max_align_compat)
31517 align = max_align_compat;
31518 if (wi::geu_p (TYPE_SIZE (type), max_align)
31519 && align < max_align)
31520 align = max_align;
31523 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31524 to 16byte boundary. */
31525 if (TARGET_64BIT)
31527 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31528 && TYPE_SIZE (type)
31529 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31530 && wi::geu_p (TYPE_SIZE (type), 128)
31531 && align < 128)
31532 return 128;
31535 if (!opt)
31536 return align;
31538 if (TREE_CODE (type) == ARRAY_TYPE)
31540 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31541 return 64;
31542 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31543 return 128;
31545 else if (TREE_CODE (type) == COMPLEX_TYPE)
31548 if (TYPE_MODE (type) == DCmode && align < 64)
31549 return 64;
31550 if ((TYPE_MODE (type) == XCmode
31551 || TYPE_MODE (type) == TCmode) && align < 128)
31552 return 128;
31554 else if ((TREE_CODE (type) == RECORD_TYPE
31555 || TREE_CODE (type) == UNION_TYPE
31556 || TREE_CODE (type) == QUAL_UNION_TYPE)
31557 && TYPE_FIELDS (type))
31559 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31560 return 64;
31561 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31562 return 128;
31564 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31565 || TREE_CODE (type) == INTEGER_TYPE)
31567 if (TYPE_MODE (type) == DFmode && align < 64)
31568 return 64;
31569 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31570 return 128;
31573 return align;
31576 /* Compute the alignment for a local variable or a stack slot. EXP is
31577 the data type or decl itself, MODE is the widest mode available and
31578 ALIGN is the alignment that the object would ordinarily have. The
31579 value of this macro is used instead of that alignment to align the
31580 object. */
31582 unsigned int
31583 ix86_local_alignment (tree exp, machine_mode mode,
31584 unsigned int align)
31586 tree type, decl;
31588 if (exp && DECL_P (exp))
31590 type = TREE_TYPE (exp);
31591 decl = exp;
31593 else
31595 type = exp;
31596 decl = NULL;
31599 /* Don't do dynamic stack realignment for long long objects with
31600 -mpreferred-stack-boundary=2. */
31601 if (!TARGET_64BIT
31602 && align == 64
31603 && ix86_preferred_stack_boundary < 64
31604 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31605 && (!type || !TYPE_USER_ALIGN (type))
31606 && (!decl || !DECL_USER_ALIGN (decl)))
31607 align = 32;
31609 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31610 register in MODE. We will return the largest alignment of XF
31611 and DF. */
31612 if (!type)
31614 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31615 align = GET_MODE_ALIGNMENT (DFmode);
31616 return align;
31619 /* Don't increase alignment for Intel MCU psABI. */
31620 if (TARGET_IAMCU)
31621 return align;
31623 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31624 to 16byte boundary. Exact wording is:
31626 An array uses the same alignment as its elements, except that a local or
31627 global array variable of length at least 16 bytes or
31628 a C99 variable-length array variable always has alignment of at least 16 bytes.
31630 This was added to allow use of aligned SSE instructions at arrays. This
31631 rule is meant for static storage (where compiler can not do the analysis
31632 by itself). We follow it for automatic variables only when convenient.
31633 We fully control everything in the function compiled and functions from
31634 other unit can not rely on the alignment.
31636 Exclude va_list type. It is the common case of local array where
31637 we can not benefit from the alignment.
31639 TODO: Probably one should optimize for size only when var is not escaping. */
31640 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31641 && TARGET_SSE)
31643 if (AGGREGATE_TYPE_P (type)
31644 && (va_list_type_node == NULL_TREE
31645 || (TYPE_MAIN_VARIANT (type)
31646 != TYPE_MAIN_VARIANT (va_list_type_node)))
31647 && TYPE_SIZE (type)
31648 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31649 && wi::geu_p (TYPE_SIZE (type), 128)
31650 && align < 128)
31651 return 128;
31653 if (TREE_CODE (type) == ARRAY_TYPE)
31655 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31656 return 64;
31657 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31658 return 128;
31660 else if (TREE_CODE (type) == COMPLEX_TYPE)
31662 if (TYPE_MODE (type) == DCmode && align < 64)
31663 return 64;
31664 if ((TYPE_MODE (type) == XCmode
31665 || TYPE_MODE (type) == TCmode) && align < 128)
31666 return 128;
31668 else if ((TREE_CODE (type) == RECORD_TYPE
31669 || TREE_CODE (type) == UNION_TYPE
31670 || TREE_CODE (type) == QUAL_UNION_TYPE)
31671 && TYPE_FIELDS (type))
31673 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31674 return 64;
31675 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31676 return 128;
31678 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31679 || TREE_CODE (type) == INTEGER_TYPE)
31682 if (TYPE_MODE (type) == DFmode && align < 64)
31683 return 64;
31684 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31685 return 128;
31687 return align;
31690 /* Compute the minimum required alignment for dynamic stack realignment
31691 purposes for a local variable, parameter or a stack slot. EXP is
31692 the data type or decl itself, MODE is its mode and ALIGN is the
31693 alignment that the object would ordinarily have. */
31695 unsigned int
31696 ix86_minimum_alignment (tree exp, machine_mode mode,
31697 unsigned int align)
31699 tree type, decl;
31701 if (exp && DECL_P (exp))
31703 type = TREE_TYPE (exp);
31704 decl = exp;
31706 else
31708 type = exp;
31709 decl = NULL;
31712 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31713 return align;
31715 /* Don't do dynamic stack realignment for long long objects with
31716 -mpreferred-stack-boundary=2. */
31717 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31718 && (!type || !TYPE_USER_ALIGN (type))
31719 && (!decl || !DECL_USER_ALIGN (decl)))
31721 gcc_checking_assert (!TARGET_STV);
31722 return 32;
31725 return align;
31728 /* Find a location for the static chain incoming to a nested function.
31729 This is a register, unless all free registers are used by arguments. */
31731 static rtx
31732 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31734 unsigned regno;
31736 /* While this function won't be called by the middle-end when a static
31737 chain isn't needed, it's also used throughout the backend so it's
31738 easiest to keep this check centralized. */
31739 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31740 return NULL;
31742 if (TARGET_64BIT)
31744 /* We always use R10 in 64-bit mode. */
31745 regno = R10_REG;
31747 else
31749 const_tree fntype, fndecl;
31750 unsigned int ccvt;
31752 /* By default in 32-bit mode we use ECX to pass the static chain. */
31753 regno = CX_REG;
31755 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31757 fntype = TREE_TYPE (fndecl_or_type);
31758 fndecl = fndecl_or_type;
31760 else
31762 fntype = fndecl_or_type;
31763 fndecl = NULL;
31766 ccvt = ix86_get_callcvt (fntype);
31767 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31769 /* Fastcall functions use ecx/edx for arguments, which leaves
31770 us with EAX for the static chain.
31771 Thiscall functions use ecx for arguments, which also
31772 leaves us with EAX for the static chain. */
31773 regno = AX_REG;
31775 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31777 /* Thiscall functions use ecx for arguments, which leaves
31778 us with EAX and EDX for the static chain.
31779 We are using for abi-compatibility EAX. */
31780 regno = AX_REG;
31782 else if (ix86_function_regparm (fntype, fndecl) == 3)
31784 /* For regparm 3, we have no free call-clobbered registers in
31785 which to store the static chain. In order to implement this,
31786 we have the trampoline push the static chain to the stack.
31787 However, we can't push a value below the return address when
31788 we call the nested function directly, so we have to use an
31789 alternate entry point. For this we use ESI, and have the
31790 alternate entry point push ESI, so that things appear the
31791 same once we're executing the nested function. */
31792 if (incoming_p)
31794 if (fndecl == current_function_decl
31795 && !ix86_static_chain_on_stack)
31797 gcc_assert (!reload_completed);
31798 ix86_static_chain_on_stack = true;
31800 return gen_frame_mem (SImode,
31801 plus_constant (Pmode,
31802 arg_pointer_rtx, -8));
31804 regno = SI_REG;
31808 return gen_rtx_REG (Pmode, regno);
31811 /* Emit RTL insns to initialize the variable parts of a trampoline.
31812 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31813 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31814 to be passed to the target function. */
31816 static void
31817 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31819 rtx mem, fnaddr;
31820 int opcode;
31821 int offset = 0;
31823 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31825 if (TARGET_64BIT)
31827 int size;
31829 /* Load the function address to r11. Try to load address using
31830 the shorter movl instead of movabs. We may want to support
31831 movq for kernel mode, but kernel does not use trampolines at
31832 the moment. FNADDR is a 32bit address and may not be in
31833 DImode when ptr_mode == SImode. Always use movl in this
31834 case. */
31835 if (ptr_mode == SImode
31836 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31838 fnaddr = copy_addr_to_reg (fnaddr);
31840 mem = adjust_address (m_tramp, HImode, offset);
31841 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31843 mem = adjust_address (m_tramp, SImode, offset + 2);
31844 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31845 offset += 6;
31847 else
31849 mem = adjust_address (m_tramp, HImode, offset);
31850 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31852 mem = adjust_address (m_tramp, DImode, offset + 2);
31853 emit_move_insn (mem, fnaddr);
31854 offset += 10;
31857 /* Load static chain using movabs to r10. Use the shorter movl
31858 instead of movabs when ptr_mode == SImode. */
31859 if (ptr_mode == SImode)
31861 opcode = 0xba41;
31862 size = 6;
31864 else
31866 opcode = 0xba49;
31867 size = 10;
31870 mem = adjust_address (m_tramp, HImode, offset);
31871 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31873 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31874 emit_move_insn (mem, chain_value);
31875 offset += size;
31877 /* Jump to r11; the last (unused) byte is a nop, only there to
31878 pad the write out to a single 32-bit store. */
31879 mem = adjust_address (m_tramp, SImode, offset);
31880 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31881 offset += 4;
31883 else
31885 rtx disp, chain;
31887 /* Depending on the static chain location, either load a register
31888 with a constant, or push the constant to the stack. All of the
31889 instructions are the same size. */
31890 chain = ix86_static_chain (fndecl, true);
31891 if (REG_P (chain))
31893 switch (REGNO (chain))
31895 case AX_REG:
31896 opcode = 0xb8; break;
31897 case CX_REG:
31898 opcode = 0xb9; break;
31899 default:
31900 gcc_unreachable ();
31903 else
31904 opcode = 0x68;
31906 mem = adjust_address (m_tramp, QImode, offset);
31907 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31909 mem = adjust_address (m_tramp, SImode, offset + 1);
31910 emit_move_insn (mem, chain_value);
31911 offset += 5;
31913 mem = adjust_address (m_tramp, QImode, offset);
31914 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31916 mem = adjust_address (m_tramp, SImode, offset + 1);
31918 /* Compute offset from the end of the jmp to the target function.
31919 In the case in which the trampoline stores the static chain on
31920 the stack, we need to skip the first insn which pushes the
31921 (call-saved) register static chain; this push is 1 byte. */
31922 offset += 5;
31923 disp = expand_binop (SImode, sub_optab, fnaddr,
31924 plus_constant (Pmode, XEXP (m_tramp, 0),
31925 offset - (MEM_P (chain) ? 1 : 0)),
31926 NULL_RTX, 1, OPTAB_DIRECT);
31927 emit_move_insn (mem, disp);
31930 gcc_assert (offset <= TRAMPOLINE_SIZE);
31932 #ifdef HAVE_ENABLE_EXECUTE_STACK
31933 #ifdef CHECK_EXECUTE_STACK_ENABLED
31934 if (CHECK_EXECUTE_STACK_ENABLED)
31935 #endif
31936 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31937 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
31938 #endif
31941 static bool
31942 ix86_allocate_stack_slots_for_args (void)
31944 /* Naked functions should not allocate stack slots for arguments. */
31945 return !ix86_function_naked (current_function_decl);
31948 static bool
31949 ix86_warn_func_return (tree decl)
31951 /* Naked functions are implemented entirely in assembly, including the
31952 return sequence, so suppress warnings about this. */
31953 return !ix86_function_naked (decl);
31956 /* The following file contains several enumerations and data structures
31957 built from the definitions in i386-builtin-types.def. */
31959 #include "i386-builtin-types.inc"
31961 /* Table for the ix86 builtin non-function types. */
31962 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31964 /* Retrieve an element from the above table, building some of
31965 the types lazily. */
31967 static tree
31968 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31970 unsigned int index;
31971 tree type, itype;
31973 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31975 type = ix86_builtin_type_tab[(int) tcode];
31976 if (type != NULL)
31977 return type;
31979 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31980 if (tcode <= IX86_BT_LAST_VECT)
31982 machine_mode mode;
31984 index = tcode - IX86_BT_LAST_PRIM - 1;
31985 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31986 mode = ix86_builtin_type_vect_mode[index];
31988 type = build_vector_type_for_mode (itype, mode);
31990 else
31992 int quals;
31994 index = tcode - IX86_BT_LAST_VECT - 1;
31995 if (tcode <= IX86_BT_LAST_PTR)
31996 quals = TYPE_UNQUALIFIED;
31997 else
31998 quals = TYPE_QUAL_CONST;
32000 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
32001 if (quals != TYPE_UNQUALIFIED)
32002 itype = build_qualified_type (itype, quals);
32004 type = build_pointer_type (itype);
32007 ix86_builtin_type_tab[(int) tcode] = type;
32008 return type;
32011 /* Table for the ix86 builtin function types. */
32012 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
32014 /* Retrieve an element from the above table, building some of
32015 the types lazily. */
32017 static tree
32018 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
32020 tree type;
32022 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
32024 type = ix86_builtin_func_type_tab[(int) tcode];
32025 if (type != NULL)
32026 return type;
32028 if (tcode <= IX86_BT_LAST_FUNC)
32030 unsigned start = ix86_builtin_func_start[(int) tcode];
32031 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
32032 tree rtype, atype, args = void_list_node;
32033 unsigned i;
32035 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
32036 for (i = after - 1; i > start; --i)
32038 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
32039 args = tree_cons (NULL, atype, args);
32042 type = build_function_type (rtype, args);
32044 else
32046 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
32047 enum ix86_builtin_func_type icode;
32049 icode = ix86_builtin_func_alias_base[index];
32050 type = ix86_get_builtin_func_type (icode);
32053 ix86_builtin_func_type_tab[(int) tcode] = type;
32054 return type;
32058 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
32059 bdesc_* arrays below should come first, then builtins for each bdesc_*
32060 array in ascending order, so that we can use direct array accesses. */
32061 enum ix86_builtins
32063 IX86_BUILTIN_MASKMOVQ,
32064 IX86_BUILTIN_LDMXCSR,
32065 IX86_BUILTIN_STMXCSR,
32066 IX86_BUILTIN_MASKMOVDQU,
32067 IX86_BUILTIN_PSLLDQ128,
32068 IX86_BUILTIN_CLFLUSH,
32069 IX86_BUILTIN_MONITOR,
32070 IX86_BUILTIN_MWAIT,
32071 IX86_BUILTIN_CLZERO,
32072 IX86_BUILTIN_VEC_INIT_V2SI,
32073 IX86_BUILTIN_VEC_INIT_V4HI,
32074 IX86_BUILTIN_VEC_INIT_V8QI,
32075 IX86_BUILTIN_VEC_EXT_V2DF,
32076 IX86_BUILTIN_VEC_EXT_V2DI,
32077 IX86_BUILTIN_VEC_EXT_V4SF,
32078 IX86_BUILTIN_VEC_EXT_V4SI,
32079 IX86_BUILTIN_VEC_EXT_V8HI,
32080 IX86_BUILTIN_VEC_EXT_V2SI,
32081 IX86_BUILTIN_VEC_EXT_V4HI,
32082 IX86_BUILTIN_VEC_EXT_V16QI,
32083 IX86_BUILTIN_VEC_SET_V2DI,
32084 IX86_BUILTIN_VEC_SET_V4SF,
32085 IX86_BUILTIN_VEC_SET_V4SI,
32086 IX86_BUILTIN_VEC_SET_V8HI,
32087 IX86_BUILTIN_VEC_SET_V4HI,
32088 IX86_BUILTIN_VEC_SET_V16QI,
32089 IX86_BUILTIN_GATHERSIV2DF,
32090 IX86_BUILTIN_GATHERSIV4DF,
32091 IX86_BUILTIN_GATHERDIV2DF,
32092 IX86_BUILTIN_GATHERDIV4DF,
32093 IX86_BUILTIN_GATHERSIV4SF,
32094 IX86_BUILTIN_GATHERSIV8SF,
32095 IX86_BUILTIN_GATHERDIV4SF,
32096 IX86_BUILTIN_GATHERDIV8SF,
32097 IX86_BUILTIN_GATHERSIV2DI,
32098 IX86_BUILTIN_GATHERSIV4DI,
32099 IX86_BUILTIN_GATHERDIV2DI,
32100 IX86_BUILTIN_GATHERDIV4DI,
32101 IX86_BUILTIN_GATHERSIV4SI,
32102 IX86_BUILTIN_GATHERSIV8SI,
32103 IX86_BUILTIN_GATHERDIV4SI,
32104 IX86_BUILTIN_GATHERDIV8SI,
32105 IX86_BUILTIN_VFMSUBSD3_MASK3,
32106 IX86_BUILTIN_VFMSUBSS3_MASK3,
32107 IX86_BUILTIN_GATHER3SIV8SF,
32108 IX86_BUILTIN_GATHER3SIV4SF,
32109 IX86_BUILTIN_GATHER3SIV4DF,
32110 IX86_BUILTIN_GATHER3SIV2DF,
32111 IX86_BUILTIN_GATHER3DIV8SF,
32112 IX86_BUILTIN_GATHER3DIV4SF,
32113 IX86_BUILTIN_GATHER3DIV4DF,
32114 IX86_BUILTIN_GATHER3DIV2DF,
32115 IX86_BUILTIN_GATHER3SIV8SI,
32116 IX86_BUILTIN_GATHER3SIV4SI,
32117 IX86_BUILTIN_GATHER3SIV4DI,
32118 IX86_BUILTIN_GATHER3SIV2DI,
32119 IX86_BUILTIN_GATHER3DIV8SI,
32120 IX86_BUILTIN_GATHER3DIV4SI,
32121 IX86_BUILTIN_GATHER3DIV4DI,
32122 IX86_BUILTIN_GATHER3DIV2DI,
32123 IX86_BUILTIN_SCATTERSIV8SF,
32124 IX86_BUILTIN_SCATTERSIV4SF,
32125 IX86_BUILTIN_SCATTERSIV4DF,
32126 IX86_BUILTIN_SCATTERSIV2DF,
32127 IX86_BUILTIN_SCATTERDIV8SF,
32128 IX86_BUILTIN_SCATTERDIV4SF,
32129 IX86_BUILTIN_SCATTERDIV4DF,
32130 IX86_BUILTIN_SCATTERDIV2DF,
32131 IX86_BUILTIN_SCATTERSIV8SI,
32132 IX86_BUILTIN_SCATTERSIV4SI,
32133 IX86_BUILTIN_SCATTERSIV4DI,
32134 IX86_BUILTIN_SCATTERSIV2DI,
32135 IX86_BUILTIN_SCATTERDIV8SI,
32136 IX86_BUILTIN_SCATTERDIV4SI,
32137 IX86_BUILTIN_SCATTERDIV4DI,
32138 IX86_BUILTIN_SCATTERDIV2DI,
32139 /* Alternate 4 and 8 element gather/scatter for the vectorizer
32140 where all operands are 32-byte or 64-byte wide respectively. */
32141 IX86_BUILTIN_GATHERALTSIV4DF,
32142 IX86_BUILTIN_GATHERALTDIV8SF,
32143 IX86_BUILTIN_GATHERALTSIV4DI,
32144 IX86_BUILTIN_GATHERALTDIV8SI,
32145 IX86_BUILTIN_GATHER3ALTDIV16SF,
32146 IX86_BUILTIN_GATHER3ALTDIV16SI,
32147 IX86_BUILTIN_GATHER3ALTSIV4DF,
32148 IX86_BUILTIN_GATHER3ALTDIV8SF,
32149 IX86_BUILTIN_GATHER3ALTSIV4DI,
32150 IX86_BUILTIN_GATHER3ALTDIV8SI,
32151 IX86_BUILTIN_GATHER3ALTSIV8DF,
32152 IX86_BUILTIN_GATHER3ALTSIV8DI,
32153 IX86_BUILTIN_GATHER3DIV16SF,
32154 IX86_BUILTIN_GATHER3DIV16SI,
32155 IX86_BUILTIN_GATHER3DIV8DF,
32156 IX86_BUILTIN_GATHER3DIV8DI,
32157 IX86_BUILTIN_GATHER3SIV16SF,
32158 IX86_BUILTIN_GATHER3SIV16SI,
32159 IX86_BUILTIN_GATHER3SIV8DF,
32160 IX86_BUILTIN_GATHER3SIV8DI,
32161 IX86_BUILTIN_SCATTERALTSIV8DF,
32162 IX86_BUILTIN_SCATTERALTDIV16SF,
32163 IX86_BUILTIN_SCATTERALTSIV8DI,
32164 IX86_BUILTIN_SCATTERALTDIV16SI,
32165 IX86_BUILTIN_SCATTERDIV16SF,
32166 IX86_BUILTIN_SCATTERDIV16SI,
32167 IX86_BUILTIN_SCATTERDIV8DF,
32168 IX86_BUILTIN_SCATTERDIV8DI,
32169 IX86_BUILTIN_SCATTERSIV16SF,
32170 IX86_BUILTIN_SCATTERSIV16SI,
32171 IX86_BUILTIN_SCATTERSIV8DF,
32172 IX86_BUILTIN_SCATTERSIV8DI,
32173 IX86_BUILTIN_GATHERPFQPD,
32174 IX86_BUILTIN_GATHERPFDPS,
32175 IX86_BUILTIN_GATHERPFDPD,
32176 IX86_BUILTIN_GATHERPFQPS,
32177 IX86_BUILTIN_SCATTERPFDPD,
32178 IX86_BUILTIN_SCATTERPFDPS,
32179 IX86_BUILTIN_SCATTERPFQPD,
32180 IX86_BUILTIN_SCATTERPFQPS,
32181 IX86_BUILTIN_CLWB,
32182 IX86_BUILTIN_CLFLUSHOPT,
32183 IX86_BUILTIN_INFQ,
32184 IX86_BUILTIN_HUGE_VALQ,
32185 IX86_BUILTIN_NANQ,
32186 IX86_BUILTIN_NANSQ,
32187 IX86_BUILTIN_XABORT,
32188 IX86_BUILTIN_ADDCARRYX32,
32189 IX86_BUILTIN_ADDCARRYX64,
32190 IX86_BUILTIN_SBB32,
32191 IX86_BUILTIN_SBB64,
32192 IX86_BUILTIN_RDRAND16_STEP,
32193 IX86_BUILTIN_RDRAND32_STEP,
32194 IX86_BUILTIN_RDRAND64_STEP,
32195 IX86_BUILTIN_RDSEED16_STEP,
32196 IX86_BUILTIN_RDSEED32_STEP,
32197 IX86_BUILTIN_RDSEED64_STEP,
32198 IX86_BUILTIN_MONITORX,
32199 IX86_BUILTIN_MWAITX,
32200 IX86_BUILTIN_CFSTRING,
32201 IX86_BUILTIN_CPU_INIT,
32202 IX86_BUILTIN_CPU_IS,
32203 IX86_BUILTIN_CPU_SUPPORTS,
32204 IX86_BUILTIN_READ_FLAGS,
32205 IX86_BUILTIN_WRITE_FLAGS,
32207 /* All the remaining builtins are tracked in bdesc_* arrays in
32208 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
32209 this point. */
32210 #define BDESC(mask, icode, name, code, comparison, flag) \
32211 code,
32212 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32213 code, \
32214 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
32215 #define BDESC_END(kind, next_kind)
32217 #include "i386-builtin.def"
32219 #undef BDESC
32220 #undef BDESC_FIRST
32221 #undef BDESC_END
32223 IX86_BUILTIN_MAX,
32225 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
32227 /* Now just the aliases for bdesc_* start/end. */
32228 #define BDESC(mask, icode, name, code, comparison, flag)
32229 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
32230 #define BDESC_END(kind, next_kind) \
32231 IX86_BUILTIN__BDESC_##kind##_LAST \
32232 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
32234 #include "i386-builtin.def"
32236 #undef BDESC
32237 #undef BDESC_FIRST
32238 #undef BDESC_END
32240 /* Just to make sure there is no comma after the last enumerator. */
32241 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
32244 /* Table for the ix86 builtin decls. */
32245 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
32247 /* Table of all of the builtin functions that are possible with different ISA's
32248 but are waiting to be built until a function is declared to use that
32249 ISA. */
32250 struct builtin_isa {
32251 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
32252 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
32253 const char *name; /* function name */
32254 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
32255 unsigned char const_p:1; /* true if the declaration is constant */
32256 unsigned char pure_p:1; /* true if the declaration has pure attribute */
32257 bool leaf_p; /* true if the declaration has leaf attribute */
32258 bool nothrow_p; /* true if the declaration has nothrow attribute */
32259 bool set_and_not_built_p;
32262 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
32264 /* Bits that can still enable any inclusion of a builtin. */
32265 static HOST_WIDE_INT deferred_isa_values = 0;
32266 static HOST_WIDE_INT deferred_isa_values2 = 0;
32268 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32269 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32270 function decl in the ix86_builtins array. Returns the function decl or
32271 NULL_TREE, if the builtin was not added.
32273 If the front end has a special hook for builtin functions, delay adding
32274 builtin functions that aren't in the current ISA until the ISA is changed
32275 with function specific optimization. Doing so, can save about 300K for the
32276 default compiler. When the builtin is expanded, check at that time whether
32277 it is valid.
32279 If the front end doesn't have a special hook, record all builtins, even if
32280 it isn't an instruction set in the current ISA in case the user uses
32281 function specific options for a different ISA, so that we don't get scope
32282 errors if a builtin is added in the middle of a function scope. */
32284 static inline tree
32285 def_builtin (HOST_WIDE_INT mask, const char *name,
32286 enum ix86_builtin_func_type tcode,
32287 enum ix86_builtins code)
32289 tree decl = NULL_TREE;
32291 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32293 ix86_builtins_isa[(int) code].isa = mask;
32295 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32296 where any bit set means that built-in is enable, this bit must be *and-ed*
32297 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32298 means that *both* cpuid bits must be set for the built-in to be available.
32299 Handle this here. */
32300 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32301 mask &= ~OPTION_MASK_ISA_AVX512VL;
32303 mask &= ~OPTION_MASK_ISA_64BIT;
32304 if (mask == 0
32305 || (mask & ix86_isa_flags) != 0
32306 || (lang_hooks.builtin_function
32307 == lang_hooks.builtin_function_ext_scope))
32310 tree type = ix86_get_builtin_func_type (tcode);
32311 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32312 NULL, NULL_TREE);
32313 ix86_builtins[(int) code] = decl;
32314 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32316 else
32318 /* Just a MASK where set_and_not_built_p == true can potentially
32319 include a builtin. */
32320 deferred_isa_values |= mask;
32321 ix86_builtins[(int) code] = NULL_TREE;
32322 ix86_builtins_isa[(int) code].tcode = tcode;
32323 ix86_builtins_isa[(int) code].name = name;
32324 ix86_builtins_isa[(int) code].leaf_p = false;
32325 ix86_builtins_isa[(int) code].nothrow_p = false;
32326 ix86_builtins_isa[(int) code].const_p = false;
32327 ix86_builtins_isa[(int) code].pure_p = false;
32328 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32332 return decl;
32335 /* Like def_builtin, but also marks the function decl "const". */
32337 static inline tree
32338 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32339 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32341 tree decl = def_builtin (mask, name, tcode, code);
32342 if (decl)
32343 TREE_READONLY (decl) = 1;
32344 else
32345 ix86_builtins_isa[(int) code].const_p = true;
32347 return decl;
32350 /* Like def_builtin, but also marks the function decl "pure". */
32352 static inline tree
32353 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32354 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32356 tree decl = def_builtin (mask, name, tcode, code);
32357 if (decl)
32358 DECL_PURE_P (decl) = 1;
32359 else
32360 ix86_builtins_isa[(int) code].pure_p = true;
32362 return decl;
32365 /* Like def_builtin, but for additional isa2 flags. */
32367 static inline tree
32368 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32369 enum ix86_builtin_func_type tcode,
32370 enum ix86_builtins code)
32372 tree decl = NULL_TREE;
32374 ix86_builtins_isa[(int) code].isa2 = mask;
32376 if (mask == 0
32377 || (mask & ix86_isa_flags2) != 0
32378 || (lang_hooks.builtin_function
32379 == lang_hooks.builtin_function_ext_scope))
32382 tree type = ix86_get_builtin_func_type (tcode);
32383 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32384 NULL, NULL_TREE);
32385 ix86_builtins[(int) code] = decl;
32386 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32388 else
32390 /* Just a MASK where set_and_not_built_p == true can potentially
32391 include a builtin. */
32392 deferred_isa_values2 |= mask;
32393 ix86_builtins[(int) code] = NULL_TREE;
32394 ix86_builtins_isa[(int) code].tcode = tcode;
32395 ix86_builtins_isa[(int) code].name = name;
32396 ix86_builtins_isa[(int) code].leaf_p = false;
32397 ix86_builtins_isa[(int) code].nothrow_p = false;
32398 ix86_builtins_isa[(int) code].const_p = false;
32399 ix86_builtins_isa[(int) code].pure_p = false;
32400 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32403 return decl;
32406 /* Like def_builtin, but also marks the function decl "const". */
32408 static inline tree
32409 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32410 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32412 tree decl = def_builtin2 (mask, name, tcode, code);
32413 if (decl)
32414 TREE_READONLY (decl) = 1;
32415 else
32416 ix86_builtins_isa[(int) code].const_p = true;
32418 return decl;
32421 /* Like def_builtin, but also marks the function decl "pure". */
32423 static inline tree
32424 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32425 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32427 tree decl = def_builtin2 (mask, name, tcode, code);
32428 if (decl)
32429 DECL_PURE_P (decl) = 1;
32430 else
32431 ix86_builtins_isa[(int) code].pure_p = true;
32433 return decl;
32436 /* Add any new builtin functions for a given ISA that may not have been
32437 declared. This saves a bit of space compared to adding all of the
32438 declarations to the tree, even if we didn't use them. */
32440 static void
32441 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32443 if ((isa & deferred_isa_values) == 0
32444 && (isa2 & deferred_isa_values2) == 0)
32445 return;
32447 /* Bits in ISA value can be removed from potential isa values. */
32448 deferred_isa_values &= ~isa;
32449 deferred_isa_values2 &= ~isa2;
32451 int i;
32452 tree saved_current_target_pragma = current_target_pragma;
32453 current_target_pragma = NULL_TREE;
32455 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32457 if (((ix86_builtins_isa[i].isa & isa) != 0
32458 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32459 && ix86_builtins_isa[i].set_and_not_built_p)
32461 tree decl, type;
32463 /* Don't define the builtin again. */
32464 ix86_builtins_isa[i].set_and_not_built_p = false;
32466 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32467 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32468 type, i, BUILT_IN_MD, NULL,
32469 NULL_TREE);
32471 ix86_builtins[i] = decl;
32472 if (ix86_builtins_isa[i].const_p)
32473 TREE_READONLY (decl) = 1;
32474 if (ix86_builtins_isa[i].pure_p)
32475 DECL_PURE_P (decl) = 1;
32476 if (ix86_builtins_isa[i].leaf_p)
32477 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32478 NULL_TREE);
32479 if (ix86_builtins_isa[i].nothrow_p)
32480 TREE_NOTHROW (decl) = 1;
32484 current_target_pragma = saved_current_target_pragma;
32487 /* Bits for builtin_description.flag. */
32489 /* Set when we don't support the comparison natively, and should
32490 swap_comparison in order to support it. */
32491 #define BUILTIN_DESC_SWAP_OPERANDS 1
32493 struct builtin_description
32495 const HOST_WIDE_INT mask;
32496 const enum insn_code icode;
32497 const char *const name;
32498 const enum ix86_builtins code;
32499 const enum rtx_code comparison;
32500 const int flag;
32503 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32504 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32505 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32506 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32507 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32508 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32509 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32510 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32511 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32512 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32513 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32514 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32515 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32516 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32517 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32518 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32519 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32520 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32521 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32522 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32523 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32524 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32525 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32526 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32527 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32528 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32529 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32530 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32531 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32532 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32533 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32534 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32535 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32536 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32537 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32538 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32539 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32540 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32541 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32542 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32543 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32544 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32545 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32546 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32547 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32548 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32549 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32550 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32551 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32552 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32553 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32554 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32556 #define BDESC(mask, icode, name, code, comparison, flag) \
32557 { mask, icode, name, code, comparison, flag },
32558 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32559 static const struct builtin_description bdesc_##kind[] = \
32561 BDESC (mask, icode, name, code, comparison, flag)
32562 #define BDESC_END(kind, next_kind) \
32565 #include "i386-builtin.def"
32567 #undef BDESC
32568 #undef BDESC_FIRST
32569 #undef BDESC_END
32571 /* TM vector builtins. */
32573 /* Reuse the existing x86-specific `struct builtin_description' cause
32574 we're lazy. Add casts to make them fit. */
32575 static const struct builtin_description bdesc_tm[] =
32577 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32578 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32579 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32580 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32581 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32582 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32583 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32585 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32586 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32587 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32588 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32589 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32590 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32591 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32593 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32594 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32595 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32596 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32597 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32598 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32599 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32601 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32602 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32603 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32606 /* Initialize the transactional memory vector load/store builtins. */
32608 static void
32609 ix86_init_tm_builtins (void)
32611 enum ix86_builtin_func_type ftype;
32612 const struct builtin_description *d;
32613 size_t i;
32614 tree decl;
32615 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32616 tree attrs_log, attrs_type_log;
32618 if (!flag_tm)
32619 return;
32621 /* If there are no builtins defined, we must be compiling in a
32622 language without trans-mem support. */
32623 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32624 return;
32626 /* Use whatever attributes a normal TM load has. */
32627 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32628 attrs_load = DECL_ATTRIBUTES (decl);
32629 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32630 /* Use whatever attributes a normal TM store has. */
32631 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32632 attrs_store = DECL_ATTRIBUTES (decl);
32633 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32634 /* Use whatever attributes a normal TM log has. */
32635 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32636 attrs_log = DECL_ATTRIBUTES (decl);
32637 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32639 for (i = 0, d = bdesc_tm;
32640 i < ARRAY_SIZE (bdesc_tm);
32641 i++, d++)
32643 if ((d->mask & ix86_isa_flags) != 0
32644 || (lang_hooks.builtin_function
32645 == lang_hooks.builtin_function_ext_scope))
32647 tree type, attrs, attrs_type;
32648 enum built_in_function code = (enum built_in_function) d->code;
32650 ftype = (enum ix86_builtin_func_type) d->flag;
32651 type = ix86_get_builtin_func_type (ftype);
32653 if (BUILTIN_TM_LOAD_P (code))
32655 attrs = attrs_load;
32656 attrs_type = attrs_type_load;
32658 else if (BUILTIN_TM_STORE_P (code))
32660 attrs = attrs_store;
32661 attrs_type = attrs_type_store;
32663 else
32665 attrs = attrs_log;
32666 attrs_type = attrs_type_log;
32668 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32669 /* The builtin without the prefix for
32670 calling it directly. */
32671 d->name + strlen ("__builtin_"),
32672 attrs);
32673 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32674 set the TYPE_ATTRIBUTES. */
32675 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32677 set_builtin_decl (code, decl, false);
32682 /* Macros for verification of enum ix86_builtins order. */
32683 #define BDESC_VERIFY(x, y, z) \
32684 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32685 #define BDESC_VERIFYS(x, y, z) \
32686 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32688 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32689 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32690 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32691 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32692 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32693 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32694 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32695 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32696 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32697 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32698 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32699 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32700 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32701 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32702 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32703 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32704 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32705 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32706 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32707 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32709 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32710 in the current target ISA to allow the user to compile particular modules
32711 with different target specific options that differ from the command line
32712 options. */
32713 static void
32714 ix86_init_mmx_sse_builtins (void)
32716 const struct builtin_description * d;
32717 enum ix86_builtin_func_type ftype;
32718 size_t i;
32720 /* Add all special builtins with variable number of operands. */
32721 for (i = 0, d = bdesc_special_args;
32722 i < ARRAY_SIZE (bdesc_special_args);
32723 i++, d++)
32725 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32726 if (d->name == 0)
32727 continue;
32729 ftype = (enum ix86_builtin_func_type) d->flag;
32730 def_builtin (d->mask, d->name, ftype, d->code);
32732 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32733 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32734 ARRAY_SIZE (bdesc_special_args) - 1);
32736 /* Add all builtins with variable number of operands. */
32737 for (i = 0, d = bdesc_args;
32738 i < ARRAY_SIZE (bdesc_args);
32739 i++, d++)
32741 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32742 if (d->name == 0)
32743 continue;
32745 ftype = (enum ix86_builtin_func_type) d->flag;
32746 def_builtin_const (d->mask, d->name, ftype, d->code);
32748 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32749 IX86_BUILTIN__BDESC_ARGS_FIRST,
32750 ARRAY_SIZE (bdesc_args) - 1);
32752 /* Add all builtins with variable number of operands. */
32753 for (i = 0, d = bdesc_args2;
32754 i < ARRAY_SIZE (bdesc_args2);
32755 i++, d++)
32757 if (d->name == 0)
32758 continue;
32760 ftype = (enum ix86_builtin_func_type) d->flag;
32761 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32764 /* Add all builtins with rounding. */
32765 for (i = 0, d = bdesc_round_args;
32766 i < ARRAY_SIZE (bdesc_round_args);
32767 i++, d++)
32769 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32770 if (d->name == 0)
32771 continue;
32773 ftype = (enum ix86_builtin_func_type) d->flag;
32774 def_builtin_const (d->mask, d->name, ftype, d->code);
32776 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32777 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32778 ARRAY_SIZE (bdesc_round_args) - 1);
32780 /* pcmpestr[im] insns. */
32781 for (i = 0, d = bdesc_pcmpestr;
32782 i < ARRAY_SIZE (bdesc_pcmpestr);
32783 i++, d++)
32785 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32786 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32787 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32788 else
32789 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32790 def_builtin_const (d->mask, d->name, ftype, d->code);
32792 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32793 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32794 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32796 /* pcmpistr[im] insns. */
32797 for (i = 0, d = bdesc_pcmpistr;
32798 i < ARRAY_SIZE (bdesc_pcmpistr);
32799 i++, d++)
32801 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32802 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32803 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32804 else
32805 ftype = INT_FTYPE_V16QI_V16QI_INT;
32806 def_builtin_const (d->mask, d->name, ftype, d->code);
32808 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32809 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32810 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32812 /* comi/ucomi insns. */
32813 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32815 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32816 if (d->mask == OPTION_MASK_ISA_SSE2)
32817 ftype = INT_FTYPE_V2DF_V2DF;
32818 else
32819 ftype = INT_FTYPE_V4SF_V4SF;
32820 def_builtin_const (d->mask, d->name, ftype, d->code);
32822 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32823 IX86_BUILTIN__BDESC_COMI_FIRST,
32824 ARRAY_SIZE (bdesc_comi) - 1);
32826 /* SSE */
32827 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32828 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32829 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32830 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32832 /* SSE or 3DNow!A */
32833 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32834 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32835 IX86_BUILTIN_MASKMOVQ);
32837 /* SSE2 */
32838 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32839 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32841 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32842 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32843 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32844 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32846 /* SSE3. */
32847 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32848 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32849 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32850 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32852 /* AES */
32853 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32854 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32855 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32856 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32857 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32858 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32859 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32860 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32861 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32862 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32863 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32864 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32866 /* PCLMUL */
32867 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32868 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32870 /* RDRND */
32871 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32872 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32873 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32874 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32875 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32876 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32877 IX86_BUILTIN_RDRAND64_STEP);
32879 /* AVX2 */
32880 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32881 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32882 IX86_BUILTIN_GATHERSIV2DF);
32884 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32885 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32886 IX86_BUILTIN_GATHERSIV4DF);
32888 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32889 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32890 IX86_BUILTIN_GATHERDIV2DF);
32892 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32893 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32894 IX86_BUILTIN_GATHERDIV4DF);
32896 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32897 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32898 IX86_BUILTIN_GATHERSIV4SF);
32900 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32901 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32902 IX86_BUILTIN_GATHERSIV8SF);
32904 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32905 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32906 IX86_BUILTIN_GATHERDIV4SF);
32908 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32909 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32910 IX86_BUILTIN_GATHERDIV8SF);
32912 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32913 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32914 IX86_BUILTIN_GATHERSIV2DI);
32916 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32917 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32918 IX86_BUILTIN_GATHERSIV4DI);
32920 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32921 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32922 IX86_BUILTIN_GATHERDIV2DI);
32924 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32925 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32926 IX86_BUILTIN_GATHERDIV4DI);
32928 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32929 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32930 IX86_BUILTIN_GATHERSIV4SI);
32932 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32933 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32934 IX86_BUILTIN_GATHERSIV8SI);
32936 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32937 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32938 IX86_BUILTIN_GATHERDIV4SI);
32940 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32941 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32942 IX86_BUILTIN_GATHERDIV8SI);
32944 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32945 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32946 IX86_BUILTIN_GATHERALTSIV4DF);
32948 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32949 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32950 IX86_BUILTIN_GATHERALTDIV8SF);
32952 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32953 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32954 IX86_BUILTIN_GATHERALTSIV4DI);
32956 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32957 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32958 IX86_BUILTIN_GATHERALTDIV8SI);
32960 /* AVX512F */
32961 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32962 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32963 IX86_BUILTIN_GATHER3SIV16SF);
32965 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32966 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32967 IX86_BUILTIN_GATHER3SIV8DF);
32969 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32970 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32971 IX86_BUILTIN_GATHER3DIV16SF);
32973 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32974 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32975 IX86_BUILTIN_GATHER3DIV8DF);
32977 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32978 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32979 IX86_BUILTIN_GATHER3SIV16SI);
32981 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32982 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32983 IX86_BUILTIN_GATHER3SIV8DI);
32985 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32986 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32987 IX86_BUILTIN_GATHER3DIV16SI);
32989 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32990 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32991 IX86_BUILTIN_GATHER3DIV8DI);
32993 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32994 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32995 IX86_BUILTIN_GATHER3ALTSIV8DF);
32997 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32998 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32999 IX86_BUILTIN_GATHER3ALTDIV16SF);
33001 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
33002 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
33003 IX86_BUILTIN_GATHER3ALTSIV8DI);
33005 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
33006 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
33007 IX86_BUILTIN_GATHER3ALTDIV16SI);
33009 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
33010 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
33011 IX86_BUILTIN_SCATTERSIV16SF);
33013 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
33014 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
33015 IX86_BUILTIN_SCATTERSIV8DF);
33017 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
33018 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
33019 IX86_BUILTIN_SCATTERDIV16SF);
33021 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
33022 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
33023 IX86_BUILTIN_SCATTERDIV8DF);
33025 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
33026 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
33027 IX86_BUILTIN_SCATTERSIV16SI);
33029 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
33030 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
33031 IX86_BUILTIN_SCATTERSIV8DI);
33033 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
33034 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
33035 IX86_BUILTIN_SCATTERDIV16SI);
33037 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
33038 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
33039 IX86_BUILTIN_SCATTERDIV8DI);
33041 /* AVX512VL */
33042 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
33043 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
33044 IX86_BUILTIN_GATHER3SIV2DF);
33046 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
33047 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
33048 IX86_BUILTIN_GATHER3SIV4DF);
33050 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
33051 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
33052 IX86_BUILTIN_GATHER3DIV2DF);
33054 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
33055 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
33056 IX86_BUILTIN_GATHER3DIV4DF);
33058 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
33059 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
33060 IX86_BUILTIN_GATHER3SIV4SF);
33062 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
33063 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
33064 IX86_BUILTIN_GATHER3SIV8SF);
33066 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
33067 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
33068 IX86_BUILTIN_GATHER3DIV4SF);
33070 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
33071 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
33072 IX86_BUILTIN_GATHER3DIV8SF);
33074 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
33075 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
33076 IX86_BUILTIN_GATHER3SIV2DI);
33078 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
33079 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
33080 IX86_BUILTIN_GATHER3SIV4DI);
33082 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
33083 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
33084 IX86_BUILTIN_GATHER3DIV2DI);
33086 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
33087 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
33088 IX86_BUILTIN_GATHER3DIV4DI);
33090 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
33091 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
33092 IX86_BUILTIN_GATHER3SIV4SI);
33094 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
33095 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
33096 IX86_BUILTIN_GATHER3SIV8SI);
33098 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
33099 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
33100 IX86_BUILTIN_GATHER3DIV4SI);
33102 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
33103 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
33104 IX86_BUILTIN_GATHER3DIV8SI);
33106 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
33107 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
33108 IX86_BUILTIN_GATHER3ALTSIV4DF);
33110 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
33111 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
33112 IX86_BUILTIN_GATHER3ALTDIV8SF);
33114 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
33115 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
33116 IX86_BUILTIN_GATHER3ALTSIV4DI);
33118 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
33119 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
33120 IX86_BUILTIN_GATHER3ALTDIV8SI);
33122 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
33123 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
33124 IX86_BUILTIN_SCATTERSIV8SF);
33126 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
33127 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
33128 IX86_BUILTIN_SCATTERSIV4SF);
33130 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
33131 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
33132 IX86_BUILTIN_SCATTERSIV4DF);
33134 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
33135 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
33136 IX86_BUILTIN_SCATTERSIV2DF);
33138 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
33139 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
33140 IX86_BUILTIN_SCATTERDIV8SF);
33142 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
33143 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
33144 IX86_BUILTIN_SCATTERDIV4SF);
33146 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
33147 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
33148 IX86_BUILTIN_SCATTERDIV4DF);
33150 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
33151 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
33152 IX86_BUILTIN_SCATTERDIV2DF);
33154 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
33155 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
33156 IX86_BUILTIN_SCATTERSIV8SI);
33158 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
33159 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
33160 IX86_BUILTIN_SCATTERSIV4SI);
33162 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
33163 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
33164 IX86_BUILTIN_SCATTERSIV4DI);
33166 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
33167 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
33168 IX86_BUILTIN_SCATTERSIV2DI);
33170 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
33171 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
33172 IX86_BUILTIN_SCATTERDIV8SI);
33174 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
33175 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
33176 IX86_BUILTIN_SCATTERDIV4SI);
33178 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
33179 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
33180 IX86_BUILTIN_SCATTERDIV4DI);
33182 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
33183 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
33184 IX86_BUILTIN_SCATTERDIV2DI);
33185 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
33186 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
33187 IX86_BUILTIN_SCATTERALTSIV8DF);
33189 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
33190 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
33191 IX86_BUILTIN_SCATTERALTDIV16SF);
33193 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
33194 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
33195 IX86_BUILTIN_SCATTERALTSIV8DI);
33197 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
33198 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
33199 IX86_BUILTIN_SCATTERALTDIV16SI);
33201 /* AVX512PF */
33202 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
33203 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33204 IX86_BUILTIN_GATHERPFDPD);
33205 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
33206 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33207 IX86_BUILTIN_GATHERPFDPS);
33208 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
33209 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33210 IX86_BUILTIN_GATHERPFQPD);
33211 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
33212 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33213 IX86_BUILTIN_GATHERPFQPS);
33214 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
33215 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33216 IX86_BUILTIN_SCATTERPFDPD);
33217 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
33218 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33219 IX86_BUILTIN_SCATTERPFDPS);
33220 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
33221 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33222 IX86_BUILTIN_SCATTERPFQPD);
33223 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
33224 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33225 IX86_BUILTIN_SCATTERPFQPS);
33227 /* SHA */
33228 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
33229 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
33230 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
33231 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
33232 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
33233 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
33234 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
33235 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
33236 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
33237 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
33238 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
33239 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
33240 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
33241 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
33243 /* RTM. */
33244 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
33245 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
33247 /* MMX access to the vec_init patterns. */
33248 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
33249 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
33251 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
33252 V4HI_FTYPE_HI_HI_HI_HI,
33253 IX86_BUILTIN_VEC_INIT_V4HI);
33255 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
33256 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
33257 IX86_BUILTIN_VEC_INIT_V8QI);
33259 /* Access to the vec_extract patterns. */
33260 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
33261 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
33262 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
33263 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
33264 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
33265 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33266 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33267 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33268 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33269 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33271 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33272 "__builtin_ia32_vec_ext_v4hi",
33273 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33275 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33276 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33278 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33279 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33281 /* Access to the vec_set patterns. */
33282 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33283 "__builtin_ia32_vec_set_v2di",
33284 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33286 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33287 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33289 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33290 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33292 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33293 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33295 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33296 "__builtin_ia32_vec_set_v4hi",
33297 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33299 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33300 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33302 /* RDSEED */
33303 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33304 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33305 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33306 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33307 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33308 "__builtin_ia32_rdseed_di_step",
33309 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33311 /* ADCX */
33312 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33313 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33314 def_builtin (OPTION_MASK_ISA_64BIT,
33315 "__builtin_ia32_addcarryx_u64",
33316 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33317 IX86_BUILTIN_ADDCARRYX64);
33319 /* SBB */
33320 def_builtin (0, "__builtin_ia32_sbb_u32",
33321 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33322 def_builtin (OPTION_MASK_ISA_64BIT,
33323 "__builtin_ia32_sbb_u64",
33324 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33325 IX86_BUILTIN_SBB64);
33327 /* Read/write FLAGS. */
33328 def_builtin (0, "__builtin_ia32_readeflags_u32",
33329 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33330 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33331 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33332 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33333 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33334 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33335 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33337 /* CLFLUSHOPT. */
33338 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33339 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33341 /* CLWB. */
33342 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33343 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33345 /* MONITORX and MWAITX. */
33346 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33347 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33348 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33349 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33351 /* CLZERO. */
33352 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33353 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33355 /* Add FMA4 multi-arg argument instructions */
33356 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33358 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33359 if (d->name == 0)
33360 continue;
33362 ftype = (enum ix86_builtin_func_type) d->flag;
33363 def_builtin_const (d->mask, d->name, ftype, d->code);
33365 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33366 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33367 ARRAY_SIZE (bdesc_multi_arg) - 1);
33370 static void
33371 ix86_init_mpx_builtins ()
33373 const struct builtin_description * d;
33374 enum ix86_builtin_func_type ftype;
33375 tree decl;
33376 size_t i;
33378 for (i = 0, d = bdesc_mpx;
33379 i < ARRAY_SIZE (bdesc_mpx);
33380 i++, d++)
33382 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33383 if (d->name == 0)
33384 continue;
33386 ftype = (enum ix86_builtin_func_type) d->flag;
33387 decl = def_builtin (d->mask, d->name, ftype, d->code);
33389 /* With no leaf and nothrow flags for MPX builtins
33390 abnormal edges may follow its call when setjmp
33391 presents in the function. Since we may have a lot
33392 of MPX builtins calls it causes lots of useless
33393 edges and enormous PHI nodes. To avoid this we mark
33394 MPX builtins as leaf and nothrow. */
33395 if (decl)
33397 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33398 NULL_TREE);
33399 TREE_NOTHROW (decl) = 1;
33401 else
33403 ix86_builtins_isa[(int)d->code].leaf_p = true;
33404 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33407 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33408 IX86_BUILTIN__BDESC_MPX_FIRST,
33409 ARRAY_SIZE (bdesc_mpx) - 1);
33411 for (i = 0, d = bdesc_mpx_const;
33412 i < ARRAY_SIZE (bdesc_mpx_const);
33413 i++, d++)
33415 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33416 if (d->name == 0)
33417 continue;
33419 ftype = (enum ix86_builtin_func_type) d->flag;
33420 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33422 if (decl)
33424 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33425 NULL_TREE);
33426 TREE_NOTHROW (decl) = 1;
33428 else
33430 ix86_builtins_isa[(int)d->code].leaf_p = true;
33431 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33434 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33435 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33436 ARRAY_SIZE (bdesc_mpx_const) - 1);
33438 #undef BDESC_VERIFY
33439 #undef BDESC_VERIFYS
33441 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33442 to return a pointer to VERSION_DECL if the outcome of the expression
33443 formed by PREDICATE_CHAIN is true. This function will be called during
33444 version dispatch to decide which function version to execute. It returns
33445 the basic block at the end, to which more conditions can be added. */
33447 static basic_block
33448 add_condition_to_bb (tree function_decl, tree version_decl,
33449 tree predicate_chain, basic_block new_bb)
33451 gimple *return_stmt;
33452 tree convert_expr, result_var;
33453 gimple *convert_stmt;
33454 gimple *call_cond_stmt;
33455 gimple *if_else_stmt;
33457 basic_block bb1, bb2, bb3;
33458 edge e12, e23;
33460 tree cond_var, and_expr_var = NULL_TREE;
33461 gimple_seq gseq;
33463 tree predicate_decl, predicate_arg;
33465 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33467 gcc_assert (new_bb != NULL);
33468 gseq = bb_seq (new_bb);
33471 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33472 build_fold_addr_expr (version_decl));
33473 result_var = create_tmp_var (ptr_type_node);
33474 convert_stmt = gimple_build_assign (result_var, convert_expr);
33475 return_stmt = gimple_build_return (result_var);
33477 if (predicate_chain == NULL_TREE)
33479 gimple_seq_add_stmt (&gseq, convert_stmt);
33480 gimple_seq_add_stmt (&gseq, return_stmt);
33481 set_bb_seq (new_bb, gseq);
33482 gimple_set_bb (convert_stmt, new_bb);
33483 gimple_set_bb (return_stmt, new_bb);
33484 pop_cfun ();
33485 return new_bb;
33488 while (predicate_chain != NULL)
33490 cond_var = create_tmp_var (integer_type_node);
33491 predicate_decl = TREE_PURPOSE (predicate_chain);
33492 predicate_arg = TREE_VALUE (predicate_chain);
33493 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33494 gimple_call_set_lhs (call_cond_stmt, cond_var);
33496 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33497 gimple_set_bb (call_cond_stmt, new_bb);
33498 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33500 predicate_chain = TREE_CHAIN (predicate_chain);
33502 if (and_expr_var == NULL)
33503 and_expr_var = cond_var;
33504 else
33506 gimple *assign_stmt;
33507 /* Use MIN_EXPR to check if any integer is zero?.
33508 and_expr_var = min_expr <cond_var, and_expr_var> */
33509 assign_stmt = gimple_build_assign (and_expr_var,
33510 build2 (MIN_EXPR, integer_type_node,
33511 cond_var, and_expr_var));
33513 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33514 gimple_set_bb (assign_stmt, new_bb);
33515 gimple_seq_add_stmt (&gseq, assign_stmt);
33519 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33520 integer_zero_node,
33521 NULL_TREE, NULL_TREE);
33522 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33523 gimple_set_bb (if_else_stmt, new_bb);
33524 gimple_seq_add_stmt (&gseq, if_else_stmt);
33526 gimple_seq_add_stmt (&gseq, convert_stmt);
33527 gimple_seq_add_stmt (&gseq, return_stmt);
33528 set_bb_seq (new_bb, gseq);
33530 bb1 = new_bb;
33531 e12 = split_block (bb1, if_else_stmt);
33532 bb2 = e12->dest;
33533 e12->flags &= ~EDGE_FALLTHRU;
33534 e12->flags |= EDGE_TRUE_VALUE;
33536 e23 = split_block (bb2, return_stmt);
33538 gimple_set_bb (convert_stmt, bb2);
33539 gimple_set_bb (return_stmt, bb2);
33541 bb3 = e23->dest;
33542 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33544 remove_edge (e23);
33545 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33547 pop_cfun ();
33549 return bb3;
33552 /* This parses the attribute arguments to target in DECL and determines
33553 the right builtin to use to match the platform specification.
33554 It returns the priority value for this version decl. If PREDICATE_LIST
33555 is not NULL, it stores the list of cpu features that need to be checked
33556 before dispatching this function. */
33558 static unsigned int
33559 get_builtin_code_for_version (tree decl, tree *predicate_list)
33561 tree attrs;
33562 struct cl_target_option cur_target;
33563 tree target_node;
33564 struct cl_target_option *new_target;
33565 const char *arg_str = NULL;
33566 const char *attrs_str = NULL;
33567 char *tok_str = NULL;
33568 char *token;
33570 /* Priority of i386 features, greater value is higher priority. This is
33571 used to decide the order in which function dispatch must happen. For
33572 instance, a version specialized for SSE4.2 should be checked for dispatch
33573 before a version for SSE3, as SSE4.2 implies SSE3. */
33574 enum feature_priority
33576 P_ZERO = 0,
33577 P_MMX,
33578 P_SSE,
33579 P_SSE2,
33580 P_SSE3,
33581 P_SSSE3,
33582 P_PROC_SSSE3,
33583 P_SSE4_A,
33584 P_PROC_SSE4_A,
33585 P_SSE4_1,
33586 P_SSE4_2,
33587 P_PROC_SSE4_2,
33588 P_POPCNT,
33589 P_AES,
33590 P_PCLMUL,
33591 P_AVX,
33592 P_PROC_AVX,
33593 P_BMI,
33594 P_PROC_BMI,
33595 P_FMA4,
33596 P_XOP,
33597 P_PROC_XOP,
33598 P_FMA,
33599 P_PROC_FMA,
33600 P_BMI2,
33601 P_AVX2,
33602 P_PROC_AVX2,
33603 P_AVX512F,
33604 P_PROC_AVX512F
33607 enum feature_priority priority = P_ZERO;
33609 /* These are the target attribute strings for which a dispatcher is
33610 available, from fold_builtin_cpu. */
33612 static struct _feature_list
33614 const char *const name;
33615 const enum feature_priority priority;
33617 const feature_list[] =
33619 {"mmx", P_MMX},
33620 {"sse", P_SSE},
33621 {"sse2", P_SSE2},
33622 {"sse3", P_SSE3},
33623 {"sse4a", P_SSE4_A},
33624 {"ssse3", P_SSSE3},
33625 {"sse4.1", P_SSE4_1},
33626 {"sse4.2", P_SSE4_2},
33627 {"popcnt", P_POPCNT},
33628 {"aes", P_AES},
33629 {"pclmul", P_PCLMUL},
33630 {"avx", P_AVX},
33631 {"bmi", P_BMI},
33632 {"fma4", P_FMA4},
33633 {"xop", P_XOP},
33634 {"fma", P_FMA},
33635 {"bmi2", P_BMI2},
33636 {"avx2", P_AVX2},
33637 {"avx512f", P_AVX512F}
33641 static unsigned int NUM_FEATURES
33642 = sizeof (feature_list) / sizeof (struct _feature_list);
33644 unsigned int i;
33646 tree predicate_chain = NULL_TREE;
33647 tree predicate_decl, predicate_arg;
33649 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33650 gcc_assert (attrs != NULL);
33652 attrs = TREE_VALUE (TREE_VALUE (attrs));
33654 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33655 attrs_str = TREE_STRING_POINTER (attrs);
33657 /* Return priority zero for default function. */
33658 if (strcmp (attrs_str, "default") == 0)
33659 return 0;
33661 /* Handle arch= if specified. For priority, set it to be 1 more than
33662 the best instruction set the processor can handle. For instance, if
33663 there is a version for atom and a version for ssse3 (the highest ISA
33664 priority for atom), the atom version must be checked for dispatch
33665 before the ssse3 version. */
33666 if (strstr (attrs_str, "arch=") != NULL)
33668 cl_target_option_save (&cur_target, &global_options);
33669 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33670 &global_options_set);
33672 gcc_assert (target_node);
33673 new_target = TREE_TARGET_OPTION (target_node);
33674 gcc_assert (new_target);
33676 if (new_target->arch_specified && new_target->arch > 0)
33678 switch (new_target->arch)
33680 case PROCESSOR_CORE2:
33681 arg_str = "core2";
33682 priority = P_PROC_SSSE3;
33683 break;
33684 case PROCESSOR_NEHALEM:
33685 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33687 arg_str = "westmere";
33688 priority = P_AES;
33690 else
33692 /* We translate "arch=corei7" and "arch=nehalem" to
33693 "corei7" so that it will be mapped to M_INTEL_COREI7
33694 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33695 arg_str = "corei7";
33696 priority = P_PROC_SSE4_2;
33698 break;
33699 case PROCESSOR_SANDYBRIDGE:
33700 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33701 arg_str = "ivybridge";
33702 else
33703 arg_str = "sandybridge";
33704 priority = P_PROC_AVX;
33705 break;
33706 case PROCESSOR_HASWELL:
33707 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33708 arg_str = "skylake-avx512";
33709 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33710 arg_str = "skylake";
33711 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33712 arg_str = "broadwell";
33713 else
33714 arg_str = "haswell";
33715 priority = P_PROC_AVX2;
33716 break;
33717 case PROCESSOR_BONNELL:
33718 arg_str = "bonnell";
33719 priority = P_PROC_SSSE3;
33720 break;
33721 case PROCESSOR_KNL:
33722 arg_str = "knl";
33723 priority = P_PROC_AVX512F;
33724 break;
33725 case PROCESSOR_SILVERMONT:
33726 arg_str = "silvermont";
33727 priority = P_PROC_SSE4_2;
33728 break;
33729 case PROCESSOR_AMDFAM10:
33730 arg_str = "amdfam10h";
33731 priority = P_PROC_SSE4_A;
33732 break;
33733 case PROCESSOR_BTVER1:
33734 arg_str = "btver1";
33735 priority = P_PROC_SSE4_A;
33736 break;
33737 case PROCESSOR_BTVER2:
33738 arg_str = "btver2";
33739 priority = P_PROC_BMI;
33740 break;
33741 case PROCESSOR_BDVER1:
33742 arg_str = "bdver1";
33743 priority = P_PROC_XOP;
33744 break;
33745 case PROCESSOR_BDVER2:
33746 arg_str = "bdver2";
33747 priority = P_PROC_FMA;
33748 break;
33749 case PROCESSOR_BDVER3:
33750 arg_str = "bdver3";
33751 priority = P_PROC_FMA;
33752 break;
33753 case PROCESSOR_BDVER4:
33754 arg_str = "bdver4";
33755 priority = P_PROC_AVX2;
33756 break;
33757 case PROCESSOR_ZNVER1:
33758 arg_str = "znver1";
33759 priority = P_PROC_AVX2;
33760 break;
33764 cl_target_option_restore (&global_options, &cur_target);
33766 if (predicate_list && arg_str == NULL)
33768 error_at (DECL_SOURCE_LOCATION (decl),
33769 "No dispatcher found for the versioning attributes");
33770 return 0;
33773 if (predicate_list)
33775 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33776 /* For a C string literal the length includes the trailing NULL. */
33777 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33778 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33779 predicate_chain);
33783 /* Process feature name. */
33784 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33785 strcpy (tok_str, attrs_str);
33786 token = strtok (tok_str, ",");
33787 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33789 while (token != NULL)
33791 /* Do not process "arch=" */
33792 if (strncmp (token, "arch=", 5) == 0)
33794 token = strtok (NULL, ",");
33795 continue;
33797 for (i = 0; i < NUM_FEATURES; ++i)
33799 if (strcmp (token, feature_list[i].name) == 0)
33801 if (predicate_list)
33803 predicate_arg = build_string_literal (
33804 strlen (feature_list[i].name) + 1,
33805 feature_list[i].name);
33806 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33807 predicate_chain);
33809 /* Find the maximum priority feature. */
33810 if (feature_list[i].priority > priority)
33811 priority = feature_list[i].priority;
33813 break;
33816 if (predicate_list && i == NUM_FEATURES)
33818 error_at (DECL_SOURCE_LOCATION (decl),
33819 "No dispatcher found for %s", token);
33820 return 0;
33822 token = strtok (NULL, ",");
33824 free (tok_str);
33826 if (predicate_list && predicate_chain == NULL_TREE)
33828 error_at (DECL_SOURCE_LOCATION (decl),
33829 "No dispatcher found for the versioning attributes : %s",
33830 attrs_str);
33831 return 0;
33833 else if (predicate_list)
33835 predicate_chain = nreverse (predicate_chain);
33836 *predicate_list = predicate_chain;
33839 return priority;
33842 /* This compares the priority of target features in function DECL1
33843 and DECL2. It returns positive value if DECL1 is higher priority,
33844 negative value if DECL2 is higher priority and 0 if they are the
33845 same. */
33847 static int
33848 ix86_compare_version_priority (tree decl1, tree decl2)
33850 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33851 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33853 return (int)priority1 - (int)priority2;
33856 /* V1 and V2 point to function versions with different priorities
33857 based on the target ISA. This function compares their priorities. */
33859 static int
33860 feature_compare (const void *v1, const void *v2)
33862 typedef struct _function_version_info
33864 tree version_decl;
33865 tree predicate_chain;
33866 unsigned int dispatch_priority;
33867 } function_version_info;
33869 const function_version_info c1 = *(const function_version_info *)v1;
33870 const function_version_info c2 = *(const function_version_info *)v2;
33871 return (c2.dispatch_priority - c1.dispatch_priority);
33874 /* This function generates the dispatch function for
33875 multi-versioned functions. DISPATCH_DECL is the function which will
33876 contain the dispatch logic. FNDECLS are the function choices for
33877 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33878 in DISPATCH_DECL in which the dispatch code is generated. */
33880 static int
33881 dispatch_function_versions (tree dispatch_decl,
33882 void *fndecls_p,
33883 basic_block *empty_bb)
33885 tree default_decl;
33886 gimple *ifunc_cpu_init_stmt;
33887 gimple_seq gseq;
33888 int ix;
33889 tree ele;
33890 vec<tree> *fndecls;
33891 unsigned int num_versions = 0;
33892 unsigned int actual_versions = 0;
33893 unsigned int i;
33895 struct _function_version_info
33897 tree version_decl;
33898 tree predicate_chain;
33899 unsigned int dispatch_priority;
33900 }*function_version_info;
33902 gcc_assert (dispatch_decl != NULL
33903 && fndecls_p != NULL
33904 && empty_bb != NULL);
33906 /*fndecls_p is actually a vector. */
33907 fndecls = static_cast<vec<tree> *> (fndecls_p);
33909 /* At least one more version other than the default. */
33910 num_versions = fndecls->length ();
33911 gcc_assert (num_versions >= 2);
33913 function_version_info = (struct _function_version_info *)
33914 XNEWVEC (struct _function_version_info, (num_versions - 1));
33916 /* The first version in the vector is the default decl. */
33917 default_decl = (*fndecls)[0];
33919 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33921 gseq = bb_seq (*empty_bb);
33922 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33923 constructors, so explicity call __builtin_cpu_init here. */
33924 ifunc_cpu_init_stmt = gimple_build_call_vec (
33925 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33926 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33927 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33928 set_bb_seq (*empty_bb, gseq);
33930 pop_cfun ();
33933 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33935 tree version_decl = ele;
33936 tree predicate_chain = NULL_TREE;
33937 unsigned int priority;
33938 /* Get attribute string, parse it and find the right predicate decl.
33939 The predicate function could be a lengthy combination of many
33940 features, like arch-type and various isa-variants. */
33941 priority = get_builtin_code_for_version (version_decl,
33942 &predicate_chain);
33944 if (predicate_chain == NULL_TREE)
33945 continue;
33947 function_version_info [actual_versions].version_decl = version_decl;
33948 function_version_info [actual_versions].predicate_chain
33949 = predicate_chain;
33950 function_version_info [actual_versions].dispatch_priority = priority;
33951 actual_versions++;
33954 /* Sort the versions according to descending order of dispatch priority. The
33955 priority is based on the ISA. This is not a perfect solution. There
33956 could still be ambiguity. If more than one function version is suitable
33957 to execute, which one should be dispatched? In future, allow the user
33958 to specify a dispatch priority next to the version. */
33959 qsort (function_version_info, actual_versions,
33960 sizeof (struct _function_version_info), feature_compare);
33962 for (i = 0; i < actual_versions; ++i)
33963 *empty_bb = add_condition_to_bb (dispatch_decl,
33964 function_version_info[i].version_decl,
33965 function_version_info[i].predicate_chain,
33966 *empty_bb);
33968 /* dispatch default version at the end. */
33969 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33970 NULL, *empty_bb);
33972 free (function_version_info);
33973 return 0;
33976 /* This function changes the assembler name for functions that are
33977 versions. If DECL is a function version and has a "target"
33978 attribute, it appends the attribute string to its assembler name. */
33980 static tree
33981 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33983 tree version_attr;
33984 const char *orig_name, *version_string;
33985 char *attr_str, *assembler_name;
33987 if (DECL_DECLARED_INLINE_P (decl)
33988 && lookup_attribute ("gnu_inline",
33989 DECL_ATTRIBUTES (decl)))
33990 error_at (DECL_SOURCE_LOCATION (decl),
33991 "Function versions cannot be marked as gnu_inline,"
33992 " bodies have to be generated");
33994 if (DECL_VIRTUAL_P (decl)
33995 || DECL_VINDEX (decl))
33996 sorry ("Virtual function multiversioning not supported");
33998 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
34000 /* target attribute string cannot be NULL. */
34001 gcc_assert (version_attr != NULL_TREE);
34003 orig_name = IDENTIFIER_POINTER (id);
34004 version_string
34005 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
34007 if (strcmp (version_string, "default") == 0)
34008 return id;
34010 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
34011 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
34013 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
34015 /* Allow assembler name to be modified if already set. */
34016 if (DECL_ASSEMBLER_NAME_SET_P (decl))
34017 SET_DECL_RTL (decl, NULL);
34019 tree ret = get_identifier (assembler_name);
34020 XDELETEVEC (attr_str);
34021 XDELETEVEC (assembler_name);
34022 return ret;
34026 static tree
34027 ix86_mangle_decl_assembler_name (tree decl, tree id)
34029 /* For function version, add the target suffix to the assembler name. */
34030 if (TREE_CODE (decl) == FUNCTION_DECL
34031 && DECL_FUNCTION_VERSIONED (decl))
34032 id = ix86_mangle_function_version_assembler_name (decl, id);
34033 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
34034 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
34035 #endif
34037 return id;
34040 /* Make a dispatcher declaration for the multi-versioned function DECL.
34041 Calls to DECL function will be replaced with calls to the dispatcher
34042 by the front-end. Returns the decl of the dispatcher function. */
34044 static tree
34045 ix86_get_function_versions_dispatcher (void *decl)
34047 tree fn = (tree) decl;
34048 struct cgraph_node *node = NULL;
34049 struct cgraph_node *default_node = NULL;
34050 struct cgraph_function_version_info *node_v = NULL;
34051 struct cgraph_function_version_info *first_v = NULL;
34053 tree dispatch_decl = NULL;
34055 struct cgraph_function_version_info *default_version_info = NULL;
34057 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
34059 node = cgraph_node::get (fn);
34060 gcc_assert (node != NULL);
34062 node_v = node->function_version ();
34063 gcc_assert (node_v != NULL);
34065 if (node_v->dispatcher_resolver != NULL)
34066 return node_v->dispatcher_resolver;
34068 /* Find the default version and make it the first node. */
34069 first_v = node_v;
34070 /* Go to the beginning of the chain. */
34071 while (first_v->prev != NULL)
34072 first_v = first_v->prev;
34073 default_version_info = first_v;
34074 while (default_version_info != NULL)
34076 if (is_function_default_version
34077 (default_version_info->this_node->decl))
34078 break;
34079 default_version_info = default_version_info->next;
34082 /* If there is no default node, just return NULL. */
34083 if (default_version_info == NULL)
34084 return NULL;
34086 /* Make default info the first node. */
34087 if (first_v != default_version_info)
34089 default_version_info->prev->next = default_version_info->next;
34090 if (default_version_info->next)
34091 default_version_info->next->prev = default_version_info->prev;
34092 first_v->prev = default_version_info;
34093 default_version_info->next = first_v;
34094 default_version_info->prev = NULL;
34097 default_node = default_version_info->this_node;
34099 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
34100 if (targetm.has_ifunc_p ())
34102 struct cgraph_function_version_info *it_v = NULL;
34103 struct cgraph_node *dispatcher_node = NULL;
34104 struct cgraph_function_version_info *dispatcher_version_info = NULL;
34106 /* Right now, the dispatching is done via ifunc. */
34107 dispatch_decl = make_dispatcher_decl (default_node->decl);
34109 dispatcher_node = cgraph_node::get_create (dispatch_decl);
34110 gcc_assert (dispatcher_node != NULL);
34111 dispatcher_node->dispatcher_function = 1;
34112 dispatcher_version_info
34113 = dispatcher_node->insert_new_function_version ();
34114 dispatcher_version_info->next = default_version_info;
34115 dispatcher_node->definition = 1;
34117 /* Set the dispatcher for all the versions. */
34118 it_v = default_version_info;
34119 while (it_v != NULL)
34121 it_v->dispatcher_resolver = dispatch_decl;
34122 it_v = it_v->next;
34125 else
34126 #endif
34128 error_at (DECL_SOURCE_LOCATION (default_node->decl),
34129 "multiversioning needs ifunc which is not supported "
34130 "on this target");
34133 return dispatch_decl;
34136 /* Make the resolver function decl to dispatch the versions of
34137 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
34138 ifunc alias that will point to the created resolver. Create an
34139 empty basic block in the resolver and store the pointer in
34140 EMPTY_BB. Return the decl of the resolver function. */
34142 static tree
34143 make_resolver_func (const tree default_decl,
34144 const tree ifunc_alias_decl,
34145 basic_block *empty_bb)
34147 char *resolver_name;
34148 tree decl, type, decl_name, t;
34150 /* IFUNC's have to be globally visible. So, if the default_decl is
34151 not, then the name of the IFUNC should be made unique. */
34152 if (TREE_PUBLIC (default_decl) == 0)
34154 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
34155 symtab->change_decl_assembler_name (ifunc_alias_decl,
34156 get_identifier (ifunc_name));
34157 XDELETEVEC (ifunc_name);
34160 resolver_name = make_unique_name (default_decl, "resolver", false);
34162 /* The resolver function should return a (void *). */
34163 type = build_function_type_list (ptr_type_node, NULL_TREE);
34165 decl = build_fn_decl (resolver_name, type);
34166 decl_name = get_identifier (resolver_name);
34167 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
34169 DECL_NAME (decl) = decl_name;
34170 TREE_USED (decl) = 1;
34171 DECL_ARTIFICIAL (decl) = 1;
34172 DECL_IGNORED_P (decl) = 1;
34173 TREE_PUBLIC (decl) = 0;
34174 DECL_UNINLINABLE (decl) = 1;
34176 /* Resolver is not external, body is generated. */
34177 DECL_EXTERNAL (decl) = 0;
34178 DECL_EXTERNAL (ifunc_alias_decl) = 0;
34180 DECL_CONTEXT (decl) = NULL_TREE;
34181 DECL_INITIAL (decl) = make_node (BLOCK);
34182 DECL_STATIC_CONSTRUCTOR (decl) = 0;
34184 if (DECL_COMDAT_GROUP (default_decl)
34185 || TREE_PUBLIC (default_decl))
34187 /* In this case, each translation unit with a call to this
34188 versioned function will put out a resolver. Ensure it
34189 is comdat to keep just one copy. */
34190 DECL_COMDAT (decl) = 1;
34191 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
34193 /* Build result decl and add to function_decl. */
34194 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
34195 DECL_ARTIFICIAL (t) = 1;
34196 DECL_IGNORED_P (t) = 1;
34197 DECL_RESULT (decl) = t;
34199 gimplify_function_tree (decl);
34200 push_cfun (DECL_STRUCT_FUNCTION (decl));
34201 *empty_bb = init_lowered_empty_function (decl, false,
34202 profile_count::uninitialized ());
34204 cgraph_node::add_new_function (decl, true);
34205 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
34207 pop_cfun ();
34209 gcc_assert (ifunc_alias_decl != NULL);
34210 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
34211 DECL_ATTRIBUTES (ifunc_alias_decl)
34212 = make_attribute ("ifunc", resolver_name,
34213 DECL_ATTRIBUTES (ifunc_alias_decl));
34215 /* Create the alias for dispatch to resolver here. */
34216 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
34217 XDELETEVEC (resolver_name);
34218 return decl;
34221 /* Generate the dispatching code body to dispatch multi-versioned function
34222 DECL. The target hook is called to process the "target" attributes and
34223 provide the code to dispatch the right function at run-time. NODE points
34224 to the dispatcher decl whose body will be created. */
34226 static tree
34227 ix86_generate_version_dispatcher_body (void *node_p)
34229 tree resolver_decl;
34230 basic_block empty_bb;
34231 tree default_ver_decl;
34232 struct cgraph_node *versn;
34233 struct cgraph_node *node;
34235 struct cgraph_function_version_info *node_version_info = NULL;
34236 struct cgraph_function_version_info *versn_info = NULL;
34238 node = (cgraph_node *)node_p;
34240 node_version_info = node->function_version ();
34241 gcc_assert (node->dispatcher_function
34242 && node_version_info != NULL);
34244 if (node_version_info->dispatcher_resolver)
34245 return node_version_info->dispatcher_resolver;
34247 /* The first version in the chain corresponds to the default version. */
34248 default_ver_decl = node_version_info->next->this_node->decl;
34250 /* node is going to be an alias, so remove the finalized bit. */
34251 node->definition = false;
34253 resolver_decl = make_resolver_func (default_ver_decl,
34254 node->decl, &empty_bb);
34256 node_version_info->dispatcher_resolver = resolver_decl;
34258 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
34260 auto_vec<tree, 2> fn_ver_vec;
34262 for (versn_info = node_version_info->next; versn_info;
34263 versn_info = versn_info->next)
34265 versn = versn_info->this_node;
34266 /* Check for virtual functions here again, as by this time it should
34267 have been determined if this function needs a vtable index or
34268 not. This happens for methods in derived classes that override
34269 virtual methods in base classes but are not explicitly marked as
34270 virtual. */
34271 if (DECL_VINDEX (versn->decl))
34272 sorry ("Virtual function multiversioning not supported");
34274 fn_ver_vec.safe_push (versn->decl);
34277 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34278 cgraph_edge::rebuild_edges ();
34279 pop_cfun ();
34280 return resolver_decl;
34282 /* This builds the processor_model struct type defined in
34283 libgcc/config/i386/cpuinfo.c */
34285 static tree
34286 build_processor_model_struct (void)
34288 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34289 "__cpu_features"};
34290 tree field = NULL_TREE, field_chain = NULL_TREE;
34291 int i;
34292 tree type = make_node (RECORD_TYPE);
34294 /* The first 3 fields are unsigned int. */
34295 for (i = 0; i < 3; ++i)
34297 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34298 get_identifier (field_name[i]), unsigned_type_node);
34299 if (field_chain != NULL_TREE)
34300 DECL_CHAIN (field) = field_chain;
34301 field_chain = field;
34304 /* The last field is an array of unsigned integers of size one. */
34305 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34306 get_identifier (field_name[3]),
34307 build_array_type (unsigned_type_node,
34308 build_index_type (size_one_node)));
34309 if (field_chain != NULL_TREE)
34310 DECL_CHAIN (field) = field_chain;
34311 field_chain = field;
34313 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34314 return type;
34317 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34319 static tree
34320 make_var_decl (tree type, const char *name)
34322 tree new_decl;
34324 new_decl = build_decl (UNKNOWN_LOCATION,
34325 VAR_DECL,
34326 get_identifier(name),
34327 type);
34329 DECL_EXTERNAL (new_decl) = 1;
34330 TREE_STATIC (new_decl) = 1;
34331 TREE_PUBLIC (new_decl) = 1;
34332 DECL_INITIAL (new_decl) = 0;
34333 DECL_ARTIFICIAL (new_decl) = 0;
34334 DECL_PRESERVE_P (new_decl) = 1;
34336 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34337 assemble_variable (new_decl, 0, 0, 0);
34339 return new_decl;
34342 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34343 into an integer defined in libgcc/config/i386/cpuinfo.c */
34345 static tree
34346 fold_builtin_cpu (tree fndecl, tree *args)
34348 unsigned int i;
34349 enum ix86_builtins fn_code = (enum ix86_builtins)
34350 DECL_FUNCTION_CODE (fndecl);
34351 tree param_string_cst = NULL;
34353 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34354 enum processor_features
34356 F_CMOV = 0,
34357 F_MMX,
34358 F_POPCNT,
34359 F_SSE,
34360 F_SSE2,
34361 F_SSE3,
34362 F_SSSE3,
34363 F_SSE4_1,
34364 F_SSE4_2,
34365 F_AVX,
34366 F_AVX2,
34367 F_SSE4_A,
34368 F_FMA4,
34369 F_XOP,
34370 F_FMA,
34371 F_AVX512F,
34372 F_BMI,
34373 F_BMI2,
34374 F_AES,
34375 F_PCLMUL,
34376 F_AVX512VL,
34377 F_AVX512BW,
34378 F_AVX512DQ,
34379 F_AVX512CD,
34380 F_AVX512ER,
34381 F_AVX512PF,
34382 F_AVX512VBMI,
34383 F_AVX512IFMA,
34384 F_AVX5124VNNIW,
34385 F_AVX5124FMAPS,
34386 F_AVX512VPOPCNTDQ,
34387 F_MAX
34390 /* These are the values for vendor types and cpu types and subtypes
34391 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34392 the corresponding start value. */
34393 enum processor_model
34395 M_INTEL = 1,
34396 M_AMD,
34397 M_CPU_TYPE_START,
34398 M_INTEL_BONNELL,
34399 M_INTEL_CORE2,
34400 M_INTEL_COREI7,
34401 M_AMDFAM10H,
34402 M_AMDFAM15H,
34403 M_INTEL_SILVERMONT,
34404 M_INTEL_KNL,
34405 M_AMD_BTVER1,
34406 M_AMD_BTVER2,
34407 M_CPU_SUBTYPE_START,
34408 M_INTEL_COREI7_NEHALEM,
34409 M_INTEL_COREI7_WESTMERE,
34410 M_INTEL_COREI7_SANDYBRIDGE,
34411 M_AMDFAM10H_BARCELONA,
34412 M_AMDFAM10H_SHANGHAI,
34413 M_AMDFAM10H_ISTANBUL,
34414 M_AMDFAM15H_BDVER1,
34415 M_AMDFAM15H_BDVER2,
34416 M_AMDFAM15H_BDVER3,
34417 M_AMDFAM15H_BDVER4,
34418 M_AMDFAM17H_ZNVER1,
34419 M_INTEL_COREI7_IVYBRIDGE,
34420 M_INTEL_COREI7_HASWELL,
34421 M_INTEL_COREI7_BROADWELL,
34422 M_INTEL_COREI7_SKYLAKE,
34423 M_INTEL_COREI7_SKYLAKE_AVX512
34426 static struct _arch_names_table
34428 const char *const name;
34429 const enum processor_model model;
34431 const arch_names_table[] =
34433 {"amd", M_AMD},
34434 {"intel", M_INTEL},
34435 {"atom", M_INTEL_BONNELL},
34436 {"slm", M_INTEL_SILVERMONT},
34437 {"core2", M_INTEL_CORE2},
34438 {"corei7", M_INTEL_COREI7},
34439 {"nehalem", M_INTEL_COREI7_NEHALEM},
34440 {"westmere", M_INTEL_COREI7_WESTMERE},
34441 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34442 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34443 {"haswell", M_INTEL_COREI7_HASWELL},
34444 {"broadwell", M_INTEL_COREI7_BROADWELL},
34445 {"skylake", M_INTEL_COREI7_SKYLAKE},
34446 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34447 {"bonnell", M_INTEL_BONNELL},
34448 {"silvermont", M_INTEL_SILVERMONT},
34449 {"knl", M_INTEL_KNL},
34450 {"amdfam10h", M_AMDFAM10H},
34451 {"barcelona", M_AMDFAM10H_BARCELONA},
34452 {"shanghai", M_AMDFAM10H_SHANGHAI},
34453 {"istanbul", M_AMDFAM10H_ISTANBUL},
34454 {"btver1", M_AMD_BTVER1},
34455 {"amdfam15h", M_AMDFAM15H},
34456 {"bdver1", M_AMDFAM15H_BDVER1},
34457 {"bdver2", M_AMDFAM15H_BDVER2},
34458 {"bdver3", M_AMDFAM15H_BDVER3},
34459 {"bdver4", M_AMDFAM15H_BDVER4},
34460 {"btver2", M_AMD_BTVER2},
34461 {"znver1", M_AMDFAM17H_ZNVER1},
34464 static struct _isa_names_table
34466 const char *const name;
34467 const enum processor_features feature;
34469 const isa_names_table[] =
34471 {"cmov", F_CMOV},
34472 {"mmx", F_MMX},
34473 {"popcnt", F_POPCNT},
34474 {"sse", F_SSE},
34475 {"sse2", F_SSE2},
34476 {"sse3", F_SSE3},
34477 {"ssse3", F_SSSE3},
34478 {"sse4a", F_SSE4_A},
34479 {"sse4.1", F_SSE4_1},
34480 {"sse4.2", F_SSE4_2},
34481 {"avx", F_AVX},
34482 {"fma4", F_FMA4},
34483 {"xop", F_XOP},
34484 {"fma", F_FMA},
34485 {"avx2", F_AVX2},
34486 {"avx512f", F_AVX512F},
34487 {"bmi", F_BMI},
34488 {"bmi2", F_BMI2},
34489 {"aes", F_AES},
34490 {"pclmul", F_PCLMUL},
34491 {"avx512vl",F_AVX512VL},
34492 {"avx512bw",F_AVX512BW},
34493 {"avx512dq",F_AVX512DQ},
34494 {"avx512cd",F_AVX512CD},
34495 {"avx512er",F_AVX512ER},
34496 {"avx512pf",F_AVX512PF},
34497 {"avx512vbmi",F_AVX512VBMI},
34498 {"avx512ifma",F_AVX512IFMA},
34499 {"avx5124vnniw",F_AVX5124VNNIW},
34500 {"avx5124fmaps",F_AVX5124FMAPS},
34501 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34504 tree __processor_model_type = build_processor_model_struct ();
34505 tree __cpu_model_var = make_var_decl (__processor_model_type,
34506 "__cpu_model");
34509 varpool_node::add (__cpu_model_var);
34511 gcc_assert ((args != NULL) && (*args != NULL));
34513 param_string_cst = *args;
34514 while (param_string_cst
34515 && TREE_CODE (param_string_cst) != STRING_CST)
34517 /* *args must be a expr that can contain other EXPRS leading to a
34518 STRING_CST. */
34519 if (!EXPR_P (param_string_cst))
34521 error ("Parameter to builtin must be a string constant or literal");
34522 return integer_zero_node;
34524 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34527 gcc_assert (param_string_cst);
34529 if (fn_code == IX86_BUILTIN_CPU_IS)
34531 tree ref;
34532 tree field;
34533 tree final;
34535 unsigned int field_val = 0;
34536 unsigned int NUM_ARCH_NAMES
34537 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34539 for (i = 0; i < NUM_ARCH_NAMES; i++)
34540 if (strcmp (arch_names_table[i].name,
34541 TREE_STRING_POINTER (param_string_cst)) == 0)
34542 break;
34544 if (i == NUM_ARCH_NAMES)
34546 error ("Parameter to builtin not valid: %s",
34547 TREE_STRING_POINTER (param_string_cst));
34548 return integer_zero_node;
34551 field = TYPE_FIELDS (__processor_model_type);
34552 field_val = arch_names_table[i].model;
34554 /* CPU types are stored in the next field. */
34555 if (field_val > M_CPU_TYPE_START
34556 && field_val < M_CPU_SUBTYPE_START)
34558 field = DECL_CHAIN (field);
34559 field_val -= M_CPU_TYPE_START;
34562 /* CPU subtypes are stored in the next field. */
34563 if (field_val > M_CPU_SUBTYPE_START)
34565 field = DECL_CHAIN ( DECL_CHAIN (field));
34566 field_val -= M_CPU_SUBTYPE_START;
34569 /* Get the appropriate field in __cpu_model. */
34570 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34571 field, NULL_TREE);
34573 /* Check the value. */
34574 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34575 build_int_cstu (unsigned_type_node, field_val));
34576 return build1 (CONVERT_EXPR, integer_type_node, final);
34578 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34580 tree ref;
34581 tree array_elt;
34582 tree field;
34583 tree final;
34585 unsigned int field_val = 0;
34586 unsigned int NUM_ISA_NAMES
34587 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34589 for (i = 0; i < NUM_ISA_NAMES; i++)
34590 if (strcmp (isa_names_table[i].name,
34591 TREE_STRING_POINTER (param_string_cst)) == 0)
34592 break;
34594 if (i == NUM_ISA_NAMES)
34596 error ("Parameter to builtin not valid: %s",
34597 TREE_STRING_POINTER (param_string_cst));
34598 return integer_zero_node;
34601 field = TYPE_FIELDS (__processor_model_type);
34602 /* Get the last field, which is __cpu_features. */
34603 while (DECL_CHAIN (field))
34604 field = DECL_CHAIN (field);
34606 /* Get the appropriate field: __cpu_model.__cpu_features */
34607 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34608 field, NULL_TREE);
34610 /* Access the 0th element of __cpu_features array. */
34611 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34612 integer_zero_node, NULL_TREE, NULL_TREE);
34614 field_val = (1 << isa_names_table[i].feature);
34615 /* Return __cpu_model.__cpu_features[0] & field_val */
34616 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34617 build_int_cstu (unsigned_type_node, field_val));
34618 return build1 (CONVERT_EXPR, integer_type_node, final);
34620 gcc_unreachable ();
34623 static tree
34624 ix86_fold_builtin (tree fndecl, int n_args,
34625 tree *args, bool ignore ATTRIBUTE_UNUSED)
34627 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34629 enum ix86_builtins fn_code = (enum ix86_builtins)
34630 DECL_FUNCTION_CODE (fndecl);
34631 switch (fn_code)
34633 case IX86_BUILTIN_CPU_IS:
34634 case IX86_BUILTIN_CPU_SUPPORTS:
34635 gcc_assert (n_args == 1);
34636 return fold_builtin_cpu (fndecl, args);
34638 case IX86_BUILTIN_NANQ:
34639 case IX86_BUILTIN_NANSQ:
34641 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34642 const char *str = c_getstr (*args);
34643 int quiet = fn_code == IX86_BUILTIN_NANQ;
34644 REAL_VALUE_TYPE real;
34646 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34647 return build_real (type, real);
34648 return NULL_TREE;
34651 case IX86_BUILTIN_INFQ:
34652 case IX86_BUILTIN_HUGE_VALQ:
34654 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34655 REAL_VALUE_TYPE inf;
34656 real_inf (&inf);
34657 return build_real (type, inf);
34660 case IX86_BUILTIN_TZCNT16:
34661 case IX86_BUILTIN_CTZS:
34662 case IX86_BUILTIN_TZCNT32:
34663 case IX86_BUILTIN_TZCNT64:
34664 gcc_assert (n_args == 1);
34665 if (TREE_CODE (args[0]) == INTEGER_CST)
34667 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34668 tree arg = args[0];
34669 if (fn_code == IX86_BUILTIN_TZCNT16
34670 || fn_code == IX86_BUILTIN_CTZS)
34671 arg = fold_convert (short_unsigned_type_node, arg);
34672 if (integer_zerop (arg))
34673 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34674 else
34675 return fold_const_call (CFN_CTZ, type, arg);
34677 break;
34679 case IX86_BUILTIN_LZCNT16:
34680 case IX86_BUILTIN_CLZS:
34681 case IX86_BUILTIN_LZCNT32:
34682 case IX86_BUILTIN_LZCNT64:
34683 gcc_assert (n_args == 1);
34684 if (TREE_CODE (args[0]) == INTEGER_CST)
34686 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34687 tree arg = args[0];
34688 if (fn_code == IX86_BUILTIN_LZCNT16
34689 || fn_code == IX86_BUILTIN_CLZS)
34690 arg = fold_convert (short_unsigned_type_node, arg);
34691 if (integer_zerop (arg))
34692 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34693 else
34694 return fold_const_call (CFN_CLZ, type, arg);
34696 break;
34698 case IX86_BUILTIN_BEXTR32:
34699 case IX86_BUILTIN_BEXTR64:
34700 case IX86_BUILTIN_BEXTRI32:
34701 case IX86_BUILTIN_BEXTRI64:
34702 gcc_assert (n_args == 2);
34703 if (tree_fits_uhwi_p (args[1]))
34705 unsigned HOST_WIDE_INT res = 0;
34706 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34707 unsigned int start = tree_to_uhwi (args[1]);
34708 unsigned int len = (start & 0xff00) >> 8;
34709 start &= 0xff;
34710 if (start >= prec || len == 0)
34711 res = 0;
34712 else if (!tree_fits_uhwi_p (args[0]))
34713 break;
34714 else
34715 res = tree_to_uhwi (args[0]) >> start;
34716 if (len > prec)
34717 len = prec;
34718 if (len < HOST_BITS_PER_WIDE_INT)
34719 res &= (HOST_WIDE_INT_1U << len) - 1;
34720 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34722 break;
34724 case IX86_BUILTIN_BZHI32:
34725 case IX86_BUILTIN_BZHI64:
34726 gcc_assert (n_args == 2);
34727 if (tree_fits_uhwi_p (args[1]))
34729 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34730 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34731 return args[0];
34732 if (!tree_fits_uhwi_p (args[0]))
34733 break;
34734 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34735 res &= ~(HOST_WIDE_INT_M1U << idx);
34736 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34738 break;
34740 case IX86_BUILTIN_PDEP32:
34741 case IX86_BUILTIN_PDEP64:
34742 gcc_assert (n_args == 2);
34743 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34745 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34746 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34747 unsigned HOST_WIDE_INT res = 0;
34748 unsigned HOST_WIDE_INT m, k = 1;
34749 for (m = 1; m; m <<= 1)
34750 if ((mask & m) != 0)
34752 if ((src & k) != 0)
34753 res |= m;
34754 k <<= 1;
34756 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34758 break;
34760 case IX86_BUILTIN_PEXT32:
34761 case IX86_BUILTIN_PEXT64:
34762 gcc_assert (n_args == 2);
34763 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34765 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34766 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34767 unsigned HOST_WIDE_INT res = 0;
34768 unsigned HOST_WIDE_INT m, k = 1;
34769 for (m = 1; m; m <<= 1)
34770 if ((mask & m) != 0)
34772 if ((src & m) != 0)
34773 res |= k;
34774 k <<= 1;
34776 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34778 break;
34780 default:
34781 break;
34785 #ifdef SUBTARGET_FOLD_BUILTIN
34786 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34787 #endif
34789 return NULL_TREE;
34792 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34793 constant) in GIMPLE. */
34795 bool
34796 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34798 gimple *stmt = gsi_stmt (*gsi);
34799 tree fndecl = gimple_call_fndecl (stmt);
34800 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34801 int n_args = gimple_call_num_args (stmt);
34802 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34803 tree decl = NULL_TREE;
34804 tree arg0, arg1;
34806 switch (fn_code)
34808 case IX86_BUILTIN_TZCNT32:
34809 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34810 goto fold_tzcnt_lzcnt;
34812 case IX86_BUILTIN_TZCNT64:
34813 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34814 goto fold_tzcnt_lzcnt;
34816 case IX86_BUILTIN_LZCNT32:
34817 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34818 goto fold_tzcnt_lzcnt;
34820 case IX86_BUILTIN_LZCNT64:
34821 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34822 goto fold_tzcnt_lzcnt;
34824 fold_tzcnt_lzcnt:
34825 gcc_assert (n_args == 1);
34826 arg0 = gimple_call_arg (stmt, 0);
34827 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34829 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34830 /* If arg0 is provably non-zero, optimize into generic
34831 __builtin_c[tl]z{,ll} function the middle-end handles
34832 better. */
34833 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34834 return false;
34836 location_t loc = gimple_location (stmt);
34837 gimple *g = gimple_build_call (decl, 1, arg0);
34838 gimple_set_location (g, loc);
34839 tree lhs = make_ssa_name (integer_type_node);
34840 gimple_call_set_lhs (g, lhs);
34841 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34842 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34843 gimple_set_location (g, loc);
34844 gsi_replace (gsi, g, false);
34845 return true;
34847 break;
34849 case IX86_BUILTIN_BZHI32:
34850 case IX86_BUILTIN_BZHI64:
34851 gcc_assert (n_args == 2);
34852 arg1 = gimple_call_arg (stmt, 1);
34853 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34855 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34856 arg0 = gimple_call_arg (stmt, 0);
34857 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34858 break;
34859 location_t loc = gimple_location (stmt);
34860 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34861 gimple_set_location (g, loc);
34862 gsi_replace (gsi, g, false);
34863 return true;
34865 break;
34867 case IX86_BUILTIN_PDEP32:
34868 case IX86_BUILTIN_PDEP64:
34869 case IX86_BUILTIN_PEXT32:
34870 case IX86_BUILTIN_PEXT64:
34871 gcc_assert (n_args == 2);
34872 arg1 = gimple_call_arg (stmt, 1);
34873 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34875 location_t loc = gimple_location (stmt);
34876 arg0 = gimple_call_arg (stmt, 0);
34877 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34878 gimple_set_location (g, loc);
34879 gsi_replace (gsi, g, false);
34880 return true;
34882 break;
34884 default:
34885 break;
34888 return false;
34891 /* Make builtins to detect cpu type and features supported. NAME is
34892 the builtin name, CODE is the builtin code, and FTYPE is the function
34893 type of the builtin. */
34895 static void
34896 make_cpu_type_builtin (const char* name, int code,
34897 enum ix86_builtin_func_type ftype, bool is_const)
34899 tree decl;
34900 tree type;
34902 type = ix86_get_builtin_func_type (ftype);
34903 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34904 NULL, NULL_TREE);
34905 gcc_assert (decl != NULL_TREE);
34906 ix86_builtins[(int) code] = decl;
34907 TREE_READONLY (decl) = is_const;
34910 /* Make builtins to get CPU type and features supported. The created
34911 builtins are :
34913 __builtin_cpu_init (), to detect cpu type and features,
34914 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34915 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34918 static void
34919 ix86_init_platform_type_builtins (void)
34921 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34922 INT_FTYPE_VOID, false);
34923 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34924 INT_FTYPE_PCCHAR, true);
34925 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34926 INT_FTYPE_PCCHAR, true);
34929 /* Internal method for ix86_init_builtins. */
34931 static void
34932 ix86_init_builtins_va_builtins_abi (void)
34934 tree ms_va_ref, sysv_va_ref;
34935 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34936 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34937 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34938 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34940 if (!TARGET_64BIT)
34941 return;
34942 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34943 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34944 ms_va_ref = build_reference_type (ms_va_list_type_node);
34945 sysv_va_ref =
34946 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34948 fnvoid_va_end_ms =
34949 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34950 fnvoid_va_start_ms =
34951 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34952 fnvoid_va_end_sysv =
34953 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34954 fnvoid_va_start_sysv =
34955 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34956 NULL_TREE);
34957 fnvoid_va_copy_ms =
34958 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34959 NULL_TREE);
34960 fnvoid_va_copy_sysv =
34961 build_function_type_list (void_type_node, sysv_va_ref,
34962 sysv_va_ref, NULL_TREE);
34964 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34965 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34966 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34967 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34968 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34969 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34970 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34971 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34972 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34973 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34974 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34975 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34978 static void
34979 ix86_init_builtin_types (void)
34981 tree float80_type_node, const_string_type_node;
34983 /* The __float80 type. */
34984 float80_type_node = long_double_type_node;
34985 if (TYPE_MODE (float80_type_node) != XFmode)
34987 if (float64x_type_node != NULL_TREE
34988 && TYPE_MODE (float64x_type_node) == XFmode)
34989 float80_type_node = float64x_type_node;
34990 else
34992 /* The __float80 type. */
34993 float80_type_node = make_node (REAL_TYPE);
34995 TYPE_PRECISION (float80_type_node) = 80;
34996 layout_type (float80_type_node);
34999 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
35001 /* The __float128 type. The node has already been created as
35002 _Float128, so we only need to register the __float128 name for
35003 it. */
35004 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
35006 const_string_type_node
35007 = build_pointer_type (build_qualified_type
35008 (char_type_node, TYPE_QUAL_CONST));
35010 /* This macro is built by i386-builtin-types.awk. */
35011 DEFINE_BUILTIN_PRIMITIVE_TYPES;
35014 static void
35015 ix86_init_builtins (void)
35017 tree ftype, decl;
35019 ix86_init_builtin_types ();
35021 /* Builtins to get CPU type and features. */
35022 ix86_init_platform_type_builtins ();
35024 /* TFmode support builtins. */
35025 def_builtin_const (0, "__builtin_infq",
35026 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
35027 def_builtin_const (0, "__builtin_huge_valq",
35028 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
35030 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
35031 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
35032 BUILT_IN_MD, "nanq", NULL_TREE);
35033 TREE_READONLY (decl) = 1;
35034 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
35036 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
35037 BUILT_IN_MD, "nansq", NULL_TREE);
35038 TREE_READONLY (decl) = 1;
35039 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
35041 /* We will expand them to normal call if SSE isn't available since
35042 they are used by libgcc. */
35043 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
35044 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
35045 BUILT_IN_MD, "__fabstf2", NULL_TREE);
35046 TREE_READONLY (decl) = 1;
35047 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
35049 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
35050 decl = add_builtin_function ("__builtin_copysignq", ftype,
35051 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
35052 "__copysigntf3", NULL_TREE);
35053 TREE_READONLY (decl) = 1;
35054 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
35056 ix86_init_tm_builtins ();
35057 ix86_init_mmx_sse_builtins ();
35058 ix86_init_mpx_builtins ();
35060 if (TARGET_LP64)
35061 ix86_init_builtins_va_builtins_abi ();
35063 #ifdef SUBTARGET_INIT_BUILTINS
35064 SUBTARGET_INIT_BUILTINS;
35065 #endif
35068 /* Return the ix86 builtin for CODE. */
35070 static tree
35071 ix86_builtin_decl (unsigned code, bool)
35073 if (code >= IX86_BUILTIN_MAX)
35074 return error_mark_node;
35076 return ix86_builtins[code];
35079 /* Errors in the source file can cause expand_expr to return const0_rtx
35080 where we expect a vector. To avoid crashing, use one of the vector
35081 clear instructions. */
35082 static rtx
35083 safe_vector_operand (rtx x, machine_mode mode)
35085 if (x == const0_rtx)
35086 x = CONST0_RTX (mode);
35087 return x;
35090 /* Fixup modeless constants to fit required mode. */
35091 static rtx
35092 fixup_modeless_constant (rtx x, machine_mode mode)
35094 if (GET_MODE (x) == VOIDmode)
35095 x = convert_to_mode (mode, x, 1);
35096 return x;
35099 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
35101 static rtx
35102 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
35104 rtx pat;
35105 tree arg0 = CALL_EXPR_ARG (exp, 0);
35106 tree arg1 = CALL_EXPR_ARG (exp, 1);
35107 rtx op0 = expand_normal (arg0);
35108 rtx op1 = expand_normal (arg1);
35109 machine_mode tmode = insn_data[icode].operand[0].mode;
35110 machine_mode mode0 = insn_data[icode].operand[1].mode;
35111 machine_mode mode1 = insn_data[icode].operand[2].mode;
35113 if (VECTOR_MODE_P (mode0))
35114 op0 = safe_vector_operand (op0, mode0);
35115 if (VECTOR_MODE_P (mode1))
35116 op1 = safe_vector_operand (op1, mode1);
35118 if (optimize || !target
35119 || GET_MODE (target) != tmode
35120 || !insn_data[icode].operand[0].predicate (target, tmode))
35121 target = gen_reg_rtx (tmode);
35123 if (GET_MODE (op1) == SImode && mode1 == TImode)
35125 rtx x = gen_reg_rtx (V4SImode);
35126 emit_insn (gen_sse2_loadd (x, op1));
35127 op1 = gen_lowpart (TImode, x);
35130 if (!insn_data[icode].operand[1].predicate (op0, mode0))
35131 op0 = copy_to_mode_reg (mode0, op0);
35132 if (!insn_data[icode].operand[2].predicate (op1, mode1))
35133 op1 = copy_to_mode_reg (mode1, op1);
35135 pat = GEN_FCN (icode) (target, op0, op1);
35136 if (! pat)
35137 return 0;
35139 emit_insn (pat);
35141 return target;
35144 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
35146 static rtx
35147 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
35148 enum ix86_builtin_func_type m_type,
35149 enum rtx_code sub_code)
35151 rtx pat;
35152 int i;
35153 int nargs;
35154 bool comparison_p = false;
35155 bool tf_p = false;
35156 bool last_arg_constant = false;
35157 int num_memory = 0;
35158 struct {
35159 rtx op;
35160 machine_mode mode;
35161 } args[4];
35163 machine_mode tmode = insn_data[icode].operand[0].mode;
35165 switch (m_type)
35167 case MULTI_ARG_4_DF2_DI_I:
35168 case MULTI_ARG_4_DF2_DI_I1:
35169 case MULTI_ARG_4_SF2_SI_I:
35170 case MULTI_ARG_4_SF2_SI_I1:
35171 nargs = 4;
35172 last_arg_constant = true;
35173 break;
35175 case MULTI_ARG_3_SF:
35176 case MULTI_ARG_3_DF:
35177 case MULTI_ARG_3_SF2:
35178 case MULTI_ARG_3_DF2:
35179 case MULTI_ARG_3_DI:
35180 case MULTI_ARG_3_SI:
35181 case MULTI_ARG_3_SI_DI:
35182 case MULTI_ARG_3_HI:
35183 case MULTI_ARG_3_HI_SI:
35184 case MULTI_ARG_3_QI:
35185 case MULTI_ARG_3_DI2:
35186 case MULTI_ARG_3_SI2:
35187 case MULTI_ARG_3_HI2:
35188 case MULTI_ARG_3_QI2:
35189 nargs = 3;
35190 break;
35192 case MULTI_ARG_2_SF:
35193 case MULTI_ARG_2_DF:
35194 case MULTI_ARG_2_DI:
35195 case MULTI_ARG_2_SI:
35196 case MULTI_ARG_2_HI:
35197 case MULTI_ARG_2_QI:
35198 nargs = 2;
35199 break;
35201 case MULTI_ARG_2_DI_IMM:
35202 case MULTI_ARG_2_SI_IMM:
35203 case MULTI_ARG_2_HI_IMM:
35204 case MULTI_ARG_2_QI_IMM:
35205 nargs = 2;
35206 last_arg_constant = true;
35207 break;
35209 case MULTI_ARG_1_SF:
35210 case MULTI_ARG_1_DF:
35211 case MULTI_ARG_1_SF2:
35212 case MULTI_ARG_1_DF2:
35213 case MULTI_ARG_1_DI:
35214 case MULTI_ARG_1_SI:
35215 case MULTI_ARG_1_HI:
35216 case MULTI_ARG_1_QI:
35217 case MULTI_ARG_1_SI_DI:
35218 case MULTI_ARG_1_HI_DI:
35219 case MULTI_ARG_1_HI_SI:
35220 case MULTI_ARG_1_QI_DI:
35221 case MULTI_ARG_1_QI_SI:
35222 case MULTI_ARG_1_QI_HI:
35223 nargs = 1;
35224 break;
35226 case MULTI_ARG_2_DI_CMP:
35227 case MULTI_ARG_2_SI_CMP:
35228 case MULTI_ARG_2_HI_CMP:
35229 case MULTI_ARG_2_QI_CMP:
35230 nargs = 2;
35231 comparison_p = true;
35232 break;
35234 case MULTI_ARG_2_SF_TF:
35235 case MULTI_ARG_2_DF_TF:
35236 case MULTI_ARG_2_DI_TF:
35237 case MULTI_ARG_2_SI_TF:
35238 case MULTI_ARG_2_HI_TF:
35239 case MULTI_ARG_2_QI_TF:
35240 nargs = 2;
35241 tf_p = true;
35242 break;
35244 default:
35245 gcc_unreachable ();
35248 if (optimize || !target
35249 || GET_MODE (target) != tmode
35250 || !insn_data[icode].operand[0].predicate (target, tmode))
35251 target = gen_reg_rtx (tmode);
35252 else if (memory_operand (target, tmode))
35253 num_memory++;
35255 gcc_assert (nargs <= 4);
35257 for (i = 0; i < nargs; i++)
35259 tree arg = CALL_EXPR_ARG (exp, i);
35260 rtx op = expand_normal (arg);
35261 int adjust = (comparison_p) ? 1 : 0;
35262 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
35264 if (last_arg_constant && i == nargs - 1)
35266 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
35268 enum insn_code new_icode = icode;
35269 switch (icode)
35271 case CODE_FOR_xop_vpermil2v2df3:
35272 case CODE_FOR_xop_vpermil2v4sf3:
35273 case CODE_FOR_xop_vpermil2v4df3:
35274 case CODE_FOR_xop_vpermil2v8sf3:
35275 error ("the last argument must be a 2-bit immediate");
35276 return gen_reg_rtx (tmode);
35277 case CODE_FOR_xop_rotlv2di3:
35278 new_icode = CODE_FOR_rotlv2di3;
35279 goto xop_rotl;
35280 case CODE_FOR_xop_rotlv4si3:
35281 new_icode = CODE_FOR_rotlv4si3;
35282 goto xop_rotl;
35283 case CODE_FOR_xop_rotlv8hi3:
35284 new_icode = CODE_FOR_rotlv8hi3;
35285 goto xop_rotl;
35286 case CODE_FOR_xop_rotlv16qi3:
35287 new_icode = CODE_FOR_rotlv16qi3;
35288 xop_rotl:
35289 if (CONST_INT_P (op))
35291 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35292 op = GEN_INT (INTVAL (op) & mask);
35293 gcc_checking_assert
35294 (insn_data[icode].operand[i + 1].predicate (op, mode));
35296 else
35298 gcc_checking_assert
35299 (nargs == 2
35300 && insn_data[new_icode].operand[0].mode == tmode
35301 && insn_data[new_icode].operand[1].mode == tmode
35302 && insn_data[new_icode].operand[2].mode == mode
35303 && insn_data[new_icode].operand[0].predicate
35304 == insn_data[icode].operand[0].predicate
35305 && insn_data[new_icode].operand[1].predicate
35306 == insn_data[icode].operand[1].predicate);
35307 icode = new_icode;
35308 goto non_constant;
35310 break;
35311 default:
35312 gcc_unreachable ();
35316 else
35318 non_constant:
35319 if (VECTOR_MODE_P (mode))
35320 op = safe_vector_operand (op, mode);
35322 /* If we aren't optimizing, only allow one memory operand to be
35323 generated. */
35324 if (memory_operand (op, mode))
35325 num_memory++;
35327 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35329 if (optimize
35330 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35331 || num_memory > 1)
35332 op = force_reg (mode, op);
35335 args[i].op = op;
35336 args[i].mode = mode;
35339 switch (nargs)
35341 case 1:
35342 pat = GEN_FCN (icode) (target, args[0].op);
35343 break;
35345 case 2:
35346 if (tf_p)
35347 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35348 GEN_INT ((int)sub_code));
35349 else if (! comparison_p)
35350 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35351 else
35353 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35354 args[0].op,
35355 args[1].op);
35357 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35359 break;
35361 case 3:
35362 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35363 break;
35365 case 4:
35366 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35367 break;
35369 default:
35370 gcc_unreachable ();
35373 if (! pat)
35374 return 0;
35376 emit_insn (pat);
35377 return target;
35380 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35381 insns with vec_merge. */
35383 static rtx
35384 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35385 rtx target)
35387 rtx pat;
35388 tree arg0 = CALL_EXPR_ARG (exp, 0);
35389 rtx op1, op0 = expand_normal (arg0);
35390 machine_mode tmode = insn_data[icode].operand[0].mode;
35391 machine_mode mode0 = insn_data[icode].operand[1].mode;
35393 if (optimize || !target
35394 || GET_MODE (target) != tmode
35395 || !insn_data[icode].operand[0].predicate (target, tmode))
35396 target = gen_reg_rtx (tmode);
35398 if (VECTOR_MODE_P (mode0))
35399 op0 = safe_vector_operand (op0, mode0);
35401 if ((optimize && !register_operand (op0, mode0))
35402 || !insn_data[icode].operand[1].predicate (op0, mode0))
35403 op0 = copy_to_mode_reg (mode0, op0);
35405 op1 = op0;
35406 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35407 op1 = copy_to_mode_reg (mode0, op1);
35409 pat = GEN_FCN (icode) (target, op0, op1);
35410 if (! pat)
35411 return 0;
35412 emit_insn (pat);
35413 return target;
35416 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35418 static rtx
35419 ix86_expand_sse_compare (const struct builtin_description *d,
35420 tree exp, rtx target, bool swap)
35422 rtx pat;
35423 tree arg0 = CALL_EXPR_ARG (exp, 0);
35424 tree arg1 = CALL_EXPR_ARG (exp, 1);
35425 rtx op0 = expand_normal (arg0);
35426 rtx op1 = expand_normal (arg1);
35427 rtx op2;
35428 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35429 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35430 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35431 enum rtx_code comparison = d->comparison;
35433 if (VECTOR_MODE_P (mode0))
35434 op0 = safe_vector_operand (op0, mode0);
35435 if (VECTOR_MODE_P (mode1))
35436 op1 = safe_vector_operand (op1, mode1);
35438 /* Swap operands if we have a comparison that isn't available in
35439 hardware. */
35440 if (swap)
35441 std::swap (op0, op1);
35443 if (optimize || !target
35444 || GET_MODE (target) != tmode
35445 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35446 target = gen_reg_rtx (tmode);
35448 if ((optimize && !register_operand (op0, mode0))
35449 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35450 op0 = copy_to_mode_reg (mode0, op0);
35451 if ((optimize && !register_operand (op1, mode1))
35452 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35453 op1 = copy_to_mode_reg (mode1, op1);
35455 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35456 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35457 if (! pat)
35458 return 0;
35459 emit_insn (pat);
35460 return target;
35463 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35465 static rtx
35466 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35467 rtx target)
35469 rtx pat;
35470 tree arg0 = CALL_EXPR_ARG (exp, 0);
35471 tree arg1 = CALL_EXPR_ARG (exp, 1);
35472 rtx op0 = expand_normal (arg0);
35473 rtx op1 = expand_normal (arg1);
35474 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35475 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35476 enum rtx_code comparison = d->comparison;
35478 if (VECTOR_MODE_P (mode0))
35479 op0 = safe_vector_operand (op0, mode0);
35480 if (VECTOR_MODE_P (mode1))
35481 op1 = safe_vector_operand (op1, mode1);
35483 /* Swap operands if we have a comparison that isn't available in
35484 hardware. */
35485 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35486 std::swap (op0, op1);
35488 target = gen_reg_rtx (SImode);
35489 emit_move_insn (target, const0_rtx);
35490 target = gen_rtx_SUBREG (QImode, target, 0);
35492 if ((optimize && !register_operand (op0, mode0))
35493 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35494 op0 = copy_to_mode_reg (mode0, op0);
35495 if ((optimize && !register_operand (op1, mode1))
35496 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35497 op1 = copy_to_mode_reg (mode1, op1);
35499 pat = GEN_FCN (d->icode) (op0, op1);
35500 if (! pat)
35501 return 0;
35502 emit_insn (pat);
35503 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35504 gen_rtx_fmt_ee (comparison, QImode,
35505 SET_DEST (pat),
35506 const0_rtx)));
35508 return SUBREG_REG (target);
35511 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35513 static rtx
35514 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35515 rtx target)
35517 rtx pat;
35518 tree arg0 = CALL_EXPR_ARG (exp, 0);
35519 rtx op1, op0 = expand_normal (arg0);
35520 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35521 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35523 if (optimize || target == 0
35524 || GET_MODE (target) != tmode
35525 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35526 target = gen_reg_rtx (tmode);
35528 if (VECTOR_MODE_P (mode0))
35529 op0 = safe_vector_operand (op0, mode0);
35531 if ((optimize && !register_operand (op0, mode0))
35532 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35533 op0 = copy_to_mode_reg (mode0, op0);
35535 op1 = GEN_INT (d->comparison);
35537 pat = GEN_FCN (d->icode) (target, op0, op1);
35538 if (! pat)
35539 return 0;
35540 emit_insn (pat);
35541 return target;
35544 static rtx
35545 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35546 tree exp, rtx target)
35548 rtx pat;
35549 tree arg0 = CALL_EXPR_ARG (exp, 0);
35550 tree arg1 = CALL_EXPR_ARG (exp, 1);
35551 rtx op0 = expand_normal (arg0);
35552 rtx op1 = expand_normal (arg1);
35553 rtx op2;
35554 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35555 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35556 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35558 if (optimize || target == 0
35559 || GET_MODE (target) != tmode
35560 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35561 target = gen_reg_rtx (tmode);
35563 op0 = safe_vector_operand (op0, mode0);
35564 op1 = safe_vector_operand (op1, mode1);
35566 if ((optimize && !register_operand (op0, mode0))
35567 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35568 op0 = copy_to_mode_reg (mode0, op0);
35569 if ((optimize && !register_operand (op1, mode1))
35570 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35571 op1 = copy_to_mode_reg (mode1, op1);
35573 op2 = GEN_INT (d->comparison);
35575 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35576 if (! pat)
35577 return 0;
35578 emit_insn (pat);
35579 return target;
35582 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35584 static rtx
35585 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35586 rtx target)
35588 rtx pat;
35589 tree arg0 = CALL_EXPR_ARG (exp, 0);
35590 tree arg1 = CALL_EXPR_ARG (exp, 1);
35591 rtx op0 = expand_normal (arg0);
35592 rtx op1 = expand_normal (arg1);
35593 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35594 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35595 enum rtx_code comparison = d->comparison;
35597 if (VECTOR_MODE_P (mode0))
35598 op0 = safe_vector_operand (op0, mode0);
35599 if (VECTOR_MODE_P (mode1))
35600 op1 = safe_vector_operand (op1, mode1);
35602 target = gen_reg_rtx (SImode);
35603 emit_move_insn (target, const0_rtx);
35604 target = gen_rtx_SUBREG (QImode, target, 0);
35606 if ((optimize && !register_operand (op0, mode0))
35607 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35608 op0 = copy_to_mode_reg (mode0, op0);
35609 if ((optimize && !register_operand (op1, mode1))
35610 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35611 op1 = copy_to_mode_reg (mode1, op1);
35613 pat = GEN_FCN (d->icode) (op0, op1);
35614 if (! pat)
35615 return 0;
35616 emit_insn (pat);
35617 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35618 gen_rtx_fmt_ee (comparison, QImode,
35619 SET_DEST (pat),
35620 const0_rtx)));
35622 return SUBREG_REG (target);
35625 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35627 static rtx
35628 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35629 tree exp, rtx target)
35631 rtx pat;
35632 tree arg0 = CALL_EXPR_ARG (exp, 0);
35633 tree arg1 = CALL_EXPR_ARG (exp, 1);
35634 tree arg2 = CALL_EXPR_ARG (exp, 2);
35635 tree arg3 = CALL_EXPR_ARG (exp, 3);
35636 tree arg4 = CALL_EXPR_ARG (exp, 4);
35637 rtx scratch0, scratch1;
35638 rtx op0 = expand_normal (arg0);
35639 rtx op1 = expand_normal (arg1);
35640 rtx op2 = expand_normal (arg2);
35641 rtx op3 = expand_normal (arg3);
35642 rtx op4 = expand_normal (arg4);
35643 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35645 tmode0 = insn_data[d->icode].operand[0].mode;
35646 tmode1 = insn_data[d->icode].operand[1].mode;
35647 modev2 = insn_data[d->icode].operand[2].mode;
35648 modei3 = insn_data[d->icode].operand[3].mode;
35649 modev4 = insn_data[d->icode].operand[4].mode;
35650 modei5 = insn_data[d->icode].operand[5].mode;
35651 modeimm = insn_data[d->icode].operand[6].mode;
35653 if (VECTOR_MODE_P (modev2))
35654 op0 = safe_vector_operand (op0, modev2);
35655 if (VECTOR_MODE_P (modev4))
35656 op2 = safe_vector_operand (op2, modev4);
35658 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35659 op0 = copy_to_mode_reg (modev2, op0);
35660 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35661 op1 = copy_to_mode_reg (modei3, op1);
35662 if ((optimize && !register_operand (op2, modev4))
35663 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35664 op2 = copy_to_mode_reg (modev4, op2);
35665 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35666 op3 = copy_to_mode_reg (modei5, op3);
35668 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35670 error ("the fifth argument must be an 8-bit immediate");
35671 return const0_rtx;
35674 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35676 if (optimize || !target
35677 || GET_MODE (target) != tmode0
35678 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35679 target = gen_reg_rtx (tmode0);
35681 scratch1 = gen_reg_rtx (tmode1);
35683 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35685 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35687 if (optimize || !target
35688 || GET_MODE (target) != tmode1
35689 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35690 target = gen_reg_rtx (tmode1);
35692 scratch0 = gen_reg_rtx (tmode0);
35694 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35696 else
35698 gcc_assert (d->flag);
35700 scratch0 = gen_reg_rtx (tmode0);
35701 scratch1 = gen_reg_rtx (tmode1);
35703 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35706 if (! pat)
35707 return 0;
35709 emit_insn (pat);
35711 if (d->flag)
35713 target = gen_reg_rtx (SImode);
35714 emit_move_insn (target, const0_rtx);
35715 target = gen_rtx_SUBREG (QImode, target, 0);
35717 emit_insn
35718 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35719 gen_rtx_fmt_ee (EQ, QImode,
35720 gen_rtx_REG ((machine_mode) d->flag,
35721 FLAGS_REG),
35722 const0_rtx)));
35723 return SUBREG_REG (target);
35725 else
35726 return target;
35730 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35732 static rtx
35733 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35734 tree exp, rtx target)
35736 rtx pat;
35737 tree arg0 = CALL_EXPR_ARG (exp, 0);
35738 tree arg1 = CALL_EXPR_ARG (exp, 1);
35739 tree arg2 = CALL_EXPR_ARG (exp, 2);
35740 rtx scratch0, scratch1;
35741 rtx op0 = expand_normal (arg0);
35742 rtx op1 = expand_normal (arg1);
35743 rtx op2 = expand_normal (arg2);
35744 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35746 tmode0 = insn_data[d->icode].operand[0].mode;
35747 tmode1 = insn_data[d->icode].operand[1].mode;
35748 modev2 = insn_data[d->icode].operand[2].mode;
35749 modev3 = insn_data[d->icode].operand[3].mode;
35750 modeimm = insn_data[d->icode].operand[4].mode;
35752 if (VECTOR_MODE_P (modev2))
35753 op0 = safe_vector_operand (op0, modev2);
35754 if (VECTOR_MODE_P (modev3))
35755 op1 = safe_vector_operand (op1, modev3);
35757 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35758 op0 = copy_to_mode_reg (modev2, op0);
35759 if ((optimize && !register_operand (op1, modev3))
35760 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35761 op1 = copy_to_mode_reg (modev3, op1);
35763 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35765 error ("the third argument must be an 8-bit immediate");
35766 return const0_rtx;
35769 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35771 if (optimize || !target
35772 || GET_MODE (target) != tmode0
35773 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35774 target = gen_reg_rtx (tmode0);
35776 scratch1 = gen_reg_rtx (tmode1);
35778 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35780 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35782 if (optimize || !target
35783 || GET_MODE (target) != tmode1
35784 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35785 target = gen_reg_rtx (tmode1);
35787 scratch0 = gen_reg_rtx (tmode0);
35789 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35791 else
35793 gcc_assert (d->flag);
35795 scratch0 = gen_reg_rtx (tmode0);
35796 scratch1 = gen_reg_rtx (tmode1);
35798 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35801 if (! pat)
35802 return 0;
35804 emit_insn (pat);
35806 if (d->flag)
35808 target = gen_reg_rtx (SImode);
35809 emit_move_insn (target, const0_rtx);
35810 target = gen_rtx_SUBREG (QImode, target, 0);
35812 emit_insn
35813 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35814 gen_rtx_fmt_ee (EQ, QImode,
35815 gen_rtx_REG ((machine_mode) d->flag,
35816 FLAGS_REG),
35817 const0_rtx)));
35818 return SUBREG_REG (target);
35820 else
35821 return target;
35824 /* Subroutine of ix86_expand_builtin to take care of insns with
35825 variable number of operands. */
35827 static rtx
35828 ix86_expand_args_builtin (const struct builtin_description *d,
35829 tree exp, rtx target)
35831 rtx pat, real_target;
35832 unsigned int i, nargs;
35833 unsigned int nargs_constant = 0;
35834 unsigned int mask_pos = 0;
35835 int num_memory = 0;
35836 struct
35838 rtx op;
35839 machine_mode mode;
35840 } args[6];
35841 bool second_arg_count = false;
35842 enum insn_code icode = d->icode;
35843 const struct insn_data_d *insn_p = &insn_data[icode];
35844 machine_mode tmode = insn_p->operand[0].mode;
35845 machine_mode rmode = VOIDmode;
35846 bool swap = false;
35847 enum rtx_code comparison = d->comparison;
35849 switch ((enum ix86_builtin_func_type) d->flag)
35851 case V2DF_FTYPE_V2DF_ROUND:
35852 case V4DF_FTYPE_V4DF_ROUND:
35853 case V8DF_FTYPE_V8DF_ROUND:
35854 case V4SF_FTYPE_V4SF_ROUND:
35855 case V8SF_FTYPE_V8SF_ROUND:
35856 case V16SF_FTYPE_V16SF_ROUND:
35857 case V4SI_FTYPE_V4SF_ROUND:
35858 case V8SI_FTYPE_V8SF_ROUND:
35859 case V16SI_FTYPE_V16SF_ROUND:
35860 return ix86_expand_sse_round (d, exp, target);
35861 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35862 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35863 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35864 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35865 case INT_FTYPE_V8SF_V8SF_PTEST:
35866 case INT_FTYPE_V4DI_V4DI_PTEST:
35867 case INT_FTYPE_V4DF_V4DF_PTEST:
35868 case INT_FTYPE_V4SF_V4SF_PTEST:
35869 case INT_FTYPE_V2DI_V2DI_PTEST:
35870 case INT_FTYPE_V2DF_V2DF_PTEST:
35871 return ix86_expand_sse_ptest (d, exp, target);
35872 case FLOAT128_FTYPE_FLOAT128:
35873 case FLOAT_FTYPE_FLOAT:
35874 case INT_FTYPE_INT:
35875 case UINT_FTYPE_UINT:
35876 case UINT16_FTYPE_UINT16:
35877 case UINT64_FTYPE_INT:
35878 case UINT64_FTYPE_UINT64:
35879 case INT64_FTYPE_INT64:
35880 case INT64_FTYPE_V4SF:
35881 case INT64_FTYPE_V2DF:
35882 case INT_FTYPE_V16QI:
35883 case INT_FTYPE_V8QI:
35884 case INT_FTYPE_V8SF:
35885 case INT_FTYPE_V4DF:
35886 case INT_FTYPE_V4SF:
35887 case INT_FTYPE_V2DF:
35888 case INT_FTYPE_V32QI:
35889 case V16QI_FTYPE_V16QI:
35890 case V8SI_FTYPE_V8SF:
35891 case V8SI_FTYPE_V4SI:
35892 case V8HI_FTYPE_V8HI:
35893 case V8HI_FTYPE_V16QI:
35894 case V8QI_FTYPE_V8QI:
35895 case V8SF_FTYPE_V8SF:
35896 case V8SF_FTYPE_V8SI:
35897 case V8SF_FTYPE_V4SF:
35898 case V8SF_FTYPE_V8HI:
35899 case V4SI_FTYPE_V4SI:
35900 case V4SI_FTYPE_V16QI:
35901 case V4SI_FTYPE_V4SF:
35902 case V4SI_FTYPE_V8SI:
35903 case V4SI_FTYPE_V8HI:
35904 case V4SI_FTYPE_V4DF:
35905 case V4SI_FTYPE_V2DF:
35906 case V4HI_FTYPE_V4HI:
35907 case V4DF_FTYPE_V4DF:
35908 case V4DF_FTYPE_V4SI:
35909 case V4DF_FTYPE_V4SF:
35910 case V4DF_FTYPE_V2DF:
35911 case V4SF_FTYPE_V4SF:
35912 case V4SF_FTYPE_V4SI:
35913 case V4SF_FTYPE_V8SF:
35914 case V4SF_FTYPE_V4DF:
35915 case V4SF_FTYPE_V8HI:
35916 case V4SF_FTYPE_V2DF:
35917 case V2DI_FTYPE_V2DI:
35918 case V2DI_FTYPE_V16QI:
35919 case V2DI_FTYPE_V8HI:
35920 case V2DI_FTYPE_V4SI:
35921 case V2DF_FTYPE_V2DF:
35922 case V2DF_FTYPE_V4SI:
35923 case V2DF_FTYPE_V4DF:
35924 case V2DF_FTYPE_V4SF:
35925 case V2DF_FTYPE_V2SI:
35926 case V2SI_FTYPE_V2SI:
35927 case V2SI_FTYPE_V4SF:
35928 case V2SI_FTYPE_V2SF:
35929 case V2SI_FTYPE_V2DF:
35930 case V2SF_FTYPE_V2SF:
35931 case V2SF_FTYPE_V2SI:
35932 case V32QI_FTYPE_V32QI:
35933 case V32QI_FTYPE_V16QI:
35934 case V16HI_FTYPE_V16HI:
35935 case V16HI_FTYPE_V8HI:
35936 case V8SI_FTYPE_V8SI:
35937 case V16HI_FTYPE_V16QI:
35938 case V8SI_FTYPE_V16QI:
35939 case V4DI_FTYPE_V16QI:
35940 case V8SI_FTYPE_V8HI:
35941 case V4DI_FTYPE_V8HI:
35942 case V4DI_FTYPE_V4SI:
35943 case V4DI_FTYPE_V2DI:
35944 case UQI_FTYPE_UQI:
35945 case UHI_FTYPE_UHI:
35946 case USI_FTYPE_USI:
35947 case USI_FTYPE_UQI:
35948 case USI_FTYPE_UHI:
35949 case UDI_FTYPE_UDI:
35950 case UHI_FTYPE_V16QI:
35951 case USI_FTYPE_V32QI:
35952 case UDI_FTYPE_V64QI:
35953 case V16QI_FTYPE_UHI:
35954 case V32QI_FTYPE_USI:
35955 case V64QI_FTYPE_UDI:
35956 case V8HI_FTYPE_UQI:
35957 case V16HI_FTYPE_UHI:
35958 case V32HI_FTYPE_USI:
35959 case V4SI_FTYPE_UQI:
35960 case V8SI_FTYPE_UQI:
35961 case V4SI_FTYPE_UHI:
35962 case V8SI_FTYPE_UHI:
35963 case UQI_FTYPE_V8HI:
35964 case UHI_FTYPE_V16HI:
35965 case USI_FTYPE_V32HI:
35966 case UQI_FTYPE_V4SI:
35967 case UQI_FTYPE_V8SI:
35968 case UHI_FTYPE_V16SI:
35969 case UQI_FTYPE_V2DI:
35970 case UQI_FTYPE_V4DI:
35971 case UQI_FTYPE_V8DI:
35972 case V16SI_FTYPE_UHI:
35973 case V2DI_FTYPE_UQI:
35974 case V4DI_FTYPE_UQI:
35975 case V16SI_FTYPE_INT:
35976 case V16SF_FTYPE_V8SF:
35977 case V16SI_FTYPE_V8SI:
35978 case V16SF_FTYPE_V4SF:
35979 case V16SI_FTYPE_V4SI:
35980 case V16SI_FTYPE_V16SF:
35981 case V16SI_FTYPE_V16SI:
35982 case V16SF_FTYPE_V16SF:
35983 case V8DI_FTYPE_UQI:
35984 case V8DI_FTYPE_V8DI:
35985 case V8DF_FTYPE_V4DF:
35986 case V8DF_FTYPE_V2DF:
35987 case V8DF_FTYPE_V8DF:
35988 nargs = 1;
35989 break;
35990 case V4SF_FTYPE_V4SF_VEC_MERGE:
35991 case V2DF_FTYPE_V2DF_VEC_MERGE:
35992 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35993 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35994 case V16QI_FTYPE_V16QI_V16QI:
35995 case V16QI_FTYPE_V8HI_V8HI:
35996 case V16SF_FTYPE_V16SF_V16SF:
35997 case V8QI_FTYPE_V8QI_V8QI:
35998 case V8QI_FTYPE_V4HI_V4HI:
35999 case V8HI_FTYPE_V8HI_V8HI:
36000 case V8HI_FTYPE_V16QI_V16QI:
36001 case V8HI_FTYPE_V4SI_V4SI:
36002 case V8SF_FTYPE_V8SF_V8SF:
36003 case V8SF_FTYPE_V8SF_V8SI:
36004 case V8DF_FTYPE_V8DF_V8DF:
36005 case V4SI_FTYPE_V4SI_V4SI:
36006 case V4SI_FTYPE_V8HI_V8HI:
36007 case V4SI_FTYPE_V2DF_V2DF:
36008 case V4HI_FTYPE_V4HI_V4HI:
36009 case V4HI_FTYPE_V8QI_V8QI:
36010 case V4HI_FTYPE_V2SI_V2SI:
36011 case V4DF_FTYPE_V4DF_V4DF:
36012 case V4DF_FTYPE_V4DF_V4DI:
36013 case V4SF_FTYPE_V4SF_V4SF:
36014 case V4SF_FTYPE_V4SF_V4SI:
36015 case V4SF_FTYPE_V4SF_V2SI:
36016 case V4SF_FTYPE_V4SF_V2DF:
36017 case V4SF_FTYPE_V4SF_UINT:
36018 case V4SF_FTYPE_V4SF_DI:
36019 case V4SF_FTYPE_V4SF_SI:
36020 case V2DI_FTYPE_V2DI_V2DI:
36021 case V2DI_FTYPE_V16QI_V16QI:
36022 case V2DI_FTYPE_V4SI_V4SI:
36023 case V2DI_FTYPE_V2DI_V16QI:
36024 case V2SI_FTYPE_V2SI_V2SI:
36025 case V2SI_FTYPE_V4HI_V4HI:
36026 case V2SI_FTYPE_V2SF_V2SF:
36027 case V2DF_FTYPE_V2DF_V2DF:
36028 case V2DF_FTYPE_V2DF_V4SF:
36029 case V2DF_FTYPE_V2DF_V2DI:
36030 case V2DF_FTYPE_V2DF_DI:
36031 case V2DF_FTYPE_V2DF_SI:
36032 case V2DF_FTYPE_V2DF_UINT:
36033 case V2SF_FTYPE_V2SF_V2SF:
36034 case V1DI_FTYPE_V1DI_V1DI:
36035 case V1DI_FTYPE_V8QI_V8QI:
36036 case V1DI_FTYPE_V2SI_V2SI:
36037 case V32QI_FTYPE_V16HI_V16HI:
36038 case V16HI_FTYPE_V8SI_V8SI:
36039 case V32QI_FTYPE_V32QI_V32QI:
36040 case V16HI_FTYPE_V32QI_V32QI:
36041 case V16HI_FTYPE_V16HI_V16HI:
36042 case V8SI_FTYPE_V4DF_V4DF:
36043 case V8SI_FTYPE_V8SI_V8SI:
36044 case V8SI_FTYPE_V16HI_V16HI:
36045 case V4DI_FTYPE_V4DI_V4DI:
36046 case V4DI_FTYPE_V8SI_V8SI:
36047 case V8DI_FTYPE_V64QI_V64QI:
36048 if (comparison == UNKNOWN)
36049 return ix86_expand_binop_builtin (icode, exp, target);
36050 nargs = 2;
36051 break;
36052 case V4SF_FTYPE_V4SF_V4SF_SWAP:
36053 case V2DF_FTYPE_V2DF_V2DF_SWAP:
36054 gcc_assert (comparison != UNKNOWN);
36055 nargs = 2;
36056 swap = true;
36057 break;
36058 case V16HI_FTYPE_V16HI_V8HI_COUNT:
36059 case V16HI_FTYPE_V16HI_SI_COUNT:
36060 case V8SI_FTYPE_V8SI_V4SI_COUNT:
36061 case V8SI_FTYPE_V8SI_SI_COUNT:
36062 case V4DI_FTYPE_V4DI_V2DI_COUNT:
36063 case V4DI_FTYPE_V4DI_INT_COUNT:
36064 case V8HI_FTYPE_V8HI_V8HI_COUNT:
36065 case V8HI_FTYPE_V8HI_SI_COUNT:
36066 case V4SI_FTYPE_V4SI_V4SI_COUNT:
36067 case V4SI_FTYPE_V4SI_SI_COUNT:
36068 case V4HI_FTYPE_V4HI_V4HI_COUNT:
36069 case V4HI_FTYPE_V4HI_SI_COUNT:
36070 case V2DI_FTYPE_V2DI_V2DI_COUNT:
36071 case V2DI_FTYPE_V2DI_SI_COUNT:
36072 case V2SI_FTYPE_V2SI_V2SI_COUNT:
36073 case V2SI_FTYPE_V2SI_SI_COUNT:
36074 case V1DI_FTYPE_V1DI_V1DI_COUNT:
36075 case V1DI_FTYPE_V1DI_SI_COUNT:
36076 nargs = 2;
36077 second_arg_count = true;
36078 break;
36079 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
36080 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
36081 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
36082 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
36083 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
36084 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
36085 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
36086 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
36087 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
36088 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
36089 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
36090 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
36091 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
36092 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
36093 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
36094 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
36095 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
36096 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
36097 nargs = 4;
36098 second_arg_count = true;
36099 break;
36100 case UINT64_FTYPE_UINT64_UINT64:
36101 case UINT_FTYPE_UINT_UINT:
36102 case UINT_FTYPE_UINT_USHORT:
36103 case UINT_FTYPE_UINT_UCHAR:
36104 case UINT16_FTYPE_UINT16_INT:
36105 case UINT8_FTYPE_UINT8_INT:
36106 case UQI_FTYPE_UQI_UQI:
36107 case UHI_FTYPE_UHI_UHI:
36108 case USI_FTYPE_USI_USI:
36109 case UDI_FTYPE_UDI_UDI:
36110 case V16SI_FTYPE_V8DF_V8DF:
36111 nargs = 2;
36112 break;
36113 case V2DI_FTYPE_V2DI_INT_CONVERT:
36114 nargs = 2;
36115 rmode = V1TImode;
36116 nargs_constant = 1;
36117 break;
36118 case V4DI_FTYPE_V4DI_INT_CONVERT:
36119 nargs = 2;
36120 rmode = V2TImode;
36121 nargs_constant = 1;
36122 break;
36123 case V8DI_FTYPE_V8DI_INT_CONVERT:
36124 nargs = 2;
36125 rmode = V4TImode;
36126 nargs_constant = 1;
36127 break;
36128 case V8HI_FTYPE_V8HI_INT:
36129 case V8HI_FTYPE_V8SF_INT:
36130 case V16HI_FTYPE_V16SF_INT:
36131 case V8HI_FTYPE_V4SF_INT:
36132 case V8SF_FTYPE_V8SF_INT:
36133 case V4SF_FTYPE_V16SF_INT:
36134 case V16SF_FTYPE_V16SF_INT:
36135 case V4SI_FTYPE_V4SI_INT:
36136 case V4SI_FTYPE_V8SI_INT:
36137 case V4HI_FTYPE_V4HI_INT:
36138 case V4DF_FTYPE_V4DF_INT:
36139 case V4DF_FTYPE_V8DF_INT:
36140 case V4SF_FTYPE_V4SF_INT:
36141 case V4SF_FTYPE_V8SF_INT:
36142 case V2DI_FTYPE_V2DI_INT:
36143 case V2DF_FTYPE_V2DF_INT:
36144 case V2DF_FTYPE_V4DF_INT:
36145 case V16HI_FTYPE_V16HI_INT:
36146 case V8SI_FTYPE_V8SI_INT:
36147 case V16SI_FTYPE_V16SI_INT:
36148 case V4SI_FTYPE_V16SI_INT:
36149 case V4DI_FTYPE_V4DI_INT:
36150 case V2DI_FTYPE_V4DI_INT:
36151 case V4DI_FTYPE_V8DI_INT:
36152 case QI_FTYPE_V4SF_INT:
36153 case QI_FTYPE_V2DF_INT:
36154 case UQI_FTYPE_UQI_UQI_CONST:
36155 case UHI_FTYPE_UHI_UQI:
36156 case USI_FTYPE_USI_UQI:
36157 case UDI_FTYPE_UDI_UQI:
36158 nargs = 2;
36159 nargs_constant = 1;
36160 break;
36161 case V16QI_FTYPE_V16QI_V16QI_V16QI:
36162 case V8SF_FTYPE_V8SF_V8SF_V8SF:
36163 case V4DF_FTYPE_V4DF_V4DF_V4DF:
36164 case V4SF_FTYPE_V4SF_V4SF_V4SF:
36165 case V2DF_FTYPE_V2DF_V2DF_V2DF:
36166 case V32QI_FTYPE_V32QI_V32QI_V32QI:
36167 case UHI_FTYPE_V16SI_V16SI_UHI:
36168 case UQI_FTYPE_V8DI_V8DI_UQI:
36169 case V16HI_FTYPE_V16SI_V16HI_UHI:
36170 case V16QI_FTYPE_V16SI_V16QI_UHI:
36171 case V16QI_FTYPE_V8DI_V16QI_UQI:
36172 case V16SF_FTYPE_V16SF_V16SF_UHI:
36173 case V16SF_FTYPE_V4SF_V16SF_UHI:
36174 case V16SI_FTYPE_SI_V16SI_UHI:
36175 case V16SI_FTYPE_V16HI_V16SI_UHI:
36176 case V16SI_FTYPE_V16QI_V16SI_UHI:
36177 case V8SF_FTYPE_V4SF_V8SF_UQI:
36178 case V4DF_FTYPE_V2DF_V4DF_UQI:
36179 case V8SI_FTYPE_V4SI_V8SI_UQI:
36180 case V8SI_FTYPE_SI_V8SI_UQI:
36181 case V4SI_FTYPE_V4SI_V4SI_UQI:
36182 case V4SI_FTYPE_SI_V4SI_UQI:
36183 case V4DI_FTYPE_V2DI_V4DI_UQI:
36184 case V4DI_FTYPE_DI_V4DI_UQI:
36185 case V2DI_FTYPE_V2DI_V2DI_UQI:
36186 case V2DI_FTYPE_DI_V2DI_UQI:
36187 case V64QI_FTYPE_V64QI_V64QI_UDI:
36188 case V64QI_FTYPE_V16QI_V64QI_UDI:
36189 case V64QI_FTYPE_QI_V64QI_UDI:
36190 case V32QI_FTYPE_V32QI_V32QI_USI:
36191 case V32QI_FTYPE_V16QI_V32QI_USI:
36192 case V32QI_FTYPE_QI_V32QI_USI:
36193 case V16QI_FTYPE_V16QI_V16QI_UHI:
36194 case V16QI_FTYPE_QI_V16QI_UHI:
36195 case V32HI_FTYPE_V8HI_V32HI_USI:
36196 case V32HI_FTYPE_HI_V32HI_USI:
36197 case V16HI_FTYPE_V8HI_V16HI_UHI:
36198 case V16HI_FTYPE_HI_V16HI_UHI:
36199 case V8HI_FTYPE_V8HI_V8HI_UQI:
36200 case V8HI_FTYPE_HI_V8HI_UQI:
36201 case V8SF_FTYPE_V8HI_V8SF_UQI:
36202 case V4SF_FTYPE_V8HI_V4SF_UQI:
36203 case V8SI_FTYPE_V8SF_V8SI_UQI:
36204 case V4SI_FTYPE_V4SF_V4SI_UQI:
36205 case V4DI_FTYPE_V4SF_V4DI_UQI:
36206 case V2DI_FTYPE_V4SF_V2DI_UQI:
36207 case V4SF_FTYPE_V4DI_V4SF_UQI:
36208 case V4SF_FTYPE_V2DI_V4SF_UQI:
36209 case V4DF_FTYPE_V4DI_V4DF_UQI:
36210 case V2DF_FTYPE_V2DI_V2DF_UQI:
36211 case V16QI_FTYPE_V8HI_V16QI_UQI:
36212 case V16QI_FTYPE_V16HI_V16QI_UHI:
36213 case V16QI_FTYPE_V4SI_V16QI_UQI:
36214 case V16QI_FTYPE_V8SI_V16QI_UQI:
36215 case V8HI_FTYPE_V4SI_V8HI_UQI:
36216 case V8HI_FTYPE_V8SI_V8HI_UQI:
36217 case V16QI_FTYPE_V2DI_V16QI_UQI:
36218 case V16QI_FTYPE_V4DI_V16QI_UQI:
36219 case V8HI_FTYPE_V2DI_V8HI_UQI:
36220 case V8HI_FTYPE_V4DI_V8HI_UQI:
36221 case V4SI_FTYPE_V2DI_V4SI_UQI:
36222 case V4SI_FTYPE_V4DI_V4SI_UQI:
36223 case V32QI_FTYPE_V32HI_V32QI_USI:
36224 case UHI_FTYPE_V16QI_V16QI_UHI:
36225 case USI_FTYPE_V32QI_V32QI_USI:
36226 case UDI_FTYPE_V64QI_V64QI_UDI:
36227 case UQI_FTYPE_V8HI_V8HI_UQI:
36228 case UHI_FTYPE_V16HI_V16HI_UHI:
36229 case USI_FTYPE_V32HI_V32HI_USI:
36230 case UQI_FTYPE_V4SI_V4SI_UQI:
36231 case UQI_FTYPE_V8SI_V8SI_UQI:
36232 case UQI_FTYPE_V2DI_V2DI_UQI:
36233 case UQI_FTYPE_V4DI_V4DI_UQI:
36234 case V4SF_FTYPE_V2DF_V4SF_UQI:
36235 case V4SF_FTYPE_V4DF_V4SF_UQI:
36236 case V16SI_FTYPE_V16SI_V16SI_UHI:
36237 case V16SI_FTYPE_V4SI_V16SI_UHI:
36238 case V2DI_FTYPE_V4SI_V2DI_UQI:
36239 case V2DI_FTYPE_V8HI_V2DI_UQI:
36240 case V2DI_FTYPE_V16QI_V2DI_UQI:
36241 case V4DI_FTYPE_V4DI_V4DI_UQI:
36242 case V4DI_FTYPE_V4SI_V4DI_UQI:
36243 case V4DI_FTYPE_V8HI_V4DI_UQI:
36244 case V4DI_FTYPE_V16QI_V4DI_UQI:
36245 case V4DI_FTYPE_V4DF_V4DI_UQI:
36246 case V2DI_FTYPE_V2DF_V2DI_UQI:
36247 case V4SI_FTYPE_V4DF_V4SI_UQI:
36248 case V4SI_FTYPE_V2DF_V4SI_UQI:
36249 case V4SI_FTYPE_V8HI_V4SI_UQI:
36250 case V4SI_FTYPE_V16QI_V4SI_UQI:
36251 case V4DI_FTYPE_V4DI_V4DI_V4DI:
36252 case V8DF_FTYPE_V2DF_V8DF_UQI:
36253 case V8DF_FTYPE_V4DF_V8DF_UQI:
36254 case V8DF_FTYPE_V8DF_V8DF_UQI:
36255 case V8SF_FTYPE_V8SF_V8SF_UQI:
36256 case V8SF_FTYPE_V8SI_V8SF_UQI:
36257 case V4DF_FTYPE_V4DF_V4DF_UQI:
36258 case V4SF_FTYPE_V4SF_V4SF_UQI:
36259 case V2DF_FTYPE_V2DF_V2DF_UQI:
36260 case V2DF_FTYPE_V4SF_V2DF_UQI:
36261 case V2DF_FTYPE_V4SI_V2DF_UQI:
36262 case V4SF_FTYPE_V4SI_V4SF_UQI:
36263 case V4DF_FTYPE_V4SF_V4DF_UQI:
36264 case V4DF_FTYPE_V4SI_V4DF_UQI:
36265 case V8SI_FTYPE_V8SI_V8SI_UQI:
36266 case V8SI_FTYPE_V8HI_V8SI_UQI:
36267 case V8SI_FTYPE_V16QI_V8SI_UQI:
36268 case V8DF_FTYPE_V8SI_V8DF_UQI:
36269 case V8DI_FTYPE_DI_V8DI_UQI:
36270 case V16SF_FTYPE_V8SF_V16SF_UHI:
36271 case V16SI_FTYPE_V8SI_V16SI_UHI:
36272 case V16HI_FTYPE_V16HI_V16HI_UHI:
36273 case V8HI_FTYPE_V16QI_V8HI_UQI:
36274 case V16HI_FTYPE_V16QI_V16HI_UHI:
36275 case V32HI_FTYPE_V32HI_V32HI_USI:
36276 case V32HI_FTYPE_V32QI_V32HI_USI:
36277 case V8DI_FTYPE_V16QI_V8DI_UQI:
36278 case V8DI_FTYPE_V2DI_V8DI_UQI:
36279 case V8DI_FTYPE_V4DI_V8DI_UQI:
36280 case V8DI_FTYPE_V8DI_V8DI_UQI:
36281 case V8DI_FTYPE_V8HI_V8DI_UQI:
36282 case V8DI_FTYPE_V8SI_V8DI_UQI:
36283 case V8HI_FTYPE_V8DI_V8HI_UQI:
36284 case V8SI_FTYPE_V8DI_V8SI_UQI:
36285 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36286 nargs = 3;
36287 break;
36288 case V32QI_FTYPE_V32QI_V32QI_INT:
36289 case V16HI_FTYPE_V16HI_V16HI_INT:
36290 case V16QI_FTYPE_V16QI_V16QI_INT:
36291 case V4DI_FTYPE_V4DI_V4DI_INT:
36292 case V8HI_FTYPE_V8HI_V8HI_INT:
36293 case V8SI_FTYPE_V8SI_V8SI_INT:
36294 case V8SI_FTYPE_V8SI_V4SI_INT:
36295 case V8SF_FTYPE_V8SF_V8SF_INT:
36296 case V8SF_FTYPE_V8SF_V4SF_INT:
36297 case V4SI_FTYPE_V4SI_V4SI_INT:
36298 case V4DF_FTYPE_V4DF_V4DF_INT:
36299 case V16SF_FTYPE_V16SF_V16SF_INT:
36300 case V16SF_FTYPE_V16SF_V4SF_INT:
36301 case V16SI_FTYPE_V16SI_V4SI_INT:
36302 case V4DF_FTYPE_V4DF_V2DF_INT:
36303 case V4SF_FTYPE_V4SF_V4SF_INT:
36304 case V2DI_FTYPE_V2DI_V2DI_INT:
36305 case V4DI_FTYPE_V4DI_V2DI_INT:
36306 case V2DF_FTYPE_V2DF_V2DF_INT:
36307 case UQI_FTYPE_V8DI_V8UDI_INT:
36308 case UQI_FTYPE_V8DF_V8DF_INT:
36309 case UQI_FTYPE_V2DF_V2DF_INT:
36310 case UQI_FTYPE_V4SF_V4SF_INT:
36311 case UHI_FTYPE_V16SI_V16SI_INT:
36312 case UHI_FTYPE_V16SF_V16SF_INT:
36313 nargs = 3;
36314 nargs_constant = 1;
36315 break;
36316 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36317 nargs = 3;
36318 rmode = V4DImode;
36319 nargs_constant = 1;
36320 break;
36321 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36322 nargs = 3;
36323 rmode = V2DImode;
36324 nargs_constant = 1;
36325 break;
36326 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36327 nargs = 3;
36328 rmode = DImode;
36329 nargs_constant = 1;
36330 break;
36331 case V2DI_FTYPE_V2DI_UINT_UINT:
36332 nargs = 3;
36333 nargs_constant = 2;
36334 break;
36335 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36336 nargs = 3;
36337 rmode = V8DImode;
36338 nargs_constant = 1;
36339 break;
36340 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36341 nargs = 5;
36342 rmode = V8DImode;
36343 mask_pos = 2;
36344 nargs_constant = 1;
36345 break;
36346 case QI_FTYPE_V8DF_INT_UQI:
36347 case QI_FTYPE_V4DF_INT_UQI:
36348 case QI_FTYPE_V2DF_INT_UQI:
36349 case HI_FTYPE_V16SF_INT_UHI:
36350 case QI_FTYPE_V8SF_INT_UQI:
36351 case QI_FTYPE_V4SF_INT_UQI:
36352 nargs = 3;
36353 mask_pos = 1;
36354 nargs_constant = 1;
36355 break;
36356 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36357 nargs = 5;
36358 rmode = V4DImode;
36359 mask_pos = 2;
36360 nargs_constant = 1;
36361 break;
36362 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36363 nargs = 5;
36364 rmode = V2DImode;
36365 mask_pos = 2;
36366 nargs_constant = 1;
36367 break;
36368 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36369 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36370 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36371 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36372 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36373 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36374 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36375 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36376 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36377 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36378 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36379 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36380 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36381 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36382 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36383 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36384 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36385 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36386 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36387 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36388 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36389 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36390 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36391 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36392 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36393 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36394 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36395 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36396 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36397 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36398 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36399 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36400 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36401 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36402 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36403 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36404 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36405 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36406 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36407 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36408 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36409 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36410 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36411 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36412 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36413 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36414 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36415 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36416 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36417 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36418 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36419 nargs = 4;
36420 break;
36421 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36422 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36423 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36424 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36425 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36426 nargs = 4;
36427 nargs_constant = 1;
36428 break;
36429 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36430 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36431 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36432 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36433 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36434 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36435 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36436 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36437 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36438 case USI_FTYPE_V32QI_V32QI_INT_USI:
36439 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36440 case USI_FTYPE_V32HI_V32HI_INT_USI:
36441 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36442 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36443 nargs = 4;
36444 mask_pos = 1;
36445 nargs_constant = 1;
36446 break;
36447 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36448 nargs = 4;
36449 nargs_constant = 2;
36450 break;
36451 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36452 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36453 nargs = 4;
36454 break;
36455 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36456 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36457 mask_pos = 1;
36458 nargs = 4;
36459 nargs_constant = 1;
36460 break;
36461 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36462 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36463 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36464 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36465 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36466 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36467 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36468 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36469 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36470 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36471 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36472 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36473 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36474 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36475 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36476 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36477 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36478 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36479 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36480 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36481 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36482 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36483 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36484 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36485 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36486 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36487 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36488 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36489 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36490 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36491 nargs = 4;
36492 mask_pos = 2;
36493 nargs_constant = 1;
36494 break;
36495 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36496 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36497 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36498 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36499 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36500 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36501 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36502 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36503 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36504 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36505 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36506 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36507 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36508 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36509 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36510 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36511 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36512 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36513 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36514 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36515 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36516 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36517 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36518 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36519 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36520 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36521 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36522 nargs = 5;
36523 mask_pos = 2;
36524 nargs_constant = 1;
36525 break;
36526 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36527 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36528 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36529 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36530 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36531 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36532 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36533 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36534 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36535 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36536 nargs = 5;
36537 mask_pos = 1;
36538 nargs_constant = 1;
36539 break;
36541 default:
36542 gcc_unreachable ();
36545 gcc_assert (nargs <= ARRAY_SIZE (args));
36547 if (comparison != UNKNOWN)
36549 gcc_assert (nargs == 2);
36550 return ix86_expand_sse_compare (d, exp, target, swap);
36553 if (rmode == VOIDmode || rmode == tmode)
36555 if (optimize
36556 || target == 0
36557 || GET_MODE (target) != tmode
36558 || !insn_p->operand[0].predicate (target, tmode))
36559 target = gen_reg_rtx (tmode);
36560 else if (memory_operand (target, tmode))
36561 num_memory++;
36562 real_target = target;
36564 else
36566 real_target = gen_reg_rtx (tmode);
36567 target = lowpart_subreg (rmode, real_target, tmode);
36570 for (i = 0; i < nargs; i++)
36572 tree arg = CALL_EXPR_ARG (exp, i);
36573 rtx op = expand_normal (arg);
36574 machine_mode mode = insn_p->operand[i + 1].mode;
36575 bool match = insn_p->operand[i + 1].predicate (op, mode);
36577 if (second_arg_count && i == 1)
36579 /* SIMD shift insns take either an 8-bit immediate or
36580 register as count. But builtin functions take int as
36581 count. If count doesn't match, we put it in register.
36582 The instructions are using 64-bit count, if op is just
36583 32-bit, zero-extend it, as negative shift counts
36584 are undefined behavior and zero-extension is more
36585 efficient. */
36586 if (!match)
36588 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36589 op = convert_modes (mode, GET_MODE (op), op, 1);
36590 else
36591 op = lowpart_subreg (mode, op, GET_MODE (op));
36592 if (!insn_p->operand[i + 1].predicate (op, mode))
36593 op = copy_to_reg (op);
36596 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36597 (!mask_pos && (nargs - i) <= nargs_constant))
36599 if (!match)
36600 switch (icode)
36602 case CODE_FOR_avx_vinsertf128v4di:
36603 case CODE_FOR_avx_vextractf128v4di:
36604 error ("the last argument must be an 1-bit immediate");
36605 return const0_rtx;
36607 case CODE_FOR_avx512f_cmpv8di3_mask:
36608 case CODE_FOR_avx512f_cmpv16si3_mask:
36609 case CODE_FOR_avx512f_ucmpv8di3_mask:
36610 case CODE_FOR_avx512f_ucmpv16si3_mask:
36611 case CODE_FOR_avx512vl_cmpv4di3_mask:
36612 case CODE_FOR_avx512vl_cmpv8si3_mask:
36613 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36614 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36615 case CODE_FOR_avx512vl_cmpv2di3_mask:
36616 case CODE_FOR_avx512vl_cmpv4si3_mask:
36617 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36618 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36619 error ("the last argument must be a 3-bit immediate");
36620 return const0_rtx;
36622 case CODE_FOR_sse4_1_roundsd:
36623 case CODE_FOR_sse4_1_roundss:
36625 case CODE_FOR_sse4_1_roundpd:
36626 case CODE_FOR_sse4_1_roundps:
36627 case CODE_FOR_avx_roundpd256:
36628 case CODE_FOR_avx_roundps256:
36630 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36631 case CODE_FOR_sse4_1_roundps_sfix:
36632 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36633 case CODE_FOR_avx_roundps_sfix256:
36635 case CODE_FOR_sse4_1_blendps:
36636 case CODE_FOR_avx_blendpd256:
36637 case CODE_FOR_avx_vpermilv4df:
36638 case CODE_FOR_avx_vpermilv4df_mask:
36639 case CODE_FOR_avx512f_getmantv8df_mask:
36640 case CODE_FOR_avx512f_getmantv16sf_mask:
36641 case CODE_FOR_avx512vl_getmantv8sf_mask:
36642 case CODE_FOR_avx512vl_getmantv4df_mask:
36643 case CODE_FOR_avx512vl_getmantv4sf_mask:
36644 case CODE_FOR_avx512vl_getmantv2df_mask:
36645 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36646 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36647 case CODE_FOR_avx512dq_rangepv4df_mask:
36648 case CODE_FOR_avx512dq_rangepv8sf_mask:
36649 case CODE_FOR_avx512dq_rangepv2df_mask:
36650 case CODE_FOR_avx512dq_rangepv4sf_mask:
36651 case CODE_FOR_avx_shufpd256_mask:
36652 error ("the last argument must be a 4-bit immediate");
36653 return const0_rtx;
36655 case CODE_FOR_sha1rnds4:
36656 case CODE_FOR_sse4_1_blendpd:
36657 case CODE_FOR_avx_vpermilv2df:
36658 case CODE_FOR_avx_vpermilv2df_mask:
36659 case CODE_FOR_xop_vpermil2v2df3:
36660 case CODE_FOR_xop_vpermil2v4sf3:
36661 case CODE_FOR_xop_vpermil2v4df3:
36662 case CODE_FOR_xop_vpermil2v8sf3:
36663 case CODE_FOR_avx512f_vinsertf32x4_mask:
36664 case CODE_FOR_avx512f_vinserti32x4_mask:
36665 case CODE_FOR_avx512f_vextractf32x4_mask:
36666 case CODE_FOR_avx512f_vextracti32x4_mask:
36667 case CODE_FOR_sse2_shufpd:
36668 case CODE_FOR_sse2_shufpd_mask:
36669 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36670 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36671 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36672 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36673 error ("the last argument must be a 2-bit immediate");
36674 return const0_rtx;
36676 case CODE_FOR_avx_vextractf128v4df:
36677 case CODE_FOR_avx_vextractf128v8sf:
36678 case CODE_FOR_avx_vextractf128v8si:
36679 case CODE_FOR_avx_vinsertf128v4df:
36680 case CODE_FOR_avx_vinsertf128v8sf:
36681 case CODE_FOR_avx_vinsertf128v8si:
36682 case CODE_FOR_avx512f_vinsertf64x4_mask:
36683 case CODE_FOR_avx512f_vinserti64x4_mask:
36684 case CODE_FOR_avx512f_vextractf64x4_mask:
36685 case CODE_FOR_avx512f_vextracti64x4_mask:
36686 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36687 case CODE_FOR_avx512dq_vinserti32x8_mask:
36688 case CODE_FOR_avx512vl_vinsertv4df:
36689 case CODE_FOR_avx512vl_vinsertv4di:
36690 case CODE_FOR_avx512vl_vinsertv8sf:
36691 case CODE_FOR_avx512vl_vinsertv8si:
36692 error ("the last argument must be a 1-bit immediate");
36693 return const0_rtx;
36695 case CODE_FOR_avx_vmcmpv2df3:
36696 case CODE_FOR_avx_vmcmpv4sf3:
36697 case CODE_FOR_avx_cmpv2df3:
36698 case CODE_FOR_avx_cmpv4sf3:
36699 case CODE_FOR_avx_cmpv4df3:
36700 case CODE_FOR_avx_cmpv8sf3:
36701 case CODE_FOR_avx512f_cmpv8df3_mask:
36702 case CODE_FOR_avx512f_cmpv16sf3_mask:
36703 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36704 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36705 error ("the last argument must be a 5-bit immediate");
36706 return const0_rtx;
36708 default:
36709 switch (nargs_constant)
36711 case 2:
36712 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36713 (!mask_pos && (nargs - i) == nargs_constant))
36715 error ("the next to last argument must be an 8-bit immediate");
36716 break;
36718 /* FALLTHRU */
36719 case 1:
36720 error ("the last argument must be an 8-bit immediate");
36721 break;
36722 default:
36723 gcc_unreachable ();
36725 return const0_rtx;
36728 else
36730 if (VECTOR_MODE_P (mode))
36731 op = safe_vector_operand (op, mode);
36733 /* If we aren't optimizing, only allow one memory operand to
36734 be generated. */
36735 if (memory_operand (op, mode))
36736 num_memory++;
36738 op = fixup_modeless_constant (op, mode);
36740 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36742 if (optimize || !match || num_memory > 1)
36743 op = copy_to_mode_reg (mode, op);
36745 else
36747 op = copy_to_reg (op);
36748 op = lowpart_subreg (mode, op, GET_MODE (op));
36752 args[i].op = op;
36753 args[i].mode = mode;
36756 switch (nargs)
36758 case 1:
36759 pat = GEN_FCN (icode) (real_target, args[0].op);
36760 break;
36761 case 2:
36762 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36763 break;
36764 case 3:
36765 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36766 args[2].op);
36767 break;
36768 case 4:
36769 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36770 args[2].op, args[3].op);
36771 break;
36772 case 5:
36773 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36774 args[2].op, args[3].op, args[4].op);
36775 break;
36776 case 6:
36777 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36778 args[2].op, args[3].op, args[4].op,
36779 args[5].op);
36780 break;
36781 default:
36782 gcc_unreachable ();
36785 if (! pat)
36786 return 0;
36788 emit_insn (pat);
36789 return target;
36792 /* Transform pattern of following layout:
36793 (set A
36794 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36796 into:
36797 (set (A B)) */
36799 static rtx
36800 ix86_erase_embedded_rounding (rtx pat)
36802 if (GET_CODE (pat) == INSN)
36803 pat = PATTERN (pat);
36805 gcc_assert (GET_CODE (pat) == SET);
36806 rtx src = SET_SRC (pat);
36807 gcc_assert (XVECLEN (src, 0) == 2);
36808 rtx p0 = XVECEXP (src, 0, 0);
36809 gcc_assert (GET_CODE (src) == UNSPEC
36810 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36811 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36812 return res;
36815 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36816 with rounding. */
36817 static rtx
36818 ix86_expand_sse_comi_round (const struct builtin_description *d,
36819 tree exp, rtx target)
36821 rtx pat, set_dst;
36822 tree arg0 = CALL_EXPR_ARG (exp, 0);
36823 tree arg1 = CALL_EXPR_ARG (exp, 1);
36824 tree arg2 = CALL_EXPR_ARG (exp, 2);
36825 tree arg3 = CALL_EXPR_ARG (exp, 3);
36826 rtx op0 = expand_normal (arg0);
36827 rtx op1 = expand_normal (arg1);
36828 rtx op2 = expand_normal (arg2);
36829 rtx op3 = expand_normal (arg3);
36830 enum insn_code icode = d->icode;
36831 const struct insn_data_d *insn_p = &insn_data[icode];
36832 machine_mode mode0 = insn_p->operand[0].mode;
36833 machine_mode mode1 = insn_p->operand[1].mode;
36834 enum rtx_code comparison = UNEQ;
36835 bool need_ucomi = false;
36837 /* See avxintrin.h for values. */
36838 enum rtx_code comi_comparisons[32] =
36840 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36841 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36842 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36844 bool need_ucomi_values[32] =
36846 true, false, false, true, true, false, false, true,
36847 true, false, false, true, true, false, false, true,
36848 false, true, true, false, false, true, true, false,
36849 false, true, true, false, false, true, true, false
36852 if (!CONST_INT_P (op2))
36854 error ("the third argument must be comparison constant");
36855 return const0_rtx;
36857 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36859 error ("incorrect comparison mode");
36860 return const0_rtx;
36863 if (!insn_p->operand[2].predicate (op3, SImode))
36865 error ("incorrect rounding operand");
36866 return const0_rtx;
36869 comparison = comi_comparisons[INTVAL (op2)];
36870 need_ucomi = need_ucomi_values[INTVAL (op2)];
36872 if (VECTOR_MODE_P (mode0))
36873 op0 = safe_vector_operand (op0, mode0);
36874 if (VECTOR_MODE_P (mode1))
36875 op1 = safe_vector_operand (op1, mode1);
36877 target = gen_reg_rtx (SImode);
36878 emit_move_insn (target, const0_rtx);
36879 target = gen_rtx_SUBREG (QImode, target, 0);
36881 if ((optimize && !register_operand (op0, mode0))
36882 || !insn_p->operand[0].predicate (op0, mode0))
36883 op0 = copy_to_mode_reg (mode0, op0);
36884 if ((optimize && !register_operand (op1, mode1))
36885 || !insn_p->operand[1].predicate (op1, mode1))
36886 op1 = copy_to_mode_reg (mode1, op1);
36888 if (need_ucomi)
36889 icode = icode == CODE_FOR_sse_comi_round
36890 ? CODE_FOR_sse_ucomi_round
36891 : CODE_FOR_sse2_ucomi_round;
36893 pat = GEN_FCN (icode) (op0, op1, op3);
36894 if (! pat)
36895 return 0;
36897 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36898 if (INTVAL (op3) == NO_ROUND)
36900 pat = ix86_erase_embedded_rounding (pat);
36901 if (! pat)
36902 return 0;
36904 set_dst = SET_DEST (pat);
36906 else
36908 gcc_assert (GET_CODE (pat) == SET);
36909 set_dst = SET_DEST (pat);
36912 emit_insn (pat);
36913 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36914 gen_rtx_fmt_ee (comparison, QImode,
36915 set_dst,
36916 const0_rtx)));
36918 return SUBREG_REG (target);
36921 static rtx
36922 ix86_expand_round_builtin (const struct builtin_description *d,
36923 tree exp, rtx target)
36925 rtx pat;
36926 unsigned int i, nargs;
36927 struct
36929 rtx op;
36930 machine_mode mode;
36931 } args[6];
36932 enum insn_code icode = d->icode;
36933 const struct insn_data_d *insn_p = &insn_data[icode];
36934 machine_mode tmode = insn_p->operand[0].mode;
36935 unsigned int nargs_constant = 0;
36936 unsigned int redundant_embed_rnd = 0;
36938 switch ((enum ix86_builtin_func_type) d->flag)
36940 case UINT64_FTYPE_V2DF_INT:
36941 case UINT64_FTYPE_V4SF_INT:
36942 case UINT_FTYPE_V2DF_INT:
36943 case UINT_FTYPE_V4SF_INT:
36944 case INT64_FTYPE_V2DF_INT:
36945 case INT64_FTYPE_V4SF_INT:
36946 case INT_FTYPE_V2DF_INT:
36947 case INT_FTYPE_V4SF_INT:
36948 nargs = 2;
36949 break;
36950 case V4SF_FTYPE_V4SF_UINT_INT:
36951 case V4SF_FTYPE_V4SF_UINT64_INT:
36952 case V2DF_FTYPE_V2DF_UINT64_INT:
36953 case V4SF_FTYPE_V4SF_INT_INT:
36954 case V4SF_FTYPE_V4SF_INT64_INT:
36955 case V2DF_FTYPE_V2DF_INT64_INT:
36956 case V4SF_FTYPE_V4SF_V4SF_INT:
36957 case V2DF_FTYPE_V2DF_V2DF_INT:
36958 case V4SF_FTYPE_V4SF_V2DF_INT:
36959 case V2DF_FTYPE_V2DF_V4SF_INT:
36960 nargs = 3;
36961 break;
36962 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36963 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36964 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36965 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36966 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36967 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36968 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36969 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36970 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36971 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36972 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36973 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36974 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36975 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36976 nargs = 4;
36977 break;
36978 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36979 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36980 nargs_constant = 2;
36981 nargs = 4;
36982 break;
36983 case INT_FTYPE_V4SF_V4SF_INT_INT:
36984 case INT_FTYPE_V2DF_V2DF_INT_INT:
36985 return ix86_expand_sse_comi_round (d, exp, target);
36986 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36987 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36988 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36989 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36990 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36991 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36992 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36993 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36994 nargs = 5;
36995 break;
36996 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36997 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36998 nargs_constant = 4;
36999 nargs = 5;
37000 break;
37001 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
37002 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
37003 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
37004 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
37005 nargs_constant = 3;
37006 nargs = 5;
37007 break;
37008 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
37009 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
37010 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
37011 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
37012 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
37013 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
37014 nargs = 6;
37015 nargs_constant = 4;
37016 break;
37017 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
37018 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
37019 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
37020 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
37021 nargs = 6;
37022 nargs_constant = 3;
37023 break;
37024 default:
37025 gcc_unreachable ();
37027 gcc_assert (nargs <= ARRAY_SIZE (args));
37029 if (optimize
37030 || target == 0
37031 || GET_MODE (target) != tmode
37032 || !insn_p->operand[0].predicate (target, tmode))
37033 target = gen_reg_rtx (tmode);
37035 for (i = 0; i < nargs; i++)
37037 tree arg = CALL_EXPR_ARG (exp, i);
37038 rtx op = expand_normal (arg);
37039 machine_mode mode = insn_p->operand[i + 1].mode;
37040 bool match = insn_p->operand[i + 1].predicate (op, mode);
37042 if (i == nargs - nargs_constant)
37044 if (!match)
37046 switch (icode)
37048 case CODE_FOR_avx512f_getmantv8df_mask_round:
37049 case CODE_FOR_avx512f_getmantv16sf_mask_round:
37050 case CODE_FOR_avx512f_vgetmantv2df_round:
37051 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
37052 case CODE_FOR_avx512f_vgetmantv4sf_round:
37053 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
37054 error ("the immediate argument must be a 4-bit immediate");
37055 return const0_rtx;
37056 case CODE_FOR_avx512f_cmpv8df3_mask_round:
37057 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
37058 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
37059 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
37060 error ("the immediate argument must be a 5-bit immediate");
37061 return const0_rtx;
37062 default:
37063 error ("the immediate argument must be an 8-bit immediate");
37064 return const0_rtx;
37068 else if (i == nargs-1)
37070 if (!insn_p->operand[nargs].predicate (op, SImode))
37072 error ("incorrect rounding operand");
37073 return const0_rtx;
37076 /* If there is no rounding use normal version of the pattern. */
37077 if (INTVAL (op) == NO_ROUND)
37078 redundant_embed_rnd = 1;
37080 else
37082 if (VECTOR_MODE_P (mode))
37083 op = safe_vector_operand (op, mode);
37085 op = fixup_modeless_constant (op, mode);
37087 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37089 if (optimize || !match)
37090 op = copy_to_mode_reg (mode, op);
37092 else
37094 op = copy_to_reg (op);
37095 op = lowpart_subreg (mode, op, GET_MODE (op));
37099 args[i].op = op;
37100 args[i].mode = mode;
37103 switch (nargs)
37105 case 1:
37106 pat = GEN_FCN (icode) (target, args[0].op);
37107 break;
37108 case 2:
37109 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37110 break;
37111 case 3:
37112 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37113 args[2].op);
37114 break;
37115 case 4:
37116 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37117 args[2].op, args[3].op);
37118 break;
37119 case 5:
37120 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37121 args[2].op, args[3].op, args[4].op);
37122 break;
37123 case 6:
37124 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37125 args[2].op, args[3].op, args[4].op,
37126 args[5].op);
37127 break;
37128 default:
37129 gcc_unreachable ();
37132 if (!pat)
37133 return 0;
37135 if (redundant_embed_rnd)
37136 pat = ix86_erase_embedded_rounding (pat);
37138 emit_insn (pat);
37139 return target;
37142 /* Subroutine of ix86_expand_builtin to take care of special insns
37143 with variable number of operands. */
37145 static rtx
37146 ix86_expand_special_args_builtin (const struct builtin_description *d,
37147 tree exp, rtx target)
37149 tree arg;
37150 rtx pat, op;
37151 unsigned int i, nargs, arg_adjust, memory;
37152 bool aligned_mem = false;
37153 struct
37155 rtx op;
37156 machine_mode mode;
37157 } args[3];
37158 enum insn_code icode = d->icode;
37159 bool last_arg_constant = false;
37160 const struct insn_data_d *insn_p = &insn_data[icode];
37161 machine_mode tmode = insn_p->operand[0].mode;
37162 enum { load, store } klass;
37164 switch ((enum ix86_builtin_func_type) d->flag)
37166 case VOID_FTYPE_VOID:
37167 emit_insn (GEN_FCN (icode) (target));
37168 return 0;
37169 case VOID_FTYPE_UINT64:
37170 case VOID_FTYPE_UNSIGNED:
37171 nargs = 0;
37172 klass = store;
37173 memory = 0;
37174 break;
37176 case INT_FTYPE_VOID:
37177 case USHORT_FTYPE_VOID:
37178 case UINT64_FTYPE_VOID:
37179 case UNSIGNED_FTYPE_VOID:
37180 nargs = 0;
37181 klass = load;
37182 memory = 0;
37183 break;
37184 case UINT64_FTYPE_PUNSIGNED:
37185 case V2DI_FTYPE_PV2DI:
37186 case V4DI_FTYPE_PV4DI:
37187 case V32QI_FTYPE_PCCHAR:
37188 case V16QI_FTYPE_PCCHAR:
37189 case V8SF_FTYPE_PCV4SF:
37190 case V8SF_FTYPE_PCFLOAT:
37191 case V4SF_FTYPE_PCFLOAT:
37192 case V4DF_FTYPE_PCV2DF:
37193 case V4DF_FTYPE_PCDOUBLE:
37194 case V2DF_FTYPE_PCDOUBLE:
37195 case VOID_FTYPE_PVOID:
37196 case V8DI_FTYPE_PV8DI:
37197 nargs = 1;
37198 klass = load;
37199 memory = 0;
37200 switch (icode)
37202 case CODE_FOR_sse4_1_movntdqa:
37203 case CODE_FOR_avx2_movntdqa:
37204 case CODE_FOR_avx512f_movntdqa:
37205 aligned_mem = true;
37206 break;
37207 default:
37208 break;
37210 break;
37211 case VOID_FTYPE_PV2SF_V4SF:
37212 case VOID_FTYPE_PV8DI_V8DI:
37213 case VOID_FTYPE_PV4DI_V4DI:
37214 case VOID_FTYPE_PV2DI_V2DI:
37215 case VOID_FTYPE_PCHAR_V32QI:
37216 case VOID_FTYPE_PCHAR_V16QI:
37217 case VOID_FTYPE_PFLOAT_V16SF:
37218 case VOID_FTYPE_PFLOAT_V8SF:
37219 case VOID_FTYPE_PFLOAT_V4SF:
37220 case VOID_FTYPE_PDOUBLE_V8DF:
37221 case VOID_FTYPE_PDOUBLE_V4DF:
37222 case VOID_FTYPE_PDOUBLE_V2DF:
37223 case VOID_FTYPE_PLONGLONG_LONGLONG:
37224 case VOID_FTYPE_PULONGLONG_ULONGLONG:
37225 case VOID_FTYPE_PINT_INT:
37226 nargs = 1;
37227 klass = store;
37228 /* Reserve memory operand for target. */
37229 memory = ARRAY_SIZE (args);
37230 switch (icode)
37232 /* These builtins and instructions require the memory
37233 to be properly aligned. */
37234 case CODE_FOR_avx_movntv4di:
37235 case CODE_FOR_sse2_movntv2di:
37236 case CODE_FOR_avx_movntv8sf:
37237 case CODE_FOR_sse_movntv4sf:
37238 case CODE_FOR_sse4a_vmmovntv4sf:
37239 case CODE_FOR_avx_movntv4df:
37240 case CODE_FOR_sse2_movntv2df:
37241 case CODE_FOR_sse4a_vmmovntv2df:
37242 case CODE_FOR_sse2_movntidi:
37243 case CODE_FOR_sse_movntq:
37244 case CODE_FOR_sse2_movntisi:
37245 case CODE_FOR_avx512f_movntv16sf:
37246 case CODE_FOR_avx512f_movntv8df:
37247 case CODE_FOR_avx512f_movntv8di:
37248 aligned_mem = true;
37249 break;
37250 default:
37251 break;
37253 break;
37254 case V4SF_FTYPE_V4SF_PCV2SF:
37255 case V2DF_FTYPE_V2DF_PCDOUBLE:
37256 nargs = 2;
37257 klass = load;
37258 memory = 1;
37259 break;
37260 case V8SF_FTYPE_PCV8SF_V8SI:
37261 case V4DF_FTYPE_PCV4DF_V4DI:
37262 case V4SF_FTYPE_PCV4SF_V4SI:
37263 case V2DF_FTYPE_PCV2DF_V2DI:
37264 case V8SI_FTYPE_PCV8SI_V8SI:
37265 case V4DI_FTYPE_PCV4DI_V4DI:
37266 case V4SI_FTYPE_PCV4SI_V4SI:
37267 case V2DI_FTYPE_PCV2DI_V2DI:
37268 case VOID_FTYPE_INT_INT64:
37269 nargs = 2;
37270 klass = load;
37271 memory = 0;
37272 break;
37273 case VOID_FTYPE_PV8DF_V8DF_UQI:
37274 case VOID_FTYPE_PV4DF_V4DF_UQI:
37275 case VOID_FTYPE_PV2DF_V2DF_UQI:
37276 case VOID_FTYPE_PV16SF_V16SF_UHI:
37277 case VOID_FTYPE_PV8SF_V8SF_UQI:
37278 case VOID_FTYPE_PV4SF_V4SF_UQI:
37279 case VOID_FTYPE_PV8DI_V8DI_UQI:
37280 case VOID_FTYPE_PV4DI_V4DI_UQI:
37281 case VOID_FTYPE_PV2DI_V2DI_UQI:
37282 case VOID_FTYPE_PV16SI_V16SI_UHI:
37283 case VOID_FTYPE_PV8SI_V8SI_UQI:
37284 case VOID_FTYPE_PV4SI_V4SI_UQI:
37285 switch (icode)
37287 /* These builtins and instructions require the memory
37288 to be properly aligned. */
37289 case CODE_FOR_avx512f_storev16sf_mask:
37290 case CODE_FOR_avx512f_storev16si_mask:
37291 case CODE_FOR_avx512f_storev8df_mask:
37292 case CODE_FOR_avx512f_storev8di_mask:
37293 case CODE_FOR_avx512vl_storev8sf_mask:
37294 case CODE_FOR_avx512vl_storev8si_mask:
37295 case CODE_FOR_avx512vl_storev4df_mask:
37296 case CODE_FOR_avx512vl_storev4di_mask:
37297 case CODE_FOR_avx512vl_storev4sf_mask:
37298 case CODE_FOR_avx512vl_storev4si_mask:
37299 case CODE_FOR_avx512vl_storev2df_mask:
37300 case CODE_FOR_avx512vl_storev2di_mask:
37301 aligned_mem = true;
37302 break;
37303 default:
37304 break;
37306 /* FALLTHRU */
37307 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37308 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37309 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37310 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37311 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37312 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37313 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37314 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37315 case VOID_FTYPE_PV8SI_V8DI_UQI:
37316 case VOID_FTYPE_PV8HI_V8DI_UQI:
37317 case VOID_FTYPE_PV16HI_V16SI_UHI:
37318 case VOID_FTYPE_PV16QI_V8DI_UQI:
37319 case VOID_FTYPE_PV16QI_V16SI_UHI:
37320 case VOID_FTYPE_PV4SI_V4DI_UQI:
37321 case VOID_FTYPE_PV4SI_V2DI_UQI:
37322 case VOID_FTYPE_PV8HI_V4DI_UQI:
37323 case VOID_FTYPE_PV8HI_V2DI_UQI:
37324 case VOID_FTYPE_PV8HI_V8SI_UQI:
37325 case VOID_FTYPE_PV8HI_V4SI_UQI:
37326 case VOID_FTYPE_PV16QI_V4DI_UQI:
37327 case VOID_FTYPE_PV16QI_V2DI_UQI:
37328 case VOID_FTYPE_PV16QI_V8SI_UQI:
37329 case VOID_FTYPE_PV16QI_V4SI_UQI:
37330 case VOID_FTYPE_PCHAR_V64QI_UDI:
37331 case VOID_FTYPE_PCHAR_V32QI_USI:
37332 case VOID_FTYPE_PCHAR_V16QI_UHI:
37333 case VOID_FTYPE_PSHORT_V32HI_USI:
37334 case VOID_FTYPE_PSHORT_V16HI_UHI:
37335 case VOID_FTYPE_PSHORT_V8HI_UQI:
37336 case VOID_FTYPE_PINT_V16SI_UHI:
37337 case VOID_FTYPE_PINT_V8SI_UQI:
37338 case VOID_FTYPE_PINT_V4SI_UQI:
37339 case VOID_FTYPE_PINT64_V8DI_UQI:
37340 case VOID_FTYPE_PINT64_V4DI_UQI:
37341 case VOID_FTYPE_PINT64_V2DI_UQI:
37342 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37343 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37344 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37345 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37346 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37347 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37348 case VOID_FTYPE_PV32QI_V32HI_USI:
37349 case VOID_FTYPE_PV16QI_V16HI_UHI:
37350 case VOID_FTYPE_PV8QI_V8HI_UQI:
37351 nargs = 2;
37352 klass = store;
37353 /* Reserve memory operand for target. */
37354 memory = ARRAY_SIZE (args);
37355 break;
37356 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37357 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37358 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37359 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37360 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37361 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37362 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37363 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37364 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37365 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37366 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37367 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37368 switch (icode)
37370 /* These builtins and instructions require the memory
37371 to be properly aligned. */
37372 case CODE_FOR_avx512f_loadv16sf_mask:
37373 case CODE_FOR_avx512f_loadv16si_mask:
37374 case CODE_FOR_avx512f_loadv8df_mask:
37375 case CODE_FOR_avx512f_loadv8di_mask:
37376 case CODE_FOR_avx512vl_loadv8sf_mask:
37377 case CODE_FOR_avx512vl_loadv8si_mask:
37378 case CODE_FOR_avx512vl_loadv4df_mask:
37379 case CODE_FOR_avx512vl_loadv4di_mask:
37380 case CODE_FOR_avx512vl_loadv4sf_mask:
37381 case CODE_FOR_avx512vl_loadv4si_mask:
37382 case CODE_FOR_avx512vl_loadv2df_mask:
37383 case CODE_FOR_avx512vl_loadv2di_mask:
37384 case CODE_FOR_avx512bw_loadv64qi_mask:
37385 case CODE_FOR_avx512vl_loadv32qi_mask:
37386 case CODE_FOR_avx512vl_loadv16qi_mask:
37387 case CODE_FOR_avx512bw_loadv32hi_mask:
37388 case CODE_FOR_avx512vl_loadv16hi_mask:
37389 case CODE_FOR_avx512vl_loadv8hi_mask:
37390 aligned_mem = true;
37391 break;
37392 default:
37393 break;
37395 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37396 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37397 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37398 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37399 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37400 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37401 case V16SI_FTYPE_PCINT_V16SI_UHI:
37402 case V8SI_FTYPE_PCINT_V8SI_UQI:
37403 case V4SI_FTYPE_PCINT_V4SI_UQI:
37404 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37405 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37406 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37407 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37408 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37409 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37410 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37411 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37412 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37413 nargs = 3;
37414 klass = load;
37415 memory = 0;
37416 break;
37417 case VOID_FTYPE_UINT_UINT_UINT:
37418 case VOID_FTYPE_UINT64_UINT_UINT:
37419 case UCHAR_FTYPE_UINT_UINT_UINT:
37420 case UCHAR_FTYPE_UINT64_UINT_UINT:
37421 nargs = 3;
37422 klass = load;
37423 memory = ARRAY_SIZE (args);
37424 last_arg_constant = true;
37425 break;
37426 default:
37427 gcc_unreachable ();
37430 gcc_assert (nargs <= ARRAY_SIZE (args));
37432 if (klass == store)
37434 arg = CALL_EXPR_ARG (exp, 0);
37435 op = expand_normal (arg);
37436 gcc_assert (target == 0);
37437 if (memory)
37439 op = ix86_zero_extend_to_Pmode (op);
37440 target = gen_rtx_MEM (tmode, op);
37441 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37442 on it. Try to improve it using get_pointer_alignment,
37443 and if the special builtin is one that requires strict
37444 mode alignment, also from it's GET_MODE_ALIGNMENT.
37445 Failure to do so could lead to ix86_legitimate_combined_insn
37446 rejecting all changes to such insns. */
37447 unsigned int align = get_pointer_alignment (arg);
37448 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37449 align = GET_MODE_ALIGNMENT (tmode);
37450 if (MEM_ALIGN (target) < align)
37451 set_mem_align (target, align);
37453 else
37454 target = force_reg (tmode, op);
37455 arg_adjust = 1;
37457 else
37459 arg_adjust = 0;
37460 if (optimize
37461 || target == 0
37462 || !register_operand (target, tmode)
37463 || GET_MODE (target) != tmode)
37464 target = gen_reg_rtx (tmode);
37467 for (i = 0; i < nargs; i++)
37469 machine_mode mode = insn_p->operand[i + 1].mode;
37470 bool match;
37472 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37473 op = expand_normal (arg);
37474 match = insn_p->operand[i + 1].predicate (op, mode);
37476 if (last_arg_constant && (i + 1) == nargs)
37478 if (!match)
37480 if (icode == CODE_FOR_lwp_lwpvalsi3
37481 || icode == CODE_FOR_lwp_lwpinssi3
37482 || icode == CODE_FOR_lwp_lwpvaldi3
37483 || icode == CODE_FOR_lwp_lwpinsdi3)
37484 error ("the last argument must be a 32-bit immediate");
37485 else
37486 error ("the last argument must be an 8-bit immediate");
37487 return const0_rtx;
37490 else
37492 if (i == memory)
37494 /* This must be the memory operand. */
37495 op = ix86_zero_extend_to_Pmode (op);
37496 op = gen_rtx_MEM (mode, op);
37497 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37498 on it. Try to improve it using get_pointer_alignment,
37499 and if the special builtin is one that requires strict
37500 mode alignment, also from it's GET_MODE_ALIGNMENT.
37501 Failure to do so could lead to ix86_legitimate_combined_insn
37502 rejecting all changes to such insns. */
37503 unsigned int align = get_pointer_alignment (arg);
37504 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37505 align = GET_MODE_ALIGNMENT (mode);
37506 if (MEM_ALIGN (op) < align)
37507 set_mem_align (op, align);
37509 else
37511 /* This must be register. */
37512 if (VECTOR_MODE_P (mode))
37513 op = safe_vector_operand (op, mode);
37515 op = fixup_modeless_constant (op, mode);
37517 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37518 op = copy_to_mode_reg (mode, op);
37519 else
37521 op = copy_to_reg (op);
37522 op = lowpart_subreg (mode, op, GET_MODE (op));
37527 args[i].op = op;
37528 args[i].mode = mode;
37531 switch (nargs)
37533 case 0:
37534 pat = GEN_FCN (icode) (target);
37535 break;
37536 case 1:
37537 pat = GEN_FCN (icode) (target, args[0].op);
37538 break;
37539 case 2:
37540 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37541 break;
37542 case 3:
37543 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37544 break;
37545 default:
37546 gcc_unreachable ();
37549 if (! pat)
37550 return 0;
37551 emit_insn (pat);
37552 return klass == store ? 0 : target;
37555 /* Return the integer constant in ARG. Constrain it to be in the range
37556 of the subparts of VEC_TYPE; issue an error if not. */
37558 static int
37559 get_element_number (tree vec_type, tree arg)
37561 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37563 if (!tree_fits_uhwi_p (arg)
37564 || (elt = tree_to_uhwi (arg), elt > max))
37566 error ("selector must be an integer constant in the range 0..%wi", max);
37567 return 0;
37570 return elt;
37573 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37574 ix86_expand_vector_init. We DO have language-level syntax for this, in
37575 the form of (type){ init-list }. Except that since we can't place emms
37576 instructions from inside the compiler, we can't allow the use of MMX
37577 registers unless the user explicitly asks for it. So we do *not* define
37578 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37579 we have builtins invoked by mmintrin.h that gives us license to emit
37580 these sorts of instructions. */
37582 static rtx
37583 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37585 machine_mode tmode = TYPE_MODE (type);
37586 machine_mode inner_mode = GET_MODE_INNER (tmode);
37587 int i, n_elt = GET_MODE_NUNITS (tmode);
37588 rtvec v = rtvec_alloc (n_elt);
37590 gcc_assert (VECTOR_MODE_P (tmode));
37591 gcc_assert (call_expr_nargs (exp) == n_elt);
37593 for (i = 0; i < n_elt; ++i)
37595 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37596 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37599 if (!target || !register_operand (target, tmode))
37600 target = gen_reg_rtx (tmode);
37602 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37603 return target;
37606 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37607 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37608 had a language-level syntax for referencing vector elements. */
37610 static rtx
37611 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37613 machine_mode tmode, mode0;
37614 tree arg0, arg1;
37615 int elt;
37616 rtx op0;
37618 arg0 = CALL_EXPR_ARG (exp, 0);
37619 arg1 = CALL_EXPR_ARG (exp, 1);
37621 op0 = expand_normal (arg0);
37622 elt = get_element_number (TREE_TYPE (arg0), arg1);
37624 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37625 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37626 gcc_assert (VECTOR_MODE_P (mode0));
37628 op0 = force_reg (mode0, op0);
37630 if (optimize || !target || !register_operand (target, tmode))
37631 target = gen_reg_rtx (tmode);
37633 ix86_expand_vector_extract (true, target, op0, elt);
37635 return target;
37638 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37639 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37640 a language-level syntax for referencing vector elements. */
37642 static rtx
37643 ix86_expand_vec_set_builtin (tree exp)
37645 machine_mode tmode, mode1;
37646 tree arg0, arg1, arg2;
37647 int elt;
37648 rtx op0, op1, target;
37650 arg0 = CALL_EXPR_ARG (exp, 0);
37651 arg1 = CALL_EXPR_ARG (exp, 1);
37652 arg2 = CALL_EXPR_ARG (exp, 2);
37654 tmode = TYPE_MODE (TREE_TYPE (arg0));
37655 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37656 gcc_assert (VECTOR_MODE_P (tmode));
37658 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37659 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37660 elt = get_element_number (TREE_TYPE (arg0), arg2);
37662 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37663 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37665 op0 = force_reg (tmode, op0);
37666 op1 = force_reg (mode1, op1);
37668 /* OP0 is the source of these builtin functions and shouldn't be
37669 modified. Create a copy, use it and return it as target. */
37670 target = gen_reg_rtx (tmode);
37671 emit_move_insn (target, op0);
37672 ix86_expand_vector_set (true, target, op1, elt);
37674 return target;
37677 /* Emit conditional move of SRC to DST with condition
37678 OP1 CODE OP2. */
37679 static void
37680 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37682 rtx t;
37684 if (TARGET_CMOVE)
37686 t = ix86_expand_compare (code, op1, op2);
37687 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37688 src, dst)));
37690 else
37692 rtx_code_label *nomove = gen_label_rtx ();
37693 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37694 const0_rtx, GET_MODE (op1), 1, nomove);
37695 emit_move_insn (dst, src);
37696 emit_label (nomove);
37700 /* Choose max of DST and SRC and put it to DST. */
37701 static void
37702 ix86_emit_move_max (rtx dst, rtx src)
37704 ix86_emit_cmove (dst, src, LTU, dst, src);
37707 /* Expand an expression EXP that calls a built-in function,
37708 with result going to TARGET if that's convenient
37709 (and in mode MODE if that's convenient).
37710 SUBTARGET may be used as the target for computing one of EXP's operands.
37711 IGNORE is nonzero if the value is to be ignored. */
37713 static rtx
37714 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37715 machine_mode mode, int ignore)
37717 size_t i;
37718 enum insn_code icode;
37719 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37720 tree arg0, arg1, arg2, arg3, arg4;
37721 rtx op0, op1, op2, op3, op4, pat, insn;
37722 machine_mode mode0, mode1, mode2, mode3, mode4;
37723 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37725 /* For CPU builtins that can be folded, fold first and expand the fold. */
37726 switch (fcode)
37728 case IX86_BUILTIN_CPU_INIT:
37730 /* Make it call __cpu_indicator_init in libgcc. */
37731 tree call_expr, fndecl, type;
37732 type = build_function_type_list (integer_type_node, NULL_TREE);
37733 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37734 call_expr = build_call_expr (fndecl, 0);
37735 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37737 case IX86_BUILTIN_CPU_IS:
37738 case IX86_BUILTIN_CPU_SUPPORTS:
37740 tree arg0 = CALL_EXPR_ARG (exp, 0);
37741 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37742 gcc_assert (fold_expr != NULL_TREE);
37743 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37747 /* Determine whether the builtin function is available under the current ISA.
37748 Originally the builtin was not created if it wasn't applicable to the
37749 current ISA based on the command line switches. With function specific
37750 options, we need to check in the context of the function making the call
37751 whether it is supported. Treat AVX512VL specially. For other flags,
37752 if isa includes more than one ISA bit, treat those are requiring any
37753 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37754 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37755 at all, -m64 is a whole TU option. */
37756 if (((ix86_builtins_isa[fcode].isa
37757 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37758 && !(ix86_builtins_isa[fcode].isa
37759 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37760 & ix86_isa_flags))
37761 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37762 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37763 || (ix86_builtins_isa[fcode].isa2
37764 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37766 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37767 ix86_builtins_isa[fcode].isa2, 0, 0,
37768 NULL, NULL, (enum fpmath_unit) 0,
37769 false);
37770 if (!opts)
37771 error ("%qE needs unknown isa option", fndecl);
37772 else
37774 gcc_assert (opts != NULL);
37775 error ("%qE needs isa option %s", fndecl, opts);
37776 free (opts);
37778 return expand_call (exp, target, ignore);
37781 switch (fcode)
37783 case IX86_BUILTIN_BNDMK:
37784 if (!target
37785 || GET_MODE (target) != BNDmode
37786 || !register_operand (target, BNDmode))
37787 target = gen_reg_rtx (BNDmode);
37789 arg0 = CALL_EXPR_ARG (exp, 0);
37790 arg1 = CALL_EXPR_ARG (exp, 1);
37792 op0 = expand_normal (arg0);
37793 op1 = expand_normal (arg1);
37795 if (!register_operand (op0, Pmode))
37796 op0 = ix86_zero_extend_to_Pmode (op0);
37797 if (!register_operand (op1, Pmode))
37798 op1 = ix86_zero_extend_to_Pmode (op1);
37800 /* Builtin arg1 is size of block but instruction op1 should
37801 be (size - 1). */
37802 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37803 NULL_RTX, 1, OPTAB_DIRECT);
37805 emit_insn (BNDmode == BND64mode
37806 ? gen_bnd64_mk (target, op0, op1)
37807 : gen_bnd32_mk (target, op0, op1));
37808 return target;
37810 case IX86_BUILTIN_BNDSTX:
37811 arg0 = CALL_EXPR_ARG (exp, 0);
37812 arg1 = CALL_EXPR_ARG (exp, 1);
37813 arg2 = CALL_EXPR_ARG (exp, 2);
37815 op0 = expand_normal (arg0);
37816 op1 = expand_normal (arg1);
37817 op2 = expand_normal (arg2);
37819 if (!register_operand (op0, Pmode))
37820 op0 = ix86_zero_extend_to_Pmode (op0);
37821 if (!register_operand (op1, BNDmode))
37822 op1 = copy_to_mode_reg (BNDmode, op1);
37823 if (!register_operand (op2, Pmode))
37824 op2 = ix86_zero_extend_to_Pmode (op2);
37826 emit_insn (BNDmode == BND64mode
37827 ? gen_bnd64_stx (op2, op0, op1)
37828 : gen_bnd32_stx (op2, op0, op1));
37829 return 0;
37831 case IX86_BUILTIN_BNDLDX:
37832 if (!target
37833 || GET_MODE (target) != BNDmode
37834 || !register_operand (target, BNDmode))
37835 target = gen_reg_rtx (BNDmode);
37837 arg0 = CALL_EXPR_ARG (exp, 0);
37838 arg1 = CALL_EXPR_ARG (exp, 1);
37840 op0 = expand_normal (arg0);
37841 op1 = expand_normal (arg1);
37843 if (!register_operand (op0, Pmode))
37844 op0 = ix86_zero_extend_to_Pmode (op0);
37845 if (!register_operand (op1, Pmode))
37846 op1 = ix86_zero_extend_to_Pmode (op1);
37848 emit_insn (BNDmode == BND64mode
37849 ? gen_bnd64_ldx (target, op0, op1)
37850 : gen_bnd32_ldx (target, op0, op1));
37851 return target;
37853 case IX86_BUILTIN_BNDCL:
37854 arg0 = CALL_EXPR_ARG (exp, 0);
37855 arg1 = CALL_EXPR_ARG (exp, 1);
37857 op0 = expand_normal (arg0);
37858 op1 = expand_normal (arg1);
37860 if (!register_operand (op0, Pmode))
37861 op0 = ix86_zero_extend_to_Pmode (op0);
37862 if (!register_operand (op1, BNDmode))
37863 op1 = copy_to_mode_reg (BNDmode, op1);
37865 emit_insn (BNDmode == BND64mode
37866 ? gen_bnd64_cl (op1, op0)
37867 : gen_bnd32_cl (op1, op0));
37868 return 0;
37870 case IX86_BUILTIN_BNDCU:
37871 arg0 = CALL_EXPR_ARG (exp, 0);
37872 arg1 = CALL_EXPR_ARG (exp, 1);
37874 op0 = expand_normal (arg0);
37875 op1 = expand_normal (arg1);
37877 if (!register_operand (op0, Pmode))
37878 op0 = ix86_zero_extend_to_Pmode (op0);
37879 if (!register_operand (op1, BNDmode))
37880 op1 = copy_to_mode_reg (BNDmode, op1);
37882 emit_insn (BNDmode == BND64mode
37883 ? gen_bnd64_cu (op1, op0)
37884 : gen_bnd32_cu (op1, op0));
37885 return 0;
37887 case IX86_BUILTIN_BNDRET:
37888 arg0 = CALL_EXPR_ARG (exp, 0);
37889 target = chkp_get_rtl_bounds (arg0);
37891 /* If no bounds were specified for returned value,
37892 then use INIT bounds. It usually happens when
37893 some built-in function is expanded. */
37894 if (!target)
37896 rtx t1 = gen_reg_rtx (Pmode);
37897 rtx t2 = gen_reg_rtx (Pmode);
37898 target = gen_reg_rtx (BNDmode);
37899 emit_move_insn (t1, const0_rtx);
37900 emit_move_insn (t2, constm1_rtx);
37901 emit_insn (BNDmode == BND64mode
37902 ? gen_bnd64_mk (target, t1, t2)
37903 : gen_bnd32_mk (target, t1, t2));
37906 gcc_assert (target && REG_P (target));
37907 return target;
37909 case IX86_BUILTIN_BNDNARROW:
37911 rtx m1, m1h1, m1h2, lb, ub, t1;
37913 /* Return value and lb. */
37914 arg0 = CALL_EXPR_ARG (exp, 0);
37915 /* Bounds. */
37916 arg1 = CALL_EXPR_ARG (exp, 1);
37917 /* Size. */
37918 arg2 = CALL_EXPR_ARG (exp, 2);
37920 lb = expand_normal (arg0);
37921 op1 = expand_normal (arg1);
37922 op2 = expand_normal (arg2);
37924 /* Size was passed but we need to use (size - 1) as for bndmk. */
37925 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37926 NULL_RTX, 1, OPTAB_DIRECT);
37928 /* Add LB to size and inverse to get UB. */
37929 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37930 op2, 1, OPTAB_DIRECT);
37931 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37933 if (!register_operand (lb, Pmode))
37934 lb = ix86_zero_extend_to_Pmode (lb);
37935 if (!register_operand (ub, Pmode))
37936 ub = ix86_zero_extend_to_Pmode (ub);
37938 /* We need to move bounds to memory before any computations. */
37939 if (MEM_P (op1))
37940 m1 = op1;
37941 else
37943 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37944 emit_move_insn (m1, op1);
37947 /* Generate mem expression to be used for access to LB and UB. */
37948 m1h1 = adjust_address (m1, Pmode, 0);
37949 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37951 t1 = gen_reg_rtx (Pmode);
37953 /* Compute LB. */
37954 emit_move_insn (t1, m1h1);
37955 ix86_emit_move_max (t1, lb);
37956 emit_move_insn (m1h1, t1);
37958 /* Compute UB. UB is stored in 1's complement form. Therefore
37959 we also use max here. */
37960 emit_move_insn (t1, m1h2);
37961 ix86_emit_move_max (t1, ub);
37962 emit_move_insn (m1h2, t1);
37964 op2 = gen_reg_rtx (BNDmode);
37965 emit_move_insn (op2, m1);
37967 return chkp_join_splitted_slot (lb, op2);
37970 case IX86_BUILTIN_BNDINT:
37972 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37974 if (!target
37975 || GET_MODE (target) != BNDmode
37976 || !register_operand (target, BNDmode))
37977 target = gen_reg_rtx (BNDmode);
37979 arg0 = CALL_EXPR_ARG (exp, 0);
37980 arg1 = CALL_EXPR_ARG (exp, 1);
37982 op0 = expand_normal (arg0);
37983 op1 = expand_normal (arg1);
37985 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37986 rh1 = adjust_address (res, Pmode, 0);
37987 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37989 /* Put first bounds to temporaries. */
37990 lb1 = gen_reg_rtx (Pmode);
37991 ub1 = gen_reg_rtx (Pmode);
37992 if (MEM_P (op0))
37994 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37995 emit_move_insn (ub1, adjust_address (op0, Pmode,
37996 GET_MODE_SIZE (Pmode)));
37998 else
38000 emit_move_insn (res, op0);
38001 emit_move_insn (lb1, rh1);
38002 emit_move_insn (ub1, rh2);
38005 /* Put second bounds to temporaries. */
38006 lb2 = gen_reg_rtx (Pmode);
38007 ub2 = gen_reg_rtx (Pmode);
38008 if (MEM_P (op1))
38010 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
38011 emit_move_insn (ub2, adjust_address (op1, Pmode,
38012 GET_MODE_SIZE (Pmode)));
38014 else
38016 emit_move_insn (res, op1);
38017 emit_move_insn (lb2, rh1);
38018 emit_move_insn (ub2, rh2);
38021 /* Compute LB. */
38022 ix86_emit_move_max (lb1, lb2);
38023 emit_move_insn (rh1, lb1);
38025 /* Compute UB. UB is stored in 1's complement form. Therefore
38026 we also use max here. */
38027 ix86_emit_move_max (ub1, ub2);
38028 emit_move_insn (rh2, ub1);
38030 emit_move_insn (target, res);
38032 return target;
38035 case IX86_BUILTIN_SIZEOF:
38037 tree name;
38038 rtx symbol;
38040 if (!target
38041 || GET_MODE (target) != Pmode
38042 || !register_operand (target, Pmode))
38043 target = gen_reg_rtx (Pmode);
38045 arg0 = CALL_EXPR_ARG (exp, 0);
38046 gcc_assert (VAR_P (arg0));
38048 name = DECL_ASSEMBLER_NAME (arg0);
38049 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
38051 emit_insn (Pmode == SImode
38052 ? gen_move_size_reloc_si (target, symbol)
38053 : gen_move_size_reloc_di (target, symbol));
38055 return target;
38058 case IX86_BUILTIN_BNDLOWER:
38060 rtx mem, hmem;
38062 if (!target
38063 || GET_MODE (target) != Pmode
38064 || !register_operand (target, Pmode))
38065 target = gen_reg_rtx (Pmode);
38067 arg0 = CALL_EXPR_ARG (exp, 0);
38068 op0 = expand_normal (arg0);
38070 /* We need to move bounds to memory first. */
38071 if (MEM_P (op0))
38072 mem = op0;
38073 else
38075 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
38076 emit_move_insn (mem, op0);
38079 /* Generate mem expression to access LB and load it. */
38080 hmem = adjust_address (mem, Pmode, 0);
38081 emit_move_insn (target, hmem);
38083 return target;
38086 case IX86_BUILTIN_BNDUPPER:
38088 rtx mem, hmem, res;
38090 if (!target
38091 || GET_MODE (target) != Pmode
38092 || !register_operand (target, Pmode))
38093 target = gen_reg_rtx (Pmode);
38095 arg0 = CALL_EXPR_ARG (exp, 0);
38096 op0 = expand_normal (arg0);
38098 /* We need to move bounds to memory first. */
38099 if (MEM_P (op0))
38100 mem = op0;
38101 else
38103 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
38104 emit_move_insn (mem, op0);
38107 /* Generate mem expression to access UB. */
38108 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
38110 /* We need to inverse all bits of UB. */
38111 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
38113 if (res != target)
38114 emit_move_insn (target, res);
38116 return target;
38119 case IX86_BUILTIN_MASKMOVQ:
38120 case IX86_BUILTIN_MASKMOVDQU:
38121 icode = (fcode == IX86_BUILTIN_MASKMOVQ
38122 ? CODE_FOR_mmx_maskmovq
38123 : CODE_FOR_sse2_maskmovdqu);
38124 /* Note the arg order is different from the operand order. */
38125 arg1 = CALL_EXPR_ARG (exp, 0);
38126 arg2 = CALL_EXPR_ARG (exp, 1);
38127 arg0 = CALL_EXPR_ARG (exp, 2);
38128 op0 = expand_normal (arg0);
38129 op1 = expand_normal (arg1);
38130 op2 = expand_normal (arg2);
38131 mode0 = insn_data[icode].operand[0].mode;
38132 mode1 = insn_data[icode].operand[1].mode;
38133 mode2 = insn_data[icode].operand[2].mode;
38135 op0 = ix86_zero_extend_to_Pmode (op0);
38136 op0 = gen_rtx_MEM (mode1, op0);
38138 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38139 op0 = copy_to_mode_reg (mode0, op0);
38140 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38141 op1 = copy_to_mode_reg (mode1, op1);
38142 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38143 op2 = copy_to_mode_reg (mode2, op2);
38144 pat = GEN_FCN (icode) (op0, op1, op2);
38145 if (! pat)
38146 return 0;
38147 emit_insn (pat);
38148 return 0;
38150 case IX86_BUILTIN_LDMXCSR:
38151 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
38152 target = assign_386_stack_local (SImode, SLOT_TEMP);
38153 emit_move_insn (target, op0);
38154 emit_insn (gen_sse_ldmxcsr (target));
38155 return 0;
38157 case IX86_BUILTIN_STMXCSR:
38158 target = assign_386_stack_local (SImode, SLOT_TEMP);
38159 emit_insn (gen_sse_stmxcsr (target));
38160 return copy_to_mode_reg (SImode, target);
38162 case IX86_BUILTIN_CLFLUSH:
38163 arg0 = CALL_EXPR_ARG (exp, 0);
38164 op0 = expand_normal (arg0);
38165 icode = CODE_FOR_sse2_clflush;
38166 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38167 op0 = ix86_zero_extend_to_Pmode (op0);
38169 emit_insn (gen_sse2_clflush (op0));
38170 return 0;
38172 case IX86_BUILTIN_CLWB:
38173 arg0 = CALL_EXPR_ARG (exp, 0);
38174 op0 = expand_normal (arg0);
38175 icode = CODE_FOR_clwb;
38176 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38177 op0 = ix86_zero_extend_to_Pmode (op0);
38179 emit_insn (gen_clwb (op0));
38180 return 0;
38182 case IX86_BUILTIN_CLFLUSHOPT:
38183 arg0 = CALL_EXPR_ARG (exp, 0);
38184 op0 = expand_normal (arg0);
38185 icode = CODE_FOR_clflushopt;
38186 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38187 op0 = ix86_zero_extend_to_Pmode (op0);
38189 emit_insn (gen_clflushopt (op0));
38190 return 0;
38192 case IX86_BUILTIN_MONITOR:
38193 case IX86_BUILTIN_MONITORX:
38194 arg0 = CALL_EXPR_ARG (exp, 0);
38195 arg1 = CALL_EXPR_ARG (exp, 1);
38196 arg2 = CALL_EXPR_ARG (exp, 2);
38197 op0 = expand_normal (arg0);
38198 op1 = expand_normal (arg1);
38199 op2 = expand_normal (arg2);
38200 if (!REG_P (op0))
38201 op0 = ix86_zero_extend_to_Pmode (op0);
38202 if (!REG_P (op1))
38203 op1 = copy_to_mode_reg (SImode, op1);
38204 if (!REG_P (op2))
38205 op2 = copy_to_mode_reg (SImode, op2);
38207 emit_insn (fcode == IX86_BUILTIN_MONITOR
38208 ? ix86_gen_monitor (op0, op1, op2)
38209 : ix86_gen_monitorx (op0, op1, op2));
38210 return 0;
38212 case IX86_BUILTIN_MWAIT:
38213 arg0 = CALL_EXPR_ARG (exp, 0);
38214 arg1 = CALL_EXPR_ARG (exp, 1);
38215 op0 = expand_normal (arg0);
38216 op1 = expand_normal (arg1);
38217 if (!REG_P (op0))
38218 op0 = copy_to_mode_reg (SImode, op0);
38219 if (!REG_P (op1))
38220 op1 = copy_to_mode_reg (SImode, op1);
38221 emit_insn (gen_sse3_mwait (op0, op1));
38222 return 0;
38224 case IX86_BUILTIN_MWAITX:
38225 arg0 = CALL_EXPR_ARG (exp, 0);
38226 arg1 = CALL_EXPR_ARG (exp, 1);
38227 arg2 = CALL_EXPR_ARG (exp, 2);
38228 op0 = expand_normal (arg0);
38229 op1 = expand_normal (arg1);
38230 op2 = expand_normal (arg2);
38231 if (!REG_P (op0))
38232 op0 = copy_to_mode_reg (SImode, op0);
38233 if (!REG_P (op1))
38234 op1 = copy_to_mode_reg (SImode, op1);
38235 if (!REG_P (op2))
38236 op2 = copy_to_mode_reg (SImode, op2);
38237 emit_insn (gen_mwaitx (op0, op1, op2));
38238 return 0;
38240 case IX86_BUILTIN_CLZERO:
38241 arg0 = CALL_EXPR_ARG (exp, 0);
38242 op0 = expand_normal (arg0);
38243 if (!REG_P (op0))
38244 op0 = ix86_zero_extend_to_Pmode (op0);
38245 emit_insn (ix86_gen_clzero (op0));
38246 return 0;
38248 case IX86_BUILTIN_VEC_INIT_V2SI:
38249 case IX86_BUILTIN_VEC_INIT_V4HI:
38250 case IX86_BUILTIN_VEC_INIT_V8QI:
38251 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
38253 case IX86_BUILTIN_VEC_EXT_V2DF:
38254 case IX86_BUILTIN_VEC_EXT_V2DI:
38255 case IX86_BUILTIN_VEC_EXT_V4SF:
38256 case IX86_BUILTIN_VEC_EXT_V4SI:
38257 case IX86_BUILTIN_VEC_EXT_V8HI:
38258 case IX86_BUILTIN_VEC_EXT_V2SI:
38259 case IX86_BUILTIN_VEC_EXT_V4HI:
38260 case IX86_BUILTIN_VEC_EXT_V16QI:
38261 return ix86_expand_vec_ext_builtin (exp, target);
38263 case IX86_BUILTIN_VEC_SET_V2DI:
38264 case IX86_BUILTIN_VEC_SET_V4SF:
38265 case IX86_BUILTIN_VEC_SET_V4SI:
38266 case IX86_BUILTIN_VEC_SET_V8HI:
38267 case IX86_BUILTIN_VEC_SET_V4HI:
38268 case IX86_BUILTIN_VEC_SET_V16QI:
38269 return ix86_expand_vec_set_builtin (exp);
38271 case IX86_BUILTIN_NANQ:
38272 case IX86_BUILTIN_NANSQ:
38273 return expand_call (exp, target, ignore);
38275 case IX86_BUILTIN_RDPMC:
38276 case IX86_BUILTIN_RDTSC:
38277 case IX86_BUILTIN_RDTSCP:
38278 case IX86_BUILTIN_XGETBV:
38280 op0 = gen_reg_rtx (DImode);
38281 op1 = gen_reg_rtx (DImode);
38283 if (fcode == IX86_BUILTIN_RDPMC)
38285 arg0 = CALL_EXPR_ARG (exp, 0);
38286 op2 = expand_normal (arg0);
38287 if (!register_operand (op2, SImode))
38288 op2 = copy_to_mode_reg (SImode, op2);
38290 insn = (TARGET_64BIT
38291 ? gen_rdpmc_rex64 (op0, op1, op2)
38292 : gen_rdpmc (op0, op2));
38293 emit_insn (insn);
38295 else if (fcode == IX86_BUILTIN_XGETBV)
38297 arg0 = CALL_EXPR_ARG (exp, 0);
38298 op2 = expand_normal (arg0);
38299 if (!register_operand (op2, SImode))
38300 op2 = copy_to_mode_reg (SImode, op2);
38302 insn = (TARGET_64BIT
38303 ? gen_xgetbv_rex64 (op0, op1, op2)
38304 : gen_xgetbv (op0, op2));
38305 emit_insn (insn);
38307 else if (fcode == IX86_BUILTIN_RDTSC)
38309 insn = (TARGET_64BIT
38310 ? gen_rdtsc_rex64 (op0, op1)
38311 : gen_rdtsc (op0));
38312 emit_insn (insn);
38314 else
38316 op2 = gen_reg_rtx (SImode);
38318 insn = (TARGET_64BIT
38319 ? gen_rdtscp_rex64 (op0, op1, op2)
38320 : gen_rdtscp (op0, op2));
38321 emit_insn (insn);
38323 arg0 = CALL_EXPR_ARG (exp, 0);
38324 op4 = expand_normal (arg0);
38325 if (!address_operand (op4, VOIDmode))
38327 op4 = convert_memory_address (Pmode, op4);
38328 op4 = copy_addr_to_reg (op4);
38330 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38333 if (target == 0)
38335 /* mode is VOIDmode if __builtin_rd* has been called
38336 without lhs. */
38337 if (mode == VOIDmode)
38338 return target;
38339 target = gen_reg_rtx (mode);
38342 if (TARGET_64BIT)
38344 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38345 op1, 1, OPTAB_DIRECT);
38346 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38347 op0, 1, OPTAB_DIRECT);
38350 emit_move_insn (target, op0);
38351 return target;
38353 case IX86_BUILTIN_FXSAVE:
38354 case IX86_BUILTIN_FXRSTOR:
38355 case IX86_BUILTIN_FXSAVE64:
38356 case IX86_BUILTIN_FXRSTOR64:
38357 case IX86_BUILTIN_FNSTENV:
38358 case IX86_BUILTIN_FLDENV:
38359 mode0 = BLKmode;
38360 switch (fcode)
38362 case IX86_BUILTIN_FXSAVE:
38363 icode = CODE_FOR_fxsave;
38364 break;
38365 case IX86_BUILTIN_FXRSTOR:
38366 icode = CODE_FOR_fxrstor;
38367 break;
38368 case IX86_BUILTIN_FXSAVE64:
38369 icode = CODE_FOR_fxsave64;
38370 break;
38371 case IX86_BUILTIN_FXRSTOR64:
38372 icode = CODE_FOR_fxrstor64;
38373 break;
38374 case IX86_BUILTIN_FNSTENV:
38375 icode = CODE_FOR_fnstenv;
38376 break;
38377 case IX86_BUILTIN_FLDENV:
38378 icode = CODE_FOR_fldenv;
38379 break;
38380 default:
38381 gcc_unreachable ();
38384 arg0 = CALL_EXPR_ARG (exp, 0);
38385 op0 = expand_normal (arg0);
38387 if (!address_operand (op0, VOIDmode))
38389 op0 = convert_memory_address (Pmode, op0);
38390 op0 = copy_addr_to_reg (op0);
38392 op0 = gen_rtx_MEM (mode0, op0);
38394 pat = GEN_FCN (icode) (op0);
38395 if (pat)
38396 emit_insn (pat);
38397 return 0;
38399 case IX86_BUILTIN_XSETBV:
38400 arg0 = CALL_EXPR_ARG (exp, 0);
38401 arg1 = CALL_EXPR_ARG (exp, 1);
38402 op0 = expand_normal (arg0);
38403 op1 = expand_normal (arg1);
38405 if (!REG_P (op0))
38406 op0 = copy_to_mode_reg (SImode, op0);
38408 if (TARGET_64BIT)
38410 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38411 NULL, 1, OPTAB_DIRECT);
38413 op2 = gen_lowpart (SImode, op2);
38414 op1 = gen_lowpart (SImode, op1);
38415 if (!REG_P (op1))
38416 op1 = copy_to_mode_reg (SImode, op1);
38417 if (!REG_P (op2))
38418 op2 = copy_to_mode_reg (SImode, op2);
38419 icode = CODE_FOR_xsetbv_rex64;
38420 pat = GEN_FCN (icode) (op0, op1, op2);
38422 else
38424 if (!REG_P (op1))
38425 op1 = copy_to_mode_reg (DImode, op1);
38426 icode = CODE_FOR_xsetbv;
38427 pat = GEN_FCN (icode) (op0, op1);
38429 if (pat)
38430 emit_insn (pat);
38431 return 0;
38433 case IX86_BUILTIN_XSAVE:
38434 case IX86_BUILTIN_XRSTOR:
38435 case IX86_BUILTIN_XSAVE64:
38436 case IX86_BUILTIN_XRSTOR64:
38437 case IX86_BUILTIN_XSAVEOPT:
38438 case IX86_BUILTIN_XSAVEOPT64:
38439 case IX86_BUILTIN_XSAVES:
38440 case IX86_BUILTIN_XRSTORS:
38441 case IX86_BUILTIN_XSAVES64:
38442 case IX86_BUILTIN_XRSTORS64:
38443 case IX86_BUILTIN_XSAVEC:
38444 case IX86_BUILTIN_XSAVEC64:
38445 arg0 = CALL_EXPR_ARG (exp, 0);
38446 arg1 = CALL_EXPR_ARG (exp, 1);
38447 op0 = expand_normal (arg0);
38448 op1 = expand_normal (arg1);
38450 if (!address_operand (op0, VOIDmode))
38452 op0 = convert_memory_address (Pmode, op0);
38453 op0 = copy_addr_to_reg (op0);
38455 op0 = gen_rtx_MEM (BLKmode, op0);
38457 op1 = force_reg (DImode, op1);
38459 if (TARGET_64BIT)
38461 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38462 NULL, 1, OPTAB_DIRECT);
38463 switch (fcode)
38465 case IX86_BUILTIN_XSAVE:
38466 icode = CODE_FOR_xsave_rex64;
38467 break;
38468 case IX86_BUILTIN_XRSTOR:
38469 icode = CODE_FOR_xrstor_rex64;
38470 break;
38471 case IX86_BUILTIN_XSAVE64:
38472 icode = CODE_FOR_xsave64;
38473 break;
38474 case IX86_BUILTIN_XRSTOR64:
38475 icode = CODE_FOR_xrstor64;
38476 break;
38477 case IX86_BUILTIN_XSAVEOPT:
38478 icode = CODE_FOR_xsaveopt_rex64;
38479 break;
38480 case IX86_BUILTIN_XSAVEOPT64:
38481 icode = CODE_FOR_xsaveopt64;
38482 break;
38483 case IX86_BUILTIN_XSAVES:
38484 icode = CODE_FOR_xsaves_rex64;
38485 break;
38486 case IX86_BUILTIN_XRSTORS:
38487 icode = CODE_FOR_xrstors_rex64;
38488 break;
38489 case IX86_BUILTIN_XSAVES64:
38490 icode = CODE_FOR_xsaves64;
38491 break;
38492 case IX86_BUILTIN_XRSTORS64:
38493 icode = CODE_FOR_xrstors64;
38494 break;
38495 case IX86_BUILTIN_XSAVEC:
38496 icode = CODE_FOR_xsavec_rex64;
38497 break;
38498 case IX86_BUILTIN_XSAVEC64:
38499 icode = CODE_FOR_xsavec64;
38500 break;
38501 default:
38502 gcc_unreachable ();
38505 op2 = gen_lowpart (SImode, op2);
38506 op1 = gen_lowpart (SImode, op1);
38507 pat = GEN_FCN (icode) (op0, op1, op2);
38509 else
38511 switch (fcode)
38513 case IX86_BUILTIN_XSAVE:
38514 icode = CODE_FOR_xsave;
38515 break;
38516 case IX86_BUILTIN_XRSTOR:
38517 icode = CODE_FOR_xrstor;
38518 break;
38519 case IX86_BUILTIN_XSAVEOPT:
38520 icode = CODE_FOR_xsaveopt;
38521 break;
38522 case IX86_BUILTIN_XSAVES:
38523 icode = CODE_FOR_xsaves;
38524 break;
38525 case IX86_BUILTIN_XRSTORS:
38526 icode = CODE_FOR_xrstors;
38527 break;
38528 case IX86_BUILTIN_XSAVEC:
38529 icode = CODE_FOR_xsavec;
38530 break;
38531 default:
38532 gcc_unreachable ();
38534 pat = GEN_FCN (icode) (op0, op1);
38537 if (pat)
38538 emit_insn (pat);
38539 return 0;
38541 case IX86_BUILTIN_LLWPCB:
38542 arg0 = CALL_EXPR_ARG (exp, 0);
38543 op0 = expand_normal (arg0);
38544 icode = CODE_FOR_lwp_llwpcb;
38545 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38546 op0 = ix86_zero_extend_to_Pmode (op0);
38547 emit_insn (gen_lwp_llwpcb (op0));
38548 return 0;
38550 case IX86_BUILTIN_SLWPCB:
38551 icode = CODE_FOR_lwp_slwpcb;
38552 if (!target
38553 || !insn_data[icode].operand[0].predicate (target, Pmode))
38554 target = gen_reg_rtx (Pmode);
38555 emit_insn (gen_lwp_slwpcb (target));
38556 return target;
38558 case IX86_BUILTIN_BEXTRI32:
38559 case IX86_BUILTIN_BEXTRI64:
38560 arg0 = CALL_EXPR_ARG (exp, 0);
38561 arg1 = CALL_EXPR_ARG (exp, 1);
38562 op0 = expand_normal (arg0);
38563 op1 = expand_normal (arg1);
38564 icode = (fcode == IX86_BUILTIN_BEXTRI32
38565 ? CODE_FOR_tbm_bextri_si
38566 : CODE_FOR_tbm_bextri_di);
38567 if (!CONST_INT_P (op1))
38569 error ("last argument must be an immediate");
38570 return const0_rtx;
38572 else
38574 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38575 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38576 op1 = GEN_INT (length);
38577 op2 = GEN_INT (lsb_index);
38578 pat = GEN_FCN (icode) (target, op0, op1, op2);
38579 if (pat)
38580 emit_insn (pat);
38581 return target;
38584 case IX86_BUILTIN_RDRAND16_STEP:
38585 icode = CODE_FOR_rdrandhi_1;
38586 mode0 = HImode;
38587 goto rdrand_step;
38589 case IX86_BUILTIN_RDRAND32_STEP:
38590 icode = CODE_FOR_rdrandsi_1;
38591 mode0 = SImode;
38592 goto rdrand_step;
38594 case IX86_BUILTIN_RDRAND64_STEP:
38595 icode = CODE_FOR_rdranddi_1;
38596 mode0 = DImode;
38598 rdrand_step:
38599 arg0 = CALL_EXPR_ARG (exp, 0);
38600 op1 = expand_normal (arg0);
38601 if (!address_operand (op1, VOIDmode))
38603 op1 = convert_memory_address (Pmode, op1);
38604 op1 = copy_addr_to_reg (op1);
38607 op0 = gen_reg_rtx (mode0);
38608 emit_insn (GEN_FCN (icode) (op0));
38610 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38612 op1 = gen_reg_rtx (SImode);
38613 emit_move_insn (op1, CONST1_RTX (SImode));
38615 /* Emit SImode conditional move. */
38616 if (mode0 == HImode)
38618 if (TARGET_ZERO_EXTEND_WITH_AND
38619 && optimize_function_for_speed_p (cfun))
38621 op2 = force_reg (SImode, const0_rtx);
38623 emit_insn (gen_movstricthi
38624 (gen_lowpart (HImode, op2), op0));
38626 else
38628 op2 = gen_reg_rtx (SImode);
38630 emit_insn (gen_zero_extendhisi2 (op2, op0));
38633 else if (mode0 == SImode)
38634 op2 = op0;
38635 else
38636 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38638 if (target == 0
38639 || !register_operand (target, SImode))
38640 target = gen_reg_rtx (SImode);
38642 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38643 const0_rtx);
38644 emit_insn (gen_rtx_SET (target,
38645 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38646 return target;
38648 case IX86_BUILTIN_RDSEED16_STEP:
38649 icode = CODE_FOR_rdseedhi_1;
38650 mode0 = HImode;
38651 goto rdseed_step;
38653 case IX86_BUILTIN_RDSEED32_STEP:
38654 icode = CODE_FOR_rdseedsi_1;
38655 mode0 = SImode;
38656 goto rdseed_step;
38658 case IX86_BUILTIN_RDSEED64_STEP:
38659 icode = CODE_FOR_rdseeddi_1;
38660 mode0 = DImode;
38662 rdseed_step:
38663 arg0 = CALL_EXPR_ARG (exp, 0);
38664 op1 = expand_normal (arg0);
38665 if (!address_operand (op1, VOIDmode))
38667 op1 = convert_memory_address (Pmode, op1);
38668 op1 = copy_addr_to_reg (op1);
38671 op0 = gen_reg_rtx (mode0);
38672 emit_insn (GEN_FCN (icode) (op0));
38674 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38676 op2 = gen_reg_rtx (QImode);
38678 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38679 const0_rtx);
38680 emit_insn (gen_rtx_SET (op2, pat));
38682 if (target == 0
38683 || !register_operand (target, SImode))
38684 target = gen_reg_rtx (SImode);
38686 emit_insn (gen_zero_extendqisi2 (target, op2));
38687 return target;
38689 case IX86_BUILTIN_SBB32:
38690 icode = CODE_FOR_subborrowsi;
38691 mode0 = SImode;
38692 goto handlecarry;
38694 case IX86_BUILTIN_SBB64:
38695 icode = CODE_FOR_subborrowdi;
38696 mode0 = DImode;
38697 goto handlecarry;
38699 case IX86_BUILTIN_ADDCARRYX32:
38700 icode = CODE_FOR_addcarrysi;
38701 mode0 = SImode;
38702 goto handlecarry;
38704 case IX86_BUILTIN_ADDCARRYX64:
38705 icode = CODE_FOR_addcarrydi;
38706 mode0 = DImode;
38708 handlecarry:
38709 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38710 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38711 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38712 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38714 op1 = expand_normal (arg0);
38715 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38717 op2 = expand_normal (arg1);
38718 if (!register_operand (op2, mode0))
38719 op2 = copy_to_mode_reg (mode0, op2);
38721 op3 = expand_normal (arg2);
38722 if (!register_operand (op3, mode0))
38723 op3 = copy_to_mode_reg (mode0, op3);
38725 op4 = expand_normal (arg3);
38726 if (!address_operand (op4, VOIDmode))
38728 op4 = convert_memory_address (Pmode, op4);
38729 op4 = copy_addr_to_reg (op4);
38732 /* Generate CF from input operand. */
38733 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38735 /* Generate instruction that consumes CF. */
38736 op0 = gen_reg_rtx (mode0);
38738 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38739 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38740 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38742 /* Return current CF value. */
38743 if (target == 0)
38744 target = gen_reg_rtx (QImode);
38746 PUT_MODE (pat, QImode);
38747 emit_insn (gen_rtx_SET (target, pat));
38749 /* Store the result. */
38750 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38752 return target;
38754 case IX86_BUILTIN_READ_FLAGS:
38755 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38757 if (optimize
38758 || target == NULL_RTX
38759 || !nonimmediate_operand (target, word_mode)
38760 || GET_MODE (target) != word_mode)
38761 target = gen_reg_rtx (word_mode);
38763 emit_insn (gen_pop (target));
38764 return target;
38766 case IX86_BUILTIN_WRITE_FLAGS:
38768 arg0 = CALL_EXPR_ARG (exp, 0);
38769 op0 = expand_normal (arg0);
38770 if (!general_no_elim_operand (op0, word_mode))
38771 op0 = copy_to_mode_reg (word_mode, op0);
38773 emit_insn (gen_push (op0));
38774 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38775 return 0;
38777 case IX86_BUILTIN_KTESTC8:
38778 icode = CODE_FOR_ktestqi;
38779 mode3 = CCCmode;
38780 goto kortest;
38782 case IX86_BUILTIN_KTESTZ8:
38783 icode = CODE_FOR_ktestqi;
38784 mode3 = CCZmode;
38785 goto kortest;
38787 case IX86_BUILTIN_KTESTC16:
38788 icode = CODE_FOR_ktesthi;
38789 mode3 = CCCmode;
38790 goto kortest;
38792 case IX86_BUILTIN_KTESTZ16:
38793 icode = CODE_FOR_ktesthi;
38794 mode3 = CCZmode;
38795 goto kortest;
38797 case IX86_BUILTIN_KTESTC32:
38798 icode = CODE_FOR_ktestsi;
38799 mode3 = CCCmode;
38800 goto kortest;
38802 case IX86_BUILTIN_KTESTZ32:
38803 icode = CODE_FOR_ktestsi;
38804 mode3 = CCZmode;
38805 goto kortest;
38807 case IX86_BUILTIN_KTESTC64:
38808 icode = CODE_FOR_ktestdi;
38809 mode3 = CCCmode;
38810 goto kortest;
38812 case IX86_BUILTIN_KTESTZ64:
38813 icode = CODE_FOR_ktestdi;
38814 mode3 = CCZmode;
38815 goto kortest;
38817 case IX86_BUILTIN_KORTESTC8:
38818 icode = CODE_FOR_kortestqi;
38819 mode3 = CCCmode;
38820 goto kortest;
38822 case IX86_BUILTIN_KORTESTZ8:
38823 icode = CODE_FOR_kortestqi;
38824 mode3 = CCZmode;
38825 goto kortest;
38827 case IX86_BUILTIN_KORTESTC16:
38828 icode = CODE_FOR_kortesthi;
38829 mode3 = CCCmode;
38830 goto kortest;
38832 case IX86_BUILTIN_KORTESTZ16:
38833 icode = CODE_FOR_kortesthi;
38834 mode3 = CCZmode;
38835 goto kortest;
38837 case IX86_BUILTIN_KORTESTC32:
38838 icode = CODE_FOR_kortestsi;
38839 mode3 = CCCmode;
38840 goto kortest;
38842 case IX86_BUILTIN_KORTESTZ32:
38843 icode = CODE_FOR_kortestsi;
38844 mode3 = CCZmode;
38845 goto kortest;
38847 case IX86_BUILTIN_KORTESTC64:
38848 icode = CODE_FOR_kortestdi;
38849 mode3 = CCCmode;
38850 goto kortest;
38852 case IX86_BUILTIN_KORTESTZ64:
38853 icode = CODE_FOR_kortestdi;
38854 mode3 = CCZmode;
38856 kortest:
38857 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38858 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38859 op0 = expand_normal (arg0);
38860 op1 = expand_normal (arg1);
38862 mode0 = insn_data[icode].operand[0].mode;
38863 mode1 = insn_data[icode].operand[1].mode;
38865 if (GET_MODE (op0) != VOIDmode)
38866 op0 = force_reg (GET_MODE (op0), op0);
38868 op0 = gen_lowpart (mode0, op0);
38870 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38871 op0 = copy_to_mode_reg (mode0, op0);
38873 if (GET_MODE (op1) != VOIDmode)
38874 op1 = force_reg (GET_MODE (op1), op1);
38876 op1 = gen_lowpart (mode1, op1);
38878 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38879 op1 = copy_to_mode_reg (mode1, op1);
38881 target = gen_reg_rtx (QImode);
38883 /* Emit kortest. */
38884 emit_insn (GEN_FCN (icode) (op0, op1));
38885 /* And use setcc to return result from flags. */
38886 ix86_expand_setcc (target, EQ,
38887 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38888 return target;
38890 case IX86_BUILTIN_GATHERSIV2DF:
38891 icode = CODE_FOR_avx2_gathersiv2df;
38892 goto gather_gen;
38893 case IX86_BUILTIN_GATHERSIV4DF:
38894 icode = CODE_FOR_avx2_gathersiv4df;
38895 goto gather_gen;
38896 case IX86_BUILTIN_GATHERDIV2DF:
38897 icode = CODE_FOR_avx2_gatherdiv2df;
38898 goto gather_gen;
38899 case IX86_BUILTIN_GATHERDIV4DF:
38900 icode = CODE_FOR_avx2_gatherdiv4df;
38901 goto gather_gen;
38902 case IX86_BUILTIN_GATHERSIV4SF:
38903 icode = CODE_FOR_avx2_gathersiv4sf;
38904 goto gather_gen;
38905 case IX86_BUILTIN_GATHERSIV8SF:
38906 icode = CODE_FOR_avx2_gathersiv8sf;
38907 goto gather_gen;
38908 case IX86_BUILTIN_GATHERDIV4SF:
38909 icode = CODE_FOR_avx2_gatherdiv4sf;
38910 goto gather_gen;
38911 case IX86_BUILTIN_GATHERDIV8SF:
38912 icode = CODE_FOR_avx2_gatherdiv8sf;
38913 goto gather_gen;
38914 case IX86_BUILTIN_GATHERSIV2DI:
38915 icode = CODE_FOR_avx2_gathersiv2di;
38916 goto gather_gen;
38917 case IX86_BUILTIN_GATHERSIV4DI:
38918 icode = CODE_FOR_avx2_gathersiv4di;
38919 goto gather_gen;
38920 case IX86_BUILTIN_GATHERDIV2DI:
38921 icode = CODE_FOR_avx2_gatherdiv2di;
38922 goto gather_gen;
38923 case IX86_BUILTIN_GATHERDIV4DI:
38924 icode = CODE_FOR_avx2_gatherdiv4di;
38925 goto gather_gen;
38926 case IX86_BUILTIN_GATHERSIV4SI:
38927 icode = CODE_FOR_avx2_gathersiv4si;
38928 goto gather_gen;
38929 case IX86_BUILTIN_GATHERSIV8SI:
38930 icode = CODE_FOR_avx2_gathersiv8si;
38931 goto gather_gen;
38932 case IX86_BUILTIN_GATHERDIV4SI:
38933 icode = CODE_FOR_avx2_gatherdiv4si;
38934 goto gather_gen;
38935 case IX86_BUILTIN_GATHERDIV8SI:
38936 icode = CODE_FOR_avx2_gatherdiv8si;
38937 goto gather_gen;
38938 case IX86_BUILTIN_GATHERALTSIV4DF:
38939 icode = CODE_FOR_avx2_gathersiv4df;
38940 goto gather_gen;
38941 case IX86_BUILTIN_GATHERALTDIV8SF:
38942 icode = CODE_FOR_avx2_gatherdiv8sf;
38943 goto gather_gen;
38944 case IX86_BUILTIN_GATHERALTSIV4DI:
38945 icode = CODE_FOR_avx2_gathersiv4di;
38946 goto gather_gen;
38947 case IX86_BUILTIN_GATHERALTDIV8SI:
38948 icode = CODE_FOR_avx2_gatherdiv8si;
38949 goto gather_gen;
38950 case IX86_BUILTIN_GATHER3SIV16SF:
38951 icode = CODE_FOR_avx512f_gathersiv16sf;
38952 goto gather_gen;
38953 case IX86_BUILTIN_GATHER3SIV8DF:
38954 icode = CODE_FOR_avx512f_gathersiv8df;
38955 goto gather_gen;
38956 case IX86_BUILTIN_GATHER3DIV16SF:
38957 icode = CODE_FOR_avx512f_gatherdiv16sf;
38958 goto gather_gen;
38959 case IX86_BUILTIN_GATHER3DIV8DF:
38960 icode = CODE_FOR_avx512f_gatherdiv8df;
38961 goto gather_gen;
38962 case IX86_BUILTIN_GATHER3SIV16SI:
38963 icode = CODE_FOR_avx512f_gathersiv16si;
38964 goto gather_gen;
38965 case IX86_BUILTIN_GATHER3SIV8DI:
38966 icode = CODE_FOR_avx512f_gathersiv8di;
38967 goto gather_gen;
38968 case IX86_BUILTIN_GATHER3DIV16SI:
38969 icode = CODE_FOR_avx512f_gatherdiv16si;
38970 goto gather_gen;
38971 case IX86_BUILTIN_GATHER3DIV8DI:
38972 icode = CODE_FOR_avx512f_gatherdiv8di;
38973 goto gather_gen;
38974 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38975 icode = CODE_FOR_avx512f_gathersiv8df;
38976 goto gather_gen;
38977 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38978 icode = CODE_FOR_avx512f_gatherdiv16sf;
38979 goto gather_gen;
38980 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38981 icode = CODE_FOR_avx512f_gathersiv8di;
38982 goto gather_gen;
38983 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38984 icode = CODE_FOR_avx512f_gatherdiv16si;
38985 goto gather_gen;
38986 case IX86_BUILTIN_GATHER3SIV2DF:
38987 icode = CODE_FOR_avx512vl_gathersiv2df;
38988 goto gather_gen;
38989 case IX86_BUILTIN_GATHER3SIV4DF:
38990 icode = CODE_FOR_avx512vl_gathersiv4df;
38991 goto gather_gen;
38992 case IX86_BUILTIN_GATHER3DIV2DF:
38993 icode = CODE_FOR_avx512vl_gatherdiv2df;
38994 goto gather_gen;
38995 case IX86_BUILTIN_GATHER3DIV4DF:
38996 icode = CODE_FOR_avx512vl_gatherdiv4df;
38997 goto gather_gen;
38998 case IX86_BUILTIN_GATHER3SIV4SF:
38999 icode = CODE_FOR_avx512vl_gathersiv4sf;
39000 goto gather_gen;
39001 case IX86_BUILTIN_GATHER3SIV8SF:
39002 icode = CODE_FOR_avx512vl_gathersiv8sf;
39003 goto gather_gen;
39004 case IX86_BUILTIN_GATHER3DIV4SF:
39005 icode = CODE_FOR_avx512vl_gatherdiv4sf;
39006 goto gather_gen;
39007 case IX86_BUILTIN_GATHER3DIV8SF:
39008 icode = CODE_FOR_avx512vl_gatherdiv8sf;
39009 goto gather_gen;
39010 case IX86_BUILTIN_GATHER3SIV2DI:
39011 icode = CODE_FOR_avx512vl_gathersiv2di;
39012 goto gather_gen;
39013 case IX86_BUILTIN_GATHER3SIV4DI:
39014 icode = CODE_FOR_avx512vl_gathersiv4di;
39015 goto gather_gen;
39016 case IX86_BUILTIN_GATHER3DIV2DI:
39017 icode = CODE_FOR_avx512vl_gatherdiv2di;
39018 goto gather_gen;
39019 case IX86_BUILTIN_GATHER3DIV4DI:
39020 icode = CODE_FOR_avx512vl_gatherdiv4di;
39021 goto gather_gen;
39022 case IX86_BUILTIN_GATHER3SIV4SI:
39023 icode = CODE_FOR_avx512vl_gathersiv4si;
39024 goto gather_gen;
39025 case IX86_BUILTIN_GATHER3SIV8SI:
39026 icode = CODE_FOR_avx512vl_gathersiv8si;
39027 goto gather_gen;
39028 case IX86_BUILTIN_GATHER3DIV4SI:
39029 icode = CODE_FOR_avx512vl_gatherdiv4si;
39030 goto gather_gen;
39031 case IX86_BUILTIN_GATHER3DIV8SI:
39032 icode = CODE_FOR_avx512vl_gatherdiv8si;
39033 goto gather_gen;
39034 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39035 icode = CODE_FOR_avx512vl_gathersiv4df;
39036 goto gather_gen;
39037 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39038 icode = CODE_FOR_avx512vl_gatherdiv8sf;
39039 goto gather_gen;
39040 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39041 icode = CODE_FOR_avx512vl_gathersiv4di;
39042 goto gather_gen;
39043 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39044 icode = CODE_FOR_avx512vl_gatherdiv8si;
39045 goto gather_gen;
39046 case IX86_BUILTIN_SCATTERSIV16SF:
39047 icode = CODE_FOR_avx512f_scattersiv16sf;
39048 goto scatter_gen;
39049 case IX86_BUILTIN_SCATTERSIV8DF:
39050 icode = CODE_FOR_avx512f_scattersiv8df;
39051 goto scatter_gen;
39052 case IX86_BUILTIN_SCATTERDIV16SF:
39053 icode = CODE_FOR_avx512f_scatterdiv16sf;
39054 goto scatter_gen;
39055 case IX86_BUILTIN_SCATTERDIV8DF:
39056 icode = CODE_FOR_avx512f_scatterdiv8df;
39057 goto scatter_gen;
39058 case IX86_BUILTIN_SCATTERSIV16SI:
39059 icode = CODE_FOR_avx512f_scattersiv16si;
39060 goto scatter_gen;
39061 case IX86_BUILTIN_SCATTERSIV8DI:
39062 icode = CODE_FOR_avx512f_scattersiv8di;
39063 goto scatter_gen;
39064 case IX86_BUILTIN_SCATTERDIV16SI:
39065 icode = CODE_FOR_avx512f_scatterdiv16si;
39066 goto scatter_gen;
39067 case IX86_BUILTIN_SCATTERDIV8DI:
39068 icode = CODE_FOR_avx512f_scatterdiv8di;
39069 goto scatter_gen;
39070 case IX86_BUILTIN_SCATTERSIV8SF:
39071 icode = CODE_FOR_avx512vl_scattersiv8sf;
39072 goto scatter_gen;
39073 case IX86_BUILTIN_SCATTERSIV4SF:
39074 icode = CODE_FOR_avx512vl_scattersiv4sf;
39075 goto scatter_gen;
39076 case IX86_BUILTIN_SCATTERSIV4DF:
39077 icode = CODE_FOR_avx512vl_scattersiv4df;
39078 goto scatter_gen;
39079 case IX86_BUILTIN_SCATTERSIV2DF:
39080 icode = CODE_FOR_avx512vl_scattersiv2df;
39081 goto scatter_gen;
39082 case IX86_BUILTIN_SCATTERDIV8SF:
39083 icode = CODE_FOR_avx512vl_scatterdiv8sf;
39084 goto scatter_gen;
39085 case IX86_BUILTIN_SCATTERDIV4SF:
39086 icode = CODE_FOR_avx512vl_scatterdiv4sf;
39087 goto scatter_gen;
39088 case IX86_BUILTIN_SCATTERDIV4DF:
39089 icode = CODE_FOR_avx512vl_scatterdiv4df;
39090 goto scatter_gen;
39091 case IX86_BUILTIN_SCATTERDIV2DF:
39092 icode = CODE_FOR_avx512vl_scatterdiv2df;
39093 goto scatter_gen;
39094 case IX86_BUILTIN_SCATTERSIV8SI:
39095 icode = CODE_FOR_avx512vl_scattersiv8si;
39096 goto scatter_gen;
39097 case IX86_BUILTIN_SCATTERSIV4SI:
39098 icode = CODE_FOR_avx512vl_scattersiv4si;
39099 goto scatter_gen;
39100 case IX86_BUILTIN_SCATTERSIV4DI:
39101 icode = CODE_FOR_avx512vl_scattersiv4di;
39102 goto scatter_gen;
39103 case IX86_BUILTIN_SCATTERSIV2DI:
39104 icode = CODE_FOR_avx512vl_scattersiv2di;
39105 goto scatter_gen;
39106 case IX86_BUILTIN_SCATTERDIV8SI:
39107 icode = CODE_FOR_avx512vl_scatterdiv8si;
39108 goto scatter_gen;
39109 case IX86_BUILTIN_SCATTERDIV4SI:
39110 icode = CODE_FOR_avx512vl_scatterdiv4si;
39111 goto scatter_gen;
39112 case IX86_BUILTIN_SCATTERDIV4DI:
39113 icode = CODE_FOR_avx512vl_scatterdiv4di;
39114 goto scatter_gen;
39115 case IX86_BUILTIN_SCATTERDIV2DI:
39116 icode = CODE_FOR_avx512vl_scatterdiv2di;
39117 goto scatter_gen;
39118 case IX86_BUILTIN_GATHERPFDPD:
39119 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
39120 goto vec_prefetch_gen;
39121 case IX86_BUILTIN_SCATTERALTSIV8DF:
39122 icode = CODE_FOR_avx512f_scattersiv8df;
39123 goto scatter_gen;
39124 case IX86_BUILTIN_SCATTERALTDIV16SF:
39125 icode = CODE_FOR_avx512f_scatterdiv16sf;
39126 goto scatter_gen;
39127 case IX86_BUILTIN_SCATTERALTSIV8DI:
39128 icode = CODE_FOR_avx512f_scattersiv8di;
39129 goto scatter_gen;
39130 case IX86_BUILTIN_SCATTERALTDIV16SI:
39131 icode = CODE_FOR_avx512f_scatterdiv16si;
39132 goto scatter_gen;
39133 case IX86_BUILTIN_GATHERPFDPS:
39134 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
39135 goto vec_prefetch_gen;
39136 case IX86_BUILTIN_GATHERPFQPD:
39137 icode = CODE_FOR_avx512pf_gatherpfv8didf;
39138 goto vec_prefetch_gen;
39139 case IX86_BUILTIN_GATHERPFQPS:
39140 icode = CODE_FOR_avx512pf_gatherpfv8disf;
39141 goto vec_prefetch_gen;
39142 case IX86_BUILTIN_SCATTERPFDPD:
39143 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
39144 goto vec_prefetch_gen;
39145 case IX86_BUILTIN_SCATTERPFDPS:
39146 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
39147 goto vec_prefetch_gen;
39148 case IX86_BUILTIN_SCATTERPFQPD:
39149 icode = CODE_FOR_avx512pf_scatterpfv8didf;
39150 goto vec_prefetch_gen;
39151 case IX86_BUILTIN_SCATTERPFQPS:
39152 icode = CODE_FOR_avx512pf_scatterpfv8disf;
39153 goto vec_prefetch_gen;
39155 gather_gen:
39156 rtx half;
39157 rtx (*gen) (rtx, rtx);
39159 arg0 = CALL_EXPR_ARG (exp, 0);
39160 arg1 = CALL_EXPR_ARG (exp, 1);
39161 arg2 = CALL_EXPR_ARG (exp, 2);
39162 arg3 = CALL_EXPR_ARG (exp, 3);
39163 arg4 = CALL_EXPR_ARG (exp, 4);
39164 op0 = expand_normal (arg0);
39165 op1 = expand_normal (arg1);
39166 op2 = expand_normal (arg2);
39167 op3 = expand_normal (arg3);
39168 op4 = expand_normal (arg4);
39169 /* Note the arg order is different from the operand order. */
39170 mode0 = insn_data[icode].operand[1].mode;
39171 mode2 = insn_data[icode].operand[3].mode;
39172 mode3 = insn_data[icode].operand[4].mode;
39173 mode4 = insn_data[icode].operand[5].mode;
39175 if (target == NULL_RTX
39176 || GET_MODE (target) != insn_data[icode].operand[0].mode
39177 || !insn_data[icode].operand[0].predicate (target,
39178 GET_MODE (target)))
39179 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
39180 else
39181 subtarget = target;
39183 switch (fcode)
39185 case IX86_BUILTIN_GATHER3ALTSIV8DF:
39186 case IX86_BUILTIN_GATHER3ALTSIV8DI:
39187 half = gen_reg_rtx (V8SImode);
39188 if (!nonimmediate_operand (op2, V16SImode))
39189 op2 = copy_to_mode_reg (V16SImode, op2);
39190 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39191 op2 = half;
39192 break;
39193 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39194 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39195 case IX86_BUILTIN_GATHERALTSIV4DF:
39196 case IX86_BUILTIN_GATHERALTSIV4DI:
39197 half = gen_reg_rtx (V4SImode);
39198 if (!nonimmediate_operand (op2, V8SImode))
39199 op2 = copy_to_mode_reg (V8SImode, op2);
39200 emit_insn (gen_vec_extract_lo_v8si (half, op2));
39201 op2 = half;
39202 break;
39203 case IX86_BUILTIN_GATHER3ALTDIV16SF:
39204 case IX86_BUILTIN_GATHER3ALTDIV16SI:
39205 half = gen_reg_rtx (mode0);
39206 if (mode0 == V8SFmode)
39207 gen = gen_vec_extract_lo_v16sf;
39208 else
39209 gen = gen_vec_extract_lo_v16si;
39210 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39211 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39212 emit_insn (gen (half, op0));
39213 op0 = half;
39214 if (GET_MODE (op3) != VOIDmode)
39216 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39217 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39218 emit_insn (gen (half, op3));
39219 op3 = half;
39221 break;
39222 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39223 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39224 case IX86_BUILTIN_GATHERALTDIV8SF:
39225 case IX86_BUILTIN_GATHERALTDIV8SI:
39226 half = gen_reg_rtx (mode0);
39227 if (mode0 == V4SFmode)
39228 gen = gen_vec_extract_lo_v8sf;
39229 else
39230 gen = gen_vec_extract_lo_v8si;
39231 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39232 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39233 emit_insn (gen (half, op0));
39234 op0 = half;
39235 if (GET_MODE (op3) != VOIDmode)
39237 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39238 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39239 emit_insn (gen (half, op3));
39240 op3 = half;
39242 break;
39243 default:
39244 break;
39247 /* Force memory operand only with base register here. But we
39248 don't want to do it on memory operand for other builtin
39249 functions. */
39250 op1 = ix86_zero_extend_to_Pmode (op1);
39252 if (!insn_data[icode].operand[1].predicate (op0, mode0))
39253 op0 = copy_to_mode_reg (mode0, op0);
39254 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
39255 op1 = copy_to_mode_reg (Pmode, op1);
39256 if (!insn_data[icode].operand[3].predicate (op2, mode2))
39257 op2 = copy_to_mode_reg (mode2, op2);
39259 op3 = fixup_modeless_constant (op3, mode3);
39261 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
39263 if (!insn_data[icode].operand[4].predicate (op3, mode3))
39264 op3 = copy_to_mode_reg (mode3, op3);
39266 else
39268 op3 = copy_to_reg (op3);
39269 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39271 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39273 error ("the last argument must be scale 1, 2, 4, 8");
39274 return const0_rtx;
39277 /* Optimize. If mask is known to have all high bits set,
39278 replace op0 with pc_rtx to signal that the instruction
39279 overwrites the whole destination and doesn't use its
39280 previous contents. */
39281 if (optimize)
39283 if (TREE_CODE (arg3) == INTEGER_CST)
39285 if (integer_all_onesp (arg3))
39286 op0 = pc_rtx;
39288 else if (TREE_CODE (arg3) == VECTOR_CST)
39290 unsigned int negative = 0;
39291 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39293 tree cst = VECTOR_CST_ELT (arg3, i);
39294 if (TREE_CODE (cst) == INTEGER_CST
39295 && tree_int_cst_sign_bit (cst))
39296 negative++;
39297 else if (TREE_CODE (cst) == REAL_CST
39298 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39299 negative++;
39301 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39302 op0 = pc_rtx;
39304 else if (TREE_CODE (arg3) == SSA_NAME
39305 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39307 /* Recognize also when mask is like:
39308 __v2df src = _mm_setzero_pd ();
39309 __v2df mask = _mm_cmpeq_pd (src, src);
39311 __v8sf src = _mm256_setzero_ps ();
39312 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39313 as that is a cheaper way to load all ones into
39314 a register than having to load a constant from
39315 memory. */
39316 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39317 if (is_gimple_call (def_stmt))
39319 tree fndecl = gimple_call_fndecl (def_stmt);
39320 if (fndecl
39321 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39322 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39324 case IX86_BUILTIN_CMPPD:
39325 case IX86_BUILTIN_CMPPS:
39326 case IX86_BUILTIN_CMPPD256:
39327 case IX86_BUILTIN_CMPPS256:
39328 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39329 break;
39330 /* FALLTHRU */
39331 case IX86_BUILTIN_CMPEQPD:
39332 case IX86_BUILTIN_CMPEQPS:
39333 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39334 && initializer_zerop (gimple_call_arg (def_stmt,
39335 1)))
39336 op0 = pc_rtx;
39337 break;
39338 default:
39339 break;
39345 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39346 if (! pat)
39347 return const0_rtx;
39348 emit_insn (pat);
39350 switch (fcode)
39352 case IX86_BUILTIN_GATHER3DIV16SF:
39353 if (target == NULL_RTX)
39354 target = gen_reg_rtx (V8SFmode);
39355 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39356 break;
39357 case IX86_BUILTIN_GATHER3DIV16SI:
39358 if (target == NULL_RTX)
39359 target = gen_reg_rtx (V8SImode);
39360 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39361 break;
39362 case IX86_BUILTIN_GATHER3DIV8SF:
39363 case IX86_BUILTIN_GATHERDIV8SF:
39364 if (target == NULL_RTX)
39365 target = gen_reg_rtx (V4SFmode);
39366 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39367 break;
39368 case IX86_BUILTIN_GATHER3DIV8SI:
39369 case IX86_BUILTIN_GATHERDIV8SI:
39370 if (target == NULL_RTX)
39371 target = gen_reg_rtx (V4SImode);
39372 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39373 break;
39374 default:
39375 target = subtarget;
39376 break;
39378 return target;
39380 scatter_gen:
39381 arg0 = CALL_EXPR_ARG (exp, 0);
39382 arg1 = CALL_EXPR_ARG (exp, 1);
39383 arg2 = CALL_EXPR_ARG (exp, 2);
39384 arg3 = CALL_EXPR_ARG (exp, 3);
39385 arg4 = CALL_EXPR_ARG (exp, 4);
39386 op0 = expand_normal (arg0);
39387 op1 = expand_normal (arg1);
39388 op2 = expand_normal (arg2);
39389 op3 = expand_normal (arg3);
39390 op4 = expand_normal (arg4);
39391 mode1 = insn_data[icode].operand[1].mode;
39392 mode2 = insn_data[icode].operand[2].mode;
39393 mode3 = insn_data[icode].operand[3].mode;
39394 mode4 = insn_data[icode].operand[4].mode;
39396 /* Scatter instruction stores operand op3 to memory with
39397 indices from op2 and scale from op4 under writemask op1.
39398 If index operand op2 has more elements then source operand
39399 op3 one need to use only its low half. And vice versa. */
39400 switch (fcode)
39402 case IX86_BUILTIN_SCATTERALTSIV8DF:
39403 case IX86_BUILTIN_SCATTERALTSIV8DI:
39404 half = gen_reg_rtx (V8SImode);
39405 if (!nonimmediate_operand (op2, V16SImode))
39406 op2 = copy_to_mode_reg (V16SImode, op2);
39407 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39408 op2 = half;
39409 break;
39410 case IX86_BUILTIN_SCATTERALTDIV16SF:
39411 case IX86_BUILTIN_SCATTERALTDIV16SI:
39412 half = gen_reg_rtx (mode3);
39413 if (mode3 == V8SFmode)
39414 gen = gen_vec_extract_lo_v16sf;
39415 else
39416 gen = gen_vec_extract_lo_v16si;
39417 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39418 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39419 emit_insn (gen (half, op3));
39420 op3 = half;
39421 break;
39422 default:
39423 break;
39426 /* Force memory operand only with base register here. But we
39427 don't want to do it on memory operand for other builtin
39428 functions. */
39429 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39431 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39432 op0 = copy_to_mode_reg (Pmode, op0);
39434 op1 = fixup_modeless_constant (op1, mode1);
39436 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39438 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39439 op1 = copy_to_mode_reg (mode1, op1);
39441 else
39443 op1 = copy_to_reg (op1);
39444 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39447 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39448 op2 = copy_to_mode_reg (mode2, op2);
39450 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39451 op3 = copy_to_mode_reg (mode3, op3);
39453 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39455 error ("the last argument must be scale 1, 2, 4, 8");
39456 return const0_rtx;
39459 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39460 if (! pat)
39461 return const0_rtx;
39463 emit_insn (pat);
39464 return 0;
39466 vec_prefetch_gen:
39467 arg0 = CALL_EXPR_ARG (exp, 0);
39468 arg1 = CALL_EXPR_ARG (exp, 1);
39469 arg2 = CALL_EXPR_ARG (exp, 2);
39470 arg3 = CALL_EXPR_ARG (exp, 3);
39471 arg4 = CALL_EXPR_ARG (exp, 4);
39472 op0 = expand_normal (arg0);
39473 op1 = expand_normal (arg1);
39474 op2 = expand_normal (arg2);
39475 op3 = expand_normal (arg3);
39476 op4 = expand_normal (arg4);
39477 mode0 = insn_data[icode].operand[0].mode;
39478 mode1 = insn_data[icode].operand[1].mode;
39479 mode3 = insn_data[icode].operand[3].mode;
39480 mode4 = insn_data[icode].operand[4].mode;
39482 op0 = fixup_modeless_constant (op0, mode0);
39484 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39486 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39487 op0 = copy_to_mode_reg (mode0, op0);
39489 else
39491 op0 = copy_to_reg (op0);
39492 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39495 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39496 op1 = copy_to_mode_reg (mode1, op1);
39498 /* Force memory operand only with base register here. But we
39499 don't want to do it on memory operand for other builtin
39500 functions. */
39501 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39503 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39504 op2 = copy_to_mode_reg (Pmode, op2);
39506 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39508 error ("the forth argument must be scale 1, 2, 4, 8");
39509 return const0_rtx;
39512 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39514 error ("incorrect hint operand");
39515 return const0_rtx;
39518 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39519 if (! pat)
39520 return const0_rtx;
39522 emit_insn (pat);
39524 return 0;
39526 case IX86_BUILTIN_XABORT:
39527 icode = CODE_FOR_xabort;
39528 arg0 = CALL_EXPR_ARG (exp, 0);
39529 op0 = expand_normal (arg0);
39530 mode0 = insn_data[icode].operand[0].mode;
39531 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39533 error ("the xabort's argument must be an 8-bit immediate");
39534 return const0_rtx;
39536 emit_insn (gen_xabort (op0));
39537 return 0;
39539 default:
39540 break;
39543 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39544 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39546 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39547 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39548 target);
39551 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39552 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39554 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39555 switch (fcode)
39557 case IX86_BUILTIN_FABSQ:
39558 case IX86_BUILTIN_COPYSIGNQ:
39559 if (!TARGET_SSE)
39560 /* Emit a normal call if SSE isn't available. */
39561 return expand_call (exp, target, ignore);
39562 /* FALLTHRU */
39563 default:
39564 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39568 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39569 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39571 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39572 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39573 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39574 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39575 int masked = 1;
39576 machine_mode mode, wide_mode, nar_mode;
39578 nar_mode = V4SFmode;
39579 mode = V16SFmode;
39580 wide_mode = V64SFmode;
39581 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39582 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39584 switch (fcode)
39586 case IX86_BUILTIN_4FMAPS:
39587 fcn = gen_avx5124fmaddps_4fmaddps;
39588 masked = 0;
39589 goto v4fma_expand;
39591 case IX86_BUILTIN_4DPWSSD:
39592 nar_mode = V4SImode;
39593 mode = V16SImode;
39594 wide_mode = V64SImode;
39595 fcn = gen_avx5124vnniw_vp4dpwssd;
39596 masked = 0;
39597 goto v4fma_expand;
39599 case IX86_BUILTIN_4DPWSSDS:
39600 nar_mode = V4SImode;
39601 mode = V16SImode;
39602 wide_mode = V64SImode;
39603 fcn = gen_avx5124vnniw_vp4dpwssds;
39604 masked = 0;
39605 goto v4fma_expand;
39607 case IX86_BUILTIN_4FNMAPS:
39608 fcn = gen_avx5124fmaddps_4fnmaddps;
39609 masked = 0;
39610 goto v4fma_expand;
39612 case IX86_BUILTIN_4FNMAPS_MASK:
39613 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39614 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39615 goto v4fma_expand;
39617 case IX86_BUILTIN_4DPWSSD_MASK:
39618 nar_mode = V4SImode;
39619 mode = V16SImode;
39620 wide_mode = V64SImode;
39621 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39622 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39623 goto v4fma_expand;
39625 case IX86_BUILTIN_4DPWSSDS_MASK:
39626 nar_mode = V4SImode;
39627 mode = V16SImode;
39628 wide_mode = V64SImode;
39629 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39630 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39631 goto v4fma_expand;
39633 case IX86_BUILTIN_4FMAPS_MASK:
39635 tree args[4];
39636 rtx ops[4];
39637 rtx wide_reg;
39638 rtx accum;
39639 rtx addr;
39640 rtx mem;
39642 v4fma_expand:
39643 wide_reg = gen_reg_rtx (wide_mode);
39644 for (i = 0; i < 4; i++)
39646 args[i] = CALL_EXPR_ARG (exp, i);
39647 ops[i] = expand_normal (args[i]);
39649 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39650 ops[i]);
39653 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39654 accum = force_reg (mode, accum);
39656 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39657 addr = force_reg (Pmode, addr);
39659 mem = gen_rtx_MEM (nar_mode, addr);
39661 target = gen_reg_rtx (mode);
39663 emit_move_insn (target, accum);
39665 if (! masked)
39666 emit_insn (fcn (target, accum, wide_reg, mem));
39667 else
39669 rtx merge, mask;
39670 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39672 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39674 if (CONST_INT_P (mask))
39675 mask = fixup_modeless_constant (mask, HImode);
39677 mask = force_reg (HImode, mask);
39679 if (GET_MODE (mask) != HImode)
39680 mask = gen_rtx_SUBREG (HImode, mask, 0);
39682 /* If merge is 0 then we're about to emit z-masked variant. */
39683 if (const0_operand (merge, mode))
39684 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39685 /* If merge is the same as accum then emit merge-masked variant. */
39686 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39688 merge = force_reg (mode, merge);
39689 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39691 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39692 else
39694 target = gen_reg_rtx (mode);
39695 emit_move_insn (target, merge);
39696 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39699 return target;
39702 case IX86_BUILTIN_4FNMASS:
39703 fcn = gen_avx5124fmaddps_4fnmaddss;
39704 masked = 0;
39705 goto s4fma_expand;
39707 case IX86_BUILTIN_4FMASS:
39708 fcn = gen_avx5124fmaddps_4fmaddss;
39709 masked = 0;
39710 goto s4fma_expand;
39712 case IX86_BUILTIN_4FNMASS_MASK:
39713 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39714 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39715 goto s4fma_expand;
39717 case IX86_BUILTIN_4FMASS_MASK:
39719 tree args[4];
39720 rtx ops[4];
39721 rtx wide_reg;
39722 rtx accum;
39723 rtx addr;
39724 rtx mem;
39726 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39727 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39729 s4fma_expand:
39730 mode = V4SFmode;
39731 wide_reg = gen_reg_rtx (V64SFmode);
39732 for (i = 0; i < 4; i++)
39734 rtx tmp;
39735 args[i] = CALL_EXPR_ARG (exp, i);
39736 ops[i] = expand_normal (args[i]);
39738 tmp = gen_reg_rtx (SFmode);
39739 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39741 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39742 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39745 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39746 accum = force_reg (V4SFmode, accum);
39748 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39749 addr = force_reg (Pmode, addr);
39751 mem = gen_rtx_MEM (V4SFmode, addr);
39753 target = gen_reg_rtx (V4SFmode);
39755 emit_move_insn (target, accum);
39757 if (! masked)
39758 emit_insn (fcn (target, accum, wide_reg, mem));
39759 else
39761 rtx merge, mask;
39762 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39764 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39766 if (CONST_INT_P (mask))
39767 mask = fixup_modeless_constant (mask, QImode);
39769 mask = force_reg (QImode, mask);
39771 if (GET_MODE (mask) != QImode)
39772 mask = gen_rtx_SUBREG (QImode, mask, 0);
39774 /* If merge is 0 then we're about to emit z-masked variant. */
39775 if (const0_operand (merge, mode))
39776 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39777 /* If merge is the same as accum then emit merge-masked
39778 variant. */
39779 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39781 merge = force_reg (mode, merge);
39782 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39784 /* Merge with something unknown might happen if we z-mask
39785 w/ -O0. */
39786 else
39788 target = gen_reg_rtx (mode);
39789 emit_move_insn (target, merge);
39790 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39793 return target;
39795 case IX86_BUILTIN_RDPID:
39796 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39797 target);
39798 default:
39799 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39803 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39804 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39806 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39807 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39810 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39811 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39813 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39814 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39817 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39818 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39820 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39821 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39824 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39825 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39827 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39828 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39831 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39832 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39834 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39835 const struct builtin_description *d = bdesc_multi_arg + i;
39836 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39837 (enum ix86_builtin_func_type)
39838 d->flag, d->comparison);
39841 gcc_unreachable ();
39844 /* This returns the target-specific builtin with code CODE if
39845 current_function_decl has visibility on this builtin, which is checked
39846 using isa flags. Returns NULL_TREE otherwise. */
39848 static tree ix86_get_builtin (enum ix86_builtins code)
39850 struct cl_target_option *opts;
39851 tree target_tree = NULL_TREE;
39853 /* Determine the isa flags of current_function_decl. */
39855 if (current_function_decl)
39856 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39858 if (target_tree == NULL)
39859 target_tree = target_option_default_node;
39861 opts = TREE_TARGET_OPTION (target_tree);
39863 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39864 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39865 return ix86_builtin_decl (code, true);
39866 else
39867 return NULL_TREE;
39870 /* Return function decl for target specific builtin
39871 for given MPX builtin passed i FCODE. */
39872 static tree
39873 ix86_builtin_mpx_function (unsigned fcode)
39875 switch (fcode)
39877 case BUILT_IN_CHKP_BNDMK:
39878 return ix86_builtins[IX86_BUILTIN_BNDMK];
39880 case BUILT_IN_CHKP_BNDSTX:
39881 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39883 case BUILT_IN_CHKP_BNDLDX:
39884 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39886 case BUILT_IN_CHKP_BNDCL:
39887 return ix86_builtins[IX86_BUILTIN_BNDCL];
39889 case BUILT_IN_CHKP_BNDCU:
39890 return ix86_builtins[IX86_BUILTIN_BNDCU];
39892 case BUILT_IN_CHKP_BNDRET:
39893 return ix86_builtins[IX86_BUILTIN_BNDRET];
39895 case BUILT_IN_CHKP_INTERSECT:
39896 return ix86_builtins[IX86_BUILTIN_BNDINT];
39898 case BUILT_IN_CHKP_NARROW:
39899 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39901 case BUILT_IN_CHKP_SIZEOF:
39902 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39904 case BUILT_IN_CHKP_EXTRACT_LOWER:
39905 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39907 case BUILT_IN_CHKP_EXTRACT_UPPER:
39908 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39910 default:
39911 return NULL_TREE;
39914 gcc_unreachable ();
39917 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39919 Return an address to be used to load/store bounds for pointer
39920 passed in SLOT.
39922 SLOT_NO is an integer constant holding number of a target
39923 dependent special slot to be used in case SLOT is not a memory.
39925 SPECIAL_BASE is a pointer to be used as a base of fake address
39926 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39927 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39929 static rtx
39930 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39932 rtx addr = NULL;
39934 /* NULL slot means we pass bounds for pointer not passed to the
39935 function at all. Register slot means we pass pointer in a
39936 register. In both these cases bounds are passed via Bounds
39937 Table. Since we do not have actual pointer stored in memory,
39938 we have to use fake addresses to access Bounds Table. We
39939 start with (special_base - sizeof (void*)) and decrease this
39940 address by pointer size to get addresses for other slots. */
39941 if (!slot || REG_P (slot))
39943 gcc_assert (CONST_INT_P (slot_no));
39944 addr = plus_constant (Pmode, special_base,
39945 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39947 /* If pointer is passed in a memory then its address is used to
39948 access Bounds Table. */
39949 else if (MEM_P (slot))
39951 addr = XEXP (slot, 0);
39952 if (!register_operand (addr, Pmode))
39953 addr = copy_addr_to_reg (addr);
39955 else
39956 gcc_unreachable ();
39958 return addr;
39961 /* Expand pass uses this hook to load bounds for function parameter
39962 PTR passed in SLOT in case its bounds are not passed in a register.
39964 If SLOT is a memory, then bounds are loaded as for regular pointer
39965 loaded from memory. PTR may be NULL in case SLOT is a memory.
39966 In such case value of PTR (if required) may be loaded from SLOT.
39968 If SLOT is NULL or a register then SLOT_NO is an integer constant
39969 holding number of the target dependent special slot which should be
39970 used to obtain bounds.
39972 Return loaded bounds. */
39974 static rtx
39975 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39977 rtx reg = gen_reg_rtx (BNDmode);
39978 rtx addr;
39980 /* Get address to be used to access Bounds Table. Special slots start
39981 at the location of return address of the current function. */
39982 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39984 /* Load pointer value from a memory if we don't have it. */
39985 if (!ptr)
39987 gcc_assert (MEM_P (slot));
39988 ptr = copy_addr_to_reg (slot);
39991 if (!register_operand (ptr, Pmode))
39992 ptr = ix86_zero_extend_to_Pmode (ptr);
39994 emit_insn (BNDmode == BND64mode
39995 ? gen_bnd64_ldx (reg, addr, ptr)
39996 : gen_bnd32_ldx (reg, addr, ptr));
39998 return reg;
40001 /* Expand pass uses this hook to store BOUNDS for call argument PTR
40002 passed in SLOT in case BOUNDS are not passed in a register.
40004 If SLOT is a memory, then BOUNDS are stored as for regular pointer
40005 stored in memory. PTR may be NULL in case SLOT is a memory.
40006 In such case value of PTR (if required) may be loaded from SLOT.
40008 If SLOT is NULL or a register then SLOT_NO is an integer constant
40009 holding number of the target dependent special slot which should be
40010 used to store BOUNDS. */
40012 static void
40013 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
40015 rtx addr;
40017 /* Get address to be used to access Bounds Table. Special slots start
40018 at the location of return address of a called function. */
40019 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
40021 /* Load pointer value from a memory if we don't have it. */
40022 if (!ptr)
40024 gcc_assert (MEM_P (slot));
40025 ptr = copy_addr_to_reg (slot);
40028 if (!register_operand (ptr, Pmode))
40029 ptr = ix86_zero_extend_to_Pmode (ptr);
40031 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
40032 if (!register_operand (bounds, BNDmode))
40033 bounds = copy_to_mode_reg (BNDmode, bounds);
40035 emit_insn (BNDmode == BND64mode
40036 ? gen_bnd64_stx (addr, ptr, bounds)
40037 : gen_bnd32_stx (addr, ptr, bounds));
40040 /* Load and return bounds returned by function in SLOT. */
40042 static rtx
40043 ix86_load_returned_bounds (rtx slot)
40045 rtx res;
40047 gcc_assert (REG_P (slot));
40048 res = gen_reg_rtx (BNDmode);
40049 emit_move_insn (res, slot);
40051 return res;
40054 /* Store BOUNDS returned by function into SLOT. */
40056 static void
40057 ix86_store_returned_bounds (rtx slot, rtx bounds)
40059 gcc_assert (REG_P (slot));
40060 emit_move_insn (slot, bounds);
40063 /* Returns a function decl for a vectorized version of the combined function
40064 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
40065 if it is not available. */
40067 static tree
40068 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
40069 tree type_in)
40071 machine_mode in_mode, out_mode;
40072 int in_n, out_n;
40074 if (TREE_CODE (type_out) != VECTOR_TYPE
40075 || TREE_CODE (type_in) != VECTOR_TYPE)
40076 return NULL_TREE;
40078 out_mode = TYPE_MODE (TREE_TYPE (type_out));
40079 out_n = TYPE_VECTOR_SUBPARTS (type_out);
40080 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40081 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40083 switch (fn)
40085 CASE_CFN_EXP2:
40086 if (out_mode == SFmode && in_mode == SFmode)
40088 if (out_n == 16 && in_n == 16)
40089 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
40091 break;
40093 CASE_CFN_IFLOOR:
40094 CASE_CFN_LFLOOR:
40095 CASE_CFN_LLFLOOR:
40096 /* The round insn does not trap on denormals. */
40097 if (flag_trapping_math || !TARGET_SSE4_1)
40098 break;
40100 if (out_mode == SImode && in_mode == DFmode)
40102 if (out_n == 4 && in_n == 2)
40103 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
40104 else if (out_n == 8 && in_n == 4)
40105 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
40106 else if (out_n == 16 && in_n == 8)
40107 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
40109 if (out_mode == SImode && in_mode == SFmode)
40111 if (out_n == 4 && in_n == 4)
40112 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
40113 else if (out_n == 8 && in_n == 8)
40114 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
40115 else if (out_n == 16 && in_n == 16)
40116 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
40118 break;
40120 CASE_CFN_ICEIL:
40121 CASE_CFN_LCEIL:
40122 CASE_CFN_LLCEIL:
40123 /* The round insn does not trap on denormals. */
40124 if (flag_trapping_math || !TARGET_SSE4_1)
40125 break;
40127 if (out_mode == SImode && in_mode == DFmode)
40129 if (out_n == 4 && in_n == 2)
40130 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
40131 else if (out_n == 8 && in_n == 4)
40132 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
40133 else if (out_n == 16 && in_n == 8)
40134 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
40136 if (out_mode == SImode && in_mode == SFmode)
40138 if (out_n == 4 && in_n == 4)
40139 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
40140 else if (out_n == 8 && in_n == 8)
40141 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
40142 else if (out_n == 16 && in_n == 16)
40143 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
40145 break;
40147 CASE_CFN_IRINT:
40148 CASE_CFN_LRINT:
40149 CASE_CFN_LLRINT:
40150 if (out_mode == SImode && in_mode == DFmode)
40152 if (out_n == 4 && in_n == 2)
40153 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
40154 else if (out_n == 8 && in_n == 4)
40155 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
40156 else if (out_n == 16 && in_n == 8)
40157 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
40159 if (out_mode == SImode && in_mode == SFmode)
40161 if (out_n == 4 && in_n == 4)
40162 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
40163 else if (out_n == 8 && in_n == 8)
40164 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
40165 else if (out_n == 16 && in_n == 16)
40166 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
40168 break;
40170 CASE_CFN_IROUND:
40171 CASE_CFN_LROUND:
40172 CASE_CFN_LLROUND:
40173 /* The round insn does not trap on denormals. */
40174 if (flag_trapping_math || !TARGET_SSE4_1)
40175 break;
40177 if (out_mode == SImode && in_mode == DFmode)
40179 if (out_n == 4 && in_n == 2)
40180 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
40181 else if (out_n == 8 && in_n == 4)
40182 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
40183 else if (out_n == 16 && in_n == 8)
40184 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
40186 if (out_mode == SImode && in_mode == SFmode)
40188 if (out_n == 4 && in_n == 4)
40189 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
40190 else if (out_n == 8 && in_n == 8)
40191 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
40192 else if (out_n == 16 && in_n == 16)
40193 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
40195 break;
40197 CASE_CFN_FLOOR:
40198 /* The round insn does not trap on denormals. */
40199 if (flag_trapping_math || !TARGET_SSE4_1)
40200 break;
40202 if (out_mode == DFmode && in_mode == DFmode)
40204 if (out_n == 2 && in_n == 2)
40205 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
40206 else if (out_n == 4 && in_n == 4)
40207 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
40208 else if (out_n == 8 && in_n == 8)
40209 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
40211 if (out_mode == SFmode && in_mode == SFmode)
40213 if (out_n == 4 && in_n == 4)
40214 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
40215 else if (out_n == 8 && in_n == 8)
40216 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
40217 else if (out_n == 16 && in_n == 16)
40218 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
40220 break;
40222 CASE_CFN_CEIL:
40223 /* The round insn does not trap on denormals. */
40224 if (flag_trapping_math || !TARGET_SSE4_1)
40225 break;
40227 if (out_mode == DFmode && in_mode == DFmode)
40229 if (out_n == 2 && in_n == 2)
40230 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
40231 else if (out_n == 4 && in_n == 4)
40232 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
40233 else if (out_n == 8 && in_n == 8)
40234 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
40236 if (out_mode == SFmode && in_mode == SFmode)
40238 if (out_n == 4 && in_n == 4)
40239 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
40240 else if (out_n == 8 && in_n == 8)
40241 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
40242 else if (out_n == 16 && in_n == 16)
40243 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
40245 break;
40247 CASE_CFN_TRUNC:
40248 /* The round insn does not trap on denormals. */
40249 if (flag_trapping_math || !TARGET_SSE4_1)
40250 break;
40252 if (out_mode == DFmode && in_mode == DFmode)
40254 if (out_n == 2 && in_n == 2)
40255 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
40256 else if (out_n == 4 && in_n == 4)
40257 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
40258 else if (out_n == 8 && in_n == 8)
40259 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
40261 if (out_mode == SFmode && in_mode == SFmode)
40263 if (out_n == 4 && in_n == 4)
40264 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
40265 else if (out_n == 8 && in_n == 8)
40266 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
40267 else if (out_n == 16 && in_n == 16)
40268 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40270 break;
40272 CASE_CFN_RINT:
40273 /* The round insn does not trap on denormals. */
40274 if (flag_trapping_math || !TARGET_SSE4_1)
40275 break;
40277 if (out_mode == DFmode && in_mode == DFmode)
40279 if (out_n == 2 && in_n == 2)
40280 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40281 else if (out_n == 4 && in_n == 4)
40282 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40284 if (out_mode == SFmode && in_mode == SFmode)
40286 if (out_n == 4 && in_n == 4)
40287 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40288 else if (out_n == 8 && in_n == 8)
40289 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40291 break;
40293 CASE_CFN_FMA:
40294 if (out_mode == DFmode && in_mode == DFmode)
40296 if (out_n == 2 && in_n == 2)
40297 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40298 if (out_n == 4 && in_n == 4)
40299 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40301 if (out_mode == SFmode && in_mode == SFmode)
40303 if (out_n == 4 && in_n == 4)
40304 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40305 if (out_n == 8 && in_n == 8)
40306 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40308 break;
40310 default:
40311 break;
40314 /* Dispatch to a handler for a vectorization library. */
40315 if (ix86_veclib_handler)
40316 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40318 return NULL_TREE;
40321 /* Handler for an SVML-style interface to
40322 a library with vectorized intrinsics. */
40324 static tree
40325 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40327 char name[20];
40328 tree fntype, new_fndecl, args;
40329 unsigned arity;
40330 const char *bname;
40331 machine_mode el_mode, in_mode;
40332 int n, in_n;
40334 /* The SVML is suitable for unsafe math only. */
40335 if (!flag_unsafe_math_optimizations)
40336 return NULL_TREE;
40338 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40339 n = TYPE_VECTOR_SUBPARTS (type_out);
40340 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40341 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40342 if (el_mode != in_mode
40343 || n != in_n)
40344 return NULL_TREE;
40346 switch (fn)
40348 CASE_CFN_EXP:
40349 CASE_CFN_LOG:
40350 CASE_CFN_LOG10:
40351 CASE_CFN_POW:
40352 CASE_CFN_TANH:
40353 CASE_CFN_TAN:
40354 CASE_CFN_ATAN:
40355 CASE_CFN_ATAN2:
40356 CASE_CFN_ATANH:
40357 CASE_CFN_CBRT:
40358 CASE_CFN_SINH:
40359 CASE_CFN_SIN:
40360 CASE_CFN_ASINH:
40361 CASE_CFN_ASIN:
40362 CASE_CFN_COSH:
40363 CASE_CFN_COS:
40364 CASE_CFN_ACOSH:
40365 CASE_CFN_ACOS:
40366 if ((el_mode != DFmode || n != 2)
40367 && (el_mode != SFmode || n != 4))
40368 return NULL_TREE;
40369 break;
40371 default:
40372 return NULL_TREE;
40375 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40376 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40378 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40379 strcpy (name, "vmlsLn4");
40380 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40381 strcpy (name, "vmldLn2");
40382 else if (n == 4)
40384 sprintf (name, "vmls%s", bname+10);
40385 name[strlen (name)-1] = '4';
40387 else
40388 sprintf (name, "vmld%s2", bname+10);
40390 /* Convert to uppercase. */
40391 name[4] &= ~0x20;
40393 arity = 0;
40394 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40395 arity++;
40397 if (arity == 1)
40398 fntype = build_function_type_list (type_out, type_in, NULL);
40399 else
40400 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40402 /* Build a function declaration for the vectorized function. */
40403 new_fndecl = build_decl (BUILTINS_LOCATION,
40404 FUNCTION_DECL, get_identifier (name), fntype);
40405 TREE_PUBLIC (new_fndecl) = 1;
40406 DECL_EXTERNAL (new_fndecl) = 1;
40407 DECL_IS_NOVOPS (new_fndecl) = 1;
40408 TREE_READONLY (new_fndecl) = 1;
40410 return new_fndecl;
40413 /* Handler for an ACML-style interface to
40414 a library with vectorized intrinsics. */
40416 static tree
40417 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40419 char name[20] = "__vr.._";
40420 tree fntype, new_fndecl, args;
40421 unsigned arity;
40422 const char *bname;
40423 machine_mode el_mode, in_mode;
40424 int n, in_n;
40426 /* The ACML is 64bits only and suitable for unsafe math only as
40427 it does not correctly support parts of IEEE with the required
40428 precision such as denormals. */
40429 if (!TARGET_64BIT
40430 || !flag_unsafe_math_optimizations)
40431 return NULL_TREE;
40433 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40434 n = TYPE_VECTOR_SUBPARTS (type_out);
40435 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40436 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40437 if (el_mode != in_mode
40438 || n != in_n)
40439 return NULL_TREE;
40441 switch (fn)
40443 CASE_CFN_SIN:
40444 CASE_CFN_COS:
40445 CASE_CFN_EXP:
40446 CASE_CFN_LOG:
40447 CASE_CFN_LOG2:
40448 CASE_CFN_LOG10:
40449 if (el_mode == DFmode && n == 2)
40451 name[4] = 'd';
40452 name[5] = '2';
40454 else if (el_mode == SFmode && n == 4)
40456 name[4] = 's';
40457 name[5] = '4';
40459 else
40460 return NULL_TREE;
40461 break;
40463 default:
40464 return NULL_TREE;
40467 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40468 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40469 sprintf (name + 7, "%s", bname+10);
40471 arity = 0;
40472 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40473 arity++;
40475 if (arity == 1)
40476 fntype = build_function_type_list (type_out, type_in, NULL);
40477 else
40478 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40480 /* Build a function declaration for the vectorized function. */
40481 new_fndecl = build_decl (BUILTINS_LOCATION,
40482 FUNCTION_DECL, get_identifier (name), fntype);
40483 TREE_PUBLIC (new_fndecl) = 1;
40484 DECL_EXTERNAL (new_fndecl) = 1;
40485 DECL_IS_NOVOPS (new_fndecl) = 1;
40486 TREE_READONLY (new_fndecl) = 1;
40488 return new_fndecl;
40491 /* Returns a decl of a function that implements gather load with
40492 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40493 Return NULL_TREE if it is not available. */
40495 static tree
40496 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40497 const_tree index_type, int scale)
40499 bool si;
40500 enum ix86_builtins code;
40502 if (! TARGET_AVX2)
40503 return NULL_TREE;
40505 if ((TREE_CODE (index_type) != INTEGER_TYPE
40506 && !POINTER_TYPE_P (index_type))
40507 || (TYPE_MODE (index_type) != SImode
40508 && TYPE_MODE (index_type) != DImode))
40509 return NULL_TREE;
40511 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40512 return NULL_TREE;
40514 /* v*gather* insn sign extends index to pointer mode. */
40515 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40516 && TYPE_UNSIGNED (index_type))
40517 return NULL_TREE;
40519 if (scale <= 0
40520 || scale > 8
40521 || (scale & (scale - 1)) != 0)
40522 return NULL_TREE;
40524 si = TYPE_MODE (index_type) == SImode;
40525 switch (TYPE_MODE (mem_vectype))
40527 case E_V2DFmode:
40528 if (TARGET_AVX512VL)
40529 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40530 else
40531 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40532 break;
40533 case E_V4DFmode:
40534 if (TARGET_AVX512VL)
40535 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40536 else
40537 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40538 break;
40539 case E_V2DImode:
40540 if (TARGET_AVX512VL)
40541 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40542 else
40543 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40544 break;
40545 case E_V4DImode:
40546 if (TARGET_AVX512VL)
40547 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40548 else
40549 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40550 break;
40551 case E_V4SFmode:
40552 if (TARGET_AVX512VL)
40553 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40554 else
40555 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40556 break;
40557 case E_V8SFmode:
40558 if (TARGET_AVX512VL)
40559 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40560 else
40561 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40562 break;
40563 case E_V4SImode:
40564 if (TARGET_AVX512VL)
40565 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40566 else
40567 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40568 break;
40569 case E_V8SImode:
40570 if (TARGET_AVX512VL)
40571 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40572 else
40573 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40574 break;
40575 case E_V8DFmode:
40576 if (TARGET_AVX512F)
40577 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40578 else
40579 return NULL_TREE;
40580 break;
40581 case E_V8DImode:
40582 if (TARGET_AVX512F)
40583 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40584 else
40585 return NULL_TREE;
40586 break;
40587 case E_V16SFmode:
40588 if (TARGET_AVX512F)
40589 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40590 else
40591 return NULL_TREE;
40592 break;
40593 case E_V16SImode:
40594 if (TARGET_AVX512F)
40595 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40596 else
40597 return NULL_TREE;
40598 break;
40599 default:
40600 return NULL_TREE;
40603 return ix86_get_builtin (code);
40606 /* Returns a decl of a function that implements scatter store with
40607 register type VECTYPE and index type INDEX_TYPE and SCALE.
40608 Return NULL_TREE if it is not available. */
40610 static tree
40611 ix86_vectorize_builtin_scatter (const_tree vectype,
40612 const_tree index_type, int scale)
40614 bool si;
40615 enum ix86_builtins code;
40617 if (!TARGET_AVX512F)
40618 return NULL_TREE;
40620 if ((TREE_CODE (index_type) != INTEGER_TYPE
40621 && !POINTER_TYPE_P (index_type))
40622 || (TYPE_MODE (index_type) != SImode
40623 && TYPE_MODE (index_type) != DImode))
40624 return NULL_TREE;
40626 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40627 return NULL_TREE;
40629 /* v*scatter* insn sign extends index to pointer mode. */
40630 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40631 && TYPE_UNSIGNED (index_type))
40632 return NULL_TREE;
40634 /* Scale can be 1, 2, 4 or 8. */
40635 if (scale <= 0
40636 || scale > 8
40637 || (scale & (scale - 1)) != 0)
40638 return NULL_TREE;
40640 si = TYPE_MODE (index_type) == SImode;
40641 switch (TYPE_MODE (vectype))
40643 case E_V8DFmode:
40644 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40645 break;
40646 case E_V8DImode:
40647 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40648 break;
40649 case E_V16SFmode:
40650 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40651 break;
40652 case E_V16SImode:
40653 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40654 break;
40655 default:
40656 return NULL_TREE;
40659 return ix86_builtins[code];
40662 /* Return true if it is safe to use the rsqrt optabs to optimize
40663 1.0/sqrt. */
40665 static bool
40666 use_rsqrt_p ()
40668 return (TARGET_SSE_MATH
40669 && flag_finite_math_only
40670 && !flag_trapping_math
40671 && flag_unsafe_math_optimizations);
40674 /* Returns a code for a target-specific builtin that implements
40675 reciprocal of the function, or NULL_TREE if not available. */
40677 static tree
40678 ix86_builtin_reciprocal (tree fndecl)
40680 switch (DECL_FUNCTION_CODE (fndecl))
40682 /* Vectorized version of sqrt to rsqrt conversion. */
40683 case IX86_BUILTIN_SQRTPS_NR:
40684 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40686 case IX86_BUILTIN_SQRTPS_NR256:
40687 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40689 default:
40690 return NULL_TREE;
40694 /* Helper for avx_vpermilps256_operand et al. This is also used by
40695 the expansion functions to turn the parallel back into a mask.
40696 The return value is 0 for no match and the imm8+1 for a match. */
40699 avx_vpermilp_parallel (rtx par, machine_mode mode)
40701 unsigned i, nelt = GET_MODE_NUNITS (mode);
40702 unsigned mask = 0;
40703 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40705 if (XVECLEN (par, 0) != (int) nelt)
40706 return 0;
40708 /* Validate that all of the elements are constants, and not totally
40709 out of range. Copy the data into an integral array to make the
40710 subsequent checks easier. */
40711 for (i = 0; i < nelt; ++i)
40713 rtx er = XVECEXP (par, 0, i);
40714 unsigned HOST_WIDE_INT ei;
40716 if (!CONST_INT_P (er))
40717 return 0;
40718 ei = INTVAL (er);
40719 if (ei >= nelt)
40720 return 0;
40721 ipar[i] = ei;
40724 switch (mode)
40726 case E_V8DFmode:
40727 /* In the 512-bit DFmode case, we can only move elements within
40728 a 128-bit lane. First fill the second part of the mask,
40729 then fallthru. */
40730 for (i = 4; i < 6; ++i)
40732 if (ipar[i] < 4 || ipar[i] >= 6)
40733 return 0;
40734 mask |= (ipar[i] - 4) << i;
40736 for (i = 6; i < 8; ++i)
40738 if (ipar[i] < 6)
40739 return 0;
40740 mask |= (ipar[i] - 6) << i;
40742 /* FALLTHRU */
40744 case E_V4DFmode:
40745 /* In the 256-bit DFmode case, we can only move elements within
40746 a 128-bit lane. */
40747 for (i = 0; i < 2; ++i)
40749 if (ipar[i] >= 2)
40750 return 0;
40751 mask |= ipar[i] << i;
40753 for (i = 2; i < 4; ++i)
40755 if (ipar[i] < 2)
40756 return 0;
40757 mask |= (ipar[i] - 2) << i;
40759 break;
40761 case E_V16SFmode:
40762 /* In 512 bit SFmode case, permutation in the upper 256 bits
40763 must mirror the permutation in the lower 256-bits. */
40764 for (i = 0; i < 8; ++i)
40765 if (ipar[i] + 8 != ipar[i + 8])
40766 return 0;
40767 /* FALLTHRU */
40769 case E_V8SFmode:
40770 /* In 256 bit SFmode case, we have full freedom of
40771 movement within the low 128-bit lane, but the high 128-bit
40772 lane must mirror the exact same pattern. */
40773 for (i = 0; i < 4; ++i)
40774 if (ipar[i] + 4 != ipar[i + 4])
40775 return 0;
40776 nelt = 4;
40777 /* FALLTHRU */
40779 case E_V2DFmode:
40780 case E_V4SFmode:
40781 /* In the 128-bit case, we've full freedom in the placement of
40782 the elements from the source operand. */
40783 for (i = 0; i < nelt; ++i)
40784 mask |= ipar[i] << (i * (nelt / 2));
40785 break;
40787 default:
40788 gcc_unreachable ();
40791 /* Make sure success has a non-zero value by adding one. */
40792 return mask + 1;
40795 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40796 the expansion functions to turn the parallel back into a mask.
40797 The return value is 0 for no match and the imm8+1 for a match. */
40800 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40802 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40803 unsigned mask = 0;
40804 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40806 if (XVECLEN (par, 0) != (int) nelt)
40807 return 0;
40809 /* Validate that all of the elements are constants, and not totally
40810 out of range. Copy the data into an integral array to make the
40811 subsequent checks easier. */
40812 for (i = 0; i < nelt; ++i)
40814 rtx er = XVECEXP (par, 0, i);
40815 unsigned HOST_WIDE_INT ei;
40817 if (!CONST_INT_P (er))
40818 return 0;
40819 ei = INTVAL (er);
40820 if (ei >= 2 * nelt)
40821 return 0;
40822 ipar[i] = ei;
40825 /* Validate that the halves of the permute are halves. */
40826 for (i = 0; i < nelt2 - 1; ++i)
40827 if (ipar[i] + 1 != ipar[i + 1])
40828 return 0;
40829 for (i = nelt2; i < nelt - 1; ++i)
40830 if (ipar[i] + 1 != ipar[i + 1])
40831 return 0;
40833 /* Reconstruct the mask. */
40834 for (i = 0; i < 2; ++i)
40836 unsigned e = ipar[i * nelt2];
40837 if (e % nelt2)
40838 return 0;
40839 e /= nelt2;
40840 mask |= e << (i * 4);
40843 /* Make sure success has a non-zero value by adding one. */
40844 return mask + 1;
40847 /* Return a register priority for hard reg REGNO. */
40848 static int
40849 ix86_register_priority (int hard_regno)
40851 /* ebp and r13 as the base always wants a displacement, r12 as the
40852 base always wants an index. So discourage their usage in an
40853 address. */
40854 if (hard_regno == R12_REG || hard_regno == R13_REG)
40855 return 0;
40856 if (hard_regno == BP_REG)
40857 return 1;
40858 /* New x86-64 int registers result in bigger code size. Discourage
40859 them. */
40860 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40861 return 2;
40862 /* New x86-64 SSE registers result in bigger code size. Discourage
40863 them. */
40864 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40865 return 2;
40866 /* Usage of AX register results in smaller code. Prefer it. */
40867 if (hard_regno == AX_REG)
40868 return 4;
40869 return 3;
40872 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40874 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40875 QImode must go into class Q_REGS.
40876 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40877 movdf to do mem-to-mem moves through integer regs. */
40879 static reg_class_t
40880 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40882 machine_mode mode = GET_MODE (x);
40884 /* We're only allowed to return a subclass of CLASS. Many of the
40885 following checks fail for NO_REGS, so eliminate that early. */
40886 if (regclass == NO_REGS)
40887 return NO_REGS;
40889 /* All classes can load zeros. */
40890 if (x == CONST0_RTX (mode))
40891 return regclass;
40893 /* Force constants into memory if we are loading a (nonzero) constant into
40894 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40895 instructions to load from a constant. */
40896 if (CONSTANT_P (x)
40897 && (MAYBE_MMX_CLASS_P (regclass)
40898 || MAYBE_SSE_CLASS_P (regclass)
40899 || MAYBE_MASK_CLASS_P (regclass)))
40900 return NO_REGS;
40902 /* Floating-point constants need more complex checks. */
40903 if (CONST_DOUBLE_P (x))
40905 /* General regs can load everything. */
40906 if (INTEGER_CLASS_P (regclass))
40907 return regclass;
40909 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40910 zero above. We only want to wind up preferring 80387 registers if
40911 we plan on doing computation with them. */
40912 if (IS_STACK_MODE (mode)
40913 && standard_80387_constant_p (x) > 0)
40915 /* Limit class to FP regs. */
40916 if (FLOAT_CLASS_P (regclass))
40917 return FLOAT_REGS;
40918 else if (regclass == FP_TOP_SSE_REGS)
40919 return FP_TOP_REG;
40920 else if (regclass == FP_SECOND_SSE_REGS)
40921 return FP_SECOND_REG;
40924 return NO_REGS;
40927 /* Prefer SSE regs only, if we can use them for math. */
40928 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40929 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40931 /* Generally when we see PLUS here, it's the function invariant
40932 (plus soft-fp const_int). Which can only be computed into general
40933 regs. */
40934 if (GET_CODE (x) == PLUS)
40935 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40937 /* QImode constants are easy to load, but non-constant QImode data
40938 must go into Q_REGS. */
40939 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40941 if (Q_CLASS_P (regclass))
40942 return regclass;
40943 else if (reg_class_subset_p (Q_REGS, regclass))
40944 return Q_REGS;
40945 else
40946 return NO_REGS;
40949 return regclass;
40952 /* Discourage putting floating-point values in SSE registers unless
40953 SSE math is being used, and likewise for the 387 registers. */
40954 static reg_class_t
40955 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40957 machine_mode mode = GET_MODE (x);
40959 /* Restrict the output reload class to the register bank that we are doing
40960 math on. If we would like not to return a subset of CLASS, reject this
40961 alternative: if reload cannot do this, it will still use its choice. */
40962 mode = GET_MODE (x);
40963 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40964 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40966 if (IS_STACK_MODE (mode))
40968 if (regclass == FP_TOP_SSE_REGS)
40969 return FP_TOP_REG;
40970 else if (regclass == FP_SECOND_SSE_REGS)
40971 return FP_SECOND_REG;
40972 else
40973 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40976 return regclass;
40979 static reg_class_t
40980 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40981 machine_mode mode, secondary_reload_info *sri)
40983 /* Double-word spills from general registers to non-offsettable memory
40984 references (zero-extended addresses) require special handling. */
40985 if (TARGET_64BIT
40986 && MEM_P (x)
40987 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40988 && INTEGER_CLASS_P (rclass)
40989 && !offsettable_memref_p (x))
40991 sri->icode = (in_p
40992 ? CODE_FOR_reload_noff_load
40993 : CODE_FOR_reload_noff_store);
40994 /* Add the cost of moving address to a temporary. */
40995 sri->extra_cost = 1;
40997 return NO_REGS;
41000 /* QImode spills from non-QI registers require
41001 intermediate register on 32bit targets. */
41002 if (mode == QImode
41003 && ((!TARGET_64BIT && !in_p
41004 && INTEGER_CLASS_P (rclass)
41005 && MAYBE_NON_Q_CLASS_P (rclass))
41006 || (!TARGET_AVX512DQ
41007 && MAYBE_MASK_CLASS_P (rclass))))
41009 int regno = true_regnum (x);
41011 /* Return Q_REGS if the operand is in memory. */
41012 if (regno == -1)
41013 return Q_REGS;
41015 return NO_REGS;
41018 /* This condition handles corner case where an expression involving
41019 pointers gets vectorized. We're trying to use the address of a
41020 stack slot as a vector initializer.
41022 (set (reg:V2DI 74 [ vect_cst_.2 ])
41023 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
41025 Eventually frame gets turned into sp+offset like this:
41027 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41028 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
41029 (const_int 392 [0x188]))))
41031 That later gets turned into:
41033 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41034 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
41035 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
41037 We'll have the following reload recorded:
41039 Reload 0: reload_in (DI) =
41040 (plus:DI (reg/f:DI 7 sp)
41041 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
41042 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41043 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
41044 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
41045 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41046 reload_reg_rtx: (reg:V2DI 22 xmm1)
41048 Which isn't going to work since SSE instructions can't handle scalar
41049 additions. Returning GENERAL_REGS forces the addition into integer
41050 register and reload can handle subsequent reloads without problems. */
41052 if (in_p && GET_CODE (x) == PLUS
41053 && SSE_CLASS_P (rclass)
41054 && SCALAR_INT_MODE_P (mode))
41055 return GENERAL_REGS;
41057 return NO_REGS;
41060 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
41062 static bool
41063 ix86_class_likely_spilled_p (reg_class_t rclass)
41065 switch (rclass)
41067 case AREG:
41068 case DREG:
41069 case CREG:
41070 case BREG:
41071 case AD_REGS:
41072 case SIREG:
41073 case DIREG:
41074 case SSE_FIRST_REG:
41075 case FP_TOP_REG:
41076 case FP_SECOND_REG:
41077 case BND_REGS:
41078 return true;
41080 default:
41081 break;
41084 return false;
41087 /* If we are copying between registers from different register sets
41088 (e.g. FP and integer), we may need a memory location.
41090 The function can't work reliably when one of the CLASSES is a class
41091 containing registers from multiple sets. We avoid this by never combining
41092 different sets in a single alternative in the machine description.
41093 Ensure that this constraint holds to avoid unexpected surprises.
41095 When STRICT is false, we are being called from REGISTER_MOVE_COST,
41096 so do not enforce these sanity checks.
41098 To optimize register_move_cost performance, define inline variant. */
41100 static inline bool
41101 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
41102 reg_class_t class2, int strict)
41104 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
41105 return false;
41107 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
41108 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
41109 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
41110 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
41111 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
41112 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
41113 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
41114 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
41116 gcc_assert (!strict || lra_in_progress);
41117 return true;
41120 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
41121 return true;
41123 /* Between mask and general, we have moves no larger than word size. */
41124 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
41125 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
41126 return true;
41128 /* ??? This is a lie. We do have moves between mmx/general, and for
41129 mmx/sse2. But by saying we need secondary memory we discourage the
41130 register allocator from using the mmx registers unless needed. */
41131 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
41132 return true;
41134 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41136 /* SSE1 doesn't have any direct moves from other classes. */
41137 if (!TARGET_SSE2)
41138 return true;
41140 /* If the target says that inter-unit moves are more expensive
41141 than moving through memory, then don't generate them. */
41142 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
41143 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
41144 return true;
41146 /* Between SSE and general, we have moves no larger than word size. */
41147 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41148 return true;
41151 return false;
41154 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
41156 static bool
41157 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
41158 reg_class_t class2)
41160 return inline_secondary_memory_needed (mode, class1, class2, true);
41163 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
41165 get_secondary_mem widens integral modes to BITS_PER_WORD.
41166 There is no need to emit full 64 bit move on 64 bit targets
41167 for integral modes that can be moved using 32 bit move. */
41169 static machine_mode
41170 ix86_secondary_memory_needed_mode (machine_mode mode)
41172 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
41173 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
41174 return mode;
41177 /* Implement the TARGET_CLASS_MAX_NREGS hook.
41179 On the 80386, this is the size of MODE in words,
41180 except in the FP regs, where a single reg is always enough. */
41182 static unsigned char
41183 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
41185 if (MAYBE_INTEGER_CLASS_P (rclass))
41187 if (mode == XFmode)
41188 return (TARGET_64BIT ? 2 : 3);
41189 else if (mode == XCmode)
41190 return (TARGET_64BIT ? 4 : 6);
41191 else
41192 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41194 else
41196 if (COMPLEX_MODE_P (mode))
41197 return 2;
41198 else
41199 return 1;
41203 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
41205 static bool
41206 ix86_can_change_mode_class (machine_mode from, machine_mode to,
41207 reg_class_t regclass)
41209 if (from == to)
41210 return true;
41212 /* x87 registers can't do subreg at all, as all values are reformatted
41213 to extended precision. */
41214 if (MAYBE_FLOAT_CLASS_P (regclass))
41215 return false;
41217 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
41219 /* Vector registers do not support QI or HImode loads. If we don't
41220 disallow a change to these modes, reload will assume it's ok to
41221 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
41222 the vec_dupv4hi pattern. */
41223 if (GET_MODE_SIZE (from) < 4)
41224 return false;
41227 return true;
41230 /* Return the cost of moving data of mode M between a
41231 register and memory. A value of 2 is the default; this cost is
41232 relative to those in `REGISTER_MOVE_COST'.
41234 This function is used extensively by register_move_cost that is used to
41235 build tables at startup. Make it inline in this case.
41236 When IN is 2, return maximum of in and out move cost.
41238 If moving between registers and memory is more expensive than
41239 between two registers, you should define this macro to express the
41240 relative cost.
41242 Model also increased moving costs of QImode registers in non
41243 Q_REGS classes.
41245 static inline int
41246 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
41247 int in)
41249 int cost;
41250 if (FLOAT_CLASS_P (regclass))
41252 int index;
41253 switch (mode)
41255 case E_SFmode:
41256 index = 0;
41257 break;
41258 case E_DFmode:
41259 index = 1;
41260 break;
41261 case E_XFmode:
41262 index = 2;
41263 break;
41264 default:
41265 return 100;
41267 if (in == 2)
41268 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
41269 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
41271 if (SSE_CLASS_P (regclass))
41273 int index;
41274 switch (GET_MODE_SIZE (mode))
41276 case 4:
41277 index = 0;
41278 break;
41279 case 8:
41280 index = 1;
41281 break;
41282 case 16:
41283 index = 2;
41284 break;
41285 default:
41286 return 100;
41288 if (in == 2)
41289 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41290 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41292 if (MMX_CLASS_P (regclass))
41294 int index;
41295 switch (GET_MODE_SIZE (mode))
41297 case 4:
41298 index = 0;
41299 break;
41300 case 8:
41301 index = 1;
41302 break;
41303 default:
41304 return 100;
41306 if (in)
41307 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41308 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41310 switch (GET_MODE_SIZE (mode))
41312 case 1:
41313 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41315 if (!in)
41316 return ix86_cost->int_store[0];
41317 if (TARGET_PARTIAL_REG_DEPENDENCY
41318 && optimize_function_for_speed_p (cfun))
41319 cost = ix86_cost->movzbl_load;
41320 else
41321 cost = ix86_cost->int_load[0];
41322 if (in == 2)
41323 return MAX (cost, ix86_cost->int_store[0]);
41324 return cost;
41326 else
41328 if (in == 2)
41329 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41330 if (in)
41331 return ix86_cost->movzbl_load;
41332 else
41333 return ix86_cost->int_store[0] + 4;
41335 break;
41336 case 2:
41337 if (in == 2)
41338 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41339 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41340 default:
41341 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41342 if (mode == TFmode)
41343 mode = XFmode;
41344 if (in == 2)
41345 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41346 else if (in)
41347 cost = ix86_cost->int_load[2];
41348 else
41349 cost = ix86_cost->int_store[2];
41350 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41354 static int
41355 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41356 bool in)
41358 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41362 /* Return the cost of moving data from a register in class CLASS1 to
41363 one in class CLASS2.
41365 It is not required that the cost always equal 2 when FROM is the same as TO;
41366 on some machines it is expensive to move between registers if they are not
41367 general registers. */
41369 static int
41370 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41371 reg_class_t class2_i)
41373 enum reg_class class1 = (enum reg_class) class1_i;
41374 enum reg_class class2 = (enum reg_class) class2_i;
41376 /* In case we require secondary memory, compute cost of the store followed
41377 by load. In order to avoid bad register allocation choices, we need
41378 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41380 if (inline_secondary_memory_needed (mode, class1, class2, false))
41382 int cost = 1;
41384 cost += inline_memory_move_cost (mode, class1, 2);
41385 cost += inline_memory_move_cost (mode, class2, 2);
41387 /* In case of copying from general_purpose_register we may emit multiple
41388 stores followed by single load causing memory size mismatch stall.
41389 Count this as arbitrarily high cost of 20. */
41390 if (targetm.class_max_nregs (class1, mode)
41391 > targetm.class_max_nregs (class2, mode))
41392 cost += 20;
41394 /* In the case of FP/MMX moves, the registers actually overlap, and we
41395 have to switch modes in order to treat them differently. */
41396 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41397 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41398 cost += 20;
41400 return cost;
41403 /* Moves between SSE/MMX and integer unit are expensive. */
41404 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41405 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41407 /* ??? By keeping returned value relatively high, we limit the number
41408 of moves between integer and MMX/SSE registers for all targets.
41409 Additionally, high value prevents problem with x86_modes_tieable_p(),
41410 where integer modes in MMX/SSE registers are not tieable
41411 because of missing QImode and HImode moves to, from or between
41412 MMX/SSE registers. */
41413 return MAX (8, ix86_cost->mmxsse_to_integer);
41415 if (MAYBE_FLOAT_CLASS_P (class1))
41416 return ix86_cost->fp_move;
41417 if (MAYBE_SSE_CLASS_P (class1))
41418 return ix86_cost->sse_move;
41419 if (MAYBE_MMX_CLASS_P (class1))
41420 return ix86_cost->mmx_move;
41421 return 2;
41424 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
41425 words of a value of mode MODE but can be less for certain modes in
41426 special long registers.
41428 Actually there are no two word move instructions for consecutive
41429 registers. And only registers 0-3 may have mov byte instructions
41430 applied to them. */
41432 static unsigned int
41433 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
41435 if (GENERAL_REGNO_P (regno))
41437 if (mode == XFmode)
41438 return TARGET_64BIT ? 2 : 3;
41439 if (mode == XCmode)
41440 return TARGET_64BIT ? 4 : 6;
41441 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41443 if (COMPLEX_MODE_P (mode))
41444 return 2;
41445 if (mode == V64SFmode || mode == V64SImode)
41446 return 4;
41447 return 1;
41450 /* Implement TARGET_HARD_REGNO_MODE_OK. */
41452 static bool
41453 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
41455 /* Flags and only flags can only hold CCmode values. */
41456 if (CC_REGNO_P (regno))
41457 return GET_MODE_CLASS (mode) == MODE_CC;
41458 if (GET_MODE_CLASS (mode) == MODE_CC
41459 || GET_MODE_CLASS (mode) == MODE_RANDOM
41460 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41461 return false;
41462 if (STACK_REGNO_P (regno))
41463 return VALID_FP_MODE_P (mode);
41464 if (MASK_REGNO_P (regno))
41465 return (VALID_MASK_REG_MODE (mode)
41466 || (TARGET_AVX512BW
41467 && VALID_MASK_AVX512BW_MODE (mode)));
41468 if (BND_REGNO_P (regno))
41469 return VALID_BND_REG_MODE (mode);
41470 if (SSE_REGNO_P (regno))
41472 /* We implement the move patterns for all vector modes into and
41473 out of SSE registers, even when no operation instructions
41474 are available. */
41476 /* For AVX-512 we allow, regardless of regno:
41477 - XI mode
41478 - any of 512-bit wide vector mode
41479 - any scalar mode. */
41480 if (TARGET_AVX512F
41481 && (mode == XImode
41482 || VALID_AVX512F_REG_MODE (mode)
41483 || VALID_AVX512F_SCALAR_MODE (mode)))
41484 return true;
41486 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41487 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41488 && MOD4_SSE_REGNO_P (regno)
41489 && mode == V64SFmode)
41490 return true;
41492 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41493 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41494 && MOD4_SSE_REGNO_P (regno)
41495 && mode == V64SImode)
41496 return true;
41498 /* TODO check for QI/HI scalars. */
41499 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41500 if (TARGET_AVX512VL
41501 && (mode == OImode
41502 || mode == TImode
41503 || VALID_AVX256_REG_MODE (mode)
41504 || VALID_AVX512VL_128_REG_MODE (mode)))
41505 return true;
41507 /* xmm16-xmm31 are only available for AVX-512. */
41508 if (EXT_REX_SSE_REGNO_P (regno))
41509 return false;
41511 /* OImode and AVX modes are available only when AVX is enabled. */
41512 return ((TARGET_AVX
41513 && VALID_AVX256_REG_OR_OI_MODE (mode))
41514 || VALID_SSE_REG_MODE (mode)
41515 || VALID_SSE2_REG_MODE (mode)
41516 || VALID_MMX_REG_MODE (mode)
41517 || VALID_MMX_REG_MODE_3DNOW (mode));
41519 if (MMX_REGNO_P (regno))
41521 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41522 so if the register is available at all, then we can move data of
41523 the given mode into or out of it. */
41524 return (VALID_MMX_REG_MODE (mode)
41525 || VALID_MMX_REG_MODE_3DNOW (mode));
41528 if (mode == QImode)
41530 /* Take care for QImode values - they can be in non-QI regs,
41531 but then they do cause partial register stalls. */
41532 if (ANY_QI_REGNO_P (regno))
41533 return true;
41534 if (!TARGET_PARTIAL_REG_STALL)
41535 return true;
41536 /* LRA checks if the hard register is OK for the given mode.
41537 QImode values can live in non-QI regs, so we allow all
41538 registers here. */
41539 if (lra_in_progress)
41540 return true;
41541 return !can_create_pseudo_p ();
41543 /* We handle both integer and floats in the general purpose registers. */
41544 else if (VALID_INT_MODE_P (mode))
41545 return true;
41546 else if (VALID_FP_MODE_P (mode))
41547 return true;
41548 else if (VALID_DFP_MODE_P (mode))
41549 return true;
41550 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41551 on to use that value in smaller contexts, this can easily force a
41552 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41553 supporting DImode, allow it. */
41554 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41555 return true;
41557 return false;
41560 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
41561 saves SSE registers across calls is Win64 (thus no need to check the
41562 current ABI here), and with AVX enabled Win64 only guarantees that
41563 the low 16 bytes are saved. */
41565 static bool
41566 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
41568 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
41571 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41572 tieable integer mode. */
41574 static bool
41575 ix86_tieable_integer_mode_p (machine_mode mode)
41577 switch (mode)
41579 case E_HImode:
41580 case E_SImode:
41581 return true;
41583 case E_QImode:
41584 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41586 case E_DImode:
41587 return TARGET_64BIT;
41589 default:
41590 return false;
41594 /* Implement TARGET_MODES_TIEABLE_P.
41596 Return true if MODE1 is accessible in a register that can hold MODE2
41597 without copying. That is, all register classes that can hold MODE2
41598 can also hold MODE1. */
41600 static bool
41601 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41603 if (mode1 == mode2)
41604 return true;
41606 if (ix86_tieable_integer_mode_p (mode1)
41607 && ix86_tieable_integer_mode_p (mode2))
41608 return true;
41610 /* MODE2 being XFmode implies fp stack or general regs, which means we
41611 can tie any smaller floating point modes to it. Note that we do not
41612 tie this with TFmode. */
41613 if (mode2 == XFmode)
41614 return mode1 == SFmode || mode1 == DFmode;
41616 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41617 that we can tie it with SFmode. */
41618 if (mode2 == DFmode)
41619 return mode1 == SFmode;
41621 /* If MODE2 is only appropriate for an SSE register, then tie with
41622 any other mode acceptable to SSE registers. */
41623 if (GET_MODE_SIZE (mode2) == 32
41624 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41625 return (GET_MODE_SIZE (mode1) == 32
41626 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41627 if (GET_MODE_SIZE (mode2) == 16
41628 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41629 return (GET_MODE_SIZE (mode1) == 16
41630 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41632 /* If MODE2 is appropriate for an MMX register, then tie
41633 with any other mode acceptable to MMX registers. */
41634 if (GET_MODE_SIZE (mode2) == 8
41635 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41636 return (GET_MODE_SIZE (mode1) == 8
41637 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41639 return false;
41642 /* Return the cost of moving between two registers of mode MODE. */
41644 static int
41645 ix86_set_reg_reg_cost (machine_mode mode)
41647 unsigned int units = UNITS_PER_WORD;
41649 switch (GET_MODE_CLASS (mode))
41651 default:
41652 break;
41654 case MODE_CC:
41655 units = GET_MODE_SIZE (CCmode);
41656 break;
41658 case MODE_FLOAT:
41659 if ((TARGET_SSE && mode == TFmode)
41660 || (TARGET_80387 && mode == XFmode)
41661 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41662 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41663 units = GET_MODE_SIZE (mode);
41664 break;
41666 case MODE_COMPLEX_FLOAT:
41667 if ((TARGET_SSE && mode == TCmode)
41668 || (TARGET_80387 && mode == XCmode)
41669 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41670 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41671 units = GET_MODE_SIZE (mode);
41672 break;
41674 case MODE_VECTOR_INT:
41675 case MODE_VECTOR_FLOAT:
41676 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41677 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41678 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41679 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41680 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41681 units = GET_MODE_SIZE (mode);
41684 /* Return the cost of moving between two registers of mode MODE,
41685 assuming that the move will be in pieces of at most UNITS bytes. */
41686 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41689 /* Compute a (partial) cost for rtx X. Return true if the complete
41690 cost has been computed, and false if subexpressions should be
41691 scanned. In either case, *TOTAL contains the cost result. */
41693 static bool
41694 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41695 int *total, bool speed)
41697 rtx mask;
41698 enum rtx_code code = GET_CODE (x);
41699 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41700 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41701 int src_cost;
41703 switch (code)
41705 case SET:
41706 if (register_operand (SET_DEST (x), VOIDmode)
41707 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41709 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41710 return true;
41713 if (register_operand (SET_SRC (x), VOIDmode))
41714 /* Avoid potentially incorrect high cost from rtx_costs
41715 for non-tieable SUBREGs. */
41716 src_cost = 0;
41717 else
41719 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41721 if (CONSTANT_P (SET_SRC (x)))
41722 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41723 a small value, possibly zero for cheap constants. */
41724 src_cost += COSTS_N_INSNS (1);
41727 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41728 return true;
41730 case CONST_INT:
41731 case CONST:
41732 case LABEL_REF:
41733 case SYMBOL_REF:
41734 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41735 *total = 3;
41736 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41737 *total = 2;
41738 else if (flag_pic && SYMBOLIC_CONST (x)
41739 && !(TARGET_64BIT
41740 && (GET_CODE (x) == LABEL_REF
41741 || (GET_CODE (x) == SYMBOL_REF
41742 && SYMBOL_REF_LOCAL_P (x))))
41743 /* Use 0 cost for CONST to improve its propagation. */
41744 && (TARGET_64BIT || GET_CODE (x) != CONST))
41745 *total = 1;
41746 else
41747 *total = 0;
41748 return true;
41750 case CONST_DOUBLE:
41751 if (IS_STACK_MODE (mode))
41752 switch (standard_80387_constant_p (x))
41754 case -1:
41755 case 0:
41756 break;
41757 case 1: /* 0.0 */
41758 *total = 1;
41759 return true;
41760 default: /* Other constants */
41761 *total = 2;
41762 return true;
41764 /* FALLTHRU */
41766 case CONST_VECTOR:
41767 switch (standard_sse_constant_p (x, mode))
41769 case 0:
41770 break;
41771 case 1: /* 0: xor eliminates false dependency */
41772 *total = 0;
41773 return true;
41774 default: /* -1: cmp contains false dependency */
41775 *total = 1;
41776 return true;
41778 /* FALLTHRU */
41780 case CONST_WIDE_INT:
41781 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41782 it'll probably end up. Add a penalty for size. */
41783 *total = (COSTS_N_INSNS (1)
41784 + (!TARGET_64BIT && flag_pic)
41785 + (GET_MODE_SIZE (mode) <= 4
41786 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41787 return true;
41789 case ZERO_EXTEND:
41790 /* The zero extensions is often completely free on x86_64, so make
41791 it as cheap as possible. */
41792 if (TARGET_64BIT && mode == DImode
41793 && GET_MODE (XEXP (x, 0)) == SImode)
41794 *total = 1;
41795 else if (TARGET_ZERO_EXTEND_WITH_AND)
41796 *total = cost->add;
41797 else
41798 *total = cost->movzx;
41799 return false;
41801 case SIGN_EXTEND:
41802 *total = cost->movsx;
41803 return false;
41805 case ASHIFT:
41806 if (SCALAR_INT_MODE_P (mode)
41807 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41808 && CONST_INT_P (XEXP (x, 1)))
41810 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41811 if (value == 1)
41813 *total = cost->add;
41814 return false;
41816 if ((value == 2 || value == 3)
41817 && cost->lea <= cost->shift_const)
41819 *total = cost->lea;
41820 return false;
41823 /* FALLTHRU */
41825 case ROTATE:
41826 case ASHIFTRT:
41827 case LSHIFTRT:
41828 case ROTATERT:
41829 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41831 /* ??? Should be SSE vector operation cost. */
41832 /* At least for published AMD latencies, this really is the same
41833 as the latency for a simple fpu operation like fabs. */
41834 /* V*QImode is emulated with 1-11 insns. */
41835 if (mode == V16QImode || mode == V32QImode)
41837 int count = 11;
41838 if (TARGET_XOP && mode == V16QImode)
41840 /* For XOP we use vpshab, which requires a broadcast of the
41841 value to the variable shift insn. For constants this
41842 means a V16Q const in mem; even when we can perform the
41843 shift with one insn set the cost to prefer paddb. */
41844 if (CONSTANT_P (XEXP (x, 1)))
41846 *total = (cost->fabs
41847 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41848 + (speed ? 2 : COSTS_N_BYTES (16)));
41849 return true;
41851 count = 3;
41853 else if (TARGET_SSSE3)
41854 count = 7;
41855 *total = cost->fabs * count;
41857 else
41858 *total = cost->fabs;
41860 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41862 if (CONST_INT_P (XEXP (x, 1)))
41864 if (INTVAL (XEXP (x, 1)) > 32)
41865 *total = cost->shift_const + COSTS_N_INSNS (2);
41866 else
41867 *total = cost->shift_const * 2;
41869 else
41871 if (GET_CODE (XEXP (x, 1)) == AND)
41872 *total = cost->shift_var * 2;
41873 else
41874 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41877 else
41879 if (CONST_INT_P (XEXP (x, 1)))
41880 *total = cost->shift_const;
41881 else if (SUBREG_P (XEXP (x, 1))
41882 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41884 /* Return the cost after shift-and truncation. */
41885 *total = cost->shift_var;
41886 return true;
41888 else
41889 *total = cost->shift_var;
41891 return false;
41893 case FMA:
41895 rtx sub;
41897 gcc_assert (FLOAT_MODE_P (mode));
41898 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41900 /* ??? SSE scalar/vector cost should be used here. */
41901 /* ??? Bald assumption that fma has the same cost as fmul. */
41902 *total = cost->fmul;
41903 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41905 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41906 sub = XEXP (x, 0);
41907 if (GET_CODE (sub) == NEG)
41908 sub = XEXP (sub, 0);
41909 *total += rtx_cost (sub, mode, FMA, 0, speed);
41911 sub = XEXP (x, 2);
41912 if (GET_CODE (sub) == NEG)
41913 sub = XEXP (sub, 0);
41914 *total += rtx_cost (sub, mode, FMA, 2, speed);
41915 return true;
41918 case MULT:
41919 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41921 /* ??? SSE scalar cost should be used here. */
41922 *total = cost->fmul;
41923 return false;
41925 else if (X87_FLOAT_MODE_P (mode))
41927 *total = cost->fmul;
41928 return false;
41930 else if (FLOAT_MODE_P (mode))
41932 /* ??? SSE vector cost should be used here. */
41933 *total = cost->fmul;
41934 return false;
41936 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41938 /* V*QImode is emulated with 7-13 insns. */
41939 if (mode == V16QImode || mode == V32QImode)
41941 int extra = 11;
41942 if (TARGET_XOP && mode == V16QImode)
41943 extra = 5;
41944 else if (TARGET_SSSE3)
41945 extra = 6;
41946 *total = cost->fmul * 2 + cost->fabs * extra;
41948 /* V*DImode is emulated with 5-8 insns. */
41949 else if (mode == V2DImode || mode == V4DImode)
41951 if (TARGET_XOP && mode == V2DImode)
41952 *total = cost->fmul * 2 + cost->fabs * 3;
41953 else
41954 *total = cost->fmul * 3 + cost->fabs * 5;
41956 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41957 insns, including two PMULUDQ. */
41958 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41959 *total = cost->fmul * 2 + cost->fabs * 5;
41960 else
41961 *total = cost->fmul;
41962 return false;
41964 else
41966 rtx op0 = XEXP (x, 0);
41967 rtx op1 = XEXP (x, 1);
41968 int nbits;
41969 if (CONST_INT_P (XEXP (x, 1)))
41971 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41972 for (nbits = 0; value != 0; value &= value - 1)
41973 nbits++;
41975 else
41976 /* This is arbitrary. */
41977 nbits = 7;
41979 /* Compute costs correctly for widening multiplication. */
41980 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41981 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41982 == GET_MODE_SIZE (mode))
41984 int is_mulwiden = 0;
41985 machine_mode inner_mode = GET_MODE (op0);
41987 if (GET_CODE (op0) == GET_CODE (op1))
41988 is_mulwiden = 1, op1 = XEXP (op1, 0);
41989 else if (CONST_INT_P (op1))
41991 if (GET_CODE (op0) == SIGN_EXTEND)
41992 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41993 == INTVAL (op1);
41994 else
41995 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41998 if (is_mulwiden)
41999 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
42002 *total = (cost->mult_init[MODE_INDEX (mode)]
42003 + nbits * cost->mult_bit
42004 + rtx_cost (op0, mode, outer_code, opno, speed)
42005 + rtx_cost (op1, mode, outer_code, opno, speed));
42007 return true;
42010 case DIV:
42011 case UDIV:
42012 case MOD:
42013 case UMOD:
42014 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42015 /* ??? SSE cost should be used here. */
42016 *total = cost->fdiv;
42017 else if (X87_FLOAT_MODE_P (mode))
42018 *total = cost->fdiv;
42019 else if (FLOAT_MODE_P (mode))
42020 /* ??? SSE vector cost should be used here. */
42021 *total = cost->fdiv;
42022 else
42023 *total = cost->divide[MODE_INDEX (mode)];
42024 return false;
42026 case PLUS:
42027 if (GET_MODE_CLASS (mode) == MODE_INT
42028 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
42030 if (GET_CODE (XEXP (x, 0)) == PLUS
42031 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
42032 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
42033 && CONSTANT_P (XEXP (x, 1)))
42035 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
42036 if (val == 2 || val == 4 || val == 8)
42038 *total = cost->lea;
42039 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
42040 outer_code, opno, speed);
42041 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
42042 outer_code, opno, speed);
42043 *total += rtx_cost (XEXP (x, 1), mode,
42044 outer_code, opno, speed);
42045 return true;
42048 else if (GET_CODE (XEXP (x, 0)) == MULT
42049 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
42051 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
42052 if (val == 2 || val == 4 || val == 8)
42054 *total = cost->lea;
42055 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42056 outer_code, opno, speed);
42057 *total += rtx_cost (XEXP (x, 1), mode,
42058 outer_code, opno, speed);
42059 return true;
42062 else if (GET_CODE (XEXP (x, 0)) == PLUS)
42064 /* Add with carry, ignore the cost of adding a carry flag. */
42065 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
42066 *total = cost->add;
42067 else
42069 *total = cost->lea;
42070 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42071 outer_code, opno, speed);
42074 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
42075 outer_code, opno, speed);
42076 *total += rtx_cost (XEXP (x, 1), mode,
42077 outer_code, opno, speed);
42078 return true;
42081 /* FALLTHRU */
42083 case MINUS:
42084 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
42085 if (GET_MODE_CLASS (mode) == MODE_INT
42086 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
42087 && GET_CODE (XEXP (x, 0)) == MINUS
42088 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
42090 *total = cost->add;
42091 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42092 outer_code, opno, speed);
42093 *total += rtx_cost (XEXP (x, 1), mode,
42094 outer_code, opno, speed);
42095 return true;
42098 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42100 /* ??? SSE cost should be used here. */
42101 *total = cost->fadd;
42102 return false;
42104 else if (X87_FLOAT_MODE_P (mode))
42106 *total = cost->fadd;
42107 return false;
42109 else if (FLOAT_MODE_P (mode))
42111 /* ??? SSE vector cost should be used here. */
42112 *total = cost->fadd;
42113 return false;
42115 /* FALLTHRU */
42117 case AND:
42118 case IOR:
42119 case XOR:
42120 if (GET_MODE_CLASS (mode) == MODE_INT
42121 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
42123 *total = (cost->add * 2
42124 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
42125 << (GET_MODE (XEXP (x, 0)) != DImode))
42126 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
42127 << (GET_MODE (XEXP (x, 1)) != DImode)));
42128 return true;
42130 /* FALLTHRU */
42132 case NEG:
42133 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42135 /* ??? SSE cost should be used here. */
42136 *total = cost->fchs;
42137 return false;
42139 else if (X87_FLOAT_MODE_P (mode))
42141 *total = cost->fchs;
42142 return false;
42144 else if (FLOAT_MODE_P (mode))
42146 /* ??? SSE vector cost should be used here. */
42147 *total = cost->fchs;
42148 return false;
42150 /* FALLTHRU */
42152 case NOT:
42153 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
42155 /* ??? Should be SSE vector operation cost. */
42156 /* At least for published AMD latencies, this really is the same
42157 as the latency for a simple fpu operation like fabs. */
42158 *total = cost->fabs;
42160 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
42161 *total = cost->add * 2;
42162 else
42163 *total = cost->add;
42164 return false;
42166 case COMPARE:
42167 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
42168 && XEXP (XEXP (x, 0), 1) == const1_rtx
42169 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
42170 && XEXP (x, 1) == const0_rtx)
42172 /* This kind of construct is implemented using test[bwl].
42173 Treat it as if we had an AND. */
42174 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
42175 *total = (cost->add
42176 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
42177 opno, speed)
42178 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
42179 return true;
42182 /* The embedded comparison operand is completely free. */
42183 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
42184 && XEXP (x, 1) == const0_rtx)
42185 *total = 0;
42187 return false;
42189 case FLOAT_EXTEND:
42190 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
42191 *total = 0;
42192 return false;
42194 case ABS:
42195 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42196 /* ??? SSE cost should be used here. */
42197 *total = cost->fabs;
42198 else if (X87_FLOAT_MODE_P (mode))
42199 *total = cost->fabs;
42200 else if (FLOAT_MODE_P (mode))
42201 /* ??? SSE vector cost should be used here. */
42202 *total = cost->fabs;
42203 return false;
42205 case SQRT:
42206 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42207 /* ??? SSE cost should be used here. */
42208 *total = cost->fsqrt;
42209 else if (X87_FLOAT_MODE_P (mode))
42210 *total = cost->fsqrt;
42211 else if (FLOAT_MODE_P (mode))
42212 /* ??? SSE vector cost should be used here. */
42213 *total = cost->fsqrt;
42214 return false;
42216 case UNSPEC:
42217 if (XINT (x, 1) == UNSPEC_TP)
42218 *total = 0;
42219 return false;
42221 case VEC_SELECT:
42222 case VEC_CONCAT:
42223 case VEC_DUPLICATE:
42224 /* ??? Assume all of these vector manipulation patterns are
42225 recognizable. In which case they all pretty much have the
42226 same cost. */
42227 *total = cost->fabs;
42228 return true;
42229 case VEC_MERGE:
42230 mask = XEXP (x, 2);
42231 /* This is masked instruction, assume the same cost,
42232 as nonmasked variant. */
42233 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
42234 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
42235 else
42236 *total = cost->fabs;
42237 return true;
42239 default:
42240 return false;
42244 #if TARGET_MACHO
42246 static int current_machopic_label_num;
42248 /* Given a symbol name and its associated stub, write out the
42249 definition of the stub. */
42251 void
42252 machopic_output_stub (FILE *file, const char *symb, const char *stub)
42254 unsigned int length;
42255 char *binder_name, *symbol_name, lazy_ptr_name[32];
42256 int label = ++current_machopic_label_num;
42258 /* For 64-bit we shouldn't get here. */
42259 gcc_assert (!TARGET_64BIT);
42261 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
42262 symb = targetm.strip_name_encoding (symb);
42264 length = strlen (stub);
42265 binder_name = XALLOCAVEC (char, length + 32);
42266 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
42268 length = strlen (symb);
42269 symbol_name = XALLOCAVEC (char, length + 32);
42270 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
42272 sprintf (lazy_ptr_name, "L%d$lz", label);
42274 if (MACHOPIC_ATT_STUB)
42275 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
42276 else if (MACHOPIC_PURE)
42277 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
42278 else
42279 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
42281 fprintf (file, "%s:\n", stub);
42282 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42284 if (MACHOPIC_ATT_STUB)
42286 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
42288 else if (MACHOPIC_PURE)
42290 /* PIC stub. */
42291 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42292 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
42293 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
42294 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
42295 label, lazy_ptr_name, label);
42296 fprintf (file, "\tjmp\t*%%ecx\n");
42298 else
42299 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
42301 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
42302 it needs no stub-binding-helper. */
42303 if (MACHOPIC_ATT_STUB)
42304 return;
42306 fprintf (file, "%s:\n", binder_name);
42308 if (MACHOPIC_PURE)
42310 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
42311 fprintf (file, "\tpushl\t%%ecx\n");
42313 else
42314 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
42316 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
42318 /* N.B. Keep the correspondence of these
42319 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
42320 old-pic/new-pic/non-pic stubs; altering this will break
42321 compatibility with existing dylibs. */
42322 if (MACHOPIC_PURE)
42324 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42325 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42327 else
42328 /* 16-byte -mdynamic-no-pic stub. */
42329 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42331 fprintf (file, "%s:\n", lazy_ptr_name);
42332 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42333 fprintf (file, ASM_LONG "%s\n", binder_name);
42335 #endif /* TARGET_MACHO */
42337 /* Order the registers for register allocator. */
42339 void
42340 x86_order_regs_for_local_alloc (void)
42342 int pos = 0;
42343 int i;
42345 /* First allocate the local general purpose registers. */
42346 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42347 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42348 reg_alloc_order [pos++] = i;
42350 /* Global general purpose registers. */
42351 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42352 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42353 reg_alloc_order [pos++] = i;
42355 /* x87 registers come first in case we are doing FP math
42356 using them. */
42357 if (!TARGET_SSE_MATH)
42358 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42359 reg_alloc_order [pos++] = i;
42361 /* SSE registers. */
42362 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42363 reg_alloc_order [pos++] = i;
42364 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42365 reg_alloc_order [pos++] = i;
42367 /* Extended REX SSE registers. */
42368 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42369 reg_alloc_order [pos++] = i;
42371 /* Mask register. */
42372 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42373 reg_alloc_order [pos++] = i;
42375 /* MPX bound registers. */
42376 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42377 reg_alloc_order [pos++] = i;
42379 /* x87 registers. */
42380 if (TARGET_SSE_MATH)
42381 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42382 reg_alloc_order [pos++] = i;
42384 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42385 reg_alloc_order [pos++] = i;
42387 /* Initialize the rest of array as we do not allocate some registers
42388 at all. */
42389 while (pos < FIRST_PSEUDO_REGISTER)
42390 reg_alloc_order [pos++] = 0;
42393 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42394 in struct attribute_spec handler. */
42395 static tree
42396 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42397 tree args,
42398 int,
42399 bool *no_add_attrs)
42401 if (TREE_CODE (*node) != FUNCTION_TYPE
42402 && TREE_CODE (*node) != METHOD_TYPE
42403 && TREE_CODE (*node) != FIELD_DECL
42404 && TREE_CODE (*node) != TYPE_DECL)
42406 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42407 name);
42408 *no_add_attrs = true;
42409 return NULL_TREE;
42411 if (TARGET_64BIT)
42413 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42414 name);
42415 *no_add_attrs = true;
42416 return NULL_TREE;
42418 if (is_attribute_p ("callee_pop_aggregate_return", name))
42420 tree cst;
42422 cst = TREE_VALUE (args);
42423 if (TREE_CODE (cst) != INTEGER_CST)
42425 warning (OPT_Wattributes,
42426 "%qE attribute requires an integer constant argument",
42427 name);
42428 *no_add_attrs = true;
42430 else if (compare_tree_int (cst, 0) != 0
42431 && compare_tree_int (cst, 1) != 0)
42433 warning (OPT_Wattributes,
42434 "argument to %qE attribute is neither zero, nor one",
42435 name);
42436 *no_add_attrs = true;
42439 return NULL_TREE;
42442 return NULL_TREE;
42445 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42446 struct attribute_spec.handler. */
42447 static tree
42448 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42449 bool *no_add_attrs)
42451 if (TREE_CODE (*node) != FUNCTION_TYPE
42452 && TREE_CODE (*node) != METHOD_TYPE
42453 && TREE_CODE (*node) != FIELD_DECL
42454 && TREE_CODE (*node) != TYPE_DECL)
42456 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42457 name);
42458 *no_add_attrs = true;
42459 return NULL_TREE;
42462 /* Can combine regparm with all attributes but fastcall. */
42463 if (is_attribute_p ("ms_abi", name))
42465 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42467 error ("ms_abi and sysv_abi attributes are not compatible");
42470 return NULL_TREE;
42472 else if (is_attribute_p ("sysv_abi", name))
42474 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42476 error ("ms_abi and sysv_abi attributes are not compatible");
42479 return NULL_TREE;
42482 return NULL_TREE;
42485 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42486 struct attribute_spec.handler. */
42487 static tree
42488 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42489 bool *no_add_attrs)
42491 tree *type = NULL;
42492 if (DECL_P (*node))
42494 if (TREE_CODE (*node) == TYPE_DECL)
42495 type = &TREE_TYPE (*node);
42497 else
42498 type = node;
42500 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42502 warning (OPT_Wattributes, "%qE attribute ignored",
42503 name);
42504 *no_add_attrs = true;
42507 else if ((is_attribute_p ("ms_struct", name)
42508 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42509 || ((is_attribute_p ("gcc_struct", name)
42510 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42512 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42513 name);
42514 *no_add_attrs = true;
42517 return NULL_TREE;
42520 static tree
42521 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42522 bool *no_add_attrs)
42524 if (TREE_CODE (*node) != FUNCTION_DECL)
42526 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42527 name);
42528 *no_add_attrs = true;
42530 return NULL_TREE;
42533 static tree
42534 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42535 int, bool *)
42537 return NULL_TREE;
42540 static tree
42541 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42543 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42544 but the function type contains args and return type data. */
42545 tree func_type = *node;
42546 tree return_type = TREE_TYPE (func_type);
42548 int nargs = 0;
42549 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42550 while (current_arg_type
42551 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42553 if (nargs == 0)
42555 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42556 error ("interrupt service routine should have a pointer "
42557 "as the first argument");
42559 else if (nargs == 1)
42561 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42562 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42563 error ("interrupt service routine should have unsigned %s"
42564 "int as the second argument",
42565 TARGET_64BIT
42566 ? (TARGET_X32 ? "long long " : "long ")
42567 : "");
42569 nargs++;
42570 current_arg_type = TREE_CHAIN (current_arg_type);
42572 if (!nargs || nargs > 2)
42573 error ("interrupt service routine can only have a pointer argument "
42574 "and an optional integer argument");
42575 if (! VOID_TYPE_P (return_type))
42576 error ("interrupt service routine can't have non-void return value");
42578 return NULL_TREE;
42581 static bool
42582 ix86_ms_bitfield_layout_p (const_tree record_type)
42584 return ((TARGET_MS_BITFIELD_LAYOUT
42585 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42586 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42589 /* Returns an expression indicating where the this parameter is
42590 located on entry to the FUNCTION. */
42592 static rtx
42593 x86_this_parameter (tree function)
42595 tree type = TREE_TYPE (function);
42596 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42597 int nregs;
42599 if (TARGET_64BIT)
42601 const int *parm_regs;
42603 if (ix86_function_type_abi (type) == MS_ABI)
42604 parm_regs = x86_64_ms_abi_int_parameter_registers;
42605 else
42606 parm_regs = x86_64_int_parameter_registers;
42607 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42610 nregs = ix86_function_regparm (type, function);
42612 if (nregs > 0 && !stdarg_p (type))
42614 int regno;
42615 unsigned int ccvt = ix86_get_callcvt (type);
42617 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42618 regno = aggr ? DX_REG : CX_REG;
42619 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42621 regno = CX_REG;
42622 if (aggr)
42623 return gen_rtx_MEM (SImode,
42624 plus_constant (Pmode, stack_pointer_rtx, 4));
42626 else
42628 regno = AX_REG;
42629 if (aggr)
42631 regno = DX_REG;
42632 if (nregs == 1)
42633 return gen_rtx_MEM (SImode,
42634 plus_constant (Pmode,
42635 stack_pointer_rtx, 4));
42638 return gen_rtx_REG (SImode, regno);
42641 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42642 aggr ? 8 : 4));
42645 /* Determine whether x86_output_mi_thunk can succeed. */
42647 static bool
42648 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42649 const_tree function)
42651 /* 64-bit can handle anything. */
42652 if (TARGET_64BIT)
42653 return true;
42655 /* For 32-bit, everything's fine if we have one free register. */
42656 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42657 return true;
42659 /* Need a free register for vcall_offset. */
42660 if (vcall_offset)
42661 return false;
42663 /* Need a free register for GOT references. */
42664 if (flag_pic && !targetm.binds_local_p (function))
42665 return false;
42667 /* Otherwise ok. */
42668 return true;
42671 /* Output the assembler code for a thunk function. THUNK_DECL is the
42672 declaration for the thunk function itself, FUNCTION is the decl for
42673 the target function. DELTA is an immediate constant offset to be
42674 added to THIS. If VCALL_OFFSET is nonzero, the word at
42675 *(*this + vcall_offset) should be added to THIS. */
42677 static void
42678 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42679 HOST_WIDE_INT vcall_offset, tree function)
42681 rtx this_param = x86_this_parameter (function);
42682 rtx this_reg, tmp, fnaddr;
42683 unsigned int tmp_regno;
42684 rtx_insn *insn;
42686 if (TARGET_64BIT)
42687 tmp_regno = R10_REG;
42688 else
42690 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42691 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42692 tmp_regno = AX_REG;
42693 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42694 tmp_regno = DX_REG;
42695 else
42696 tmp_regno = CX_REG;
42699 emit_note (NOTE_INSN_PROLOGUE_END);
42701 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42702 pull it in now and let DELTA benefit. */
42703 if (REG_P (this_param))
42704 this_reg = this_param;
42705 else if (vcall_offset)
42707 /* Put the this parameter into %eax. */
42708 this_reg = gen_rtx_REG (Pmode, AX_REG);
42709 emit_move_insn (this_reg, this_param);
42711 else
42712 this_reg = NULL_RTX;
42714 /* Adjust the this parameter by a fixed constant. */
42715 if (delta)
42717 rtx delta_rtx = GEN_INT (delta);
42718 rtx delta_dst = this_reg ? this_reg : this_param;
42720 if (TARGET_64BIT)
42722 if (!x86_64_general_operand (delta_rtx, Pmode))
42724 tmp = gen_rtx_REG (Pmode, tmp_regno);
42725 emit_move_insn (tmp, delta_rtx);
42726 delta_rtx = tmp;
42730 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42733 /* Adjust the this parameter by a value stored in the vtable. */
42734 if (vcall_offset)
42736 rtx vcall_addr, vcall_mem, this_mem;
42738 tmp = gen_rtx_REG (Pmode, tmp_regno);
42740 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42741 if (Pmode != ptr_mode)
42742 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42743 emit_move_insn (tmp, this_mem);
42745 /* Adjust the this parameter. */
42746 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42747 if (TARGET_64BIT
42748 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42750 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42751 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42752 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42755 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42756 if (Pmode != ptr_mode)
42757 emit_insn (gen_addsi_1_zext (this_reg,
42758 gen_rtx_REG (ptr_mode,
42759 REGNO (this_reg)),
42760 vcall_mem));
42761 else
42762 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42765 /* If necessary, drop THIS back to its stack slot. */
42766 if (this_reg && this_reg != this_param)
42767 emit_move_insn (this_param, this_reg);
42769 fnaddr = XEXP (DECL_RTL (function), 0);
42770 if (TARGET_64BIT)
42772 if (!flag_pic || targetm.binds_local_p (function)
42773 || TARGET_PECOFF)
42775 else
42777 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42778 tmp = gen_rtx_CONST (Pmode, tmp);
42779 fnaddr = gen_const_mem (Pmode, tmp);
42782 else
42784 if (!flag_pic || targetm.binds_local_p (function))
42786 #if TARGET_MACHO
42787 else if (TARGET_MACHO)
42789 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42790 fnaddr = XEXP (fnaddr, 0);
42792 #endif /* TARGET_MACHO */
42793 else
42795 tmp = gen_rtx_REG (Pmode, CX_REG);
42796 output_set_got (tmp, NULL_RTX);
42798 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42799 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42800 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42801 fnaddr = gen_const_mem (Pmode, fnaddr);
42805 /* Our sibling call patterns do not allow memories, because we have no
42806 predicate that can distinguish between frame and non-frame memory.
42807 For our purposes here, we can get away with (ab)using a jump pattern,
42808 because we're going to do no optimization. */
42809 if (MEM_P (fnaddr))
42811 if (sibcall_insn_operand (fnaddr, word_mode))
42813 fnaddr = XEXP (DECL_RTL (function), 0);
42814 tmp = gen_rtx_MEM (QImode, fnaddr);
42815 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42816 tmp = emit_call_insn (tmp);
42817 SIBLING_CALL_P (tmp) = 1;
42819 else
42820 emit_jump_insn (gen_indirect_jump (fnaddr));
42822 else
42824 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42826 // CM_LARGE_PIC always uses pseudo PIC register which is
42827 // uninitialized. Since FUNCTION is local and calling it
42828 // doesn't go through PLT, we use scratch register %r11 as
42829 // PIC register and initialize it here.
42830 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42831 ix86_init_large_pic_reg (tmp_regno);
42832 fnaddr = legitimize_pic_address (fnaddr,
42833 gen_rtx_REG (Pmode, tmp_regno));
42836 if (!sibcall_insn_operand (fnaddr, word_mode))
42838 tmp = gen_rtx_REG (word_mode, tmp_regno);
42839 if (GET_MODE (fnaddr) != word_mode)
42840 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42841 emit_move_insn (tmp, fnaddr);
42842 fnaddr = tmp;
42845 tmp = gen_rtx_MEM (QImode, fnaddr);
42846 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42847 tmp = emit_call_insn (tmp);
42848 SIBLING_CALL_P (tmp) = 1;
42850 emit_barrier ();
42852 /* Emit just enough of rest_of_compilation to get the insns emitted.
42853 Note that use_thunk calls assemble_start_function et al. */
42854 insn = get_insns ();
42855 shorten_branches (insn);
42856 final_start_function (insn, file, 1);
42857 final (insn, file, 1);
42858 final_end_function ();
42861 static void
42862 x86_file_start (void)
42864 default_file_start ();
42865 if (TARGET_16BIT)
42866 fputs ("\t.code16gcc\n", asm_out_file);
42867 #if TARGET_MACHO
42868 darwin_file_start ();
42869 #endif
42870 if (X86_FILE_START_VERSION_DIRECTIVE)
42871 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42872 if (X86_FILE_START_FLTUSED)
42873 fputs ("\t.global\t__fltused\n", asm_out_file);
42874 if (ix86_asm_dialect == ASM_INTEL)
42875 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42879 x86_field_alignment (tree type, int computed)
42881 machine_mode mode;
42883 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42884 return computed;
42885 if (TARGET_IAMCU)
42886 return iamcu_alignment (type, computed);
42887 mode = TYPE_MODE (strip_array_types (type));
42888 if (mode == DFmode || mode == DCmode
42889 || GET_MODE_CLASS (mode) == MODE_INT
42890 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42891 return MIN (32, computed);
42892 return computed;
42895 /* Print call to TARGET to FILE. */
42897 static void
42898 x86_print_call_or_nop (FILE *file, const char *target)
42900 if (flag_nop_mcount)
42901 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42902 else
42903 fprintf (file, "1:\tcall\t%s\n", target);
42906 /* Output assembler code to FILE to increment profiler label # LABELNO
42907 for profiling a function entry. */
42908 void
42909 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42911 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42912 : MCOUNT_NAME);
42913 if (TARGET_64BIT)
42915 #ifndef NO_PROFILE_COUNTERS
42916 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42917 #endif
42919 if (!TARGET_PECOFF && flag_pic)
42920 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42921 else
42922 x86_print_call_or_nop (file, mcount_name);
42924 else if (flag_pic)
42926 #ifndef NO_PROFILE_COUNTERS
42927 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42928 LPREFIX, labelno);
42929 #endif
42930 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42932 else
42934 #ifndef NO_PROFILE_COUNTERS
42935 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42936 LPREFIX, labelno);
42937 #endif
42938 x86_print_call_or_nop (file, mcount_name);
42941 if (flag_record_mcount)
42943 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42944 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42945 fprintf (file, "\t.previous\n");
42949 /* We don't have exact information about the insn sizes, but we may assume
42950 quite safely that we are informed about all 1 byte insns and memory
42951 address sizes. This is enough to eliminate unnecessary padding in
42952 99% of cases. */
42954 static int
42955 min_insn_size (rtx_insn *insn)
42957 int l = 0, len;
42959 if (!INSN_P (insn) || !active_insn_p (insn))
42960 return 0;
42962 /* Discard alignments we've emit and jump instructions. */
42963 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42964 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42965 return 0;
42967 /* Important case - calls are always 5 bytes.
42968 It is common to have many calls in the row. */
42969 if (CALL_P (insn)
42970 && symbolic_reference_mentioned_p (PATTERN (insn))
42971 && !SIBLING_CALL_P (insn))
42972 return 5;
42973 len = get_attr_length (insn);
42974 if (len <= 1)
42975 return 1;
42977 /* For normal instructions we rely on get_attr_length being exact,
42978 with a few exceptions. */
42979 if (!JUMP_P (insn))
42981 enum attr_type type = get_attr_type (insn);
42983 switch (type)
42985 case TYPE_MULTI:
42986 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42987 || asm_noperands (PATTERN (insn)) >= 0)
42988 return 0;
42989 break;
42990 case TYPE_OTHER:
42991 case TYPE_FCMP:
42992 break;
42993 default:
42994 /* Otherwise trust get_attr_length. */
42995 return len;
42998 l = get_attr_length_address (insn);
42999 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
43000 l = 4;
43002 if (l)
43003 return 1+l;
43004 else
43005 return 2;
43008 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43010 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
43011 window. */
43013 static void
43014 ix86_avoid_jump_mispredicts (void)
43016 rtx_insn *insn, *start = get_insns ();
43017 int nbytes = 0, njumps = 0;
43018 bool isjump = false;
43020 /* Look for all minimal intervals of instructions containing 4 jumps.
43021 The intervals are bounded by START and INSN. NBYTES is the total
43022 size of instructions in the interval including INSN and not including
43023 START. When the NBYTES is smaller than 16 bytes, it is possible
43024 that the end of START and INSN ends up in the same 16byte page.
43026 The smallest offset in the page INSN can start is the case where START
43027 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
43028 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
43030 Don't consider asm goto as jump, while it can contain a jump, it doesn't
43031 have to, control transfer to label(s) can be performed through other
43032 means, and also we estimate minimum length of all asm stmts as 0. */
43033 for (insn = start; insn; insn = NEXT_INSN (insn))
43035 int min_size;
43037 if (LABEL_P (insn))
43039 int align = label_to_alignment (insn);
43040 int max_skip = label_to_max_skip (insn);
43042 if (max_skip > 15)
43043 max_skip = 15;
43044 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
43045 already in the current 16 byte page, because otherwise
43046 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
43047 bytes to reach 16 byte boundary. */
43048 if (align <= 0
43049 || (align <= 3 && max_skip != (1 << align) - 1))
43050 max_skip = 0;
43051 if (dump_file)
43052 fprintf (dump_file, "Label %i with max_skip %i\n",
43053 INSN_UID (insn), max_skip);
43054 if (max_skip)
43056 while (nbytes + max_skip >= 16)
43058 start = NEXT_INSN (start);
43059 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
43060 || CALL_P (start))
43061 njumps--, isjump = true;
43062 else
43063 isjump = false;
43064 nbytes -= min_insn_size (start);
43067 continue;
43070 min_size = min_insn_size (insn);
43071 nbytes += min_size;
43072 if (dump_file)
43073 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
43074 INSN_UID (insn), min_size);
43075 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
43076 || CALL_P (insn))
43077 njumps++;
43078 else
43079 continue;
43081 while (njumps > 3)
43083 start = NEXT_INSN (start);
43084 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
43085 || CALL_P (start))
43086 njumps--, isjump = true;
43087 else
43088 isjump = false;
43089 nbytes -= min_insn_size (start);
43091 gcc_assert (njumps >= 0);
43092 if (dump_file)
43093 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
43094 INSN_UID (start), INSN_UID (insn), nbytes);
43096 if (njumps == 3 && isjump && nbytes < 16)
43098 int padsize = 15 - nbytes + min_insn_size (insn);
43100 if (dump_file)
43101 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
43102 INSN_UID (insn), padsize);
43103 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
43107 #endif
43109 /* AMD Athlon works faster
43110 when RET is not destination of conditional jump or directly preceded
43111 by other jump instruction. We avoid the penalty by inserting NOP just
43112 before the RET instructions in such cases. */
43113 static void
43114 ix86_pad_returns (void)
43116 edge e;
43117 edge_iterator ei;
43119 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43121 basic_block bb = e->src;
43122 rtx_insn *ret = BB_END (bb);
43123 rtx_insn *prev;
43124 bool replace = false;
43126 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
43127 || optimize_bb_for_size_p (bb))
43128 continue;
43129 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
43130 if (active_insn_p (prev) || LABEL_P (prev))
43131 break;
43132 if (prev && LABEL_P (prev))
43134 edge e;
43135 edge_iterator ei;
43137 FOR_EACH_EDGE (e, ei, bb->preds)
43138 if (EDGE_FREQUENCY (e) && e->src->index >= 0
43139 && !(e->flags & EDGE_FALLTHRU))
43141 replace = true;
43142 break;
43145 if (!replace)
43147 prev = prev_active_insn (ret);
43148 if (prev
43149 && ((JUMP_P (prev) && any_condjump_p (prev))
43150 || CALL_P (prev)))
43151 replace = true;
43152 /* Empty functions get branch mispredict even when
43153 the jump destination is not visible to us. */
43154 if (!prev && !optimize_function_for_size_p (cfun))
43155 replace = true;
43157 if (replace)
43159 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
43160 delete_insn (ret);
43165 /* Count the minimum number of instructions in BB. Return 4 if the
43166 number of instructions >= 4. */
43168 static int
43169 ix86_count_insn_bb (basic_block bb)
43171 rtx_insn *insn;
43172 int insn_count = 0;
43174 /* Count number of instructions in this block. Return 4 if the number
43175 of instructions >= 4. */
43176 FOR_BB_INSNS (bb, insn)
43178 /* Only happen in exit blocks. */
43179 if (JUMP_P (insn)
43180 && ANY_RETURN_P (PATTERN (insn)))
43181 break;
43183 if (NONDEBUG_INSN_P (insn)
43184 && GET_CODE (PATTERN (insn)) != USE
43185 && GET_CODE (PATTERN (insn)) != CLOBBER)
43187 insn_count++;
43188 if (insn_count >= 4)
43189 return insn_count;
43193 return insn_count;
43197 /* Count the minimum number of instructions in code path in BB.
43198 Return 4 if the number of instructions >= 4. */
43200 static int
43201 ix86_count_insn (basic_block bb)
43203 edge e;
43204 edge_iterator ei;
43205 int min_prev_count;
43207 /* Only bother counting instructions along paths with no
43208 more than 2 basic blocks between entry and exit. Given
43209 that BB has an edge to exit, determine if a predecessor
43210 of BB has an edge from entry. If so, compute the number
43211 of instructions in the predecessor block. If there
43212 happen to be multiple such blocks, compute the minimum. */
43213 min_prev_count = 4;
43214 FOR_EACH_EDGE (e, ei, bb->preds)
43216 edge prev_e;
43217 edge_iterator prev_ei;
43219 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43221 min_prev_count = 0;
43222 break;
43224 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
43226 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43228 int count = ix86_count_insn_bb (e->src);
43229 if (count < min_prev_count)
43230 min_prev_count = count;
43231 break;
43236 if (min_prev_count < 4)
43237 min_prev_count += ix86_count_insn_bb (bb);
43239 return min_prev_count;
43242 /* Pad short function to 4 instructions. */
43244 static void
43245 ix86_pad_short_function (void)
43247 edge e;
43248 edge_iterator ei;
43250 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43252 rtx_insn *ret = BB_END (e->src);
43253 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
43255 int insn_count = ix86_count_insn (e->src);
43257 /* Pad short function. */
43258 if (insn_count < 4)
43260 rtx_insn *insn = ret;
43262 /* Find epilogue. */
43263 while (insn
43264 && (!NOTE_P (insn)
43265 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
43266 insn = PREV_INSN (insn);
43268 if (!insn)
43269 insn = ret;
43271 /* Two NOPs count as one instruction. */
43272 insn_count = 2 * (4 - insn_count);
43273 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
43279 /* Fix up a Windows system unwinder issue. If an EH region falls through into
43280 the epilogue, the Windows system unwinder will apply epilogue logic and
43281 produce incorrect offsets. This can be avoided by adding a nop between
43282 the last insn that can throw and the first insn of the epilogue. */
43284 static void
43285 ix86_seh_fixup_eh_fallthru (void)
43287 edge e;
43288 edge_iterator ei;
43290 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43292 rtx_insn *insn, *next;
43294 /* Find the beginning of the epilogue. */
43295 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
43296 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
43297 break;
43298 if (insn == NULL)
43299 continue;
43301 /* We only care about preceding insns that can throw. */
43302 insn = prev_active_insn (insn);
43303 if (insn == NULL || !can_throw_internal (insn))
43304 continue;
43306 /* Do not separate calls from their debug information. */
43307 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
43308 if (NOTE_P (next)
43309 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
43310 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
43311 insn = next;
43312 else
43313 break;
43315 emit_insn_after (gen_nops (const1_rtx), insn);
43319 /* Given a register number BASE, the lowest of a group of registers, update
43320 regsets IN and OUT with the registers that should be avoided in input
43321 and output operands respectively when trying to avoid generating a modr/m
43322 byte for -fmitigate-rop. */
43324 static void
43325 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43327 SET_HARD_REG_BIT (out, base);
43328 SET_HARD_REG_BIT (out, base + 1);
43329 SET_HARD_REG_BIT (in, base + 2);
43330 SET_HARD_REG_BIT (in, base + 3);
43333 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43334 that certain encodings of modr/m bytes do not occur. */
43335 static void
43336 ix86_mitigate_rop (void)
43338 HARD_REG_SET input_risky;
43339 HARD_REG_SET output_risky;
43340 HARD_REG_SET inout_risky;
43342 CLEAR_HARD_REG_SET (output_risky);
43343 CLEAR_HARD_REG_SET (input_risky);
43344 SET_HARD_REG_BIT (output_risky, AX_REG);
43345 SET_HARD_REG_BIT (output_risky, CX_REG);
43346 SET_HARD_REG_BIT (input_risky, BX_REG);
43347 SET_HARD_REG_BIT (input_risky, DX_REG);
43348 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43349 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43350 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43351 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43352 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43353 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43354 COPY_HARD_REG_SET (inout_risky, input_risky);
43355 IOR_HARD_REG_SET (inout_risky, output_risky);
43357 df_note_add_problem ();
43358 /* Fix up what stack-regs did. */
43359 df_insn_rescan_all ();
43360 df_analyze ();
43362 regrename_init (true);
43363 regrename_analyze (NULL);
43365 auto_vec<du_head_p> cands;
43367 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43369 if (!NONDEBUG_INSN_P (insn))
43370 continue;
43372 if (GET_CODE (PATTERN (insn)) == USE
43373 || GET_CODE (PATTERN (insn)) == CLOBBER)
43374 continue;
43376 extract_insn (insn);
43378 int opno0, opno1;
43379 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43380 recog_data.n_operands, &opno0,
43381 &opno1);
43383 if (!ix86_rop_should_change_byte_p (modrm))
43384 continue;
43386 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43388 /* This happens when regrename has to fail a block. */
43389 if (!info->op_info)
43390 continue;
43392 if (info->op_info[opno0].n_chains != 0)
43394 gcc_assert (info->op_info[opno0].n_chains == 1);
43395 du_head_p op0c;
43396 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43397 if (op0c->target_data_1 + op0c->target_data_2 == 0
43398 && !op0c->cannot_rename)
43399 cands.safe_push (op0c);
43401 op0c->target_data_1++;
43403 if (info->op_info[opno1].n_chains != 0)
43405 gcc_assert (info->op_info[opno1].n_chains == 1);
43406 du_head_p op1c;
43407 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43408 if (op1c->target_data_1 + op1c->target_data_2 == 0
43409 && !op1c->cannot_rename)
43410 cands.safe_push (op1c);
43412 op1c->target_data_2++;
43416 int i;
43417 du_head_p head;
43418 FOR_EACH_VEC_ELT (cands, i, head)
43420 int old_reg, best_reg;
43421 HARD_REG_SET unavailable;
43423 CLEAR_HARD_REG_SET (unavailable);
43424 if (head->target_data_1)
43425 IOR_HARD_REG_SET (unavailable, output_risky);
43426 if (head->target_data_2)
43427 IOR_HARD_REG_SET (unavailable, input_risky);
43429 int n_uses;
43430 reg_class superclass = regrename_find_superclass (head, &n_uses,
43431 &unavailable);
43432 old_reg = head->regno;
43433 best_reg = find_rename_reg (head, superclass, &unavailable,
43434 old_reg, false);
43435 bool ok = regrename_do_replace (head, best_reg);
43436 gcc_assert (ok);
43437 if (dump_file)
43438 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43439 reg_names[best_reg], reg_class_names[superclass]);
43443 regrename_finish ();
43445 df_analyze ();
43447 basic_block bb;
43448 regset_head live;
43450 INIT_REG_SET (&live);
43452 FOR_EACH_BB_FN (bb, cfun)
43454 rtx_insn *insn;
43456 COPY_REG_SET (&live, DF_LR_OUT (bb));
43457 df_simulate_initialize_backwards (bb, &live);
43459 FOR_BB_INSNS_REVERSE (bb, insn)
43461 if (!NONDEBUG_INSN_P (insn))
43462 continue;
43464 df_simulate_one_insn_backwards (bb, insn, &live);
43466 if (GET_CODE (PATTERN (insn)) == USE
43467 || GET_CODE (PATTERN (insn)) == CLOBBER)
43468 continue;
43470 extract_insn (insn);
43471 constrain_operands_cached (insn, reload_completed);
43472 int opno0, opno1;
43473 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43474 recog_data.n_operands, &opno0,
43475 &opno1);
43476 if (modrm < 0
43477 || !ix86_rop_should_change_byte_p (modrm)
43478 || opno0 == opno1)
43479 continue;
43481 rtx oldreg = recog_data.operand[opno1];
43482 preprocess_constraints (insn);
43483 const operand_alternative *alt = which_op_alt ();
43485 int i;
43486 for (i = 0; i < recog_data.n_operands; i++)
43487 if (i != opno1
43488 && alt[i].earlyclobber
43489 && reg_overlap_mentioned_p (recog_data.operand[i],
43490 oldreg))
43491 break;
43493 if (i < recog_data.n_operands)
43494 continue;
43496 if (dump_file)
43497 fprintf (dump_file,
43498 "attempting to fix modrm byte in insn %d:"
43499 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43500 reg_class_names[alt[opno1].cl]);
43502 HARD_REG_SET unavailable;
43503 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43504 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43505 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43506 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43507 IOR_HARD_REG_SET (unavailable, output_risky);
43508 IOR_COMPL_HARD_REG_SET (unavailable,
43509 reg_class_contents[alt[opno1].cl]);
43511 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43512 if (!TEST_HARD_REG_BIT (unavailable, i))
43513 break;
43514 if (i == FIRST_PSEUDO_REGISTER)
43516 if (dump_file)
43517 fprintf (dump_file, ", none available\n");
43518 continue;
43520 if (dump_file)
43521 fprintf (dump_file, " -> %d\n", i);
43522 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43523 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43524 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43529 /* Implement machine specific optimizations. We implement padding of returns
43530 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43531 static void
43532 ix86_reorg (void)
43534 /* We are freeing block_for_insn in the toplev to keep compatibility
43535 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43536 compute_bb_for_insn ();
43538 if (flag_mitigate_rop)
43539 ix86_mitigate_rop ();
43541 if (TARGET_SEH && current_function_has_exception_handlers ())
43542 ix86_seh_fixup_eh_fallthru ();
43544 if (optimize && optimize_function_for_speed_p (cfun))
43546 if (TARGET_PAD_SHORT_FUNCTION)
43547 ix86_pad_short_function ();
43548 else if (TARGET_PAD_RETURNS)
43549 ix86_pad_returns ();
43550 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43551 if (TARGET_FOUR_JUMP_LIMIT)
43552 ix86_avoid_jump_mispredicts ();
43553 #endif
43557 /* Return nonzero when QImode register that must be represented via REX prefix
43558 is used. */
43559 bool
43560 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43562 int i;
43563 extract_insn_cached (insn);
43564 for (i = 0; i < recog_data.n_operands; i++)
43565 if (GENERAL_REG_P (recog_data.operand[i])
43566 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43567 return true;
43568 return false;
43571 /* Return true when INSN mentions register that must be encoded using REX
43572 prefix. */
43573 bool
43574 x86_extended_reg_mentioned_p (rtx insn)
43576 subrtx_iterator::array_type array;
43577 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43579 const_rtx x = *iter;
43580 if (REG_P (x)
43581 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43582 return true;
43584 return false;
43587 /* If profitable, negate (without causing overflow) integer constant
43588 of mode MODE at location LOC. Return true in this case. */
43589 bool
43590 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43592 HOST_WIDE_INT val;
43594 if (!CONST_INT_P (*loc))
43595 return false;
43597 switch (mode)
43599 case E_DImode:
43600 /* DImode x86_64 constants must fit in 32 bits. */
43601 gcc_assert (x86_64_immediate_operand (*loc, mode));
43603 mode = SImode;
43604 break;
43606 case E_SImode:
43607 case E_HImode:
43608 case E_QImode:
43609 break;
43611 default:
43612 gcc_unreachable ();
43615 /* Avoid overflows. */
43616 if (mode_signbit_p (mode, *loc))
43617 return false;
43619 val = INTVAL (*loc);
43621 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43622 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43623 if ((val < 0 && val != -128)
43624 || val == 128)
43626 *loc = GEN_INT (-val);
43627 return true;
43630 return false;
43633 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43634 optabs would emit if we didn't have TFmode patterns. */
43636 void
43637 x86_emit_floatuns (rtx operands[2])
43639 rtx_code_label *neglab, *donelab;
43640 rtx i0, i1, f0, in, out;
43641 machine_mode mode, inmode;
43643 inmode = GET_MODE (operands[1]);
43644 gcc_assert (inmode == SImode || inmode == DImode);
43646 out = operands[0];
43647 in = force_reg (inmode, operands[1]);
43648 mode = GET_MODE (out);
43649 neglab = gen_label_rtx ();
43650 donelab = gen_label_rtx ();
43651 f0 = gen_reg_rtx (mode);
43653 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43655 expand_float (out, in, 0);
43657 emit_jump_insn (gen_jump (donelab));
43658 emit_barrier ();
43660 emit_label (neglab);
43662 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43663 1, OPTAB_DIRECT);
43664 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43665 1, OPTAB_DIRECT);
43666 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43668 expand_float (f0, i0, 0);
43670 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43672 emit_label (donelab);
43675 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43676 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43677 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43678 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43680 /* Get a vector mode of the same size as the original but with elements
43681 twice as wide. This is only guaranteed to apply to integral vectors. */
43683 static inline machine_mode
43684 get_mode_wider_vector (machine_mode o)
43686 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43687 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
43688 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43689 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43690 return n;
43693 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43694 fill target with val via vec_duplicate. */
43696 static bool
43697 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43699 bool ok;
43700 rtx_insn *insn;
43701 rtx dup;
43703 /* First attempt to recognize VAL as-is. */
43704 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43705 insn = emit_insn (gen_rtx_SET (target, dup));
43706 if (recog_memoized (insn) < 0)
43708 rtx_insn *seq;
43709 machine_mode innermode = GET_MODE_INNER (mode);
43710 rtx reg;
43712 /* If that fails, force VAL into a register. */
43714 start_sequence ();
43715 reg = force_reg (innermode, val);
43716 if (GET_MODE (reg) != innermode)
43717 reg = gen_lowpart (innermode, reg);
43718 XEXP (dup, 0) = reg;
43719 seq = get_insns ();
43720 end_sequence ();
43721 if (seq)
43722 emit_insn_before (seq, insn);
43724 ok = recog_memoized (insn) >= 0;
43725 gcc_assert (ok);
43727 return true;
43730 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43731 with all elements equal to VAR. Return true if successful. */
43733 static bool
43734 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43735 rtx target, rtx val)
43737 bool ok;
43739 switch (mode)
43741 case E_V2SImode:
43742 case E_V2SFmode:
43743 if (!mmx_ok)
43744 return false;
43745 /* FALLTHRU */
43747 case E_V4DFmode:
43748 case E_V4DImode:
43749 case E_V8SFmode:
43750 case E_V8SImode:
43751 case E_V2DFmode:
43752 case E_V2DImode:
43753 case E_V4SFmode:
43754 case E_V4SImode:
43755 case E_V16SImode:
43756 case E_V8DImode:
43757 case E_V16SFmode:
43758 case E_V8DFmode:
43759 return ix86_vector_duplicate_value (mode, target, val);
43761 case E_V4HImode:
43762 if (!mmx_ok)
43763 return false;
43764 if (TARGET_SSE || TARGET_3DNOW_A)
43766 rtx x;
43768 val = gen_lowpart (SImode, val);
43769 x = gen_rtx_TRUNCATE (HImode, val);
43770 x = gen_rtx_VEC_DUPLICATE (mode, x);
43771 emit_insn (gen_rtx_SET (target, x));
43772 return true;
43774 goto widen;
43776 case E_V8QImode:
43777 if (!mmx_ok)
43778 return false;
43779 goto widen;
43781 case E_V8HImode:
43782 if (TARGET_AVX2)
43783 return ix86_vector_duplicate_value (mode, target, val);
43785 if (TARGET_SSE2)
43787 struct expand_vec_perm_d dperm;
43788 rtx tmp1, tmp2;
43790 permute:
43791 memset (&dperm, 0, sizeof (dperm));
43792 dperm.target = target;
43793 dperm.vmode = mode;
43794 dperm.nelt = GET_MODE_NUNITS (mode);
43795 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43796 dperm.one_operand_p = true;
43798 /* Extend to SImode using a paradoxical SUBREG. */
43799 tmp1 = gen_reg_rtx (SImode);
43800 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43802 /* Insert the SImode value as low element of a V4SImode vector. */
43803 tmp2 = gen_reg_rtx (V4SImode);
43804 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43805 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43807 ok = (expand_vec_perm_1 (&dperm)
43808 || expand_vec_perm_broadcast_1 (&dperm));
43809 gcc_assert (ok);
43810 return ok;
43812 goto widen;
43814 case E_V16QImode:
43815 if (TARGET_AVX2)
43816 return ix86_vector_duplicate_value (mode, target, val);
43818 if (TARGET_SSE2)
43819 goto permute;
43820 goto widen;
43822 widen:
43823 /* Replicate the value once into the next wider mode and recurse. */
43825 machine_mode smode, wsmode, wvmode;
43826 rtx x;
43828 smode = GET_MODE_INNER (mode);
43829 wvmode = get_mode_wider_vector (mode);
43830 wsmode = GET_MODE_INNER (wvmode);
43832 val = convert_modes (wsmode, smode, val, true);
43833 x = expand_simple_binop (wsmode, ASHIFT, val,
43834 GEN_INT (GET_MODE_BITSIZE (smode)),
43835 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43836 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43838 x = gen_reg_rtx (wvmode);
43839 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43840 gcc_assert (ok);
43841 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43842 return ok;
43845 case E_V16HImode:
43846 case E_V32QImode:
43847 if (TARGET_AVX2)
43848 return ix86_vector_duplicate_value (mode, target, val);
43849 else
43851 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43852 rtx x = gen_reg_rtx (hvmode);
43854 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43855 gcc_assert (ok);
43857 x = gen_rtx_VEC_CONCAT (mode, x, x);
43858 emit_insn (gen_rtx_SET (target, x));
43860 return true;
43862 case E_V64QImode:
43863 case E_V32HImode:
43864 if (TARGET_AVX512BW)
43865 return ix86_vector_duplicate_value (mode, target, val);
43866 else
43868 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43869 rtx x = gen_reg_rtx (hvmode);
43871 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43872 gcc_assert (ok);
43874 x = gen_rtx_VEC_CONCAT (mode, x, x);
43875 emit_insn (gen_rtx_SET (target, x));
43877 return true;
43879 default:
43880 return false;
43884 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43885 whose ONE_VAR element is VAR, and other elements are zero. Return true
43886 if successful. */
43888 static bool
43889 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43890 rtx target, rtx var, int one_var)
43892 machine_mode vsimode;
43893 rtx new_target;
43894 rtx x, tmp;
43895 bool use_vector_set = false;
43897 switch (mode)
43899 case E_V2DImode:
43900 /* For SSE4.1, we normally use vector set. But if the second
43901 element is zero and inter-unit moves are OK, we use movq
43902 instead. */
43903 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43904 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43905 && one_var == 0));
43906 break;
43907 case E_V16QImode:
43908 case E_V4SImode:
43909 case E_V4SFmode:
43910 use_vector_set = TARGET_SSE4_1;
43911 break;
43912 case E_V8HImode:
43913 use_vector_set = TARGET_SSE2;
43914 break;
43915 case E_V4HImode:
43916 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43917 break;
43918 case E_V32QImode:
43919 case E_V16HImode:
43920 case E_V8SImode:
43921 case E_V8SFmode:
43922 case E_V4DFmode:
43923 use_vector_set = TARGET_AVX;
43924 break;
43925 case E_V4DImode:
43926 /* Use ix86_expand_vector_set in 64bit mode only. */
43927 use_vector_set = TARGET_AVX && TARGET_64BIT;
43928 break;
43929 default:
43930 break;
43933 if (use_vector_set)
43935 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43936 var = force_reg (GET_MODE_INNER (mode), var);
43937 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43938 return true;
43941 switch (mode)
43943 case E_V2SFmode:
43944 case E_V2SImode:
43945 if (!mmx_ok)
43946 return false;
43947 /* FALLTHRU */
43949 case E_V2DFmode:
43950 case E_V2DImode:
43951 if (one_var != 0)
43952 return false;
43953 var = force_reg (GET_MODE_INNER (mode), var);
43954 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43955 emit_insn (gen_rtx_SET (target, x));
43956 return true;
43958 case E_V4SFmode:
43959 case E_V4SImode:
43960 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43961 new_target = gen_reg_rtx (mode);
43962 else
43963 new_target = target;
43964 var = force_reg (GET_MODE_INNER (mode), var);
43965 x = gen_rtx_VEC_DUPLICATE (mode, var);
43966 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43967 emit_insn (gen_rtx_SET (new_target, x));
43968 if (one_var != 0)
43970 /* We need to shuffle the value to the correct position, so
43971 create a new pseudo to store the intermediate result. */
43973 /* With SSE2, we can use the integer shuffle insns. */
43974 if (mode != V4SFmode && TARGET_SSE2)
43976 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43977 const1_rtx,
43978 GEN_INT (one_var == 1 ? 0 : 1),
43979 GEN_INT (one_var == 2 ? 0 : 1),
43980 GEN_INT (one_var == 3 ? 0 : 1)));
43981 if (target != new_target)
43982 emit_move_insn (target, new_target);
43983 return true;
43986 /* Otherwise convert the intermediate result to V4SFmode and
43987 use the SSE1 shuffle instructions. */
43988 if (mode != V4SFmode)
43990 tmp = gen_reg_rtx (V4SFmode);
43991 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43993 else
43994 tmp = new_target;
43996 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43997 const1_rtx,
43998 GEN_INT (one_var == 1 ? 0 : 1),
43999 GEN_INT (one_var == 2 ? 0+4 : 1+4),
44000 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
44002 if (mode != V4SFmode)
44003 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
44004 else if (tmp != target)
44005 emit_move_insn (target, tmp);
44007 else if (target != new_target)
44008 emit_move_insn (target, new_target);
44009 return true;
44011 case E_V8HImode:
44012 case E_V16QImode:
44013 vsimode = V4SImode;
44014 goto widen;
44015 case E_V4HImode:
44016 case E_V8QImode:
44017 if (!mmx_ok)
44018 return false;
44019 vsimode = V2SImode;
44020 goto widen;
44021 widen:
44022 if (one_var != 0)
44023 return false;
44025 /* Zero extend the variable element to SImode and recurse. */
44026 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
44028 x = gen_reg_rtx (vsimode);
44029 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
44030 var, one_var))
44031 gcc_unreachable ();
44033 emit_move_insn (target, gen_lowpart (mode, x));
44034 return true;
44036 default:
44037 return false;
44041 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
44042 consisting of the values in VALS. It is known that all elements
44043 except ONE_VAR are constants. Return true if successful. */
44045 static bool
44046 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
44047 rtx target, rtx vals, int one_var)
44049 rtx var = XVECEXP (vals, 0, one_var);
44050 machine_mode wmode;
44051 rtx const_vec, x;
44053 const_vec = copy_rtx (vals);
44054 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
44055 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
44057 switch (mode)
44059 case E_V2DFmode:
44060 case E_V2DImode:
44061 case E_V2SFmode:
44062 case E_V2SImode:
44063 /* For the two element vectors, it's just as easy to use
44064 the general case. */
44065 return false;
44067 case E_V4DImode:
44068 /* Use ix86_expand_vector_set in 64bit mode only. */
44069 if (!TARGET_64BIT)
44070 return false;
44071 /* FALLTHRU */
44072 case E_V4DFmode:
44073 case E_V8SFmode:
44074 case E_V8SImode:
44075 case E_V16HImode:
44076 case E_V32QImode:
44077 case E_V4SFmode:
44078 case E_V4SImode:
44079 case E_V8HImode:
44080 case E_V4HImode:
44081 break;
44083 case E_V16QImode:
44084 if (TARGET_SSE4_1)
44085 break;
44086 wmode = V8HImode;
44087 goto widen;
44088 case E_V8QImode:
44089 wmode = V4HImode;
44090 goto widen;
44091 widen:
44092 /* There's no way to set one QImode entry easily. Combine
44093 the variable value with its adjacent constant value, and
44094 promote to an HImode set. */
44095 x = XVECEXP (vals, 0, one_var ^ 1);
44096 if (one_var & 1)
44098 var = convert_modes (HImode, QImode, var, true);
44099 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
44100 NULL_RTX, 1, OPTAB_LIB_WIDEN);
44101 x = GEN_INT (INTVAL (x) & 0xff);
44103 else
44105 var = convert_modes (HImode, QImode, var, true);
44106 x = gen_int_mode (INTVAL (x) << 8, HImode);
44108 if (x != const0_rtx)
44109 var = expand_simple_binop (HImode, IOR, var, x, var,
44110 1, OPTAB_LIB_WIDEN);
44112 x = gen_reg_rtx (wmode);
44113 emit_move_insn (x, gen_lowpart (wmode, const_vec));
44114 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
44116 emit_move_insn (target, gen_lowpart (mode, x));
44117 return true;
44119 default:
44120 return false;
44123 emit_move_insn (target, const_vec);
44124 ix86_expand_vector_set (mmx_ok, target, var, one_var);
44125 return true;
44128 /* A subroutine of ix86_expand_vector_init_general. Use vector
44129 concatenate to handle the most general case: all values variable,
44130 and none identical. */
44132 static void
44133 ix86_expand_vector_init_concat (machine_mode mode,
44134 rtx target, rtx *ops, int n)
44136 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
44137 rtx first[16], second[8], third[4];
44138 rtvec v;
44139 int i, j;
44141 switch (n)
44143 case 2:
44144 switch (mode)
44146 case E_V16SImode:
44147 cmode = V8SImode;
44148 break;
44149 case E_V16SFmode:
44150 cmode = V8SFmode;
44151 break;
44152 case E_V8DImode:
44153 cmode = V4DImode;
44154 break;
44155 case E_V8DFmode:
44156 cmode = V4DFmode;
44157 break;
44158 case E_V8SImode:
44159 cmode = V4SImode;
44160 break;
44161 case E_V8SFmode:
44162 cmode = V4SFmode;
44163 break;
44164 case E_V4DImode:
44165 cmode = V2DImode;
44166 break;
44167 case E_V4DFmode:
44168 cmode = V2DFmode;
44169 break;
44170 case E_V4SImode:
44171 cmode = V2SImode;
44172 break;
44173 case E_V4SFmode:
44174 cmode = V2SFmode;
44175 break;
44176 case E_V2DImode:
44177 cmode = DImode;
44178 break;
44179 case E_V2SImode:
44180 cmode = SImode;
44181 break;
44182 case E_V2DFmode:
44183 cmode = DFmode;
44184 break;
44185 case E_V2SFmode:
44186 cmode = SFmode;
44187 break;
44188 default:
44189 gcc_unreachable ();
44192 if (!register_operand (ops[1], cmode))
44193 ops[1] = force_reg (cmode, ops[1]);
44194 if (!register_operand (ops[0], cmode))
44195 ops[0] = force_reg (cmode, ops[0]);
44196 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
44197 ops[1])));
44198 break;
44200 case 4:
44201 switch (mode)
44203 case E_V4DImode:
44204 cmode = V2DImode;
44205 break;
44206 case E_V4DFmode:
44207 cmode = V2DFmode;
44208 break;
44209 case E_V4SImode:
44210 cmode = V2SImode;
44211 break;
44212 case E_V4SFmode:
44213 cmode = V2SFmode;
44214 break;
44215 default:
44216 gcc_unreachable ();
44218 goto half;
44220 case 8:
44221 switch (mode)
44223 case E_V8DImode:
44224 cmode = V2DImode;
44225 hmode = V4DImode;
44226 break;
44227 case E_V8DFmode:
44228 cmode = V2DFmode;
44229 hmode = V4DFmode;
44230 break;
44231 case E_V8SImode:
44232 cmode = V2SImode;
44233 hmode = V4SImode;
44234 break;
44235 case E_V8SFmode:
44236 cmode = V2SFmode;
44237 hmode = V4SFmode;
44238 break;
44239 default:
44240 gcc_unreachable ();
44242 goto half;
44244 case 16:
44245 switch (mode)
44247 case E_V16SImode:
44248 cmode = V2SImode;
44249 hmode = V4SImode;
44250 gmode = V8SImode;
44251 break;
44252 case E_V16SFmode:
44253 cmode = V2SFmode;
44254 hmode = V4SFmode;
44255 gmode = V8SFmode;
44256 break;
44257 default:
44258 gcc_unreachable ();
44260 goto half;
44262 half:
44263 /* FIXME: We process inputs backward to help RA. PR 36222. */
44264 i = n - 1;
44265 j = (n >> 1) - 1;
44266 for (; i > 0; i -= 2, j--)
44268 first[j] = gen_reg_rtx (cmode);
44269 v = gen_rtvec (2, ops[i - 1], ops[i]);
44270 ix86_expand_vector_init (false, first[j],
44271 gen_rtx_PARALLEL (cmode, v));
44274 n >>= 1;
44275 if (n > 4)
44277 gcc_assert (hmode != VOIDmode);
44278 gcc_assert (gmode != VOIDmode);
44279 for (i = j = 0; i < n; i += 2, j++)
44281 second[j] = gen_reg_rtx (hmode);
44282 ix86_expand_vector_init_concat (hmode, second [j],
44283 &first [i], 2);
44285 n >>= 1;
44286 for (i = j = 0; i < n; i += 2, j++)
44288 third[j] = gen_reg_rtx (gmode);
44289 ix86_expand_vector_init_concat (gmode, third[j],
44290 &second[i], 2);
44292 n >>= 1;
44293 ix86_expand_vector_init_concat (mode, target, third, n);
44295 else if (n > 2)
44297 gcc_assert (hmode != VOIDmode);
44298 for (i = j = 0; i < n; i += 2, j++)
44300 second[j] = gen_reg_rtx (hmode);
44301 ix86_expand_vector_init_concat (hmode, second [j],
44302 &first [i], 2);
44304 n >>= 1;
44305 ix86_expand_vector_init_concat (mode, target, second, n);
44307 else
44308 ix86_expand_vector_init_concat (mode, target, first, n);
44309 break;
44311 default:
44312 gcc_unreachable ();
44316 /* A subroutine of ix86_expand_vector_init_general. Use vector
44317 interleave to handle the most general case: all values variable,
44318 and none identical. */
44320 static void
44321 ix86_expand_vector_init_interleave (machine_mode mode,
44322 rtx target, rtx *ops, int n)
44324 machine_mode first_imode, second_imode, third_imode, inner_mode;
44325 int i, j;
44326 rtx op0, op1;
44327 rtx (*gen_load_even) (rtx, rtx, rtx);
44328 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44329 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44331 switch (mode)
44333 case E_V8HImode:
44334 gen_load_even = gen_vec_setv8hi;
44335 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44336 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44337 inner_mode = HImode;
44338 first_imode = V4SImode;
44339 second_imode = V2DImode;
44340 third_imode = VOIDmode;
44341 break;
44342 case E_V16QImode:
44343 gen_load_even = gen_vec_setv16qi;
44344 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44345 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44346 inner_mode = QImode;
44347 first_imode = V8HImode;
44348 second_imode = V4SImode;
44349 third_imode = V2DImode;
44350 break;
44351 default:
44352 gcc_unreachable ();
44355 for (i = 0; i < n; i++)
44357 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44358 op0 = gen_reg_rtx (SImode);
44359 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44361 /* Insert the SImode value as low element of V4SImode vector. */
44362 op1 = gen_reg_rtx (V4SImode);
44363 op0 = gen_rtx_VEC_MERGE (V4SImode,
44364 gen_rtx_VEC_DUPLICATE (V4SImode,
44365 op0),
44366 CONST0_RTX (V4SImode),
44367 const1_rtx);
44368 emit_insn (gen_rtx_SET (op1, op0));
44370 /* Cast the V4SImode vector back to a vector in orignal mode. */
44371 op0 = gen_reg_rtx (mode);
44372 emit_move_insn (op0, gen_lowpart (mode, op1));
44374 /* Load even elements into the second position. */
44375 emit_insn (gen_load_even (op0,
44376 force_reg (inner_mode,
44377 ops [i + i + 1]),
44378 const1_rtx));
44380 /* Cast vector to FIRST_IMODE vector. */
44381 ops[i] = gen_reg_rtx (first_imode);
44382 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44385 /* Interleave low FIRST_IMODE vectors. */
44386 for (i = j = 0; i < n; i += 2, j++)
44388 op0 = gen_reg_rtx (first_imode);
44389 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44391 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44392 ops[j] = gen_reg_rtx (second_imode);
44393 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44396 /* Interleave low SECOND_IMODE vectors. */
44397 switch (second_imode)
44399 case E_V4SImode:
44400 for (i = j = 0; i < n / 2; i += 2, j++)
44402 op0 = gen_reg_rtx (second_imode);
44403 emit_insn (gen_interleave_second_low (op0, ops[i],
44404 ops[i + 1]));
44406 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44407 vector. */
44408 ops[j] = gen_reg_rtx (third_imode);
44409 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44411 second_imode = V2DImode;
44412 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44413 /* FALLTHRU */
44415 case E_V2DImode:
44416 op0 = gen_reg_rtx (second_imode);
44417 emit_insn (gen_interleave_second_low (op0, ops[0],
44418 ops[1]));
44420 /* Cast the SECOND_IMODE vector back to a vector on original
44421 mode. */
44422 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44423 break;
44425 default:
44426 gcc_unreachable ();
44430 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44431 all values variable, and none identical. */
44433 static void
44434 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44435 rtx target, rtx vals)
44437 rtx ops[64], op0, op1, op2, op3, op4, op5;
44438 machine_mode half_mode = VOIDmode;
44439 machine_mode quarter_mode = VOIDmode;
44440 int n, i;
44442 switch (mode)
44444 case E_V2SFmode:
44445 case E_V2SImode:
44446 if (!mmx_ok && !TARGET_SSE)
44447 break;
44448 /* FALLTHRU */
44450 case E_V16SImode:
44451 case E_V16SFmode:
44452 case E_V8DFmode:
44453 case E_V8DImode:
44454 case E_V8SFmode:
44455 case E_V8SImode:
44456 case E_V4DFmode:
44457 case E_V4DImode:
44458 case E_V4SFmode:
44459 case E_V4SImode:
44460 case E_V2DFmode:
44461 case E_V2DImode:
44462 n = GET_MODE_NUNITS (mode);
44463 for (i = 0; i < n; i++)
44464 ops[i] = XVECEXP (vals, 0, i);
44465 ix86_expand_vector_init_concat (mode, target, ops, n);
44466 return;
44468 case E_V2TImode:
44469 for (i = 0; i < 2; i++)
44470 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44471 op0 = gen_reg_rtx (V4DImode);
44472 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44473 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44474 return;
44476 case E_V4TImode:
44477 for (i = 0; i < 4; i++)
44478 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44479 ops[4] = gen_reg_rtx (V4DImode);
44480 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44481 ops[5] = gen_reg_rtx (V4DImode);
44482 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44483 op0 = gen_reg_rtx (V8DImode);
44484 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44485 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44486 return;
44488 case E_V32QImode:
44489 half_mode = V16QImode;
44490 goto half;
44492 case E_V16HImode:
44493 half_mode = V8HImode;
44494 goto half;
44496 half:
44497 n = GET_MODE_NUNITS (mode);
44498 for (i = 0; i < n; i++)
44499 ops[i] = XVECEXP (vals, 0, i);
44500 op0 = gen_reg_rtx (half_mode);
44501 op1 = gen_reg_rtx (half_mode);
44502 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44503 n >> 2);
44504 ix86_expand_vector_init_interleave (half_mode, op1,
44505 &ops [n >> 1], n >> 2);
44506 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44507 return;
44509 case E_V64QImode:
44510 quarter_mode = V16QImode;
44511 half_mode = V32QImode;
44512 goto quarter;
44514 case E_V32HImode:
44515 quarter_mode = V8HImode;
44516 half_mode = V16HImode;
44517 goto quarter;
44519 quarter:
44520 n = GET_MODE_NUNITS (mode);
44521 for (i = 0; i < n; i++)
44522 ops[i] = XVECEXP (vals, 0, i);
44523 op0 = gen_reg_rtx (quarter_mode);
44524 op1 = gen_reg_rtx (quarter_mode);
44525 op2 = gen_reg_rtx (quarter_mode);
44526 op3 = gen_reg_rtx (quarter_mode);
44527 op4 = gen_reg_rtx (half_mode);
44528 op5 = gen_reg_rtx (half_mode);
44529 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44530 n >> 3);
44531 ix86_expand_vector_init_interleave (quarter_mode, op1,
44532 &ops [n >> 2], n >> 3);
44533 ix86_expand_vector_init_interleave (quarter_mode, op2,
44534 &ops [n >> 1], n >> 3);
44535 ix86_expand_vector_init_interleave (quarter_mode, op3,
44536 &ops [(n >> 1) | (n >> 2)], n >> 3);
44537 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44538 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44539 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44540 return;
44542 case E_V16QImode:
44543 if (!TARGET_SSE4_1)
44544 break;
44545 /* FALLTHRU */
44547 case E_V8HImode:
44548 if (!TARGET_SSE2)
44549 break;
44551 /* Don't use ix86_expand_vector_init_interleave if we can't
44552 move from GPR to SSE register directly. */
44553 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44554 break;
44556 n = GET_MODE_NUNITS (mode);
44557 for (i = 0; i < n; i++)
44558 ops[i] = XVECEXP (vals, 0, i);
44559 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44560 return;
44562 case E_V4HImode:
44563 case E_V8QImode:
44564 break;
44566 default:
44567 gcc_unreachable ();
44571 int i, j, n_elts, n_words, n_elt_per_word;
44572 machine_mode inner_mode;
44573 rtx words[4], shift;
44575 inner_mode = GET_MODE_INNER (mode);
44576 n_elts = GET_MODE_NUNITS (mode);
44577 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44578 n_elt_per_word = n_elts / n_words;
44579 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44581 for (i = 0; i < n_words; ++i)
44583 rtx word = NULL_RTX;
44585 for (j = 0; j < n_elt_per_word; ++j)
44587 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44588 elt = convert_modes (word_mode, inner_mode, elt, true);
44590 if (j == 0)
44591 word = elt;
44592 else
44594 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44595 word, 1, OPTAB_LIB_WIDEN);
44596 word = expand_simple_binop (word_mode, IOR, word, elt,
44597 word, 1, OPTAB_LIB_WIDEN);
44601 words[i] = word;
44604 if (n_words == 1)
44605 emit_move_insn (target, gen_lowpart (mode, words[0]));
44606 else if (n_words == 2)
44608 rtx tmp = gen_reg_rtx (mode);
44609 emit_clobber (tmp);
44610 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44611 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44612 emit_move_insn (target, tmp);
44614 else if (n_words == 4)
44616 rtx tmp = gen_reg_rtx (V4SImode);
44617 gcc_assert (word_mode == SImode);
44618 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44619 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44620 emit_move_insn (target, gen_lowpart (mode, tmp));
44622 else
44623 gcc_unreachable ();
44627 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44628 instructions unless MMX_OK is true. */
44630 void
44631 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44633 machine_mode mode = GET_MODE (target);
44634 machine_mode inner_mode = GET_MODE_INNER (mode);
44635 int n_elts = GET_MODE_NUNITS (mode);
44636 int n_var = 0, one_var = -1;
44637 bool all_same = true, all_const_zero = true;
44638 int i;
44639 rtx x;
44641 /* Handle first initialization from vector elts. */
44642 if (n_elts != XVECLEN (vals, 0))
44644 rtx subtarget = target;
44645 x = XVECEXP (vals, 0, 0);
44646 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44647 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44649 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44650 if (inner_mode == QImode || inner_mode == HImode)
44652 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
44653 mode = mode_for_vector (SImode, n_bits / 4).require ();
44654 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
44655 ops[0] = gen_lowpart (inner_mode, ops[0]);
44656 ops[1] = gen_lowpart (inner_mode, ops[1]);
44657 subtarget = gen_reg_rtx (mode);
44659 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44660 if (subtarget != target)
44661 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44662 return;
44664 gcc_unreachable ();
44667 for (i = 0; i < n_elts; ++i)
44669 x = XVECEXP (vals, 0, i);
44670 if (!(CONST_SCALAR_INT_P (x)
44671 || CONST_DOUBLE_P (x)
44672 || CONST_FIXED_P (x)))
44673 n_var++, one_var = i;
44674 else if (x != CONST0_RTX (inner_mode))
44675 all_const_zero = false;
44676 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44677 all_same = false;
44680 /* Constants are best loaded from the constant pool. */
44681 if (n_var == 0)
44683 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44684 return;
44687 /* If all values are identical, broadcast the value. */
44688 if (all_same
44689 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44690 XVECEXP (vals, 0, 0)))
44691 return;
44693 /* Values where only one field is non-constant are best loaded from
44694 the pool and overwritten via move later. */
44695 if (n_var == 1)
44697 if (all_const_zero
44698 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44699 XVECEXP (vals, 0, one_var),
44700 one_var))
44701 return;
44703 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44704 return;
44707 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44710 void
44711 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44713 machine_mode mode = GET_MODE (target);
44714 machine_mode inner_mode = GET_MODE_INNER (mode);
44715 machine_mode half_mode;
44716 bool use_vec_merge = false;
44717 rtx tmp;
44718 static rtx (*gen_extract[6][2]) (rtx, rtx)
44720 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44721 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44722 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44723 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44724 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44725 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44727 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44729 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44730 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44731 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44732 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44733 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44734 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44736 int i, j, n;
44737 machine_mode mmode = VOIDmode;
44738 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44740 switch (mode)
44742 case E_V2SFmode:
44743 case E_V2SImode:
44744 if (mmx_ok)
44746 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44747 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44748 if (elt == 0)
44749 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44750 else
44751 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44752 emit_insn (gen_rtx_SET (target, tmp));
44753 return;
44755 break;
44757 case E_V2DImode:
44758 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44759 if (use_vec_merge)
44760 break;
44762 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44763 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44764 if (elt == 0)
44765 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44766 else
44767 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44768 emit_insn (gen_rtx_SET (target, tmp));
44769 return;
44771 case E_V2DFmode:
44773 rtx op0, op1;
44775 /* For the two element vectors, we implement a VEC_CONCAT with
44776 the extraction of the other element. */
44778 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44779 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44781 if (elt == 0)
44782 op0 = val, op1 = tmp;
44783 else
44784 op0 = tmp, op1 = val;
44786 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44787 emit_insn (gen_rtx_SET (target, tmp));
44789 return;
44791 case E_V4SFmode:
44792 use_vec_merge = TARGET_SSE4_1;
44793 if (use_vec_merge)
44794 break;
44796 switch (elt)
44798 case 0:
44799 use_vec_merge = true;
44800 break;
44802 case 1:
44803 /* tmp = target = A B C D */
44804 tmp = copy_to_reg (target);
44805 /* target = A A B B */
44806 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44807 /* target = X A B B */
44808 ix86_expand_vector_set (false, target, val, 0);
44809 /* target = A X C D */
44810 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44811 const1_rtx, const0_rtx,
44812 GEN_INT (2+4), GEN_INT (3+4)));
44813 return;
44815 case 2:
44816 /* tmp = target = A B C D */
44817 tmp = copy_to_reg (target);
44818 /* tmp = X B C D */
44819 ix86_expand_vector_set (false, tmp, val, 0);
44820 /* target = A B X D */
44821 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44822 const0_rtx, const1_rtx,
44823 GEN_INT (0+4), GEN_INT (3+4)));
44824 return;
44826 case 3:
44827 /* tmp = target = A B C D */
44828 tmp = copy_to_reg (target);
44829 /* tmp = X B C D */
44830 ix86_expand_vector_set (false, tmp, val, 0);
44831 /* target = A B X D */
44832 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44833 const0_rtx, const1_rtx,
44834 GEN_INT (2+4), GEN_INT (0+4)));
44835 return;
44837 default:
44838 gcc_unreachable ();
44840 break;
44842 case E_V4SImode:
44843 use_vec_merge = TARGET_SSE4_1;
44844 if (use_vec_merge)
44845 break;
44847 /* Element 0 handled by vec_merge below. */
44848 if (elt == 0)
44850 use_vec_merge = true;
44851 break;
44854 if (TARGET_SSE2)
44856 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44857 store into element 0, then shuffle them back. */
44859 rtx order[4];
44861 order[0] = GEN_INT (elt);
44862 order[1] = const1_rtx;
44863 order[2] = const2_rtx;
44864 order[3] = GEN_INT (3);
44865 order[elt] = const0_rtx;
44867 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44868 order[1], order[2], order[3]));
44870 ix86_expand_vector_set (false, target, val, 0);
44872 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44873 order[1], order[2], order[3]));
44875 else
44877 /* For SSE1, we have to reuse the V4SF code. */
44878 rtx t = gen_reg_rtx (V4SFmode);
44879 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44880 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44881 emit_move_insn (target, gen_lowpart (mode, t));
44883 return;
44885 case E_V8HImode:
44886 use_vec_merge = TARGET_SSE2;
44887 break;
44888 case E_V4HImode:
44889 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44890 break;
44892 case E_V16QImode:
44893 use_vec_merge = TARGET_SSE4_1;
44894 break;
44896 case E_V8QImode:
44897 break;
44899 case E_V32QImode:
44900 half_mode = V16QImode;
44901 j = 0;
44902 n = 16;
44903 goto half;
44905 case E_V16HImode:
44906 half_mode = V8HImode;
44907 j = 1;
44908 n = 8;
44909 goto half;
44911 case E_V8SImode:
44912 half_mode = V4SImode;
44913 j = 2;
44914 n = 4;
44915 goto half;
44917 case E_V4DImode:
44918 half_mode = V2DImode;
44919 j = 3;
44920 n = 2;
44921 goto half;
44923 case E_V8SFmode:
44924 half_mode = V4SFmode;
44925 j = 4;
44926 n = 4;
44927 goto half;
44929 case E_V4DFmode:
44930 half_mode = V2DFmode;
44931 j = 5;
44932 n = 2;
44933 goto half;
44935 half:
44936 /* Compute offset. */
44937 i = elt / n;
44938 elt %= n;
44940 gcc_assert (i <= 1);
44942 /* Extract the half. */
44943 tmp = gen_reg_rtx (half_mode);
44944 emit_insn (gen_extract[j][i] (tmp, target));
44946 /* Put val in tmp at elt. */
44947 ix86_expand_vector_set (false, tmp, val, elt);
44949 /* Put it back. */
44950 emit_insn (gen_insert[j][i] (target, target, tmp));
44951 return;
44953 case E_V8DFmode:
44954 if (TARGET_AVX512F)
44956 mmode = QImode;
44957 gen_blendm = gen_avx512f_blendmv8df;
44959 break;
44961 case E_V8DImode:
44962 if (TARGET_AVX512F)
44964 mmode = QImode;
44965 gen_blendm = gen_avx512f_blendmv8di;
44967 break;
44969 case E_V16SFmode:
44970 if (TARGET_AVX512F)
44972 mmode = HImode;
44973 gen_blendm = gen_avx512f_blendmv16sf;
44975 break;
44977 case E_V16SImode:
44978 if (TARGET_AVX512F)
44980 mmode = HImode;
44981 gen_blendm = gen_avx512f_blendmv16si;
44983 break;
44985 case E_V32HImode:
44986 if (TARGET_AVX512F && TARGET_AVX512BW)
44988 mmode = SImode;
44989 gen_blendm = gen_avx512bw_blendmv32hi;
44991 break;
44993 case E_V64QImode:
44994 if (TARGET_AVX512F && TARGET_AVX512BW)
44996 mmode = DImode;
44997 gen_blendm = gen_avx512bw_blendmv64qi;
44999 break;
45001 default:
45002 break;
45005 if (mmode != VOIDmode)
45007 tmp = gen_reg_rtx (mode);
45008 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
45009 /* The avx512*_blendm<mode> expanders have different operand order
45010 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
45011 elements where the mask is set and second input operand otherwise,
45012 in {sse,avx}*_*blend* the first input operand is used for elements
45013 where the mask is clear and second input operand otherwise. */
45014 emit_insn (gen_blendm (target, target, tmp,
45015 force_reg (mmode,
45016 gen_int_mode (1 << elt, mmode))));
45018 else if (use_vec_merge)
45020 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
45021 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
45022 emit_insn (gen_rtx_SET (target, tmp));
45024 else
45026 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45028 emit_move_insn (mem, target);
45030 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45031 emit_move_insn (tmp, val);
45033 emit_move_insn (target, mem);
45037 void
45038 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
45040 machine_mode mode = GET_MODE (vec);
45041 machine_mode inner_mode = GET_MODE_INNER (mode);
45042 bool use_vec_extr = false;
45043 rtx tmp;
45045 switch (mode)
45047 case E_V2SImode:
45048 case E_V2SFmode:
45049 if (!mmx_ok)
45050 break;
45051 /* FALLTHRU */
45053 case E_V2DFmode:
45054 case E_V2DImode:
45055 case E_V2TImode:
45056 case E_V4TImode:
45057 use_vec_extr = true;
45058 break;
45060 case E_V4SFmode:
45061 use_vec_extr = TARGET_SSE4_1;
45062 if (use_vec_extr)
45063 break;
45065 switch (elt)
45067 case 0:
45068 tmp = vec;
45069 break;
45071 case 1:
45072 case 3:
45073 tmp = gen_reg_rtx (mode);
45074 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
45075 GEN_INT (elt), GEN_INT (elt),
45076 GEN_INT (elt+4), GEN_INT (elt+4)));
45077 break;
45079 case 2:
45080 tmp = gen_reg_rtx (mode);
45081 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
45082 break;
45084 default:
45085 gcc_unreachable ();
45087 vec = tmp;
45088 use_vec_extr = true;
45089 elt = 0;
45090 break;
45092 case E_V4SImode:
45093 use_vec_extr = TARGET_SSE4_1;
45094 if (use_vec_extr)
45095 break;
45097 if (TARGET_SSE2)
45099 switch (elt)
45101 case 0:
45102 tmp = vec;
45103 break;
45105 case 1:
45106 case 3:
45107 tmp = gen_reg_rtx (mode);
45108 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
45109 GEN_INT (elt), GEN_INT (elt),
45110 GEN_INT (elt), GEN_INT (elt)));
45111 break;
45113 case 2:
45114 tmp = gen_reg_rtx (mode);
45115 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
45116 break;
45118 default:
45119 gcc_unreachable ();
45121 vec = tmp;
45122 use_vec_extr = true;
45123 elt = 0;
45125 else
45127 /* For SSE1, we have to reuse the V4SF code. */
45128 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
45129 gen_lowpart (V4SFmode, vec), elt);
45130 return;
45132 break;
45134 case E_V8HImode:
45135 use_vec_extr = TARGET_SSE2;
45136 break;
45137 case E_V4HImode:
45138 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
45139 break;
45141 case E_V16QImode:
45142 use_vec_extr = TARGET_SSE4_1;
45143 break;
45145 case E_V8SFmode:
45146 if (TARGET_AVX)
45148 tmp = gen_reg_rtx (V4SFmode);
45149 if (elt < 4)
45150 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
45151 else
45152 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
45153 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45154 return;
45156 break;
45158 case E_V4DFmode:
45159 if (TARGET_AVX)
45161 tmp = gen_reg_rtx (V2DFmode);
45162 if (elt < 2)
45163 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
45164 else
45165 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
45166 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45167 return;
45169 break;
45171 case E_V32QImode:
45172 if (TARGET_AVX)
45174 tmp = gen_reg_rtx (V16QImode);
45175 if (elt < 16)
45176 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
45177 else
45178 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
45179 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45180 return;
45182 break;
45184 case E_V16HImode:
45185 if (TARGET_AVX)
45187 tmp = gen_reg_rtx (V8HImode);
45188 if (elt < 8)
45189 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
45190 else
45191 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
45192 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45193 return;
45195 break;
45197 case E_V8SImode:
45198 if (TARGET_AVX)
45200 tmp = gen_reg_rtx (V4SImode);
45201 if (elt < 4)
45202 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
45203 else
45204 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
45205 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45206 return;
45208 break;
45210 case E_V4DImode:
45211 if (TARGET_AVX)
45213 tmp = gen_reg_rtx (V2DImode);
45214 if (elt < 2)
45215 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
45216 else
45217 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
45218 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45219 return;
45221 break;
45223 case E_V32HImode:
45224 if (TARGET_AVX512BW)
45226 tmp = gen_reg_rtx (V16HImode);
45227 if (elt < 16)
45228 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
45229 else
45230 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
45231 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45232 return;
45234 break;
45236 case E_V64QImode:
45237 if (TARGET_AVX512BW)
45239 tmp = gen_reg_rtx (V32QImode);
45240 if (elt < 32)
45241 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
45242 else
45243 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
45244 ix86_expand_vector_extract (false, target, tmp, elt & 31);
45245 return;
45247 break;
45249 case E_V16SFmode:
45250 tmp = gen_reg_rtx (V8SFmode);
45251 if (elt < 8)
45252 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
45253 else
45254 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
45255 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45256 return;
45258 case E_V8DFmode:
45259 tmp = gen_reg_rtx (V4DFmode);
45260 if (elt < 4)
45261 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
45262 else
45263 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
45264 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45265 return;
45267 case E_V16SImode:
45268 tmp = gen_reg_rtx (V8SImode);
45269 if (elt < 8)
45270 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
45271 else
45272 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
45273 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45274 return;
45276 case E_V8DImode:
45277 tmp = gen_reg_rtx (V4DImode);
45278 if (elt < 4)
45279 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
45280 else
45281 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
45282 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45283 return;
45285 case E_V8QImode:
45286 /* ??? Could extract the appropriate HImode element and shift. */
45287 default:
45288 break;
45291 if (use_vec_extr)
45293 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
45294 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
45296 /* Let the rtl optimizers know about the zero extension performed. */
45297 if (inner_mode == QImode || inner_mode == HImode)
45299 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
45300 target = gen_lowpart (SImode, target);
45303 emit_insn (gen_rtx_SET (target, tmp));
45305 else
45307 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45309 emit_move_insn (mem, vec);
45311 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45312 emit_move_insn (target, tmp);
45316 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
45317 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
45318 The upper bits of DEST are undefined, though they shouldn't cause
45319 exceptions (some bits from src or all zeros are ok). */
45321 static void
45322 emit_reduc_half (rtx dest, rtx src, int i)
45324 rtx tem, d = dest;
45325 switch (GET_MODE (src))
45327 case E_V4SFmode:
45328 if (i == 128)
45329 tem = gen_sse_movhlps (dest, src, src);
45330 else
45331 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45332 GEN_INT (1 + 4), GEN_INT (1 + 4));
45333 break;
45334 case E_V2DFmode:
45335 tem = gen_vec_interleave_highv2df (dest, src, src);
45336 break;
45337 case E_V16QImode:
45338 case E_V8HImode:
45339 case E_V4SImode:
45340 case E_V2DImode:
45341 d = gen_reg_rtx (V1TImode);
45342 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45343 GEN_INT (i / 2));
45344 break;
45345 case E_V8SFmode:
45346 if (i == 256)
45347 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45348 else
45349 tem = gen_avx_shufps256 (dest, src, src,
45350 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45351 break;
45352 case E_V4DFmode:
45353 if (i == 256)
45354 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45355 else
45356 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45357 break;
45358 case E_V32QImode:
45359 case E_V16HImode:
45360 case E_V8SImode:
45361 case E_V4DImode:
45362 if (i == 256)
45364 if (GET_MODE (dest) != V4DImode)
45365 d = gen_reg_rtx (V4DImode);
45366 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45367 gen_lowpart (V4DImode, src),
45368 const1_rtx);
45370 else
45372 d = gen_reg_rtx (V2TImode);
45373 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45374 GEN_INT (i / 2));
45376 break;
45377 case E_V64QImode:
45378 case E_V32HImode:
45379 case E_V16SImode:
45380 case E_V16SFmode:
45381 case E_V8DImode:
45382 case E_V8DFmode:
45383 if (i > 128)
45384 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45385 gen_lowpart (V16SImode, src),
45386 gen_lowpart (V16SImode, src),
45387 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45388 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45389 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45390 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45391 GEN_INT (0xC), GEN_INT (0xD),
45392 GEN_INT (0xE), GEN_INT (0xF),
45393 GEN_INT (0x10), GEN_INT (0x11),
45394 GEN_INT (0x12), GEN_INT (0x13),
45395 GEN_INT (0x14), GEN_INT (0x15),
45396 GEN_INT (0x16), GEN_INT (0x17));
45397 else
45398 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45399 gen_lowpart (V16SImode, src),
45400 GEN_INT (i == 128 ? 0x2 : 0x1),
45401 GEN_INT (0x3),
45402 GEN_INT (0x3),
45403 GEN_INT (0x3),
45404 GEN_INT (i == 128 ? 0x6 : 0x5),
45405 GEN_INT (0x7),
45406 GEN_INT (0x7),
45407 GEN_INT (0x7),
45408 GEN_INT (i == 128 ? 0xA : 0x9),
45409 GEN_INT (0xB),
45410 GEN_INT (0xB),
45411 GEN_INT (0xB),
45412 GEN_INT (i == 128 ? 0xE : 0xD),
45413 GEN_INT (0xF),
45414 GEN_INT (0xF),
45415 GEN_INT (0xF));
45416 break;
45417 default:
45418 gcc_unreachable ();
45420 emit_insn (tem);
45421 if (d != dest)
45422 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45425 /* Expand a vector reduction. FN is the binary pattern to reduce;
45426 DEST is the destination; IN is the input vector. */
45428 void
45429 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45431 rtx half, dst, vec = in;
45432 machine_mode mode = GET_MODE (in);
45433 int i;
45435 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45436 if (TARGET_SSE4_1
45437 && mode == V8HImode
45438 && fn == gen_uminv8hi3)
45440 emit_insn (gen_sse4_1_phminposuw (dest, in));
45441 return;
45444 for (i = GET_MODE_BITSIZE (mode);
45445 i > GET_MODE_UNIT_BITSIZE (mode);
45446 i >>= 1)
45448 half = gen_reg_rtx (mode);
45449 emit_reduc_half (half, vec, i);
45450 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45451 dst = dest;
45452 else
45453 dst = gen_reg_rtx (mode);
45454 emit_insn (fn (dst, half, vec));
45455 vec = dst;
45459 /* Target hook for scalar_mode_supported_p. */
45460 static bool
45461 ix86_scalar_mode_supported_p (scalar_mode mode)
45463 if (DECIMAL_FLOAT_MODE_P (mode))
45464 return default_decimal_float_supported_p ();
45465 else if (mode == TFmode)
45466 return true;
45467 else
45468 return default_scalar_mode_supported_p (mode);
45471 /* Implements target hook vector_mode_supported_p. */
45472 static bool
45473 ix86_vector_mode_supported_p (machine_mode mode)
45475 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45476 return true;
45477 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45478 return true;
45479 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45480 return true;
45481 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45482 return true;
45483 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45484 return true;
45485 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45486 return true;
45487 return false;
45490 /* Target hook for c_mode_for_suffix. */
45491 static machine_mode
45492 ix86_c_mode_for_suffix (char suffix)
45494 if (suffix == 'q')
45495 return TFmode;
45496 if (suffix == 'w')
45497 return XFmode;
45499 return VOIDmode;
45502 /* Worker function for TARGET_MD_ASM_ADJUST.
45504 We implement asm flag outputs, and maintain source compatibility
45505 with the old cc0-based compiler. */
45507 static rtx_insn *
45508 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45509 vec<const char *> &constraints,
45510 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45512 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45513 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45515 bool saw_asm_flag = false;
45517 start_sequence ();
45518 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45520 const char *con = constraints[i];
45521 if (strncmp (con, "=@cc", 4) != 0)
45522 continue;
45523 con += 4;
45524 if (strchr (con, ',') != NULL)
45526 error ("alternatives not allowed in asm flag output");
45527 continue;
45530 bool invert = false;
45531 if (con[0] == 'n')
45532 invert = true, con++;
45534 machine_mode mode = CCmode;
45535 rtx_code code = UNKNOWN;
45537 switch (con[0])
45539 case 'a':
45540 if (con[1] == 0)
45541 mode = CCAmode, code = EQ;
45542 else if (con[1] == 'e' && con[2] == 0)
45543 mode = CCCmode, code = NE;
45544 break;
45545 case 'b':
45546 if (con[1] == 0)
45547 mode = CCCmode, code = EQ;
45548 else if (con[1] == 'e' && con[2] == 0)
45549 mode = CCAmode, code = NE;
45550 break;
45551 case 'c':
45552 if (con[1] == 0)
45553 mode = CCCmode, code = EQ;
45554 break;
45555 case 'e':
45556 if (con[1] == 0)
45557 mode = CCZmode, code = EQ;
45558 break;
45559 case 'g':
45560 if (con[1] == 0)
45561 mode = CCGCmode, code = GT;
45562 else if (con[1] == 'e' && con[2] == 0)
45563 mode = CCGCmode, code = GE;
45564 break;
45565 case 'l':
45566 if (con[1] == 0)
45567 mode = CCGCmode, code = LT;
45568 else if (con[1] == 'e' && con[2] == 0)
45569 mode = CCGCmode, code = LE;
45570 break;
45571 case 'o':
45572 if (con[1] == 0)
45573 mode = CCOmode, code = EQ;
45574 break;
45575 case 'p':
45576 if (con[1] == 0)
45577 mode = CCPmode, code = EQ;
45578 break;
45579 case 's':
45580 if (con[1] == 0)
45581 mode = CCSmode, code = EQ;
45582 break;
45583 case 'z':
45584 if (con[1] == 0)
45585 mode = CCZmode, code = EQ;
45586 break;
45588 if (code == UNKNOWN)
45590 error ("unknown asm flag output %qs", constraints[i]);
45591 continue;
45593 if (invert)
45594 code = reverse_condition (code);
45596 rtx dest = outputs[i];
45597 if (!saw_asm_flag)
45599 /* This is the first asm flag output. Here we put the flags
45600 register in as the real output and adjust the condition to
45601 allow it. */
45602 constraints[i] = "=Bf";
45603 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45604 saw_asm_flag = true;
45606 else
45608 /* We don't need the flags register as output twice. */
45609 constraints[i] = "=X";
45610 outputs[i] = gen_rtx_SCRATCH (SImode);
45613 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45614 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45616 machine_mode dest_mode = GET_MODE (dest);
45617 if (!SCALAR_INT_MODE_P (dest_mode))
45619 error ("invalid type for asm flag output");
45620 continue;
45623 if (dest_mode == DImode && !TARGET_64BIT)
45624 dest_mode = SImode;
45626 if (dest_mode != QImode)
45628 rtx destqi = gen_reg_rtx (QImode);
45629 emit_insn (gen_rtx_SET (destqi, x));
45631 if (TARGET_ZERO_EXTEND_WITH_AND
45632 && optimize_function_for_speed_p (cfun))
45634 x = force_reg (dest_mode, const0_rtx);
45636 emit_insn (gen_movstrictqi
45637 (gen_lowpart (QImode, x), destqi));
45639 else
45640 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45643 if (dest_mode != GET_MODE (dest))
45645 rtx tmp = gen_reg_rtx (SImode);
45647 emit_insn (gen_rtx_SET (tmp, x));
45648 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45650 else
45651 emit_insn (gen_rtx_SET (dest, x));
45653 rtx_insn *seq = get_insns ();
45654 end_sequence ();
45656 if (saw_asm_flag)
45657 return seq;
45658 else
45660 /* If we had no asm flag outputs, clobber the flags. */
45661 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45662 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45663 return NULL;
45667 /* Implements target vector targetm.asm.encode_section_info. */
45669 static void ATTRIBUTE_UNUSED
45670 ix86_encode_section_info (tree decl, rtx rtl, int first)
45672 default_encode_section_info (decl, rtl, first);
45674 if (ix86_in_large_data_p (decl))
45675 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45678 /* Worker function for REVERSE_CONDITION. */
45680 enum rtx_code
45681 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45683 return (mode != CCFPmode && mode != CCFPUmode
45684 ? reverse_condition (code)
45685 : reverse_condition_maybe_unordered (code));
45688 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45689 to OPERANDS[0]. */
45691 const char *
45692 output_387_reg_move (rtx_insn *insn, rtx *operands)
45694 if (REG_P (operands[0]))
45696 if (REG_P (operands[1])
45697 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45699 if (REGNO (operands[0]) == FIRST_STACK_REG)
45700 return output_387_ffreep (operands, 0);
45701 return "fstp\t%y0";
45703 if (STACK_TOP_P (operands[0]))
45704 return "fld%Z1\t%y1";
45705 return "fst\t%y0";
45707 else if (MEM_P (operands[0]))
45709 gcc_assert (REG_P (operands[1]));
45710 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45711 return "fstp%Z0\t%y0";
45712 else
45714 /* There is no non-popping store to memory for XFmode.
45715 So if we need one, follow the store with a load. */
45716 if (GET_MODE (operands[0]) == XFmode)
45717 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45718 else
45719 return "fst%Z0\t%y0";
45722 else
45723 gcc_unreachable();
45726 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45727 FP status register is set. */
45729 void
45730 ix86_emit_fp_unordered_jump (rtx label)
45732 rtx reg = gen_reg_rtx (HImode);
45733 rtx temp;
45735 emit_insn (gen_x86_fnstsw_1 (reg));
45737 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45739 emit_insn (gen_x86_sahf_1 (reg));
45741 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45742 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45744 else
45746 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45748 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45749 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45752 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45753 gen_rtx_LABEL_REF (VOIDmode, label),
45754 pc_rtx);
45755 temp = gen_rtx_SET (pc_rtx, temp);
45757 emit_jump_insn (temp);
45758 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45761 /* Output code to perform a log1p XFmode calculation. */
45763 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45765 rtx_code_label *label1 = gen_label_rtx ();
45766 rtx_code_label *label2 = gen_label_rtx ();
45768 rtx tmp = gen_reg_rtx (XFmode);
45769 rtx tmp2 = gen_reg_rtx (XFmode);
45770 rtx test;
45772 emit_insn (gen_absxf2 (tmp, op1));
45773 test = gen_rtx_GE (VOIDmode, tmp,
45774 const_double_from_real_value (
45775 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45776 XFmode));
45777 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45779 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45780 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45781 emit_jump (label2);
45783 emit_label (label1);
45784 emit_move_insn (tmp, CONST1_RTX (XFmode));
45785 emit_insn (gen_addxf3 (tmp, op1, tmp));
45786 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45787 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45789 emit_label (label2);
45792 /* Emit code for round calculation. */
45793 void ix86_emit_i387_round (rtx op0, rtx op1)
45795 machine_mode inmode = GET_MODE (op1);
45796 machine_mode outmode = GET_MODE (op0);
45797 rtx e1, e2, res, tmp, tmp1, half;
45798 rtx scratch = gen_reg_rtx (HImode);
45799 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45800 rtx_code_label *jump_label = gen_label_rtx ();
45801 rtx insn;
45802 rtx (*gen_abs) (rtx, rtx);
45803 rtx (*gen_neg) (rtx, rtx);
45805 switch (inmode)
45807 case E_SFmode:
45808 gen_abs = gen_abssf2;
45809 break;
45810 case E_DFmode:
45811 gen_abs = gen_absdf2;
45812 break;
45813 case E_XFmode:
45814 gen_abs = gen_absxf2;
45815 break;
45816 default:
45817 gcc_unreachable ();
45820 switch (outmode)
45822 case E_SFmode:
45823 gen_neg = gen_negsf2;
45824 break;
45825 case E_DFmode:
45826 gen_neg = gen_negdf2;
45827 break;
45828 case E_XFmode:
45829 gen_neg = gen_negxf2;
45830 break;
45831 case E_HImode:
45832 gen_neg = gen_neghi2;
45833 break;
45834 case E_SImode:
45835 gen_neg = gen_negsi2;
45836 break;
45837 case E_DImode:
45838 gen_neg = gen_negdi2;
45839 break;
45840 default:
45841 gcc_unreachable ();
45844 e1 = gen_reg_rtx (inmode);
45845 e2 = gen_reg_rtx (inmode);
45846 res = gen_reg_rtx (outmode);
45848 half = const_double_from_real_value (dconsthalf, inmode);
45850 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45852 /* scratch = fxam(op1) */
45853 emit_insn (gen_rtx_SET (scratch,
45854 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45855 UNSPEC_FXAM)));
45856 /* e1 = fabs(op1) */
45857 emit_insn (gen_abs (e1, op1));
45859 /* e2 = e1 + 0.5 */
45860 half = force_reg (inmode, half);
45861 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45863 /* res = floor(e2) */
45864 if (inmode != XFmode)
45866 tmp1 = gen_reg_rtx (XFmode);
45868 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45870 else
45871 tmp1 = e2;
45873 switch (outmode)
45875 case E_SFmode:
45876 case E_DFmode:
45878 rtx tmp0 = gen_reg_rtx (XFmode);
45880 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45882 emit_insn (gen_rtx_SET (res,
45883 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45884 UNSPEC_TRUNC_NOOP)));
45886 break;
45887 case E_XFmode:
45888 emit_insn (gen_frndintxf2_floor (res, tmp1));
45889 break;
45890 case E_HImode:
45891 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45892 break;
45893 case E_SImode:
45894 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45895 break;
45896 case E_DImode:
45897 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45898 break;
45899 default:
45900 gcc_unreachable ();
45903 /* flags = signbit(a) */
45904 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45906 /* if (flags) then res = -res */
45907 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45908 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45909 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45910 pc_rtx);
45911 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45912 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45913 JUMP_LABEL (insn) = jump_label;
45915 emit_insn (gen_neg (res, res));
45917 emit_label (jump_label);
45918 LABEL_NUSES (jump_label) = 1;
45920 emit_move_insn (op0, res);
45923 /* Output code to perform a Newton-Rhapson approximation of a single precision
45924 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45926 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45928 rtx x0, x1, e0, e1;
45930 x0 = gen_reg_rtx (mode);
45931 e0 = gen_reg_rtx (mode);
45932 e1 = gen_reg_rtx (mode);
45933 x1 = gen_reg_rtx (mode);
45935 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45937 b = force_reg (mode, b);
45939 /* x0 = rcp(b) estimate */
45940 if (mode == V16SFmode || mode == V8DFmode)
45942 if (TARGET_AVX512ER)
45944 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45945 UNSPEC_RCP28)));
45946 /* res = a * x0 */
45947 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45948 return;
45950 else
45951 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45952 UNSPEC_RCP14)));
45954 else
45955 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45956 UNSPEC_RCP)));
45958 /* e0 = x0 * b */
45959 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45961 /* e0 = x0 * e0 */
45962 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45964 /* e1 = x0 + x0 */
45965 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45967 /* x1 = e1 - e0 */
45968 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45970 /* res = a * x1 */
45971 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45974 /* Output code to perform a Newton-Rhapson approximation of a
45975 single precision floating point [reciprocal] square root. */
45977 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45979 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45980 REAL_VALUE_TYPE r;
45981 int unspec;
45983 x0 = gen_reg_rtx (mode);
45984 e0 = gen_reg_rtx (mode);
45985 e1 = gen_reg_rtx (mode);
45986 e2 = gen_reg_rtx (mode);
45987 e3 = gen_reg_rtx (mode);
45989 if (TARGET_AVX512ER && mode == V16SFmode)
45991 if (recip)
45992 /* res = rsqrt28(a) estimate */
45993 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45994 UNSPEC_RSQRT28)));
45995 else
45997 /* x0 = rsqrt28(a) estimate */
45998 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45999 UNSPEC_RSQRT28)));
46000 /* res = rcp28(x0) estimate */
46001 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
46002 UNSPEC_RCP28)));
46004 return;
46007 real_from_integer (&r, VOIDmode, -3, SIGNED);
46008 mthree = const_double_from_real_value (r, SFmode);
46010 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
46011 mhalf = const_double_from_real_value (r, SFmode);
46012 unspec = UNSPEC_RSQRT;
46014 if (VECTOR_MODE_P (mode))
46016 mthree = ix86_build_const_vector (mode, true, mthree);
46017 mhalf = ix86_build_const_vector (mode, true, mhalf);
46018 /* There is no 512-bit rsqrt. There is however rsqrt14. */
46019 if (GET_MODE_SIZE (mode) == 64)
46020 unspec = UNSPEC_RSQRT14;
46023 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
46024 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
46026 a = force_reg (mode, a);
46028 /* x0 = rsqrt(a) estimate */
46029 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
46030 unspec)));
46032 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
46033 if (!recip)
46035 rtx zero = force_reg (mode, CONST0_RTX(mode));
46036 rtx mask;
46038 /* Handle masked compare. */
46039 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
46041 mask = gen_reg_rtx (HImode);
46042 /* Imm value 0x4 corresponds to not-equal comparison. */
46043 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
46044 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
46046 else
46048 mask = gen_reg_rtx (mode);
46049 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
46050 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
46054 /* e0 = x0 * a */
46055 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
46056 /* e1 = e0 * x0 */
46057 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
46059 /* e2 = e1 - 3. */
46060 mthree = force_reg (mode, mthree);
46061 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
46063 mhalf = force_reg (mode, mhalf);
46064 if (recip)
46065 /* e3 = -.5 * x0 */
46066 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
46067 else
46068 /* e3 = -.5 * e0 */
46069 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
46070 /* ret = e2 * e3 */
46071 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
46074 #ifdef TARGET_SOLARIS
46075 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
46077 static void
46078 i386_solaris_elf_named_section (const char *name, unsigned int flags,
46079 tree decl)
46081 /* With Binutils 2.15, the "@unwind" marker must be specified on
46082 every occurrence of the ".eh_frame" section, not just the first
46083 one. */
46084 if (TARGET_64BIT
46085 && strcmp (name, ".eh_frame") == 0)
46087 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
46088 flags & SECTION_WRITE ? "aw" : "a");
46089 return;
46092 #ifndef USE_GAS
46093 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
46095 solaris_elf_asm_comdat_section (name, flags, decl);
46096 return;
46098 #endif
46100 default_elf_asm_named_section (name, flags, decl);
46102 #endif /* TARGET_SOLARIS */
46104 /* Return the mangling of TYPE if it is an extended fundamental type. */
46106 static const char *
46107 ix86_mangle_type (const_tree type)
46109 type = TYPE_MAIN_VARIANT (type);
46111 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
46112 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
46113 return NULL;
46115 switch (TYPE_MODE (type))
46117 case E_TFmode:
46118 /* __float128 is "g". */
46119 return "g";
46120 case E_XFmode:
46121 /* "long double" or __float80 is "e". */
46122 return "e";
46123 default:
46124 return NULL;
46128 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
46130 static tree
46131 ix86_stack_protect_guard (void)
46133 if (TARGET_SSP_TLS_GUARD)
46135 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
46136 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
46137 tree type = build_qualified_type (type_node, qual);
46138 tree t;
46140 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
46142 t = ix86_tls_stack_chk_guard_decl;
46144 if (t == NULL)
46146 rtx x;
46148 t = build_decl
46149 (UNKNOWN_LOCATION, VAR_DECL,
46150 get_identifier (ix86_stack_protector_guard_symbol_str),
46151 type);
46152 TREE_STATIC (t) = 1;
46153 TREE_PUBLIC (t) = 1;
46154 DECL_EXTERNAL (t) = 1;
46155 TREE_USED (t) = 1;
46156 TREE_THIS_VOLATILE (t) = 1;
46157 DECL_ARTIFICIAL (t) = 1;
46158 DECL_IGNORED_P (t) = 1;
46160 /* Do not share RTL as the declaration is visible outside of
46161 current function. */
46162 x = DECL_RTL (t);
46163 RTX_FLAG (x, used) = 1;
46165 ix86_tls_stack_chk_guard_decl = t;
46168 else
46170 tree asptrtype = build_pointer_type (type);
46172 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
46173 t = build2 (MEM_REF, asptrtype, t,
46174 build_int_cst (asptrtype, 0));
46177 return t;
46180 return default_stack_protect_guard ();
46183 /* For 32-bit code we can save PIC register setup by using
46184 __stack_chk_fail_local hidden function instead of calling
46185 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
46186 register, so it is better to call __stack_chk_fail directly. */
46188 static tree ATTRIBUTE_UNUSED
46189 ix86_stack_protect_fail (void)
46191 return TARGET_64BIT
46192 ? default_external_stack_protect_fail ()
46193 : default_hidden_stack_protect_fail ();
46196 /* Select a format to encode pointers in exception handling data. CODE
46197 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
46198 true if the symbol may be affected by dynamic relocations.
46200 ??? All x86 object file formats are capable of representing this.
46201 After all, the relocation needed is the same as for the call insn.
46202 Whether or not a particular assembler allows us to enter such, I
46203 guess we'll have to see. */
46205 asm_preferred_eh_data_format (int code, int global)
46207 if (flag_pic)
46209 int type = DW_EH_PE_sdata8;
46210 if (!TARGET_64BIT
46211 || ix86_cmodel == CM_SMALL_PIC
46212 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
46213 type = DW_EH_PE_sdata4;
46214 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
46216 if (ix86_cmodel == CM_SMALL
46217 || (ix86_cmodel == CM_MEDIUM && code))
46218 return DW_EH_PE_udata4;
46219 return DW_EH_PE_absptr;
46222 /* Expand copysign from SIGN to the positive value ABS_VALUE
46223 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
46224 the sign-bit. */
46225 static void
46226 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
46228 machine_mode mode = GET_MODE (sign);
46229 rtx sgn = gen_reg_rtx (mode);
46230 if (mask == NULL_RTX)
46232 machine_mode vmode;
46234 if (mode == SFmode)
46235 vmode = V4SFmode;
46236 else if (mode == DFmode)
46237 vmode = V2DFmode;
46238 else
46239 vmode = mode;
46241 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
46242 if (!VECTOR_MODE_P (mode))
46244 /* We need to generate a scalar mode mask in this case. */
46245 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46246 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46247 mask = gen_reg_rtx (mode);
46248 emit_insn (gen_rtx_SET (mask, tmp));
46251 else
46252 mask = gen_rtx_NOT (mode, mask);
46253 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
46254 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
46257 /* Expand fabs (OP0) and return a new rtx that holds the result. The
46258 mask for masking out the sign-bit is stored in *SMASK, if that is
46259 non-null. */
46260 static rtx
46261 ix86_expand_sse_fabs (rtx op0, rtx *smask)
46263 machine_mode vmode, mode = GET_MODE (op0);
46264 rtx xa, mask;
46266 xa = gen_reg_rtx (mode);
46267 if (mode == SFmode)
46268 vmode = V4SFmode;
46269 else if (mode == DFmode)
46270 vmode = V2DFmode;
46271 else
46272 vmode = mode;
46273 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
46274 if (!VECTOR_MODE_P (mode))
46276 /* We need to generate a scalar mode mask in this case. */
46277 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46278 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46279 mask = gen_reg_rtx (mode);
46280 emit_insn (gen_rtx_SET (mask, tmp));
46282 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
46284 if (smask)
46285 *smask = mask;
46287 return xa;
46290 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
46291 swapping the operands if SWAP_OPERANDS is true. The expanded
46292 code is a forward jump to a newly created label in case the
46293 comparison is true. The generated label rtx is returned. */
46294 static rtx_code_label *
46295 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
46296 bool swap_operands)
46298 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
46299 rtx_code_label *label;
46300 rtx tmp;
46302 if (swap_operands)
46303 std::swap (op0, op1);
46305 label = gen_label_rtx ();
46306 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
46307 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
46308 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
46309 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
46310 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
46311 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
46312 JUMP_LABEL (tmp) = label;
46314 return label;
46317 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
46318 using comparison code CODE. Operands are swapped for the comparison if
46319 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
46320 static rtx
46321 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
46322 bool swap_operands)
46324 rtx (*insn)(rtx, rtx, rtx, rtx);
46325 machine_mode mode = GET_MODE (op0);
46326 rtx mask = gen_reg_rtx (mode);
46328 if (swap_operands)
46329 std::swap (op0, op1);
46331 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
46333 emit_insn (insn (mask, op0, op1,
46334 gen_rtx_fmt_ee (code, mode, op0, op1)));
46335 return mask;
46338 /* Generate and return a rtx of mode MODE for 2**n where n is the number
46339 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
46340 static rtx
46341 ix86_gen_TWO52 (machine_mode mode)
46343 REAL_VALUE_TYPE TWO52r;
46344 rtx TWO52;
46346 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
46347 TWO52 = const_double_from_real_value (TWO52r, mode);
46348 TWO52 = force_reg (mode, TWO52);
46350 return TWO52;
46353 /* Expand SSE sequence for computing lround from OP1 storing
46354 into OP0. */
46355 void
46356 ix86_expand_lround (rtx op0, rtx op1)
46358 /* C code for the stuff we're doing below:
46359 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
46360 return (long)tmp;
46362 machine_mode mode = GET_MODE (op1);
46363 const struct real_format *fmt;
46364 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46365 rtx adj;
46367 /* load nextafter (0.5, 0.0) */
46368 fmt = REAL_MODE_FORMAT (mode);
46369 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46370 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46372 /* adj = copysign (0.5, op1) */
46373 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46374 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46376 /* adj = op1 + adj */
46377 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46379 /* op0 = (imode)adj */
46380 expand_fix (op0, adj, 0);
46383 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46384 into OPERAND0. */
46385 void
46386 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46388 /* C code for the stuff we're doing below (for do_floor):
46389 xi = (long)op1;
46390 xi -= (double)xi > op1 ? 1 : 0;
46391 return xi;
46393 machine_mode fmode = GET_MODE (op1);
46394 machine_mode imode = GET_MODE (op0);
46395 rtx ireg, freg, tmp;
46396 rtx_code_label *label;
46398 /* reg = (long)op1 */
46399 ireg = gen_reg_rtx (imode);
46400 expand_fix (ireg, op1, 0);
46402 /* freg = (double)reg */
46403 freg = gen_reg_rtx (fmode);
46404 expand_float (freg, ireg, 0);
46406 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46407 label = ix86_expand_sse_compare_and_jump (UNLE,
46408 freg, op1, !do_floor);
46409 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46410 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46411 emit_move_insn (ireg, tmp);
46413 emit_label (label);
46414 LABEL_NUSES (label) = 1;
46416 emit_move_insn (op0, ireg);
46419 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46420 result in OPERAND0. */
46421 void
46422 ix86_expand_rint (rtx operand0, rtx operand1)
46424 /* C code for the stuff we're doing below:
46425 xa = fabs (operand1);
46426 if (!isless (xa, 2**52))
46427 return operand1;
46428 xa = xa + 2**52 - 2**52;
46429 return copysign (xa, operand1);
46431 machine_mode mode = GET_MODE (operand0);
46432 rtx res, xa, TWO52, mask;
46433 rtx_code_label *label;
46435 res = gen_reg_rtx (mode);
46436 emit_move_insn (res, operand1);
46438 /* xa = abs (operand1) */
46439 xa = ix86_expand_sse_fabs (res, &mask);
46441 /* if (!isless (xa, TWO52)) goto label; */
46442 TWO52 = ix86_gen_TWO52 (mode);
46443 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46445 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46446 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46448 ix86_sse_copysign_to_positive (res, xa, res, mask);
46450 emit_label (label);
46451 LABEL_NUSES (label) = 1;
46453 emit_move_insn (operand0, res);
46456 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46457 into OPERAND0. */
46458 void
46459 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46461 /* C code for the stuff we expand below.
46462 double xa = fabs (x), x2;
46463 if (!isless (xa, TWO52))
46464 return x;
46465 xa = xa + TWO52 - TWO52;
46466 x2 = copysign (xa, x);
46467 Compensate. Floor:
46468 if (x2 > x)
46469 x2 -= 1;
46470 Compensate. Ceil:
46471 if (x2 < x)
46472 x2 -= -1;
46473 return x2;
46475 machine_mode mode = GET_MODE (operand0);
46476 rtx xa, TWO52, tmp, one, res, mask;
46477 rtx_code_label *label;
46479 TWO52 = ix86_gen_TWO52 (mode);
46481 /* Temporary for holding the result, initialized to the input
46482 operand to ease control flow. */
46483 res = gen_reg_rtx (mode);
46484 emit_move_insn (res, operand1);
46486 /* xa = abs (operand1) */
46487 xa = ix86_expand_sse_fabs (res, &mask);
46489 /* if (!isless (xa, TWO52)) goto label; */
46490 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46492 /* xa = xa + TWO52 - TWO52; */
46493 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46494 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46496 /* xa = copysign (xa, operand1) */
46497 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46499 /* generate 1.0 or -1.0 */
46500 one = force_reg (mode,
46501 const_double_from_real_value (do_floor
46502 ? dconst1 : dconstm1, mode));
46504 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46505 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46506 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46507 /* We always need to subtract here to preserve signed zero. */
46508 tmp = expand_simple_binop (mode, MINUS,
46509 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46510 emit_move_insn (res, tmp);
46512 emit_label (label);
46513 LABEL_NUSES (label) = 1;
46515 emit_move_insn (operand0, res);
46518 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46519 into OPERAND0. */
46520 void
46521 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46523 /* C code for the stuff we expand below.
46524 double xa = fabs (x), x2;
46525 if (!isless (xa, TWO52))
46526 return x;
46527 x2 = (double)(long)x;
46528 Compensate. Floor:
46529 if (x2 > x)
46530 x2 -= 1;
46531 Compensate. Ceil:
46532 if (x2 < x)
46533 x2 += 1;
46534 if (HONOR_SIGNED_ZEROS (mode))
46535 return copysign (x2, x);
46536 return x2;
46538 machine_mode mode = GET_MODE (operand0);
46539 rtx xa, xi, TWO52, tmp, one, res, mask;
46540 rtx_code_label *label;
46542 TWO52 = ix86_gen_TWO52 (mode);
46544 /* Temporary for holding the result, initialized to the input
46545 operand to ease control flow. */
46546 res = gen_reg_rtx (mode);
46547 emit_move_insn (res, operand1);
46549 /* xa = abs (operand1) */
46550 xa = ix86_expand_sse_fabs (res, &mask);
46552 /* if (!isless (xa, TWO52)) goto label; */
46553 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46555 /* xa = (double)(long)x */
46556 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46557 expand_fix (xi, res, 0);
46558 expand_float (xa, xi, 0);
46560 /* generate 1.0 */
46561 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46563 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46564 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46565 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46566 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46567 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46568 emit_move_insn (res, tmp);
46570 if (HONOR_SIGNED_ZEROS (mode))
46571 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46573 emit_label (label);
46574 LABEL_NUSES (label) = 1;
46576 emit_move_insn (operand0, res);
46579 /* Expand SSE sequence for computing round from OPERAND1 storing
46580 into OPERAND0. Sequence that works without relying on DImode truncation
46581 via cvttsd2siq that is only available on 64bit targets. */
46582 void
46583 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46585 /* C code for the stuff we expand below.
46586 double xa = fabs (x), xa2, x2;
46587 if (!isless (xa, TWO52))
46588 return x;
46589 Using the absolute value and copying back sign makes
46590 -0.0 -> -0.0 correct.
46591 xa2 = xa + TWO52 - TWO52;
46592 Compensate.
46593 dxa = xa2 - xa;
46594 if (dxa <= -0.5)
46595 xa2 += 1;
46596 else if (dxa > 0.5)
46597 xa2 -= 1;
46598 x2 = copysign (xa2, x);
46599 return x2;
46601 machine_mode mode = GET_MODE (operand0);
46602 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46603 rtx_code_label *label;
46605 TWO52 = ix86_gen_TWO52 (mode);
46607 /* Temporary for holding the result, initialized to the input
46608 operand to ease control flow. */
46609 res = gen_reg_rtx (mode);
46610 emit_move_insn (res, operand1);
46612 /* xa = abs (operand1) */
46613 xa = ix86_expand_sse_fabs (res, &mask);
46615 /* if (!isless (xa, TWO52)) goto label; */
46616 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46618 /* xa2 = xa + TWO52 - TWO52; */
46619 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46620 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46622 /* dxa = xa2 - xa; */
46623 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46625 /* generate 0.5, 1.0 and -0.5 */
46626 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46627 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46628 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46629 0, OPTAB_DIRECT);
46631 /* Compensate. */
46632 tmp = gen_reg_rtx (mode);
46633 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46634 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46635 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46636 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46637 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46638 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46639 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46640 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46642 /* res = copysign (xa2, operand1) */
46643 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46645 emit_label (label);
46646 LABEL_NUSES (label) = 1;
46648 emit_move_insn (operand0, res);
46651 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46652 into OPERAND0. */
46653 void
46654 ix86_expand_trunc (rtx operand0, rtx operand1)
46656 /* C code for SSE variant we expand below.
46657 double xa = fabs (x), x2;
46658 if (!isless (xa, TWO52))
46659 return x;
46660 x2 = (double)(long)x;
46661 if (HONOR_SIGNED_ZEROS (mode))
46662 return copysign (x2, x);
46663 return x2;
46665 machine_mode mode = GET_MODE (operand0);
46666 rtx xa, xi, TWO52, res, mask;
46667 rtx_code_label *label;
46669 TWO52 = ix86_gen_TWO52 (mode);
46671 /* Temporary for holding the result, initialized to the input
46672 operand to ease control flow. */
46673 res = gen_reg_rtx (mode);
46674 emit_move_insn (res, operand1);
46676 /* xa = abs (operand1) */
46677 xa = ix86_expand_sse_fabs (res, &mask);
46679 /* if (!isless (xa, TWO52)) goto label; */
46680 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46682 /* x = (double)(long)x */
46683 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46684 expand_fix (xi, res, 0);
46685 expand_float (res, xi, 0);
46687 if (HONOR_SIGNED_ZEROS (mode))
46688 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46690 emit_label (label);
46691 LABEL_NUSES (label) = 1;
46693 emit_move_insn (operand0, res);
46696 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46697 into OPERAND0. */
46698 void
46699 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46701 machine_mode mode = GET_MODE (operand0);
46702 rtx xa, mask, TWO52, one, res, smask, tmp;
46703 rtx_code_label *label;
46705 /* C code for SSE variant we expand below.
46706 double xa = fabs (x), x2;
46707 if (!isless (xa, TWO52))
46708 return x;
46709 xa2 = xa + TWO52 - TWO52;
46710 Compensate:
46711 if (xa2 > xa)
46712 xa2 -= 1.0;
46713 x2 = copysign (xa2, x);
46714 return x2;
46717 TWO52 = ix86_gen_TWO52 (mode);
46719 /* Temporary for holding the result, initialized to the input
46720 operand to ease control flow. */
46721 res = gen_reg_rtx (mode);
46722 emit_move_insn (res, operand1);
46724 /* xa = abs (operand1) */
46725 xa = ix86_expand_sse_fabs (res, &smask);
46727 /* if (!isless (xa, TWO52)) goto label; */
46728 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46730 /* res = xa + TWO52 - TWO52; */
46731 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46732 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46733 emit_move_insn (res, tmp);
46735 /* generate 1.0 */
46736 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46738 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46739 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46740 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46741 tmp = expand_simple_binop (mode, MINUS,
46742 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46743 emit_move_insn (res, tmp);
46745 /* res = copysign (res, operand1) */
46746 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46748 emit_label (label);
46749 LABEL_NUSES (label) = 1;
46751 emit_move_insn (operand0, res);
46754 /* Expand SSE sequence for computing round from OPERAND1 storing
46755 into OPERAND0. */
46756 void
46757 ix86_expand_round (rtx operand0, rtx operand1)
46759 /* C code for the stuff we're doing below:
46760 double xa = fabs (x);
46761 if (!isless (xa, TWO52))
46762 return x;
46763 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46764 return copysign (xa, x);
46766 machine_mode mode = GET_MODE (operand0);
46767 rtx res, TWO52, xa, xi, half, mask;
46768 rtx_code_label *label;
46769 const struct real_format *fmt;
46770 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46772 /* Temporary for holding the result, initialized to the input
46773 operand to ease control flow. */
46774 res = gen_reg_rtx (mode);
46775 emit_move_insn (res, operand1);
46777 TWO52 = ix86_gen_TWO52 (mode);
46778 xa = ix86_expand_sse_fabs (res, &mask);
46779 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46781 /* load nextafter (0.5, 0.0) */
46782 fmt = REAL_MODE_FORMAT (mode);
46783 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46784 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46786 /* xa = xa + 0.5 */
46787 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46788 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46790 /* xa = (double)(int64_t)xa */
46791 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46792 expand_fix (xi, xa, 0);
46793 expand_float (xa, xi, 0);
46795 /* res = copysign (xa, operand1) */
46796 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46798 emit_label (label);
46799 LABEL_NUSES (label) = 1;
46801 emit_move_insn (operand0, res);
46804 /* Expand SSE sequence for computing round
46805 from OP1 storing into OP0 using sse4 round insn. */
46806 void
46807 ix86_expand_round_sse4 (rtx op0, rtx op1)
46809 machine_mode mode = GET_MODE (op0);
46810 rtx e1, e2, res, half;
46811 const struct real_format *fmt;
46812 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46813 rtx (*gen_copysign) (rtx, rtx, rtx);
46814 rtx (*gen_round) (rtx, rtx, rtx);
46816 switch (mode)
46818 case E_SFmode:
46819 gen_copysign = gen_copysignsf3;
46820 gen_round = gen_sse4_1_roundsf2;
46821 break;
46822 case E_DFmode:
46823 gen_copysign = gen_copysigndf3;
46824 gen_round = gen_sse4_1_rounddf2;
46825 break;
46826 default:
46827 gcc_unreachable ();
46830 /* round (a) = trunc (a + copysign (0.5, a)) */
46832 /* load nextafter (0.5, 0.0) */
46833 fmt = REAL_MODE_FORMAT (mode);
46834 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46835 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46836 half = const_double_from_real_value (pred_half, mode);
46838 /* e1 = copysign (0.5, op1) */
46839 e1 = gen_reg_rtx (mode);
46840 emit_insn (gen_copysign (e1, half, op1));
46842 /* e2 = op1 + e1 */
46843 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46845 /* res = trunc (e2) */
46846 res = gen_reg_rtx (mode);
46847 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46849 emit_move_insn (op0, res);
46853 /* Table of valid machine attributes. */
46854 static const struct attribute_spec ix86_attribute_table[] =
46856 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46857 affects_type_identity } */
46858 /* Stdcall attribute says callee is responsible for popping arguments
46859 if they are not variable. */
46860 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46861 true },
46862 /* Fastcall attribute says callee is responsible for popping arguments
46863 if they are not variable. */
46864 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46865 true },
46866 /* Thiscall attribute says callee is responsible for popping arguments
46867 if they are not variable. */
46868 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46869 true },
46870 /* Cdecl attribute says the callee is a normal C declaration */
46871 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46872 true },
46873 /* Regparm attribute specifies how many integer arguments are to be
46874 passed in registers. */
46875 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46876 true },
46877 /* Sseregparm attribute says we are using x86_64 calling conventions
46878 for FP arguments. */
46879 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46880 true },
46881 /* The transactional memory builtins are implicitly regparm or fastcall
46882 depending on the ABI. Override the generic do-nothing attribute that
46883 these builtins were declared with. */
46884 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46885 true },
46886 /* force_align_arg_pointer says this function realigns the stack at entry. */
46887 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46888 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46889 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46890 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46891 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46892 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46893 false },
46894 #endif
46895 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46896 false },
46897 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46898 false },
46899 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46900 SUBTARGET_ATTRIBUTE_TABLE,
46901 #endif
46902 /* ms_abi and sysv_abi calling convention function attributes. */
46903 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46904 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46905 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46906 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46907 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46908 false },
46909 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46910 ix86_handle_callee_pop_aggregate_return, true },
46911 { "interrupt", 0, 0, false, true, true,
46912 ix86_handle_interrupt_attribute, false },
46913 { "no_caller_saved_registers", 0, 0, false, true, true,
46914 ix86_handle_no_caller_saved_registers_attribute, false },
46915 { "naked", 0, 0, true, false, false,
46916 ix86_handle_fndecl_attribute, false },
46918 /* End element. */
46919 { NULL, 0, 0, false, false, false, NULL, false }
46922 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46923 static int
46924 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46925 tree vectype, int)
46927 switch (type_of_cost)
46929 case scalar_stmt:
46930 return ix86_cost->scalar_stmt_cost;
46932 case scalar_load:
46933 return ix86_cost->scalar_load_cost;
46935 case scalar_store:
46936 return ix86_cost->scalar_store_cost;
46938 case vector_stmt:
46939 return ix86_cost->vec_stmt_cost;
46941 case vector_load:
46942 return ix86_cost->vec_align_load_cost;
46944 case vector_store:
46945 return ix86_cost->vec_store_cost;
46947 case vec_to_scalar:
46948 return ix86_cost->vec_to_scalar_cost;
46950 case scalar_to_vec:
46951 return ix86_cost->scalar_to_vec_cost;
46953 case unaligned_load:
46954 case unaligned_store:
46955 return ix86_cost->vec_unalign_load_cost;
46957 case cond_branch_taken:
46958 return ix86_cost->cond_taken_branch_cost;
46960 case cond_branch_not_taken:
46961 return ix86_cost->cond_not_taken_branch_cost;
46963 case vec_perm:
46964 case vec_promote_demote:
46965 return ix86_cost->vec_stmt_cost;
46967 case vec_construct:
46968 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46970 default:
46971 gcc_unreachable ();
46975 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46976 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46977 insn every time. */
46979 static GTY(()) rtx_insn *vselect_insn;
46981 /* Initialize vselect_insn. */
46983 static void
46984 init_vselect_insn (void)
46986 unsigned i;
46987 rtx x;
46989 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46990 for (i = 0; i < MAX_VECT_LEN; ++i)
46991 XVECEXP (x, 0, i) = const0_rtx;
46992 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46993 const0_rtx), x);
46994 x = gen_rtx_SET (const0_rtx, x);
46995 start_sequence ();
46996 vselect_insn = emit_insn (x);
46997 end_sequence ();
47000 /* Construct (set target (vec_select op0 (parallel perm))) and
47001 return true if that's a valid instruction in the active ISA. */
47003 static bool
47004 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
47005 unsigned nelt, bool testing_p)
47007 unsigned int i;
47008 rtx x, save_vconcat;
47009 int icode;
47011 if (vselect_insn == NULL_RTX)
47012 init_vselect_insn ();
47014 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
47015 PUT_NUM_ELEM (XVEC (x, 0), nelt);
47016 for (i = 0; i < nelt; ++i)
47017 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
47018 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
47019 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
47020 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
47021 SET_DEST (PATTERN (vselect_insn)) = target;
47022 icode = recog_memoized (vselect_insn);
47024 if (icode >= 0 && !testing_p)
47025 emit_insn (copy_rtx (PATTERN (vselect_insn)));
47027 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
47028 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
47029 INSN_CODE (vselect_insn) = -1;
47031 return icode >= 0;
47034 /* Similar, but generate a vec_concat from op0 and op1 as well. */
47036 static bool
47037 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
47038 const unsigned char *perm, unsigned nelt,
47039 bool testing_p)
47041 machine_mode v2mode;
47042 rtx x;
47043 bool ok;
47045 if (vselect_insn == NULL_RTX)
47046 init_vselect_insn ();
47048 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
47049 return false;
47050 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
47051 PUT_MODE (x, v2mode);
47052 XEXP (x, 0) = op0;
47053 XEXP (x, 1) = op1;
47054 ok = expand_vselect (target, x, perm, nelt, testing_p);
47055 XEXP (x, 0) = const0_rtx;
47056 XEXP (x, 1) = const0_rtx;
47057 return ok;
47060 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47061 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
47063 static bool
47064 expand_vec_perm_blend (struct expand_vec_perm_d *d)
47066 machine_mode mmode, vmode = d->vmode;
47067 unsigned i, mask, nelt = d->nelt;
47068 rtx target, op0, op1, maskop, x;
47069 rtx rperm[32], vperm;
47071 if (d->one_operand_p)
47072 return false;
47073 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
47074 && (TARGET_AVX512BW
47075 || GET_MODE_UNIT_SIZE (vmode) >= 4))
47077 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47079 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47081 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47083 else
47084 return false;
47086 /* This is a blend, not a permute. Elements must stay in their
47087 respective lanes. */
47088 for (i = 0; i < nelt; ++i)
47090 unsigned e = d->perm[i];
47091 if (!(e == i || e == i + nelt))
47092 return false;
47095 if (d->testing_p)
47096 return true;
47098 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
47099 decision should be extracted elsewhere, so that we only try that
47100 sequence once all budget==3 options have been tried. */
47101 target = d->target;
47102 op0 = d->op0;
47103 op1 = d->op1;
47104 mask = 0;
47106 switch (vmode)
47108 case E_V8DFmode:
47109 case E_V16SFmode:
47110 case E_V4DFmode:
47111 case E_V8SFmode:
47112 case E_V2DFmode:
47113 case E_V4SFmode:
47114 case E_V8HImode:
47115 case E_V8SImode:
47116 case E_V32HImode:
47117 case E_V64QImode:
47118 case E_V16SImode:
47119 case E_V8DImode:
47120 for (i = 0; i < nelt; ++i)
47121 mask |= (d->perm[i] >= nelt) << i;
47122 break;
47124 case E_V2DImode:
47125 for (i = 0; i < 2; ++i)
47126 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
47127 vmode = V8HImode;
47128 goto do_subreg;
47130 case E_V4SImode:
47131 for (i = 0; i < 4; ++i)
47132 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47133 vmode = V8HImode;
47134 goto do_subreg;
47136 case E_V16QImode:
47137 /* See if bytes move in pairs so we can use pblendw with
47138 an immediate argument, rather than pblendvb with a vector
47139 argument. */
47140 for (i = 0; i < 16; i += 2)
47141 if (d->perm[i] + 1 != d->perm[i + 1])
47143 use_pblendvb:
47144 for (i = 0; i < nelt; ++i)
47145 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
47147 finish_pblendvb:
47148 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
47149 vperm = force_reg (vmode, vperm);
47151 if (GET_MODE_SIZE (vmode) == 16)
47152 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
47153 else
47154 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
47155 if (target != d->target)
47156 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47157 return true;
47160 for (i = 0; i < 8; ++i)
47161 mask |= (d->perm[i * 2] >= 16) << i;
47162 vmode = V8HImode;
47163 /* FALLTHRU */
47165 do_subreg:
47166 target = gen_reg_rtx (vmode);
47167 op0 = gen_lowpart (vmode, op0);
47168 op1 = gen_lowpart (vmode, op1);
47169 break;
47171 case E_V32QImode:
47172 /* See if bytes move in pairs. If not, vpblendvb must be used. */
47173 for (i = 0; i < 32; i += 2)
47174 if (d->perm[i] + 1 != d->perm[i + 1])
47175 goto use_pblendvb;
47176 /* See if bytes move in quadruplets. If yes, vpblendd
47177 with immediate can be used. */
47178 for (i = 0; i < 32; i += 4)
47179 if (d->perm[i] + 2 != d->perm[i + 2])
47180 break;
47181 if (i < 32)
47183 /* See if bytes move the same in both lanes. If yes,
47184 vpblendw with immediate can be used. */
47185 for (i = 0; i < 16; i += 2)
47186 if (d->perm[i] + 16 != d->perm[i + 16])
47187 goto use_pblendvb;
47189 /* Use vpblendw. */
47190 for (i = 0; i < 16; ++i)
47191 mask |= (d->perm[i * 2] >= 32) << i;
47192 vmode = V16HImode;
47193 goto do_subreg;
47196 /* Use vpblendd. */
47197 for (i = 0; i < 8; ++i)
47198 mask |= (d->perm[i * 4] >= 32) << i;
47199 vmode = V8SImode;
47200 goto do_subreg;
47202 case E_V16HImode:
47203 /* See if words move in pairs. If yes, vpblendd can be used. */
47204 for (i = 0; i < 16; i += 2)
47205 if (d->perm[i] + 1 != d->perm[i + 1])
47206 break;
47207 if (i < 16)
47209 /* See if words move the same in both lanes. If not,
47210 vpblendvb must be used. */
47211 for (i = 0; i < 8; i++)
47212 if (d->perm[i] + 8 != d->perm[i + 8])
47214 /* Use vpblendvb. */
47215 for (i = 0; i < 32; ++i)
47216 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
47218 vmode = V32QImode;
47219 nelt = 32;
47220 target = gen_reg_rtx (vmode);
47221 op0 = gen_lowpart (vmode, op0);
47222 op1 = gen_lowpart (vmode, op1);
47223 goto finish_pblendvb;
47226 /* Use vpblendw. */
47227 for (i = 0; i < 16; ++i)
47228 mask |= (d->perm[i] >= 16) << i;
47229 break;
47232 /* Use vpblendd. */
47233 for (i = 0; i < 8; ++i)
47234 mask |= (d->perm[i * 2] >= 16) << i;
47235 vmode = V8SImode;
47236 goto do_subreg;
47238 case E_V4DImode:
47239 /* Use vpblendd. */
47240 for (i = 0; i < 4; ++i)
47241 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47242 vmode = V8SImode;
47243 goto do_subreg;
47245 default:
47246 gcc_unreachable ();
47249 switch (vmode)
47251 case E_V8DFmode:
47252 case E_V8DImode:
47253 mmode = QImode;
47254 break;
47255 case E_V16SFmode:
47256 case E_V16SImode:
47257 mmode = HImode;
47258 break;
47259 case E_V32HImode:
47260 mmode = SImode;
47261 break;
47262 case E_V64QImode:
47263 mmode = DImode;
47264 break;
47265 default:
47266 mmode = VOIDmode;
47269 if (mmode != VOIDmode)
47270 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
47271 else
47272 maskop = GEN_INT (mask);
47274 /* This matches five different patterns with the different modes. */
47275 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
47276 x = gen_rtx_SET (target, x);
47277 emit_insn (x);
47278 if (target != d->target)
47279 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47281 return true;
47284 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47285 in terms of the variable form of vpermilps.
47287 Note that we will have already failed the immediate input vpermilps,
47288 which requires that the high and low part shuffle be identical; the
47289 variable form doesn't require that. */
47291 static bool
47292 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
47294 rtx rperm[8], vperm;
47295 unsigned i;
47297 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
47298 return false;
47300 /* We can only permute within the 128-bit lane. */
47301 for (i = 0; i < 8; ++i)
47303 unsigned e = d->perm[i];
47304 if (i < 4 ? e >= 4 : e < 4)
47305 return false;
47308 if (d->testing_p)
47309 return true;
47311 for (i = 0; i < 8; ++i)
47313 unsigned e = d->perm[i];
47315 /* Within each 128-bit lane, the elements of op0 are numbered
47316 from 0 and the elements of op1 are numbered from 4. */
47317 if (e >= 8 + 4)
47318 e -= 8;
47319 else if (e >= 4)
47320 e -= 4;
47322 rperm[i] = GEN_INT (e);
47325 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
47326 vperm = force_reg (V8SImode, vperm);
47327 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
47329 return true;
47332 /* Return true if permutation D can be performed as VMODE permutation
47333 instead. */
47335 static bool
47336 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
47338 unsigned int i, j, chunk;
47340 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
47341 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
47342 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
47343 return false;
47345 if (GET_MODE_NUNITS (vmode) >= d->nelt)
47346 return true;
47348 chunk = d->nelt / GET_MODE_NUNITS (vmode);
47349 for (i = 0; i < d->nelt; i += chunk)
47350 if (d->perm[i] & (chunk - 1))
47351 return false;
47352 else
47353 for (j = 1; j < chunk; ++j)
47354 if (d->perm[i] + j != d->perm[i + j])
47355 return false;
47357 return true;
47360 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47361 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
47363 static bool
47364 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47366 unsigned i, nelt, eltsz, mask;
47367 unsigned char perm[64];
47368 machine_mode vmode = V16QImode;
47369 rtx rperm[64], vperm, target, op0, op1;
47371 nelt = d->nelt;
47373 if (!d->one_operand_p)
47375 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47377 if (TARGET_AVX2
47378 && valid_perm_using_mode_p (V2TImode, d))
47380 if (d->testing_p)
47381 return true;
47383 /* Use vperm2i128 insn. The pattern uses
47384 V4DImode instead of V2TImode. */
47385 target = d->target;
47386 if (d->vmode != V4DImode)
47387 target = gen_reg_rtx (V4DImode);
47388 op0 = gen_lowpart (V4DImode, d->op0);
47389 op1 = gen_lowpart (V4DImode, d->op1);
47390 rperm[0]
47391 = GEN_INT ((d->perm[0] / (nelt / 2))
47392 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47393 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47394 if (target != d->target)
47395 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47396 return true;
47398 return false;
47401 else
47403 if (GET_MODE_SIZE (d->vmode) == 16)
47405 if (!TARGET_SSSE3)
47406 return false;
47408 else if (GET_MODE_SIZE (d->vmode) == 32)
47410 if (!TARGET_AVX2)
47411 return false;
47413 /* V4DImode should be already handled through
47414 expand_vselect by vpermq instruction. */
47415 gcc_assert (d->vmode != V4DImode);
47417 vmode = V32QImode;
47418 if (d->vmode == V8SImode
47419 || d->vmode == V16HImode
47420 || d->vmode == V32QImode)
47422 /* First see if vpermq can be used for
47423 V8SImode/V16HImode/V32QImode. */
47424 if (valid_perm_using_mode_p (V4DImode, d))
47426 for (i = 0; i < 4; i++)
47427 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47428 if (d->testing_p)
47429 return true;
47430 target = gen_reg_rtx (V4DImode);
47431 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47432 perm, 4, false))
47434 emit_move_insn (d->target,
47435 gen_lowpart (d->vmode, target));
47436 return true;
47438 return false;
47441 /* Next see if vpermd can be used. */
47442 if (valid_perm_using_mode_p (V8SImode, d))
47443 vmode = V8SImode;
47445 /* Or if vpermps can be used. */
47446 else if (d->vmode == V8SFmode)
47447 vmode = V8SImode;
47449 if (vmode == V32QImode)
47451 /* vpshufb only works intra lanes, it is not
47452 possible to shuffle bytes in between the lanes. */
47453 for (i = 0; i < nelt; ++i)
47454 if ((d->perm[i] ^ i) & (nelt / 2))
47455 return false;
47458 else if (GET_MODE_SIZE (d->vmode) == 64)
47460 if (!TARGET_AVX512BW)
47461 return false;
47463 /* If vpermq didn't work, vpshufb won't work either. */
47464 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47465 return false;
47467 vmode = V64QImode;
47468 if (d->vmode == V16SImode
47469 || d->vmode == V32HImode
47470 || d->vmode == V64QImode)
47472 /* First see if vpermq can be used for
47473 V16SImode/V32HImode/V64QImode. */
47474 if (valid_perm_using_mode_p (V8DImode, d))
47476 for (i = 0; i < 8; i++)
47477 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47478 if (d->testing_p)
47479 return true;
47480 target = gen_reg_rtx (V8DImode);
47481 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47482 perm, 8, false))
47484 emit_move_insn (d->target,
47485 gen_lowpart (d->vmode, target));
47486 return true;
47488 return false;
47491 /* Next see if vpermd can be used. */
47492 if (valid_perm_using_mode_p (V16SImode, d))
47493 vmode = V16SImode;
47495 /* Or if vpermps can be used. */
47496 else if (d->vmode == V16SFmode)
47497 vmode = V16SImode;
47498 if (vmode == V64QImode)
47500 /* vpshufb only works intra lanes, it is not
47501 possible to shuffle bytes in between the lanes. */
47502 for (i = 0; i < nelt; ++i)
47503 if ((d->perm[i] ^ i) & (nelt / 4))
47504 return false;
47507 else
47508 return false;
47511 if (d->testing_p)
47512 return true;
47514 if (vmode == V8SImode)
47515 for (i = 0; i < 8; ++i)
47516 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47517 else if (vmode == V16SImode)
47518 for (i = 0; i < 16; ++i)
47519 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47520 else
47522 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47523 if (!d->one_operand_p)
47524 mask = 2 * nelt - 1;
47525 else if (vmode == V16QImode)
47526 mask = nelt - 1;
47527 else if (vmode == V64QImode)
47528 mask = nelt / 4 - 1;
47529 else
47530 mask = nelt / 2 - 1;
47532 for (i = 0; i < nelt; ++i)
47534 unsigned j, e = d->perm[i] & mask;
47535 for (j = 0; j < eltsz; ++j)
47536 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47540 vperm = gen_rtx_CONST_VECTOR (vmode,
47541 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47542 vperm = force_reg (vmode, vperm);
47544 target = d->target;
47545 if (d->vmode != vmode)
47546 target = gen_reg_rtx (vmode);
47547 op0 = gen_lowpart (vmode, d->op0);
47548 if (d->one_operand_p)
47550 if (vmode == V16QImode)
47551 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47552 else if (vmode == V32QImode)
47553 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47554 else if (vmode == V64QImode)
47555 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47556 else if (vmode == V8SFmode)
47557 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47558 else if (vmode == V8SImode)
47559 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47560 else if (vmode == V16SFmode)
47561 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47562 else if (vmode == V16SImode)
47563 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47564 else
47565 gcc_unreachable ();
47567 else
47569 op1 = gen_lowpart (vmode, d->op1);
47570 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47572 if (target != d->target)
47573 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47575 return true;
47578 /* For V*[QHS]Imode permutations, check if the same permutation
47579 can't be performed in a 2x, 4x or 8x wider inner mode. */
47581 static bool
47582 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47583 struct expand_vec_perm_d *nd)
47585 int i;
47586 machine_mode mode = VOIDmode;
47588 switch (d->vmode)
47590 case E_V16QImode: mode = V8HImode; break;
47591 case E_V32QImode: mode = V16HImode; break;
47592 case E_V64QImode: mode = V32HImode; break;
47593 case E_V8HImode: mode = V4SImode; break;
47594 case E_V16HImode: mode = V8SImode; break;
47595 case E_V32HImode: mode = V16SImode; break;
47596 case E_V4SImode: mode = V2DImode; break;
47597 case E_V8SImode: mode = V4DImode; break;
47598 case E_V16SImode: mode = V8DImode; break;
47599 default: return false;
47601 for (i = 0; i < d->nelt; i += 2)
47602 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47603 return false;
47604 nd->vmode = mode;
47605 nd->nelt = d->nelt / 2;
47606 for (i = 0; i < nd->nelt; i++)
47607 nd->perm[i] = d->perm[2 * i] / 2;
47608 if (GET_MODE_INNER (mode) != DImode)
47609 canonicalize_vector_int_perm (nd, nd);
47610 if (nd != d)
47612 nd->one_operand_p = d->one_operand_p;
47613 nd->testing_p = d->testing_p;
47614 if (d->op0 == d->op1)
47615 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47616 else
47618 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47619 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47621 if (d->testing_p)
47622 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47623 else
47624 nd->target = gen_reg_rtx (nd->vmode);
47626 return true;
47629 /* Try to expand one-operand permutation with constant mask. */
47631 static bool
47632 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47634 machine_mode mode = GET_MODE (d->op0);
47635 machine_mode maskmode = mode;
47636 rtx (*gen) (rtx, rtx, rtx) = NULL;
47637 rtx target, op0, mask;
47638 rtx vec[64];
47640 if (!rtx_equal_p (d->op0, d->op1))
47641 return false;
47643 if (!TARGET_AVX512F)
47644 return false;
47646 switch (mode)
47648 case E_V16SImode:
47649 gen = gen_avx512f_permvarv16si;
47650 break;
47651 case E_V16SFmode:
47652 gen = gen_avx512f_permvarv16sf;
47653 maskmode = V16SImode;
47654 break;
47655 case E_V8DImode:
47656 gen = gen_avx512f_permvarv8di;
47657 break;
47658 case E_V8DFmode:
47659 gen = gen_avx512f_permvarv8df;
47660 maskmode = V8DImode;
47661 break;
47662 default:
47663 return false;
47666 target = d->target;
47667 op0 = d->op0;
47668 for (int i = 0; i < d->nelt; ++i)
47669 vec[i] = GEN_INT (d->perm[i]);
47670 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47671 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47672 return true;
47675 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47676 in a single instruction. */
47678 static bool
47679 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47681 unsigned i, nelt = d->nelt;
47682 struct expand_vec_perm_d nd;
47684 /* Check plain VEC_SELECT first, because AVX has instructions that could
47685 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47686 input where SEL+CONCAT may not. */
47687 if (d->one_operand_p)
47689 int mask = nelt - 1;
47690 bool identity_perm = true;
47691 bool broadcast_perm = true;
47693 for (i = 0; i < nelt; i++)
47695 nd.perm[i] = d->perm[i] & mask;
47696 if (nd.perm[i] != i)
47697 identity_perm = false;
47698 if (nd.perm[i])
47699 broadcast_perm = false;
47702 if (identity_perm)
47704 if (!d->testing_p)
47705 emit_move_insn (d->target, d->op0);
47706 return true;
47708 else if (broadcast_perm && TARGET_AVX2)
47710 /* Use vpbroadcast{b,w,d}. */
47711 rtx (*gen) (rtx, rtx) = NULL;
47712 switch (d->vmode)
47714 case E_V64QImode:
47715 if (TARGET_AVX512BW)
47716 gen = gen_avx512bw_vec_dupv64qi_1;
47717 break;
47718 case E_V32QImode:
47719 gen = gen_avx2_pbroadcastv32qi_1;
47720 break;
47721 case E_V32HImode:
47722 if (TARGET_AVX512BW)
47723 gen = gen_avx512bw_vec_dupv32hi_1;
47724 break;
47725 case E_V16HImode:
47726 gen = gen_avx2_pbroadcastv16hi_1;
47727 break;
47728 case E_V16SImode:
47729 if (TARGET_AVX512F)
47730 gen = gen_avx512f_vec_dupv16si_1;
47731 break;
47732 case E_V8SImode:
47733 gen = gen_avx2_pbroadcastv8si_1;
47734 break;
47735 case E_V16QImode:
47736 gen = gen_avx2_pbroadcastv16qi;
47737 break;
47738 case E_V8HImode:
47739 gen = gen_avx2_pbroadcastv8hi;
47740 break;
47741 case E_V16SFmode:
47742 if (TARGET_AVX512F)
47743 gen = gen_avx512f_vec_dupv16sf_1;
47744 break;
47745 case E_V8SFmode:
47746 gen = gen_avx2_vec_dupv8sf_1;
47747 break;
47748 case E_V8DFmode:
47749 if (TARGET_AVX512F)
47750 gen = gen_avx512f_vec_dupv8df_1;
47751 break;
47752 case E_V8DImode:
47753 if (TARGET_AVX512F)
47754 gen = gen_avx512f_vec_dupv8di_1;
47755 break;
47756 /* For other modes prefer other shuffles this function creates. */
47757 default: break;
47759 if (gen != NULL)
47761 if (!d->testing_p)
47762 emit_insn (gen (d->target, d->op0));
47763 return true;
47767 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47768 return true;
47770 /* There are plenty of patterns in sse.md that are written for
47771 SEL+CONCAT and are not replicated for a single op. Perhaps
47772 that should be changed, to avoid the nastiness here. */
47774 /* Recognize interleave style patterns, which means incrementing
47775 every other permutation operand. */
47776 for (i = 0; i < nelt; i += 2)
47778 nd.perm[i] = d->perm[i] & mask;
47779 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47781 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47782 d->testing_p))
47783 return true;
47785 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47786 if (nelt >= 4)
47788 for (i = 0; i < nelt; i += 4)
47790 nd.perm[i + 0] = d->perm[i + 0] & mask;
47791 nd.perm[i + 1] = d->perm[i + 1] & mask;
47792 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47793 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47796 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47797 d->testing_p))
47798 return true;
47802 /* Finally, try the fully general two operand permute. */
47803 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47804 d->testing_p))
47805 return true;
47807 /* Recognize interleave style patterns with reversed operands. */
47808 if (!d->one_operand_p)
47810 for (i = 0; i < nelt; ++i)
47812 unsigned e = d->perm[i];
47813 if (e >= nelt)
47814 e -= nelt;
47815 else
47816 e += nelt;
47817 nd.perm[i] = e;
47820 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47821 d->testing_p))
47822 return true;
47825 /* Try the SSE4.1 blend variable merge instructions. */
47826 if (expand_vec_perm_blend (d))
47827 return true;
47829 /* Try one of the AVX vpermil variable permutations. */
47830 if (expand_vec_perm_vpermil (d))
47831 return true;
47833 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47834 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47835 if (expand_vec_perm_pshufb (d))
47836 return true;
47838 /* Try the AVX2 vpalignr instruction. */
47839 if (expand_vec_perm_palignr (d, true))
47840 return true;
47842 /* Try the AVX512F vperm{s,d} instructions. */
47843 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47844 return true;
47846 /* Try the AVX512F vpermi2 instructions. */
47847 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47848 return true;
47850 /* See if we can get the same permutation in different vector integer
47851 mode. */
47852 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47854 if (!d->testing_p)
47855 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47856 return true;
47858 return false;
47861 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47862 in terms of a pair of pshuflw + pshufhw instructions. */
47864 static bool
47865 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47867 unsigned char perm2[MAX_VECT_LEN];
47868 unsigned i;
47869 bool ok;
47871 if (d->vmode != V8HImode || !d->one_operand_p)
47872 return false;
47874 /* The two permutations only operate in 64-bit lanes. */
47875 for (i = 0; i < 4; ++i)
47876 if (d->perm[i] >= 4)
47877 return false;
47878 for (i = 4; i < 8; ++i)
47879 if (d->perm[i] < 4)
47880 return false;
47882 if (d->testing_p)
47883 return true;
47885 /* Emit the pshuflw. */
47886 memcpy (perm2, d->perm, 4);
47887 for (i = 4; i < 8; ++i)
47888 perm2[i] = i;
47889 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47890 gcc_assert (ok);
47892 /* Emit the pshufhw. */
47893 memcpy (perm2 + 4, d->perm + 4, 4);
47894 for (i = 0; i < 4; ++i)
47895 perm2[i] = i;
47896 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47897 gcc_assert (ok);
47899 return true;
47902 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47903 the permutation using the SSSE3 palignr instruction. This succeeds
47904 when all of the elements in PERM fit within one vector and we merely
47905 need to shift them down so that a single vector permutation has a
47906 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47907 the vpalignr instruction itself can perform the requested permutation. */
47909 static bool
47910 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47912 unsigned i, nelt = d->nelt;
47913 unsigned min, max, minswap, maxswap;
47914 bool in_order, ok, swap = false;
47915 rtx shift, target;
47916 struct expand_vec_perm_d dcopy;
47918 /* Even with AVX, palignr only operates on 128-bit vectors,
47919 in AVX2 palignr operates on both 128-bit lanes. */
47920 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47921 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47922 return false;
47924 min = 2 * nelt;
47925 max = 0;
47926 minswap = 2 * nelt;
47927 maxswap = 0;
47928 for (i = 0; i < nelt; ++i)
47930 unsigned e = d->perm[i];
47931 unsigned eswap = d->perm[i] ^ nelt;
47932 if (GET_MODE_SIZE (d->vmode) == 32)
47934 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47935 eswap = e ^ (nelt / 2);
47937 if (e < min)
47938 min = e;
47939 if (e > max)
47940 max = e;
47941 if (eswap < minswap)
47942 minswap = eswap;
47943 if (eswap > maxswap)
47944 maxswap = eswap;
47946 if (min == 0
47947 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47949 if (d->one_operand_p
47950 || minswap == 0
47951 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47952 ? nelt / 2 : nelt))
47953 return false;
47954 swap = true;
47955 min = minswap;
47956 max = maxswap;
47959 /* Given that we have SSSE3, we know we'll be able to implement the
47960 single operand permutation after the palignr with pshufb for
47961 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47962 first. */
47963 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47964 return true;
47966 dcopy = *d;
47967 if (swap)
47969 dcopy.op0 = d->op1;
47970 dcopy.op1 = d->op0;
47971 for (i = 0; i < nelt; ++i)
47972 dcopy.perm[i] ^= nelt;
47975 in_order = true;
47976 for (i = 0; i < nelt; ++i)
47978 unsigned e = dcopy.perm[i];
47979 if (GET_MODE_SIZE (d->vmode) == 32
47980 && e >= nelt
47981 && (e & (nelt / 2 - 1)) < min)
47982 e = e - min - (nelt / 2);
47983 else
47984 e = e - min;
47985 if (e != i)
47986 in_order = false;
47987 dcopy.perm[i] = e;
47989 dcopy.one_operand_p = true;
47991 if (single_insn_only_p && !in_order)
47992 return false;
47994 /* For AVX2, test whether we can permute the result in one instruction. */
47995 if (d->testing_p)
47997 if (in_order)
47998 return true;
47999 dcopy.op1 = dcopy.op0;
48000 return expand_vec_perm_1 (&dcopy);
48003 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
48004 if (GET_MODE_SIZE (d->vmode) == 16)
48006 target = gen_reg_rtx (TImode);
48007 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
48008 gen_lowpart (TImode, dcopy.op0), shift));
48010 else
48012 target = gen_reg_rtx (V2TImode);
48013 emit_insn (gen_avx2_palignrv2ti (target,
48014 gen_lowpart (V2TImode, dcopy.op1),
48015 gen_lowpart (V2TImode, dcopy.op0),
48016 shift));
48019 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
48021 /* Test for the degenerate case where the alignment by itself
48022 produces the desired permutation. */
48023 if (in_order)
48025 emit_move_insn (d->target, dcopy.op0);
48026 return true;
48029 ok = expand_vec_perm_1 (&dcopy);
48030 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
48032 return ok;
48035 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
48036 the permutation using the SSE4_1 pblendv instruction. Potentially
48037 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
48039 static bool
48040 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
48042 unsigned i, which, nelt = d->nelt;
48043 struct expand_vec_perm_d dcopy, dcopy1;
48044 machine_mode vmode = d->vmode;
48045 bool ok;
48047 /* Use the same checks as in expand_vec_perm_blend. */
48048 if (d->one_operand_p)
48049 return false;
48050 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
48052 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
48054 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
48056 else
48057 return false;
48059 /* Figure out where permutation elements stay not in their
48060 respective lanes. */
48061 for (i = 0, which = 0; i < nelt; ++i)
48063 unsigned e = d->perm[i];
48064 if (e != i)
48065 which |= (e < nelt ? 1 : 2);
48067 /* We can pblend the part where elements stay not in their
48068 respective lanes only when these elements are all in one
48069 half of a permutation.
48070 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
48071 lanes, but both 8 and 9 >= 8
48072 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
48073 respective lanes and 8 >= 8, but 2 not. */
48074 if (which != 1 && which != 2)
48075 return false;
48076 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
48077 return true;
48079 /* First we apply one operand permutation to the part where
48080 elements stay not in their respective lanes. */
48081 dcopy = *d;
48082 if (which == 2)
48083 dcopy.op0 = dcopy.op1 = d->op1;
48084 else
48085 dcopy.op0 = dcopy.op1 = d->op0;
48086 if (!d->testing_p)
48087 dcopy.target = gen_reg_rtx (vmode);
48088 dcopy.one_operand_p = true;
48090 for (i = 0; i < nelt; ++i)
48091 dcopy.perm[i] = d->perm[i] & (nelt - 1);
48093 ok = expand_vec_perm_1 (&dcopy);
48094 if (GET_MODE_SIZE (vmode) != 16 && !ok)
48095 return false;
48096 else
48097 gcc_assert (ok);
48098 if (d->testing_p)
48099 return true;
48101 /* Next we put permuted elements into their positions. */
48102 dcopy1 = *d;
48103 if (which == 2)
48104 dcopy1.op1 = dcopy.target;
48105 else
48106 dcopy1.op0 = dcopy.target;
48108 for (i = 0; i < nelt; ++i)
48109 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
48111 ok = expand_vec_perm_blend (&dcopy1);
48112 gcc_assert (ok);
48114 return true;
48117 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
48119 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48120 a two vector permutation into a single vector permutation by using
48121 an interleave operation to merge the vectors. */
48123 static bool
48124 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
48126 struct expand_vec_perm_d dremap, dfinal;
48127 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
48128 unsigned HOST_WIDE_INT contents;
48129 unsigned char remap[2 * MAX_VECT_LEN];
48130 rtx_insn *seq;
48131 bool ok, same_halves = false;
48133 if (GET_MODE_SIZE (d->vmode) == 16)
48135 if (d->one_operand_p)
48136 return false;
48138 else if (GET_MODE_SIZE (d->vmode) == 32)
48140 if (!TARGET_AVX)
48141 return false;
48142 /* For 32-byte modes allow even d->one_operand_p.
48143 The lack of cross-lane shuffling in some instructions
48144 might prevent a single insn shuffle. */
48145 dfinal = *d;
48146 dfinal.testing_p = true;
48147 /* If expand_vec_perm_interleave3 can expand this into
48148 a 3 insn sequence, give up and let it be expanded as
48149 3 insn sequence. While that is one insn longer,
48150 it doesn't need a memory operand and in the common
48151 case that both interleave low and high permutations
48152 with the same operands are adjacent needs 4 insns
48153 for both after CSE. */
48154 if (expand_vec_perm_interleave3 (&dfinal))
48155 return false;
48157 else
48158 return false;
48160 /* Examine from whence the elements come. */
48161 contents = 0;
48162 for (i = 0; i < nelt; ++i)
48163 contents |= HOST_WIDE_INT_1U << d->perm[i];
48165 memset (remap, 0xff, sizeof (remap));
48166 dremap = *d;
48168 if (GET_MODE_SIZE (d->vmode) == 16)
48170 unsigned HOST_WIDE_INT h1, h2, h3, h4;
48172 /* Split the two input vectors into 4 halves. */
48173 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
48174 h2 = h1 << nelt2;
48175 h3 = h2 << nelt2;
48176 h4 = h3 << nelt2;
48178 /* If the elements from the low halves use interleave low, and similarly
48179 for interleave high. If the elements are from mis-matched halves, we
48180 can use shufps for V4SF/V4SI or do a DImode shuffle. */
48181 if ((contents & (h1 | h3)) == contents)
48183 /* punpckl* */
48184 for (i = 0; i < nelt2; ++i)
48186 remap[i] = i * 2;
48187 remap[i + nelt] = i * 2 + 1;
48188 dremap.perm[i * 2] = i;
48189 dremap.perm[i * 2 + 1] = i + nelt;
48191 if (!TARGET_SSE2 && d->vmode == V4SImode)
48192 dremap.vmode = V4SFmode;
48194 else if ((contents & (h2 | h4)) == contents)
48196 /* punpckh* */
48197 for (i = 0; i < nelt2; ++i)
48199 remap[i + nelt2] = i * 2;
48200 remap[i + nelt + nelt2] = i * 2 + 1;
48201 dremap.perm[i * 2] = i + nelt2;
48202 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
48204 if (!TARGET_SSE2 && d->vmode == V4SImode)
48205 dremap.vmode = V4SFmode;
48207 else if ((contents & (h1 | h4)) == contents)
48209 /* shufps */
48210 for (i = 0; i < nelt2; ++i)
48212 remap[i] = i;
48213 remap[i + nelt + nelt2] = i + nelt2;
48214 dremap.perm[i] = i;
48215 dremap.perm[i + nelt2] = i + nelt + nelt2;
48217 if (nelt != 4)
48219 /* shufpd */
48220 dremap.vmode = V2DImode;
48221 dremap.nelt = 2;
48222 dremap.perm[0] = 0;
48223 dremap.perm[1] = 3;
48226 else if ((contents & (h2 | h3)) == contents)
48228 /* shufps */
48229 for (i = 0; i < nelt2; ++i)
48231 remap[i + nelt2] = i;
48232 remap[i + nelt] = i + nelt2;
48233 dremap.perm[i] = i + nelt2;
48234 dremap.perm[i + nelt2] = i + nelt;
48236 if (nelt != 4)
48238 /* shufpd */
48239 dremap.vmode = V2DImode;
48240 dremap.nelt = 2;
48241 dremap.perm[0] = 1;
48242 dremap.perm[1] = 2;
48245 else
48246 return false;
48248 else
48250 unsigned int nelt4 = nelt / 4, nzcnt = 0;
48251 unsigned HOST_WIDE_INT q[8];
48252 unsigned int nonzero_halves[4];
48254 /* Split the two input vectors into 8 quarters. */
48255 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
48256 for (i = 1; i < 8; ++i)
48257 q[i] = q[0] << (nelt4 * i);
48258 for (i = 0; i < 4; ++i)
48259 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
48261 nonzero_halves[nzcnt] = i;
48262 ++nzcnt;
48265 if (nzcnt == 1)
48267 gcc_assert (d->one_operand_p);
48268 nonzero_halves[1] = nonzero_halves[0];
48269 same_halves = true;
48271 else if (d->one_operand_p)
48273 gcc_assert (nonzero_halves[0] == 0);
48274 gcc_assert (nonzero_halves[1] == 1);
48277 if (nzcnt <= 2)
48279 if (d->perm[0] / nelt2 == nonzero_halves[1])
48281 /* Attempt to increase the likelihood that dfinal
48282 shuffle will be intra-lane. */
48283 std::swap (nonzero_halves[0], nonzero_halves[1]);
48286 /* vperm2f128 or vperm2i128. */
48287 for (i = 0; i < nelt2; ++i)
48289 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
48290 remap[i + nonzero_halves[0] * nelt2] = i;
48291 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
48292 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
48295 if (d->vmode != V8SFmode
48296 && d->vmode != V4DFmode
48297 && d->vmode != V8SImode)
48299 dremap.vmode = V8SImode;
48300 dremap.nelt = 8;
48301 for (i = 0; i < 4; ++i)
48303 dremap.perm[i] = i + nonzero_halves[0] * 4;
48304 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
48308 else if (d->one_operand_p)
48309 return false;
48310 else if (TARGET_AVX2
48311 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
48313 /* vpunpckl* */
48314 for (i = 0; i < nelt4; ++i)
48316 remap[i] = i * 2;
48317 remap[i + nelt] = i * 2 + 1;
48318 remap[i + nelt2] = i * 2 + nelt2;
48319 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
48320 dremap.perm[i * 2] = i;
48321 dremap.perm[i * 2 + 1] = i + nelt;
48322 dremap.perm[i * 2 + nelt2] = i + nelt2;
48323 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
48326 else if (TARGET_AVX2
48327 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
48329 /* vpunpckh* */
48330 for (i = 0; i < nelt4; ++i)
48332 remap[i + nelt4] = i * 2;
48333 remap[i + nelt + nelt4] = i * 2 + 1;
48334 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
48335 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
48336 dremap.perm[i * 2] = i + nelt4;
48337 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
48338 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
48339 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
48342 else
48343 return false;
48346 /* Use the remapping array set up above to move the elements from their
48347 swizzled locations into their final destinations. */
48348 dfinal = *d;
48349 for (i = 0; i < nelt; ++i)
48351 unsigned e = remap[d->perm[i]];
48352 gcc_assert (e < nelt);
48353 /* If same_halves is true, both halves of the remapped vector are the
48354 same. Avoid cross-lane accesses if possible. */
48355 if (same_halves && i >= nelt2)
48357 gcc_assert (e < nelt2);
48358 dfinal.perm[i] = e + nelt2;
48360 else
48361 dfinal.perm[i] = e;
48363 if (!d->testing_p)
48365 dremap.target = gen_reg_rtx (dremap.vmode);
48366 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48368 dfinal.op1 = dfinal.op0;
48369 dfinal.one_operand_p = true;
48371 /* Test if the final remap can be done with a single insn. For V4SFmode or
48372 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48373 start_sequence ();
48374 ok = expand_vec_perm_1 (&dfinal);
48375 seq = get_insns ();
48376 end_sequence ();
48378 if (!ok)
48379 return false;
48381 if (d->testing_p)
48382 return true;
48384 if (dremap.vmode != dfinal.vmode)
48386 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48387 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48390 ok = expand_vec_perm_1 (&dremap);
48391 gcc_assert (ok);
48393 emit_insn (seq);
48394 return true;
48397 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48398 a single vector cross-lane permutation into vpermq followed
48399 by any of the single insn permutations. */
48401 static bool
48402 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48404 struct expand_vec_perm_d dremap, dfinal;
48405 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48406 unsigned contents[2];
48407 bool ok;
48409 if (!(TARGET_AVX2
48410 && (d->vmode == V32QImode || d->vmode == V16HImode)
48411 && d->one_operand_p))
48412 return false;
48414 contents[0] = 0;
48415 contents[1] = 0;
48416 for (i = 0; i < nelt2; ++i)
48418 contents[0] |= 1u << (d->perm[i] / nelt4);
48419 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48422 for (i = 0; i < 2; ++i)
48424 unsigned int cnt = 0;
48425 for (j = 0; j < 4; ++j)
48426 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48427 return false;
48430 if (d->testing_p)
48431 return true;
48433 dremap = *d;
48434 dremap.vmode = V4DImode;
48435 dremap.nelt = 4;
48436 dremap.target = gen_reg_rtx (V4DImode);
48437 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48438 dremap.op1 = dremap.op0;
48439 dremap.one_operand_p = true;
48440 for (i = 0; i < 2; ++i)
48442 unsigned int cnt = 0;
48443 for (j = 0; j < 4; ++j)
48444 if ((contents[i] & (1u << j)) != 0)
48445 dremap.perm[2 * i + cnt++] = j;
48446 for (; cnt < 2; ++cnt)
48447 dremap.perm[2 * i + cnt] = 0;
48450 dfinal = *d;
48451 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48452 dfinal.op1 = dfinal.op0;
48453 dfinal.one_operand_p = true;
48454 for (i = 0, j = 0; i < nelt; ++i)
48456 if (i == nelt2)
48457 j = 2;
48458 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48459 if ((d->perm[i] / nelt4) == dremap.perm[j])
48461 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48462 dfinal.perm[i] |= nelt4;
48463 else
48464 gcc_unreachable ();
48467 ok = expand_vec_perm_1 (&dremap);
48468 gcc_assert (ok);
48470 ok = expand_vec_perm_1 (&dfinal);
48471 gcc_assert (ok);
48473 return true;
48476 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48477 a vector permutation using two instructions, vperm2f128 resp.
48478 vperm2i128 followed by any single in-lane permutation. */
48480 static bool
48481 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48483 struct expand_vec_perm_d dfirst, dsecond;
48484 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48485 bool ok;
48487 if (!TARGET_AVX
48488 || GET_MODE_SIZE (d->vmode) != 32
48489 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48490 return false;
48492 dsecond = *d;
48493 dsecond.one_operand_p = false;
48494 dsecond.testing_p = true;
48496 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48497 immediate. For perm < 16 the second permutation uses
48498 d->op0 as first operand, for perm >= 16 it uses d->op1
48499 as first operand. The second operand is the result of
48500 vperm2[fi]128. */
48501 for (perm = 0; perm < 32; perm++)
48503 /* Ignore permutations which do not move anything cross-lane. */
48504 if (perm < 16)
48506 /* The second shuffle for e.g. V4DFmode has
48507 0123 and ABCD operands.
48508 Ignore AB23, as 23 is already in the second lane
48509 of the first operand. */
48510 if ((perm & 0xc) == (1 << 2)) continue;
48511 /* And 01CD, as 01 is in the first lane of the first
48512 operand. */
48513 if ((perm & 3) == 0) continue;
48514 /* And 4567, as then the vperm2[fi]128 doesn't change
48515 anything on the original 4567 second operand. */
48516 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48518 else
48520 /* The second shuffle for e.g. V4DFmode has
48521 4567 and ABCD operands.
48522 Ignore AB67, as 67 is already in the second lane
48523 of the first operand. */
48524 if ((perm & 0xc) == (3 << 2)) continue;
48525 /* And 45CD, as 45 is in the first lane of the first
48526 operand. */
48527 if ((perm & 3) == 2) continue;
48528 /* And 0123, as then the vperm2[fi]128 doesn't change
48529 anything on the original 0123 first operand. */
48530 if ((perm & 0xf) == (1 << 2)) continue;
48533 for (i = 0; i < nelt; i++)
48535 j = d->perm[i] / nelt2;
48536 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48537 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48538 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48539 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48540 else
48541 break;
48544 if (i == nelt)
48546 start_sequence ();
48547 ok = expand_vec_perm_1 (&dsecond);
48548 end_sequence ();
48550 else
48551 ok = false;
48553 if (ok)
48555 if (d->testing_p)
48556 return true;
48558 /* Found a usable second shuffle. dfirst will be
48559 vperm2f128 on d->op0 and d->op1. */
48560 dsecond.testing_p = false;
48561 dfirst = *d;
48562 dfirst.target = gen_reg_rtx (d->vmode);
48563 for (i = 0; i < nelt; i++)
48564 dfirst.perm[i] = (i & (nelt2 - 1))
48565 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48567 canonicalize_perm (&dfirst);
48568 ok = expand_vec_perm_1 (&dfirst);
48569 gcc_assert (ok);
48571 /* And dsecond is some single insn shuffle, taking
48572 d->op0 and result of vperm2f128 (if perm < 16) or
48573 d->op1 and result of vperm2f128 (otherwise). */
48574 if (perm >= 16)
48575 dsecond.op0 = dsecond.op1;
48576 dsecond.op1 = dfirst.target;
48578 ok = expand_vec_perm_1 (&dsecond);
48579 gcc_assert (ok);
48581 return true;
48584 /* For one operand, the only useful vperm2f128 permutation is 0x01
48585 aka lanes swap. */
48586 if (d->one_operand_p)
48587 return false;
48590 return false;
48593 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48594 a two vector permutation using 2 intra-lane interleave insns
48595 and cross-lane shuffle for 32-byte vectors. */
48597 static bool
48598 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48600 unsigned i, nelt;
48601 rtx (*gen) (rtx, rtx, rtx);
48603 if (d->one_operand_p)
48604 return false;
48605 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48607 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48609 else
48610 return false;
48612 nelt = d->nelt;
48613 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48614 return false;
48615 for (i = 0; i < nelt; i += 2)
48616 if (d->perm[i] != d->perm[0] + i / 2
48617 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48618 return false;
48620 if (d->testing_p)
48621 return true;
48623 switch (d->vmode)
48625 case E_V32QImode:
48626 if (d->perm[0])
48627 gen = gen_vec_interleave_highv32qi;
48628 else
48629 gen = gen_vec_interleave_lowv32qi;
48630 break;
48631 case E_V16HImode:
48632 if (d->perm[0])
48633 gen = gen_vec_interleave_highv16hi;
48634 else
48635 gen = gen_vec_interleave_lowv16hi;
48636 break;
48637 case E_V8SImode:
48638 if (d->perm[0])
48639 gen = gen_vec_interleave_highv8si;
48640 else
48641 gen = gen_vec_interleave_lowv8si;
48642 break;
48643 case E_V4DImode:
48644 if (d->perm[0])
48645 gen = gen_vec_interleave_highv4di;
48646 else
48647 gen = gen_vec_interleave_lowv4di;
48648 break;
48649 case E_V8SFmode:
48650 if (d->perm[0])
48651 gen = gen_vec_interleave_highv8sf;
48652 else
48653 gen = gen_vec_interleave_lowv8sf;
48654 break;
48655 case E_V4DFmode:
48656 if (d->perm[0])
48657 gen = gen_vec_interleave_highv4df;
48658 else
48659 gen = gen_vec_interleave_lowv4df;
48660 break;
48661 default:
48662 gcc_unreachable ();
48665 emit_insn (gen (d->target, d->op0, d->op1));
48666 return true;
48669 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48670 a single vector permutation using a single intra-lane vector
48671 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48672 the non-swapped and swapped vectors together. */
48674 static bool
48675 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48677 struct expand_vec_perm_d dfirst, dsecond;
48678 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48679 rtx_insn *seq;
48680 bool ok;
48681 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48683 if (!TARGET_AVX
48684 || TARGET_AVX2
48685 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48686 || !d->one_operand_p)
48687 return false;
48689 dfirst = *d;
48690 for (i = 0; i < nelt; i++)
48691 dfirst.perm[i] = 0xff;
48692 for (i = 0, msk = 0; i < nelt; i++)
48694 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48695 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48696 return false;
48697 dfirst.perm[j] = d->perm[i];
48698 if (j != i)
48699 msk |= (1 << i);
48701 for (i = 0; i < nelt; i++)
48702 if (dfirst.perm[i] == 0xff)
48703 dfirst.perm[i] = i;
48705 if (!d->testing_p)
48706 dfirst.target = gen_reg_rtx (dfirst.vmode);
48708 start_sequence ();
48709 ok = expand_vec_perm_1 (&dfirst);
48710 seq = get_insns ();
48711 end_sequence ();
48713 if (!ok)
48714 return false;
48716 if (d->testing_p)
48717 return true;
48719 emit_insn (seq);
48721 dsecond = *d;
48722 dsecond.op0 = dfirst.target;
48723 dsecond.op1 = dfirst.target;
48724 dsecond.one_operand_p = true;
48725 dsecond.target = gen_reg_rtx (dsecond.vmode);
48726 for (i = 0; i < nelt; i++)
48727 dsecond.perm[i] = i ^ nelt2;
48729 ok = expand_vec_perm_1 (&dsecond);
48730 gcc_assert (ok);
48732 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48733 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48734 return true;
48737 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48738 permutation using two vperm2f128, followed by a vshufpd insn blending
48739 the two vectors together. */
48741 static bool
48742 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48744 struct expand_vec_perm_d dfirst, dsecond, dthird;
48745 bool ok;
48747 if (!TARGET_AVX || (d->vmode != V4DFmode))
48748 return false;
48750 if (d->testing_p)
48751 return true;
48753 dfirst = *d;
48754 dsecond = *d;
48755 dthird = *d;
48757 dfirst.perm[0] = (d->perm[0] & ~1);
48758 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48759 dfirst.perm[2] = (d->perm[2] & ~1);
48760 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48761 dsecond.perm[0] = (d->perm[1] & ~1);
48762 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48763 dsecond.perm[2] = (d->perm[3] & ~1);
48764 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48765 dthird.perm[0] = (d->perm[0] % 2);
48766 dthird.perm[1] = (d->perm[1] % 2) + 4;
48767 dthird.perm[2] = (d->perm[2] % 2) + 2;
48768 dthird.perm[3] = (d->perm[3] % 2) + 6;
48770 dfirst.target = gen_reg_rtx (dfirst.vmode);
48771 dsecond.target = gen_reg_rtx (dsecond.vmode);
48772 dthird.op0 = dfirst.target;
48773 dthird.op1 = dsecond.target;
48774 dthird.one_operand_p = false;
48776 canonicalize_perm (&dfirst);
48777 canonicalize_perm (&dsecond);
48779 ok = expand_vec_perm_1 (&dfirst)
48780 && expand_vec_perm_1 (&dsecond)
48781 && expand_vec_perm_1 (&dthird);
48783 gcc_assert (ok);
48785 return true;
48788 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48789 permutation with two pshufb insns and an ior. We should have already
48790 failed all two instruction sequences. */
48792 static bool
48793 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48795 rtx rperm[2][16], vperm, l, h, op, m128;
48796 unsigned int i, nelt, eltsz;
48798 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48799 return false;
48800 gcc_assert (!d->one_operand_p);
48802 if (d->testing_p)
48803 return true;
48805 nelt = d->nelt;
48806 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48808 /* Generate two permutation masks. If the required element is within
48809 the given vector it is shuffled into the proper lane. If the required
48810 element is in the other vector, force a zero into the lane by setting
48811 bit 7 in the permutation mask. */
48812 m128 = GEN_INT (-128);
48813 for (i = 0; i < nelt; ++i)
48815 unsigned j, e = d->perm[i];
48816 unsigned which = (e >= nelt);
48817 if (e >= nelt)
48818 e -= nelt;
48820 for (j = 0; j < eltsz; ++j)
48822 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48823 rperm[1-which][i*eltsz + j] = m128;
48827 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48828 vperm = force_reg (V16QImode, vperm);
48830 l = gen_reg_rtx (V16QImode);
48831 op = gen_lowpart (V16QImode, d->op0);
48832 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48834 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48835 vperm = force_reg (V16QImode, vperm);
48837 h = gen_reg_rtx (V16QImode);
48838 op = gen_lowpart (V16QImode, d->op1);
48839 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48841 op = d->target;
48842 if (d->vmode != V16QImode)
48843 op = gen_reg_rtx (V16QImode);
48844 emit_insn (gen_iorv16qi3 (op, l, h));
48845 if (op != d->target)
48846 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48848 return true;
48851 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48852 with two vpshufb insns, vpermq and vpor. We should have already failed
48853 all two or three instruction sequences. */
48855 static bool
48856 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48858 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48859 unsigned int i, nelt, eltsz;
48861 if (!TARGET_AVX2
48862 || !d->one_operand_p
48863 || (d->vmode != V32QImode && d->vmode != V16HImode))
48864 return false;
48866 if (d->testing_p)
48867 return true;
48869 nelt = d->nelt;
48870 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48872 /* Generate two permutation masks. If the required element is within
48873 the same lane, it is shuffled in. If the required element from the
48874 other lane, force a zero by setting bit 7 in the permutation mask.
48875 In the other mask the mask has non-negative elements if element
48876 is requested from the other lane, but also moved to the other lane,
48877 so that the result of vpshufb can have the two V2TImode halves
48878 swapped. */
48879 m128 = GEN_INT (-128);
48880 for (i = 0; i < nelt; ++i)
48882 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48883 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48885 for (j = 0; j < eltsz; ++j)
48887 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48888 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48892 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48893 vperm = force_reg (V32QImode, vperm);
48895 h = gen_reg_rtx (V32QImode);
48896 op = gen_lowpart (V32QImode, d->op0);
48897 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48899 /* Swap the 128-byte lanes of h into hp. */
48900 hp = gen_reg_rtx (V4DImode);
48901 op = gen_lowpart (V4DImode, h);
48902 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48903 const1_rtx));
48905 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48906 vperm = force_reg (V32QImode, vperm);
48908 l = gen_reg_rtx (V32QImode);
48909 op = gen_lowpart (V32QImode, d->op0);
48910 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48912 op = d->target;
48913 if (d->vmode != V32QImode)
48914 op = gen_reg_rtx (V32QImode);
48915 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48916 if (op != d->target)
48917 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48919 return true;
48922 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48923 and extract-odd permutations of two V32QImode and V16QImode operand
48924 with two vpshufb insns, vpor and vpermq. We should have already
48925 failed all two or three instruction sequences. */
48927 static bool
48928 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48930 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48931 unsigned int i, nelt, eltsz;
48933 if (!TARGET_AVX2
48934 || d->one_operand_p
48935 || (d->vmode != V32QImode && d->vmode != V16HImode))
48936 return false;
48938 for (i = 0; i < d->nelt; ++i)
48939 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48940 return false;
48942 if (d->testing_p)
48943 return true;
48945 nelt = d->nelt;
48946 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48948 /* Generate two permutation masks. In the first permutation mask
48949 the first quarter will contain indexes for the first half
48950 of the op0, the second quarter will contain bit 7 set, third quarter
48951 will contain indexes for the second half of the op0 and the
48952 last quarter bit 7 set. In the second permutation mask
48953 the first quarter will contain bit 7 set, the second quarter
48954 indexes for the first half of the op1, the third quarter bit 7 set
48955 and last quarter indexes for the second half of the op1.
48956 I.e. the first mask e.g. for V32QImode extract even will be:
48957 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48958 (all values masked with 0xf except for -128) and second mask
48959 for extract even will be
48960 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48961 m128 = GEN_INT (-128);
48962 for (i = 0; i < nelt; ++i)
48964 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48965 unsigned which = d->perm[i] >= nelt;
48966 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48968 for (j = 0; j < eltsz; ++j)
48970 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48971 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48975 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48976 vperm = force_reg (V32QImode, vperm);
48978 l = gen_reg_rtx (V32QImode);
48979 op = gen_lowpart (V32QImode, d->op0);
48980 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48982 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48983 vperm = force_reg (V32QImode, vperm);
48985 h = gen_reg_rtx (V32QImode);
48986 op = gen_lowpart (V32QImode, d->op1);
48987 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48989 ior = gen_reg_rtx (V32QImode);
48990 emit_insn (gen_iorv32qi3 (ior, l, h));
48992 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48993 op = gen_reg_rtx (V4DImode);
48994 ior = gen_lowpart (V4DImode, ior);
48995 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48996 const1_rtx, GEN_INT (3)));
48997 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48999 return true;
49002 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
49003 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
49004 with two "and" and "pack" or two "shift" and "pack" insns. We should
49005 have already failed all two instruction sequences. */
49007 static bool
49008 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
49010 rtx op, dop0, dop1, t, rperm[16];
49011 unsigned i, odd, c, s, nelt = d->nelt;
49012 bool end_perm = false;
49013 machine_mode half_mode;
49014 rtx (*gen_and) (rtx, rtx, rtx);
49015 rtx (*gen_pack) (rtx, rtx, rtx);
49016 rtx (*gen_shift) (rtx, rtx, rtx);
49018 if (d->one_operand_p)
49019 return false;
49021 switch (d->vmode)
49023 case E_V8HImode:
49024 /* Required for "pack". */
49025 if (!TARGET_SSE4_1)
49026 return false;
49027 c = 0xffff;
49028 s = 16;
49029 half_mode = V4SImode;
49030 gen_and = gen_andv4si3;
49031 gen_pack = gen_sse4_1_packusdw;
49032 gen_shift = gen_lshrv4si3;
49033 break;
49034 case E_V16QImode:
49035 /* No check as all instructions are SSE2. */
49036 c = 0xff;
49037 s = 8;
49038 half_mode = V8HImode;
49039 gen_and = gen_andv8hi3;
49040 gen_pack = gen_sse2_packuswb;
49041 gen_shift = gen_lshrv8hi3;
49042 break;
49043 case E_V16HImode:
49044 if (!TARGET_AVX2)
49045 return false;
49046 c = 0xffff;
49047 s = 16;
49048 half_mode = V8SImode;
49049 gen_and = gen_andv8si3;
49050 gen_pack = gen_avx2_packusdw;
49051 gen_shift = gen_lshrv8si3;
49052 end_perm = true;
49053 break;
49054 case E_V32QImode:
49055 if (!TARGET_AVX2)
49056 return false;
49057 c = 0xff;
49058 s = 8;
49059 half_mode = V16HImode;
49060 gen_and = gen_andv16hi3;
49061 gen_pack = gen_avx2_packuswb;
49062 gen_shift = gen_lshrv16hi3;
49063 end_perm = true;
49064 break;
49065 default:
49066 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
49067 general shuffles. */
49068 return false;
49071 /* Check that permutation is even or odd. */
49072 odd = d->perm[0];
49073 if (odd > 1)
49074 return false;
49076 for (i = 1; i < nelt; ++i)
49077 if (d->perm[i] != 2 * i + odd)
49078 return false;
49080 if (d->testing_p)
49081 return true;
49083 dop0 = gen_reg_rtx (half_mode);
49084 dop1 = gen_reg_rtx (half_mode);
49085 if (odd == 0)
49087 for (i = 0; i < nelt / 2; i++)
49088 rperm[i] = GEN_INT (c);
49089 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
49090 t = force_reg (half_mode, t);
49091 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
49092 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
49094 else
49096 emit_insn (gen_shift (dop0,
49097 gen_lowpart (half_mode, d->op0),
49098 GEN_INT (s)));
49099 emit_insn (gen_shift (dop1,
49100 gen_lowpart (half_mode, d->op1),
49101 GEN_INT (s)));
49103 /* In AVX2 for 256 bit case we need to permute pack result. */
49104 if (TARGET_AVX2 && end_perm)
49106 op = gen_reg_rtx (d->vmode);
49107 t = gen_reg_rtx (V4DImode);
49108 emit_insn (gen_pack (op, dop0, dop1));
49109 emit_insn (gen_avx2_permv4di_1 (t,
49110 gen_lowpart (V4DImode, op),
49111 const0_rtx,
49112 const2_rtx,
49113 const1_rtx,
49114 GEN_INT (3)));
49115 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
49117 else
49118 emit_insn (gen_pack (d->target, dop0, dop1));
49120 return true;
49123 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
49124 and extract-odd permutations of two V64QI operands
49125 with two "shifts", two "truncs" and one "concat" insns for "odd"
49126 and two "truncs" and one concat insn for "even."
49127 Have already failed all two instruction sequences. */
49129 static bool
49130 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
49132 rtx t1, t2, t3, t4;
49133 unsigned i, odd, nelt = d->nelt;
49135 if (!TARGET_AVX512BW
49136 || d->one_operand_p
49137 || d->vmode != V64QImode)
49138 return false;
49140 /* Check that permutation is even or odd. */
49141 odd = d->perm[0];
49142 if (odd > 1)
49143 return false;
49145 for (i = 1; i < nelt; ++i)
49146 if (d->perm[i] != 2 * i + odd)
49147 return false;
49149 if (d->testing_p)
49150 return true;
49153 if (odd)
49155 t1 = gen_reg_rtx (V32HImode);
49156 t2 = gen_reg_rtx (V32HImode);
49157 emit_insn (gen_lshrv32hi3 (t1,
49158 gen_lowpart (V32HImode, d->op0),
49159 GEN_INT (8)));
49160 emit_insn (gen_lshrv32hi3 (t2,
49161 gen_lowpart (V32HImode, d->op1),
49162 GEN_INT (8)));
49164 else
49166 t1 = gen_lowpart (V32HImode, d->op0);
49167 t2 = gen_lowpart (V32HImode, d->op1);
49170 t3 = gen_reg_rtx (V32QImode);
49171 t4 = gen_reg_rtx (V32QImode);
49172 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
49173 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
49174 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
49176 return true;
49179 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
49180 and extract-odd permutations. */
49182 static bool
49183 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
49185 rtx t1, t2, t3, t4, t5;
49187 switch (d->vmode)
49189 case E_V4DFmode:
49190 if (d->testing_p)
49191 break;
49192 t1 = gen_reg_rtx (V4DFmode);
49193 t2 = gen_reg_rtx (V4DFmode);
49195 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49196 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
49197 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
49199 /* Now an unpck[lh]pd will produce the result required. */
49200 if (odd)
49201 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
49202 else
49203 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
49204 emit_insn (t3);
49205 break;
49207 case E_V8SFmode:
49209 int mask = odd ? 0xdd : 0x88;
49211 if (d->testing_p)
49212 break;
49213 t1 = gen_reg_rtx (V8SFmode);
49214 t2 = gen_reg_rtx (V8SFmode);
49215 t3 = gen_reg_rtx (V8SFmode);
49217 /* Shuffle within the 128-bit lanes to produce:
49218 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
49219 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
49220 GEN_INT (mask)));
49222 /* Shuffle the lanes around to produce:
49223 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
49224 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
49225 GEN_INT (0x3)));
49227 /* Shuffle within the 128-bit lanes to produce:
49228 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
49229 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
49231 /* Shuffle within the 128-bit lanes to produce:
49232 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
49233 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
49235 /* Shuffle the lanes around to produce:
49236 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
49237 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
49238 GEN_INT (0x20)));
49240 break;
49242 case E_V2DFmode:
49243 case E_V4SFmode:
49244 case E_V2DImode:
49245 case E_V4SImode:
49246 /* These are always directly implementable by expand_vec_perm_1. */
49247 gcc_unreachable ();
49249 case E_V8HImode:
49250 if (TARGET_SSE4_1)
49251 return expand_vec_perm_even_odd_pack (d);
49252 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
49253 return expand_vec_perm_pshufb2 (d);
49254 else
49256 if (d->testing_p)
49257 break;
49258 /* We need 2*log2(N)-1 operations to achieve odd/even
49259 with interleave. */
49260 t1 = gen_reg_rtx (V8HImode);
49261 t2 = gen_reg_rtx (V8HImode);
49262 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
49263 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
49264 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
49265 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
49266 if (odd)
49267 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
49268 else
49269 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
49270 emit_insn (t3);
49272 break;
49274 case E_V16QImode:
49275 return expand_vec_perm_even_odd_pack (d);
49277 case E_V16HImode:
49278 case E_V32QImode:
49279 return expand_vec_perm_even_odd_pack (d);
49281 case E_V64QImode:
49282 return expand_vec_perm_even_odd_trunc (d);
49284 case E_V4DImode:
49285 if (!TARGET_AVX2)
49287 struct expand_vec_perm_d d_copy = *d;
49288 d_copy.vmode = V4DFmode;
49289 if (d->testing_p)
49290 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
49291 else
49292 d_copy.target = gen_reg_rtx (V4DFmode);
49293 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
49294 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
49295 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49297 if (!d->testing_p)
49298 emit_move_insn (d->target,
49299 gen_lowpart (V4DImode, d_copy.target));
49300 return true;
49302 return false;
49305 if (d->testing_p)
49306 break;
49308 t1 = gen_reg_rtx (V4DImode);
49309 t2 = gen_reg_rtx (V4DImode);
49311 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49312 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
49313 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
49315 /* Now an vpunpck[lh]qdq will produce the result required. */
49316 if (odd)
49317 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
49318 else
49319 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
49320 emit_insn (t3);
49321 break;
49323 case E_V8SImode:
49324 if (!TARGET_AVX2)
49326 struct expand_vec_perm_d d_copy = *d;
49327 d_copy.vmode = V8SFmode;
49328 if (d->testing_p)
49329 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
49330 else
49331 d_copy.target = gen_reg_rtx (V8SFmode);
49332 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
49333 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
49334 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49336 if (!d->testing_p)
49337 emit_move_insn (d->target,
49338 gen_lowpart (V8SImode, d_copy.target));
49339 return true;
49341 return false;
49344 if (d->testing_p)
49345 break;
49347 t1 = gen_reg_rtx (V8SImode);
49348 t2 = gen_reg_rtx (V8SImode);
49349 t3 = gen_reg_rtx (V4DImode);
49350 t4 = gen_reg_rtx (V4DImode);
49351 t5 = gen_reg_rtx (V4DImode);
49353 /* Shuffle the lanes around into
49354 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
49355 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
49356 gen_lowpart (V4DImode, d->op1),
49357 GEN_INT (0x20)));
49358 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
49359 gen_lowpart (V4DImode, d->op1),
49360 GEN_INT (0x31)));
49362 /* Swap the 2nd and 3rd position in each lane into
49363 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
49364 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49365 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49366 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49367 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49369 /* Now an vpunpck[lh]qdq will produce
49370 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49371 if (odd)
49372 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49373 gen_lowpart (V4DImode, t2));
49374 else
49375 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49376 gen_lowpart (V4DImode, t2));
49377 emit_insn (t3);
49378 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49379 break;
49381 default:
49382 gcc_unreachable ();
49385 return true;
49388 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49389 extract-even and extract-odd permutations. */
49391 static bool
49392 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49394 unsigned i, odd, nelt = d->nelt;
49396 odd = d->perm[0];
49397 if (odd != 0 && odd != 1)
49398 return false;
49400 for (i = 1; i < nelt; ++i)
49401 if (d->perm[i] != 2 * i + odd)
49402 return false;
49404 return expand_vec_perm_even_odd_1 (d, odd);
49407 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49408 permutations. We assume that expand_vec_perm_1 has already failed. */
49410 static bool
49411 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49413 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49414 machine_mode vmode = d->vmode;
49415 unsigned char perm2[4];
49416 rtx op0 = d->op0, dest;
49417 bool ok;
49419 switch (vmode)
49421 case E_V4DFmode:
49422 case E_V8SFmode:
49423 /* These are special-cased in sse.md so that we can optionally
49424 use the vbroadcast instruction. They expand to two insns
49425 if the input happens to be in a register. */
49426 gcc_unreachable ();
49428 case E_V2DFmode:
49429 case E_V2DImode:
49430 case E_V4SFmode:
49431 case E_V4SImode:
49432 /* These are always implementable using standard shuffle patterns. */
49433 gcc_unreachable ();
49435 case E_V8HImode:
49436 case E_V16QImode:
49437 /* These can be implemented via interleave. We save one insn by
49438 stopping once we have promoted to V4SImode and then use pshufd. */
49439 if (d->testing_p)
49440 return true;
49443 rtx dest;
49444 rtx (*gen) (rtx, rtx, rtx)
49445 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49446 : gen_vec_interleave_lowv8hi;
49448 if (elt >= nelt2)
49450 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49451 : gen_vec_interleave_highv8hi;
49452 elt -= nelt2;
49454 nelt2 /= 2;
49456 dest = gen_reg_rtx (vmode);
49457 emit_insn (gen (dest, op0, op0));
49458 vmode = get_mode_wider_vector (vmode);
49459 op0 = gen_lowpart (vmode, dest);
49461 while (vmode != V4SImode);
49463 memset (perm2, elt, 4);
49464 dest = gen_reg_rtx (V4SImode);
49465 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49466 gcc_assert (ok);
49467 if (!d->testing_p)
49468 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49469 return true;
49471 case E_V64QImode:
49472 case E_V32QImode:
49473 case E_V16HImode:
49474 case E_V8SImode:
49475 case E_V4DImode:
49476 /* For AVX2 broadcasts of the first element vpbroadcast* or
49477 vpermq should be used by expand_vec_perm_1. */
49478 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49479 return false;
49481 default:
49482 gcc_unreachable ();
49486 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49487 broadcast permutations. */
49489 static bool
49490 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49492 unsigned i, elt, nelt = d->nelt;
49494 if (!d->one_operand_p)
49495 return false;
49497 elt = d->perm[0];
49498 for (i = 1; i < nelt; ++i)
49499 if (d->perm[i] != elt)
49500 return false;
49502 return expand_vec_perm_broadcast_1 (d);
49505 /* Implement arbitrary permutations of two V64QImode operands
49506 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49507 static bool
49508 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49510 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49511 return false;
49513 if (d->testing_p)
49514 return true;
49516 struct expand_vec_perm_d ds[2];
49517 rtx rperm[128], vperm, target0, target1;
49518 unsigned int i, nelt;
49519 machine_mode vmode;
49521 nelt = d->nelt;
49522 vmode = V64QImode;
49524 for (i = 0; i < 2; i++)
49526 ds[i] = *d;
49527 ds[i].vmode = V32HImode;
49528 ds[i].nelt = 32;
49529 ds[i].target = gen_reg_rtx (V32HImode);
49530 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49531 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49534 /* Prepare permutations such that the first one takes care of
49535 putting the even bytes into the right positions or one higher
49536 positions (ds[0]) and the second one takes care of
49537 putting the odd bytes into the right positions or one below
49538 (ds[1]). */
49540 for (i = 0; i < nelt; i++)
49542 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49543 if (i & 1)
49545 rperm[i] = constm1_rtx;
49546 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49548 else
49550 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49551 rperm[i + 64] = constm1_rtx;
49555 bool ok = expand_vec_perm_1 (&ds[0]);
49556 gcc_assert (ok);
49557 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49559 ok = expand_vec_perm_1 (&ds[1]);
49560 gcc_assert (ok);
49561 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49563 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49564 vperm = force_reg (vmode, vperm);
49565 target0 = gen_reg_rtx (V64QImode);
49566 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49568 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49569 vperm = force_reg (vmode, vperm);
49570 target1 = gen_reg_rtx (V64QImode);
49571 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49573 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49574 return true;
49577 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49578 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49579 all the shorter instruction sequences. */
49581 static bool
49582 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49584 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49585 unsigned int i, nelt, eltsz;
49586 bool used[4];
49588 if (!TARGET_AVX2
49589 || d->one_operand_p
49590 || (d->vmode != V32QImode && d->vmode != V16HImode))
49591 return false;
49593 if (d->testing_p)
49594 return true;
49596 nelt = d->nelt;
49597 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49599 /* Generate 4 permutation masks. If the required element is within
49600 the same lane, it is shuffled in. If the required element from the
49601 other lane, force a zero by setting bit 7 in the permutation mask.
49602 In the other mask the mask has non-negative elements if element
49603 is requested from the other lane, but also moved to the other lane,
49604 so that the result of vpshufb can have the two V2TImode halves
49605 swapped. */
49606 m128 = GEN_INT (-128);
49607 for (i = 0; i < 32; ++i)
49609 rperm[0][i] = m128;
49610 rperm[1][i] = m128;
49611 rperm[2][i] = m128;
49612 rperm[3][i] = m128;
49614 used[0] = false;
49615 used[1] = false;
49616 used[2] = false;
49617 used[3] = false;
49618 for (i = 0; i < nelt; ++i)
49620 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49621 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49622 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49624 for (j = 0; j < eltsz; ++j)
49625 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49626 used[which] = true;
49629 for (i = 0; i < 2; ++i)
49631 if (!used[2 * i + 1])
49633 h[i] = NULL_RTX;
49634 continue;
49636 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49637 gen_rtvec_v (32, rperm[2 * i + 1]));
49638 vperm = force_reg (V32QImode, vperm);
49639 h[i] = gen_reg_rtx (V32QImode);
49640 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49641 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49644 /* Swap the 128-byte lanes of h[X]. */
49645 for (i = 0; i < 2; ++i)
49647 if (h[i] == NULL_RTX)
49648 continue;
49649 op = gen_reg_rtx (V4DImode);
49650 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49651 const2_rtx, GEN_INT (3), const0_rtx,
49652 const1_rtx));
49653 h[i] = gen_lowpart (V32QImode, op);
49656 for (i = 0; i < 2; ++i)
49658 if (!used[2 * i])
49660 l[i] = NULL_RTX;
49661 continue;
49663 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49664 vperm = force_reg (V32QImode, vperm);
49665 l[i] = gen_reg_rtx (V32QImode);
49666 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49667 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49670 for (i = 0; i < 2; ++i)
49672 if (h[i] && l[i])
49674 op = gen_reg_rtx (V32QImode);
49675 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49676 l[i] = op;
49678 else if (h[i])
49679 l[i] = h[i];
49682 gcc_assert (l[0] && l[1]);
49683 op = d->target;
49684 if (d->vmode != V32QImode)
49685 op = gen_reg_rtx (V32QImode);
49686 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49687 if (op != d->target)
49688 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49689 return true;
49692 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49693 With all of the interface bits taken care of, perform the expansion
49694 in D and return true on success. */
49696 static bool
49697 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49699 /* Try a single instruction expansion. */
49700 if (expand_vec_perm_1 (d))
49701 return true;
49703 /* Try sequences of two instructions. */
49705 if (expand_vec_perm_pshuflw_pshufhw (d))
49706 return true;
49708 if (expand_vec_perm_palignr (d, false))
49709 return true;
49711 if (expand_vec_perm_interleave2 (d))
49712 return true;
49714 if (expand_vec_perm_broadcast (d))
49715 return true;
49717 if (expand_vec_perm_vpermq_perm_1 (d))
49718 return true;
49720 if (expand_vec_perm_vperm2f128 (d))
49721 return true;
49723 if (expand_vec_perm_pblendv (d))
49724 return true;
49726 /* Try sequences of three instructions. */
49728 if (expand_vec_perm_even_odd_pack (d))
49729 return true;
49731 if (expand_vec_perm_2vperm2f128_vshuf (d))
49732 return true;
49734 if (expand_vec_perm_pshufb2 (d))
49735 return true;
49737 if (expand_vec_perm_interleave3 (d))
49738 return true;
49740 if (expand_vec_perm_vperm2f128_vblend (d))
49741 return true;
49743 /* Try sequences of four instructions. */
49745 if (expand_vec_perm_even_odd_trunc (d))
49746 return true;
49747 if (expand_vec_perm_vpshufb2_vpermq (d))
49748 return true;
49750 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49751 return true;
49753 if (expand_vec_perm_vpermi2_vpshub2 (d))
49754 return true;
49756 /* ??? Look for narrow permutations whose element orderings would
49757 allow the promotion to a wider mode. */
49759 /* ??? Look for sequences of interleave or a wider permute that place
49760 the data into the correct lanes for a half-vector shuffle like
49761 pshuf[lh]w or vpermilps. */
49763 /* ??? Look for sequences of interleave that produce the desired results.
49764 The combinatorics of punpck[lh] get pretty ugly... */
49766 if (expand_vec_perm_even_odd (d))
49767 return true;
49769 /* Even longer sequences. */
49770 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49771 return true;
49773 /* See if we can get the same permutation in different vector integer
49774 mode. */
49775 struct expand_vec_perm_d nd;
49776 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49778 if (!d->testing_p)
49779 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49780 return true;
49783 return false;
49786 /* If a permutation only uses one operand, make it clear. Returns true
49787 if the permutation references both operands. */
49789 static bool
49790 canonicalize_perm (struct expand_vec_perm_d *d)
49792 int i, which, nelt = d->nelt;
49794 for (i = which = 0; i < nelt; ++i)
49795 which |= (d->perm[i] < nelt ? 1 : 2);
49797 d->one_operand_p = true;
49798 switch (which)
49800 default:
49801 gcc_unreachable();
49803 case 3:
49804 if (!rtx_equal_p (d->op0, d->op1))
49806 d->one_operand_p = false;
49807 break;
49809 /* The elements of PERM do not suggest that only the first operand
49810 is used, but both operands are identical. Allow easier matching
49811 of the permutation by folding the permutation into the single
49812 input vector. */
49813 /* FALLTHRU */
49815 case 2:
49816 for (i = 0; i < nelt; ++i)
49817 d->perm[i] &= nelt - 1;
49818 d->op0 = d->op1;
49819 break;
49821 case 1:
49822 d->op1 = d->op0;
49823 break;
49826 return (which == 3);
49829 bool
49830 ix86_expand_vec_perm_const (rtx operands[4])
49832 struct expand_vec_perm_d d;
49833 unsigned char perm[MAX_VECT_LEN];
49834 int i, nelt;
49835 bool two_args;
49836 rtx sel;
49838 d.target = operands[0];
49839 d.op0 = operands[1];
49840 d.op1 = operands[2];
49841 sel = operands[3];
49843 d.vmode = GET_MODE (d.target);
49844 gcc_assert (VECTOR_MODE_P (d.vmode));
49845 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49846 d.testing_p = false;
49848 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49849 gcc_assert (XVECLEN (sel, 0) == nelt);
49850 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49852 for (i = 0; i < nelt; ++i)
49854 rtx e = XVECEXP (sel, 0, i);
49855 int ei = INTVAL (e) & (2 * nelt - 1);
49856 d.perm[i] = ei;
49857 perm[i] = ei;
49860 two_args = canonicalize_perm (&d);
49862 if (ix86_expand_vec_perm_const_1 (&d))
49863 return true;
49865 /* If the selector says both arguments are needed, but the operands are the
49866 same, the above tried to expand with one_operand_p and flattened selector.
49867 If that didn't work, retry without one_operand_p; we succeeded with that
49868 during testing. */
49869 if (two_args && d.one_operand_p)
49871 d.one_operand_p = false;
49872 memcpy (d.perm, perm, sizeof (perm));
49873 return ix86_expand_vec_perm_const_1 (&d);
49876 return false;
49879 /* Implement targetm.vectorize.vec_perm_const_ok. */
49881 static bool
49882 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49883 const unsigned char *sel)
49885 struct expand_vec_perm_d d;
49886 unsigned int i, nelt, which;
49887 bool ret;
49889 d.vmode = vmode;
49890 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49891 d.testing_p = true;
49893 /* Given sufficient ISA support we can just return true here
49894 for selected vector modes. */
49895 switch (d.vmode)
49897 case E_V16SFmode:
49898 case E_V16SImode:
49899 case E_V8DImode:
49900 case E_V8DFmode:
49901 if (TARGET_AVX512F)
49902 /* All implementable with a single vpermi2 insn. */
49903 return true;
49904 break;
49905 case E_V32HImode:
49906 if (TARGET_AVX512BW)
49907 /* All implementable with a single vpermi2 insn. */
49908 return true;
49909 break;
49910 case E_V64QImode:
49911 if (TARGET_AVX512BW)
49912 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49913 return true;
49914 break;
49915 case E_V8SImode:
49916 case E_V8SFmode:
49917 case E_V4DFmode:
49918 case E_V4DImode:
49919 if (TARGET_AVX512VL)
49920 /* All implementable with a single vpermi2 insn. */
49921 return true;
49922 break;
49923 case E_V16HImode:
49924 if (TARGET_AVX2)
49925 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49926 return true;
49927 break;
49928 case E_V32QImode:
49929 if (TARGET_AVX2)
49930 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49931 return true;
49932 break;
49933 case E_V4SImode:
49934 case E_V4SFmode:
49935 case E_V8HImode:
49936 case E_V16QImode:
49937 /* All implementable with a single vpperm insn. */
49938 if (TARGET_XOP)
49939 return true;
49940 /* All implementable with 2 pshufb + 1 ior. */
49941 if (TARGET_SSSE3)
49942 return true;
49943 break;
49944 case E_V2DImode:
49945 case E_V2DFmode:
49946 /* All implementable with shufpd or unpck[lh]pd. */
49947 return true;
49948 default:
49949 return false;
49952 /* Extract the values from the vector CST into the permutation
49953 array in D. */
49954 memcpy (d.perm, sel, nelt);
49955 for (i = which = 0; i < nelt; ++i)
49957 unsigned char e = d.perm[i];
49958 gcc_assert (e < 2 * nelt);
49959 which |= (e < nelt ? 1 : 2);
49962 /* For all elements from second vector, fold the elements to first. */
49963 if (which == 2)
49964 for (i = 0; i < nelt; ++i)
49965 d.perm[i] -= nelt;
49967 /* Check whether the mask can be applied to the vector type. */
49968 d.one_operand_p = (which != 3);
49970 /* Implementable with shufps or pshufd. */
49971 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49972 return true;
49974 /* Otherwise we have to go through the motions and see if we can
49975 figure out how to generate the requested permutation. */
49976 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49977 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49978 if (!d.one_operand_p)
49979 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49981 start_sequence ();
49982 ret = ix86_expand_vec_perm_const_1 (&d);
49983 end_sequence ();
49985 return ret;
49988 void
49989 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49991 struct expand_vec_perm_d d;
49992 unsigned i, nelt;
49994 d.target = targ;
49995 d.op0 = op0;
49996 d.op1 = op1;
49997 d.vmode = GET_MODE (targ);
49998 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49999 d.one_operand_p = false;
50000 d.testing_p = false;
50002 for (i = 0; i < nelt; ++i)
50003 d.perm[i] = i * 2 + odd;
50005 /* We'll either be able to implement the permutation directly... */
50006 if (expand_vec_perm_1 (&d))
50007 return;
50009 /* ... or we use the special-case patterns. */
50010 expand_vec_perm_even_odd_1 (&d, odd);
50013 static void
50014 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
50016 struct expand_vec_perm_d d;
50017 unsigned i, nelt, base;
50018 bool ok;
50020 d.target = targ;
50021 d.op0 = op0;
50022 d.op1 = op1;
50023 d.vmode = GET_MODE (targ);
50024 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
50025 d.one_operand_p = false;
50026 d.testing_p = false;
50028 base = high_p ? nelt / 2 : 0;
50029 for (i = 0; i < nelt / 2; ++i)
50031 d.perm[i * 2] = i + base;
50032 d.perm[i * 2 + 1] = i + base + nelt;
50035 /* Note that for AVX this isn't one instruction. */
50036 ok = ix86_expand_vec_perm_const_1 (&d);
50037 gcc_assert (ok);
50041 /* Expand a vector operation CODE for a V*QImode in terms of the
50042 same operation on V*HImode. */
50044 void
50045 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
50047 machine_mode qimode = GET_MODE (dest);
50048 machine_mode himode;
50049 rtx (*gen_il) (rtx, rtx, rtx);
50050 rtx (*gen_ih) (rtx, rtx, rtx);
50051 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
50052 struct expand_vec_perm_d d;
50053 bool ok, full_interleave;
50054 bool uns_p = false;
50055 int i;
50057 switch (qimode)
50059 case E_V16QImode:
50060 himode = V8HImode;
50061 gen_il = gen_vec_interleave_lowv16qi;
50062 gen_ih = gen_vec_interleave_highv16qi;
50063 break;
50064 case E_V32QImode:
50065 himode = V16HImode;
50066 gen_il = gen_avx2_interleave_lowv32qi;
50067 gen_ih = gen_avx2_interleave_highv32qi;
50068 break;
50069 case E_V64QImode:
50070 himode = V32HImode;
50071 gen_il = gen_avx512bw_interleave_lowv64qi;
50072 gen_ih = gen_avx512bw_interleave_highv64qi;
50073 break;
50074 default:
50075 gcc_unreachable ();
50078 op2_l = op2_h = op2;
50079 switch (code)
50081 case MULT:
50082 /* Unpack data such that we've got a source byte in each low byte of
50083 each word. We don't care what goes into the high byte of each word.
50084 Rather than trying to get zero in there, most convenient is to let
50085 it be a copy of the low byte. */
50086 op2_l = gen_reg_rtx (qimode);
50087 op2_h = gen_reg_rtx (qimode);
50088 emit_insn (gen_il (op2_l, op2, op2));
50089 emit_insn (gen_ih (op2_h, op2, op2));
50090 /* FALLTHRU */
50092 op1_l = gen_reg_rtx (qimode);
50093 op1_h = gen_reg_rtx (qimode);
50094 emit_insn (gen_il (op1_l, op1, op1));
50095 emit_insn (gen_ih (op1_h, op1, op1));
50096 full_interleave = qimode == V16QImode;
50097 break;
50099 case ASHIFT:
50100 case LSHIFTRT:
50101 uns_p = true;
50102 /* FALLTHRU */
50103 case ASHIFTRT:
50104 op1_l = gen_reg_rtx (himode);
50105 op1_h = gen_reg_rtx (himode);
50106 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
50107 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
50108 full_interleave = true;
50109 break;
50110 default:
50111 gcc_unreachable ();
50114 /* Perform the operation. */
50115 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
50116 1, OPTAB_DIRECT);
50117 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
50118 1, OPTAB_DIRECT);
50119 gcc_assert (res_l && res_h);
50121 /* Merge the data back into the right place. */
50122 d.target = dest;
50123 d.op0 = gen_lowpart (qimode, res_l);
50124 d.op1 = gen_lowpart (qimode, res_h);
50125 d.vmode = qimode;
50126 d.nelt = GET_MODE_NUNITS (qimode);
50127 d.one_operand_p = false;
50128 d.testing_p = false;
50130 if (full_interleave)
50132 /* For SSE2, we used an full interleave, so the desired
50133 results are in the even elements. */
50134 for (i = 0; i < d.nelt; ++i)
50135 d.perm[i] = i * 2;
50137 else
50139 /* For AVX, the interleave used above was not cross-lane. So the
50140 extraction is evens but with the second and third quarter swapped.
50141 Happily, that is even one insn shorter than even extraction.
50142 For AVX512BW we have 4 lanes. We extract evens from within a lane,
50143 always first from the first and then from the second source operand,
50144 the index bits above the low 4 bits remains the same.
50145 Thus, for d.nelt == 32 we want permutation
50146 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
50147 and for d.nelt == 64 we want permutation
50148 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
50149 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
50150 for (i = 0; i < d.nelt; ++i)
50151 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
50154 ok = ix86_expand_vec_perm_const_1 (&d);
50155 gcc_assert (ok);
50157 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50158 gen_rtx_fmt_ee (code, qimode, op1, op2));
50161 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
50162 if op is CONST_VECTOR with all odd elements equal to their
50163 preceding element. */
50165 static bool
50166 const_vector_equal_evenodd_p (rtx op)
50168 machine_mode mode = GET_MODE (op);
50169 int i, nunits = GET_MODE_NUNITS (mode);
50170 if (GET_CODE (op) != CONST_VECTOR
50171 || nunits != CONST_VECTOR_NUNITS (op))
50172 return false;
50173 for (i = 0; i < nunits; i += 2)
50174 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
50175 return false;
50176 return true;
50179 void
50180 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
50181 bool uns_p, bool odd_p)
50183 machine_mode mode = GET_MODE (op1);
50184 machine_mode wmode = GET_MODE (dest);
50185 rtx x;
50186 rtx orig_op1 = op1, orig_op2 = op2;
50188 if (!nonimmediate_operand (op1, mode))
50189 op1 = force_reg (mode, op1);
50190 if (!nonimmediate_operand (op2, mode))
50191 op2 = force_reg (mode, op2);
50193 /* We only play even/odd games with vectors of SImode. */
50194 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
50196 /* If we're looking for the odd results, shift those members down to
50197 the even slots. For some cpus this is faster than a PSHUFD. */
50198 if (odd_p)
50200 /* For XOP use vpmacsdqh, but only for smult, as it is only
50201 signed. */
50202 if (TARGET_XOP && mode == V4SImode && !uns_p)
50204 x = force_reg (wmode, CONST0_RTX (wmode));
50205 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
50206 return;
50209 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
50210 if (!const_vector_equal_evenodd_p (orig_op1))
50211 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
50212 x, NULL, 1, OPTAB_DIRECT);
50213 if (!const_vector_equal_evenodd_p (orig_op2))
50214 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
50215 x, NULL, 1, OPTAB_DIRECT);
50216 op1 = gen_lowpart (mode, op1);
50217 op2 = gen_lowpart (mode, op2);
50220 if (mode == V16SImode)
50222 if (uns_p)
50223 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
50224 else
50225 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
50227 else if (mode == V8SImode)
50229 if (uns_p)
50230 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
50231 else
50232 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
50234 else if (uns_p)
50235 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
50236 else if (TARGET_SSE4_1)
50237 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
50238 else
50240 rtx s1, s2, t0, t1, t2;
50242 /* The easiest way to implement this without PMULDQ is to go through
50243 the motions as if we are performing a full 64-bit multiply. With
50244 the exception that we need to do less shuffling of the elements. */
50246 /* Compute the sign-extension, aka highparts, of the two operands. */
50247 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50248 op1, pc_rtx, pc_rtx);
50249 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50250 op2, pc_rtx, pc_rtx);
50252 /* Multiply LO(A) * HI(B), and vice-versa. */
50253 t1 = gen_reg_rtx (wmode);
50254 t2 = gen_reg_rtx (wmode);
50255 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
50256 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
50258 /* Multiply LO(A) * LO(B). */
50259 t0 = gen_reg_rtx (wmode);
50260 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
50262 /* Combine and shift the highparts into place. */
50263 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
50264 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
50265 1, OPTAB_DIRECT);
50267 /* Combine high and low parts. */
50268 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
50269 return;
50271 emit_insn (x);
50274 void
50275 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
50276 bool uns_p, bool high_p)
50278 machine_mode wmode = GET_MODE (dest);
50279 machine_mode mode = GET_MODE (op1);
50280 rtx t1, t2, t3, t4, mask;
50282 switch (mode)
50284 case E_V4SImode:
50285 t1 = gen_reg_rtx (mode);
50286 t2 = gen_reg_rtx (mode);
50287 if (TARGET_XOP && !uns_p)
50289 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
50290 shuffle the elements once so that all elements are in the right
50291 place for immediate use: { A C B D }. */
50292 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
50293 const1_rtx, GEN_INT (3)));
50294 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
50295 const1_rtx, GEN_INT (3)));
50297 else
50299 /* Put the elements into place for the multiply. */
50300 ix86_expand_vec_interleave (t1, op1, op1, high_p);
50301 ix86_expand_vec_interleave (t2, op2, op2, high_p);
50302 high_p = false;
50304 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
50305 break;
50307 case E_V8SImode:
50308 /* Shuffle the elements between the lanes. After this we
50309 have { A B E F | C D G H } for each operand. */
50310 t1 = gen_reg_rtx (V4DImode);
50311 t2 = gen_reg_rtx (V4DImode);
50312 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
50313 const0_rtx, const2_rtx,
50314 const1_rtx, GEN_INT (3)));
50315 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
50316 const0_rtx, const2_rtx,
50317 const1_rtx, GEN_INT (3)));
50319 /* Shuffle the elements within the lanes. After this we
50320 have { A A B B | C C D D } or { E E F F | G G H H }. */
50321 t3 = gen_reg_rtx (V8SImode);
50322 t4 = gen_reg_rtx (V8SImode);
50323 mask = GEN_INT (high_p
50324 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
50325 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
50326 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
50327 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
50329 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
50330 break;
50332 case E_V8HImode:
50333 case E_V16HImode:
50334 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
50335 uns_p, OPTAB_DIRECT);
50336 t2 = expand_binop (mode,
50337 uns_p ? umul_highpart_optab : smul_highpart_optab,
50338 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
50339 gcc_assert (t1 && t2);
50341 t3 = gen_reg_rtx (mode);
50342 ix86_expand_vec_interleave (t3, t1, t2, high_p);
50343 emit_move_insn (dest, gen_lowpart (wmode, t3));
50344 break;
50346 case E_V16QImode:
50347 case E_V32QImode:
50348 case E_V32HImode:
50349 case E_V16SImode:
50350 case E_V64QImode:
50351 t1 = gen_reg_rtx (wmode);
50352 t2 = gen_reg_rtx (wmode);
50353 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
50354 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
50356 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
50357 break;
50359 default:
50360 gcc_unreachable ();
50364 void
50365 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50367 rtx res_1, res_2, res_3, res_4;
50369 res_1 = gen_reg_rtx (V4SImode);
50370 res_2 = gen_reg_rtx (V4SImode);
50371 res_3 = gen_reg_rtx (V2DImode);
50372 res_4 = gen_reg_rtx (V2DImode);
50373 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50374 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50376 /* Move the results in element 2 down to element 1; we don't care
50377 what goes in elements 2 and 3. Then we can merge the parts
50378 back together with an interleave.
50380 Note that two other sequences were tried:
50381 (1) Use interleaves at the start instead of psrldq, which allows
50382 us to use a single shufps to merge things back at the end.
50383 (2) Use shufps here to combine the two vectors, then pshufd to
50384 put the elements in the correct order.
50385 In both cases the cost of the reformatting stall was too high
50386 and the overall sequence slower. */
50388 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50389 const0_rtx, const2_rtx,
50390 const0_rtx, const0_rtx));
50391 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50392 const0_rtx, const2_rtx,
50393 const0_rtx, const0_rtx));
50394 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50396 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50399 void
50400 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50402 machine_mode mode = GET_MODE (op0);
50403 rtx t1, t2, t3, t4, t5, t6;
50405 if (TARGET_AVX512DQ && mode == V8DImode)
50406 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50407 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50408 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50409 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50410 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50411 else if (TARGET_XOP && mode == V2DImode)
50413 /* op1: A,B,C,D, op2: E,F,G,H */
50414 op1 = gen_lowpart (V4SImode, op1);
50415 op2 = gen_lowpart (V4SImode, op2);
50417 t1 = gen_reg_rtx (V4SImode);
50418 t2 = gen_reg_rtx (V4SImode);
50419 t3 = gen_reg_rtx (V2DImode);
50420 t4 = gen_reg_rtx (V2DImode);
50422 /* t1: B,A,D,C */
50423 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50424 GEN_INT (1),
50425 GEN_INT (0),
50426 GEN_INT (3),
50427 GEN_INT (2)));
50429 /* t2: (B*E),(A*F),(D*G),(C*H) */
50430 emit_insn (gen_mulv4si3 (t2, t1, op2));
50432 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50433 emit_insn (gen_xop_phadddq (t3, t2));
50435 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50436 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50438 /* Multiply lower parts and add all */
50439 t5 = gen_reg_rtx (V2DImode);
50440 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50441 gen_lowpart (V4SImode, op1),
50442 gen_lowpart (V4SImode, op2)));
50443 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50446 else
50448 machine_mode nmode;
50449 rtx (*umul) (rtx, rtx, rtx);
50451 if (mode == V2DImode)
50453 umul = gen_vec_widen_umult_even_v4si;
50454 nmode = V4SImode;
50456 else if (mode == V4DImode)
50458 umul = gen_vec_widen_umult_even_v8si;
50459 nmode = V8SImode;
50461 else if (mode == V8DImode)
50463 umul = gen_vec_widen_umult_even_v16si;
50464 nmode = V16SImode;
50466 else
50467 gcc_unreachable ();
50470 /* Multiply low parts. */
50471 t1 = gen_reg_rtx (mode);
50472 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50474 /* Shift input vectors right 32 bits so we can multiply high parts. */
50475 t6 = GEN_INT (32);
50476 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50477 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50479 /* Multiply high parts by low parts. */
50480 t4 = gen_reg_rtx (mode);
50481 t5 = gen_reg_rtx (mode);
50482 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50483 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50485 /* Combine and shift the highparts back. */
50486 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50487 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50489 /* Combine high and low parts. */
50490 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50493 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50494 gen_rtx_MULT (mode, op1, op2));
50497 /* Return 1 if control tansfer instruction INSN
50498 should be encoded with bnd prefix.
50499 If insn is NULL then return 1 when control
50500 transfer instructions should be prefixed with
50501 bnd by default for current function. */
50503 bool
50504 ix86_bnd_prefixed_insn_p (rtx insn)
50506 /* For call insns check special flag. */
50507 if (insn && CALL_P (insn))
50509 rtx call = get_call_rtx_from (insn);
50510 if (call)
50511 return CALL_EXPR_WITH_BOUNDS_P (call);
50514 /* All other insns are prefixed only if function is instrumented. */
50515 return chkp_function_instrumented_p (current_function_decl);
50518 /* Calculate integer abs() using only SSE2 instructions. */
50520 void
50521 ix86_expand_sse2_abs (rtx target, rtx input)
50523 machine_mode mode = GET_MODE (target);
50524 rtx tmp0, tmp1, x;
50526 switch (mode)
50528 /* For 32-bit signed integer X, the best way to calculate the absolute
50529 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50530 case E_V4SImode:
50531 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50532 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50533 NULL, 0, OPTAB_DIRECT);
50534 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50535 NULL, 0, OPTAB_DIRECT);
50536 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50537 target, 0, OPTAB_DIRECT);
50538 break;
50540 /* For 16-bit signed integer X, the best way to calculate the absolute
50541 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50542 case E_V8HImode:
50543 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50545 x = expand_simple_binop (mode, SMAX, tmp0, input,
50546 target, 0, OPTAB_DIRECT);
50547 break;
50549 /* For 8-bit signed integer X, the best way to calculate the absolute
50550 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50551 as SSE2 provides the PMINUB insn. */
50552 case E_V16QImode:
50553 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50555 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50556 target, 0, OPTAB_DIRECT);
50557 break;
50559 default:
50560 gcc_unreachable ();
50563 if (x != target)
50564 emit_move_insn (target, x);
50567 /* Expand an extract from a vector register through pextr insn.
50568 Return true if successful. */
50570 bool
50571 ix86_expand_pextr (rtx *operands)
50573 rtx dst = operands[0];
50574 rtx src = operands[1];
50576 unsigned int size = INTVAL (operands[2]);
50577 unsigned int pos = INTVAL (operands[3]);
50579 if (SUBREG_P (dst))
50581 /* Reject non-lowpart subregs. */
50582 if (SUBREG_BYTE (dst) > 0)
50583 return false;
50584 dst = SUBREG_REG (dst);
50587 if (SUBREG_P (src))
50589 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50590 src = SUBREG_REG (src);
50593 switch (GET_MODE (src))
50595 case E_V16QImode:
50596 case E_V8HImode:
50597 case E_V4SImode:
50598 case E_V2DImode:
50599 case E_V1TImode:
50600 case E_TImode:
50602 machine_mode srcmode, dstmode;
50603 rtx d, pat;
50605 if (!int_mode_for_size (size, 0).exists (&dstmode))
50606 return false;
50608 switch (dstmode)
50610 case E_QImode:
50611 if (!TARGET_SSE4_1)
50612 return false;
50613 srcmode = V16QImode;
50614 break;
50616 case E_HImode:
50617 if (!TARGET_SSE2)
50618 return false;
50619 srcmode = V8HImode;
50620 break;
50622 case E_SImode:
50623 if (!TARGET_SSE4_1)
50624 return false;
50625 srcmode = V4SImode;
50626 break;
50628 case E_DImode:
50629 gcc_assert (TARGET_64BIT);
50630 if (!TARGET_SSE4_1)
50631 return false;
50632 srcmode = V2DImode;
50633 break;
50635 default:
50636 return false;
50639 /* Reject extractions from misaligned positions. */
50640 if (pos & (size-1))
50641 return false;
50643 if (GET_MODE (dst) == dstmode)
50644 d = dst;
50645 else
50646 d = gen_reg_rtx (dstmode);
50648 /* Construct insn pattern. */
50649 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50650 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50652 /* Let the rtl optimizers know about the zero extension performed. */
50653 if (dstmode == QImode || dstmode == HImode)
50655 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50656 d = gen_lowpart (SImode, d);
50659 emit_insn (gen_rtx_SET (d, pat));
50661 if (d != dst)
50662 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50663 return true;
50666 default:
50667 return false;
50671 /* Expand an insert into a vector register through pinsr insn.
50672 Return true if successful. */
50674 bool
50675 ix86_expand_pinsr (rtx *operands)
50677 rtx dst = operands[0];
50678 rtx src = operands[3];
50680 unsigned int size = INTVAL (operands[1]);
50681 unsigned int pos = INTVAL (operands[2]);
50683 if (SUBREG_P (dst))
50685 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50686 dst = SUBREG_REG (dst);
50689 switch (GET_MODE (dst))
50691 case E_V16QImode:
50692 case E_V8HImode:
50693 case E_V4SImode:
50694 case E_V2DImode:
50695 case E_V1TImode:
50696 case E_TImode:
50698 machine_mode srcmode, dstmode;
50699 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50700 rtx d;
50702 if (!int_mode_for_size (size, 0).exists (&srcmode))
50703 return false;
50705 switch (srcmode)
50707 case E_QImode:
50708 if (!TARGET_SSE4_1)
50709 return false;
50710 dstmode = V16QImode;
50711 pinsr = gen_sse4_1_pinsrb;
50712 break;
50714 case E_HImode:
50715 if (!TARGET_SSE2)
50716 return false;
50717 dstmode = V8HImode;
50718 pinsr = gen_sse2_pinsrw;
50719 break;
50721 case E_SImode:
50722 if (!TARGET_SSE4_1)
50723 return false;
50724 dstmode = V4SImode;
50725 pinsr = gen_sse4_1_pinsrd;
50726 break;
50728 case E_DImode:
50729 gcc_assert (TARGET_64BIT);
50730 if (!TARGET_SSE4_1)
50731 return false;
50732 dstmode = V2DImode;
50733 pinsr = gen_sse4_1_pinsrq;
50734 break;
50736 default:
50737 return false;
50740 /* Reject insertions to misaligned positions. */
50741 if (pos & (size-1))
50742 return false;
50744 if (SUBREG_P (src))
50746 unsigned int srcpos = SUBREG_BYTE (src);
50748 if (srcpos > 0)
50750 rtx extr_ops[4];
50752 extr_ops[0] = gen_reg_rtx (srcmode);
50753 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50754 extr_ops[2] = GEN_INT (size);
50755 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50757 if (!ix86_expand_pextr (extr_ops))
50758 return false;
50760 src = extr_ops[0];
50762 else
50763 src = gen_lowpart (srcmode, SUBREG_REG (src));
50766 if (GET_MODE (dst) == dstmode)
50767 d = dst;
50768 else
50769 d = gen_reg_rtx (dstmode);
50771 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50772 gen_lowpart (srcmode, src),
50773 GEN_INT (1 << (pos / size))));
50774 if (d != dst)
50775 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50776 return true;
50779 default:
50780 return false;
50784 /* This function returns the calling abi specific va_list type node.
50785 It returns the FNDECL specific va_list type. */
50787 static tree
50788 ix86_fn_abi_va_list (tree fndecl)
50790 if (!TARGET_64BIT)
50791 return va_list_type_node;
50792 gcc_assert (fndecl != NULL_TREE);
50794 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50795 return ms_va_list_type_node;
50796 else
50797 return sysv_va_list_type_node;
50800 /* Returns the canonical va_list type specified by TYPE. If there
50801 is no valid TYPE provided, it return NULL_TREE. */
50803 static tree
50804 ix86_canonical_va_list_type (tree type)
50806 if (TARGET_64BIT)
50808 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50809 return ms_va_list_type_node;
50811 if ((TREE_CODE (type) == ARRAY_TYPE
50812 && integer_zerop (array_type_nelts (type)))
50813 || POINTER_TYPE_P (type))
50815 tree elem_type = TREE_TYPE (type);
50816 if (TREE_CODE (elem_type) == RECORD_TYPE
50817 && lookup_attribute ("sysv_abi va_list",
50818 TYPE_ATTRIBUTES (elem_type)))
50819 return sysv_va_list_type_node;
50822 return NULL_TREE;
50825 return std_canonical_va_list_type (type);
50828 /* Iterate through the target-specific builtin types for va_list.
50829 IDX denotes the iterator, *PTREE is set to the result type of
50830 the va_list builtin, and *PNAME to its internal type.
50831 Returns zero if there is no element for this index, otherwise
50832 IDX should be increased upon the next call.
50833 Note, do not iterate a base builtin's name like __builtin_va_list.
50834 Used from c_common_nodes_and_builtins. */
50836 static int
50837 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50839 if (TARGET_64BIT)
50841 switch (idx)
50843 default:
50844 break;
50846 case 0:
50847 *ptree = ms_va_list_type_node;
50848 *pname = "__builtin_ms_va_list";
50849 return 1;
50851 case 1:
50852 *ptree = sysv_va_list_type_node;
50853 *pname = "__builtin_sysv_va_list";
50854 return 1;
50858 return 0;
50861 #undef TARGET_SCHED_DISPATCH
50862 #define TARGET_SCHED_DISPATCH has_dispatch
50863 #undef TARGET_SCHED_DISPATCH_DO
50864 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50865 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50866 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50867 #undef TARGET_SCHED_REORDER
50868 #define TARGET_SCHED_REORDER ix86_sched_reorder
50869 #undef TARGET_SCHED_ADJUST_PRIORITY
50870 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50871 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50872 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50873 ix86_dependencies_evaluation_hook
50875 /* The size of the dispatch window is the total number of bytes of
50876 object code allowed in a window. */
50877 #define DISPATCH_WINDOW_SIZE 16
50879 /* Number of dispatch windows considered for scheduling. */
50880 #define MAX_DISPATCH_WINDOWS 3
50882 /* Maximum number of instructions in a window. */
50883 #define MAX_INSN 4
50885 /* Maximum number of immediate operands in a window. */
50886 #define MAX_IMM 4
50888 /* Maximum number of immediate bits allowed in a window. */
50889 #define MAX_IMM_SIZE 128
50891 /* Maximum number of 32 bit immediates allowed in a window. */
50892 #define MAX_IMM_32 4
50894 /* Maximum number of 64 bit immediates allowed in a window. */
50895 #define MAX_IMM_64 2
50897 /* Maximum total of loads or prefetches allowed in a window. */
50898 #define MAX_LOAD 2
50900 /* Maximum total of stores allowed in a window. */
50901 #define MAX_STORE 1
50903 #undef BIG
50904 #define BIG 100
50907 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50908 enum dispatch_group {
50909 disp_no_group = 0,
50910 disp_load,
50911 disp_store,
50912 disp_load_store,
50913 disp_prefetch,
50914 disp_imm,
50915 disp_imm_32,
50916 disp_imm_64,
50917 disp_branch,
50918 disp_cmp,
50919 disp_jcc,
50920 disp_last
50923 /* Number of allowable groups in a dispatch window. It is an array
50924 indexed by dispatch_group enum. 100 is used as a big number,
50925 because the number of these kind of operations does not have any
50926 effect in dispatch window, but we need them for other reasons in
50927 the table. */
50928 static unsigned int num_allowable_groups[disp_last] = {
50929 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50932 char group_name[disp_last + 1][16] = {
50933 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50934 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50935 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50938 /* Instruction path. */
50939 enum insn_path {
50940 no_path = 0,
50941 path_single, /* Single micro op. */
50942 path_double, /* Double micro op. */
50943 path_multi, /* Instructions with more than 2 micro op.. */
50944 last_path
50947 /* sched_insn_info defines a window to the instructions scheduled in
50948 the basic block. It contains a pointer to the insn_info table and
50949 the instruction scheduled.
50951 Windows are allocated for each basic block and are linked
50952 together. */
50953 typedef struct sched_insn_info_s {
50954 rtx insn;
50955 enum dispatch_group group;
50956 enum insn_path path;
50957 int byte_len;
50958 int imm_bytes;
50959 } sched_insn_info;
50961 /* Linked list of dispatch windows. This is a two way list of
50962 dispatch windows of a basic block. It contains information about
50963 the number of uops in the window and the total number of
50964 instructions and of bytes in the object code for this dispatch
50965 window. */
50966 typedef struct dispatch_windows_s {
50967 int num_insn; /* Number of insn in the window. */
50968 int num_uops; /* Number of uops in the window. */
50969 int window_size; /* Number of bytes in the window. */
50970 int window_num; /* Window number between 0 or 1. */
50971 int num_imm; /* Number of immediates in an insn. */
50972 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50973 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50974 int imm_size; /* Total immediates in the window. */
50975 int num_loads; /* Total memory loads in the window. */
50976 int num_stores; /* Total memory stores in the window. */
50977 int violation; /* Violation exists in window. */
50978 sched_insn_info *window; /* Pointer to the window. */
50979 struct dispatch_windows_s *next;
50980 struct dispatch_windows_s *prev;
50981 } dispatch_windows;
50983 /* Immediate valuse used in an insn. */
50984 typedef struct imm_info_s
50986 int imm;
50987 int imm32;
50988 int imm64;
50989 } imm_info;
50991 static dispatch_windows *dispatch_window_list;
50992 static dispatch_windows *dispatch_window_list1;
50994 /* Get dispatch group of insn. */
50996 static enum dispatch_group
50997 get_mem_group (rtx_insn *insn)
50999 enum attr_memory memory;
51001 if (INSN_CODE (insn) < 0)
51002 return disp_no_group;
51003 memory = get_attr_memory (insn);
51004 if (memory == MEMORY_STORE)
51005 return disp_store;
51007 if (memory == MEMORY_LOAD)
51008 return disp_load;
51010 if (memory == MEMORY_BOTH)
51011 return disp_load_store;
51013 return disp_no_group;
51016 /* Return true if insn is a compare instruction. */
51018 static bool
51019 is_cmp (rtx_insn *insn)
51021 enum attr_type type;
51023 type = get_attr_type (insn);
51024 return (type == TYPE_TEST
51025 || type == TYPE_ICMP
51026 || type == TYPE_FCMP
51027 || GET_CODE (PATTERN (insn)) == COMPARE);
51030 /* Return true if a dispatch violation encountered. */
51032 static bool
51033 dispatch_violation (void)
51035 if (dispatch_window_list->next)
51036 return dispatch_window_list->next->violation;
51037 return dispatch_window_list->violation;
51040 /* Return true if insn is a branch instruction. */
51042 static bool
51043 is_branch (rtx_insn *insn)
51045 return (CALL_P (insn) || JUMP_P (insn));
51048 /* Return true if insn is a prefetch instruction. */
51050 static bool
51051 is_prefetch (rtx_insn *insn)
51053 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
51056 /* This function initializes a dispatch window and the list container holding a
51057 pointer to the window. */
51059 static void
51060 init_window (int window_num)
51062 int i;
51063 dispatch_windows *new_list;
51065 if (window_num == 0)
51066 new_list = dispatch_window_list;
51067 else
51068 new_list = dispatch_window_list1;
51070 new_list->num_insn = 0;
51071 new_list->num_uops = 0;
51072 new_list->window_size = 0;
51073 new_list->next = NULL;
51074 new_list->prev = NULL;
51075 new_list->window_num = window_num;
51076 new_list->num_imm = 0;
51077 new_list->num_imm_32 = 0;
51078 new_list->num_imm_64 = 0;
51079 new_list->imm_size = 0;
51080 new_list->num_loads = 0;
51081 new_list->num_stores = 0;
51082 new_list->violation = false;
51084 for (i = 0; i < MAX_INSN; i++)
51086 new_list->window[i].insn = NULL;
51087 new_list->window[i].group = disp_no_group;
51088 new_list->window[i].path = no_path;
51089 new_list->window[i].byte_len = 0;
51090 new_list->window[i].imm_bytes = 0;
51092 return;
51095 /* This function allocates and initializes a dispatch window and the
51096 list container holding a pointer to the window. */
51098 static dispatch_windows *
51099 allocate_window (void)
51101 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
51102 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
51104 return new_list;
51107 /* This routine initializes the dispatch scheduling information. It
51108 initiates building dispatch scheduler tables and constructs the
51109 first dispatch window. */
51111 static void
51112 init_dispatch_sched (void)
51114 /* Allocate a dispatch list and a window. */
51115 dispatch_window_list = allocate_window ();
51116 dispatch_window_list1 = allocate_window ();
51117 init_window (0);
51118 init_window (1);
51121 /* This function returns true if a branch is detected. End of a basic block
51122 does not have to be a branch, but here we assume only branches end a
51123 window. */
51125 static bool
51126 is_end_basic_block (enum dispatch_group group)
51128 return group == disp_branch;
51131 /* This function is called when the end of a window processing is reached. */
51133 static void
51134 process_end_window (void)
51136 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
51137 if (dispatch_window_list->next)
51139 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
51140 gcc_assert (dispatch_window_list->window_size
51141 + dispatch_window_list1->window_size <= 48);
51142 init_window (1);
51144 init_window (0);
51147 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
51148 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
51149 for 48 bytes of instructions. Note that these windows are not dispatch
51150 windows that their sizes are DISPATCH_WINDOW_SIZE. */
51152 static dispatch_windows *
51153 allocate_next_window (int window_num)
51155 if (window_num == 0)
51157 if (dispatch_window_list->next)
51158 init_window (1);
51159 init_window (0);
51160 return dispatch_window_list;
51163 dispatch_window_list->next = dispatch_window_list1;
51164 dispatch_window_list1->prev = dispatch_window_list;
51166 return dispatch_window_list1;
51169 /* Compute number of immediate operands of an instruction. */
51171 static void
51172 find_constant (rtx in_rtx, imm_info *imm_values)
51174 if (INSN_P (in_rtx))
51175 in_rtx = PATTERN (in_rtx);
51176 subrtx_iterator::array_type array;
51177 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
51178 if (const_rtx x = *iter)
51179 switch (GET_CODE (x))
51181 case CONST:
51182 case SYMBOL_REF:
51183 case CONST_INT:
51184 (imm_values->imm)++;
51185 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
51186 (imm_values->imm32)++;
51187 else
51188 (imm_values->imm64)++;
51189 break;
51191 case CONST_DOUBLE:
51192 case CONST_WIDE_INT:
51193 (imm_values->imm)++;
51194 (imm_values->imm64)++;
51195 break;
51197 case CODE_LABEL:
51198 if (LABEL_KIND (x) == LABEL_NORMAL)
51200 (imm_values->imm)++;
51201 (imm_values->imm32)++;
51203 break;
51205 default:
51206 break;
51210 /* Return total size of immediate operands of an instruction along with number
51211 of corresponding immediate-operands. It initializes its parameters to zero
51212 befor calling FIND_CONSTANT.
51213 INSN is the input instruction. IMM is the total of immediates.
51214 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
51215 bit immediates. */
51217 static int
51218 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
51220 imm_info imm_values = {0, 0, 0};
51222 find_constant (insn, &imm_values);
51223 *imm = imm_values.imm;
51224 *imm32 = imm_values.imm32;
51225 *imm64 = imm_values.imm64;
51226 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
51229 /* This function indicates if an operand of an instruction is an
51230 immediate. */
51232 static bool
51233 has_immediate (rtx_insn *insn)
51235 int num_imm_operand;
51236 int num_imm32_operand;
51237 int num_imm64_operand;
51239 if (insn)
51240 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51241 &num_imm64_operand);
51242 return false;
51245 /* Return single or double path for instructions. */
51247 static enum insn_path
51248 get_insn_path (rtx_insn *insn)
51250 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
51252 if ((int)path == 0)
51253 return path_single;
51255 if ((int)path == 1)
51256 return path_double;
51258 return path_multi;
51261 /* Return insn dispatch group. */
51263 static enum dispatch_group
51264 get_insn_group (rtx_insn *insn)
51266 enum dispatch_group group = get_mem_group (insn);
51267 if (group)
51268 return group;
51270 if (is_branch (insn))
51271 return disp_branch;
51273 if (is_cmp (insn))
51274 return disp_cmp;
51276 if (has_immediate (insn))
51277 return disp_imm;
51279 if (is_prefetch (insn))
51280 return disp_prefetch;
51282 return disp_no_group;
51285 /* Count number of GROUP restricted instructions in a dispatch
51286 window WINDOW_LIST. */
51288 static int
51289 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
51291 enum dispatch_group group = get_insn_group (insn);
51292 int imm_size;
51293 int num_imm_operand;
51294 int num_imm32_operand;
51295 int num_imm64_operand;
51297 if (group == disp_no_group)
51298 return 0;
51300 if (group == disp_imm)
51302 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51303 &num_imm64_operand);
51304 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
51305 || num_imm_operand + window_list->num_imm > MAX_IMM
51306 || (num_imm32_operand > 0
51307 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
51308 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
51309 || (num_imm64_operand > 0
51310 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
51311 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
51312 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
51313 && num_imm64_operand > 0
51314 && ((window_list->num_imm_64 > 0
51315 && window_list->num_insn >= 2)
51316 || window_list->num_insn >= 3)))
51317 return BIG;
51319 return 1;
51322 if ((group == disp_load_store
51323 && (window_list->num_loads >= MAX_LOAD
51324 || window_list->num_stores >= MAX_STORE))
51325 || ((group == disp_load
51326 || group == disp_prefetch)
51327 && window_list->num_loads >= MAX_LOAD)
51328 || (group == disp_store
51329 && window_list->num_stores >= MAX_STORE))
51330 return BIG;
51332 return 1;
51335 /* This function returns true if insn satisfies dispatch rules on the
51336 last window scheduled. */
51338 static bool
51339 fits_dispatch_window (rtx_insn *insn)
51341 dispatch_windows *window_list = dispatch_window_list;
51342 dispatch_windows *window_list_next = dispatch_window_list->next;
51343 unsigned int num_restrict;
51344 enum dispatch_group group = get_insn_group (insn);
51345 enum insn_path path = get_insn_path (insn);
51346 int sum;
51348 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
51349 instructions should be given the lowest priority in the
51350 scheduling process in Haifa scheduler to make sure they will be
51351 scheduled in the same dispatch window as the reference to them. */
51352 if (group == disp_jcc || group == disp_cmp)
51353 return false;
51355 /* Check nonrestricted. */
51356 if (group == disp_no_group || group == disp_branch)
51357 return true;
51359 /* Get last dispatch window. */
51360 if (window_list_next)
51361 window_list = window_list_next;
51363 if (window_list->window_num == 1)
51365 sum = window_list->prev->window_size + window_list->window_size;
51367 if (sum == 32
51368 || (min_insn_size (insn) + sum) >= 48)
51369 /* Window 1 is full. Go for next window. */
51370 return true;
51373 num_restrict = count_num_restricted (insn, window_list);
51375 if (num_restrict > num_allowable_groups[group])
51376 return false;
51378 /* See if it fits in the first window. */
51379 if (window_list->window_num == 0)
51381 /* The first widow should have only single and double path
51382 uops. */
51383 if (path == path_double
51384 && (window_list->num_uops + 2) > MAX_INSN)
51385 return false;
51386 else if (path != path_single)
51387 return false;
51389 return true;
51392 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51393 dispatch window WINDOW_LIST. */
51395 static void
51396 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51398 int byte_len = min_insn_size (insn);
51399 int num_insn = window_list->num_insn;
51400 int imm_size;
51401 sched_insn_info *window = window_list->window;
51402 enum dispatch_group group = get_insn_group (insn);
51403 enum insn_path path = get_insn_path (insn);
51404 int num_imm_operand;
51405 int num_imm32_operand;
51406 int num_imm64_operand;
51408 if (!window_list->violation && group != disp_cmp
51409 && !fits_dispatch_window (insn))
51410 window_list->violation = true;
51412 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51413 &num_imm64_operand);
51415 /* Initialize window with new instruction. */
51416 window[num_insn].insn = insn;
51417 window[num_insn].byte_len = byte_len;
51418 window[num_insn].group = group;
51419 window[num_insn].path = path;
51420 window[num_insn].imm_bytes = imm_size;
51422 window_list->window_size += byte_len;
51423 window_list->num_insn = num_insn + 1;
51424 window_list->num_uops = window_list->num_uops + num_uops;
51425 window_list->imm_size += imm_size;
51426 window_list->num_imm += num_imm_operand;
51427 window_list->num_imm_32 += num_imm32_operand;
51428 window_list->num_imm_64 += num_imm64_operand;
51430 if (group == disp_store)
51431 window_list->num_stores += 1;
51432 else if (group == disp_load
51433 || group == disp_prefetch)
51434 window_list->num_loads += 1;
51435 else if (group == disp_load_store)
51437 window_list->num_stores += 1;
51438 window_list->num_loads += 1;
51442 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51443 If the total bytes of instructions or the number of instructions in
51444 the window exceed allowable, it allocates a new window. */
51446 static void
51447 add_to_dispatch_window (rtx_insn *insn)
51449 int byte_len;
51450 dispatch_windows *window_list;
51451 dispatch_windows *next_list;
51452 dispatch_windows *window0_list;
51453 enum insn_path path;
51454 enum dispatch_group insn_group;
51455 bool insn_fits;
51456 int num_insn;
51457 int num_uops;
51458 int window_num;
51459 int insn_num_uops;
51460 int sum;
51462 if (INSN_CODE (insn) < 0)
51463 return;
51465 byte_len = min_insn_size (insn);
51466 window_list = dispatch_window_list;
51467 next_list = window_list->next;
51468 path = get_insn_path (insn);
51469 insn_group = get_insn_group (insn);
51471 /* Get the last dispatch window. */
51472 if (next_list)
51473 window_list = dispatch_window_list->next;
51475 if (path == path_single)
51476 insn_num_uops = 1;
51477 else if (path == path_double)
51478 insn_num_uops = 2;
51479 else
51480 insn_num_uops = (int) path;
51482 /* If current window is full, get a new window.
51483 Window number zero is full, if MAX_INSN uops are scheduled in it.
51484 Window number one is full, if window zero's bytes plus window
51485 one's bytes is 32, or if the bytes of the new instruction added
51486 to the total makes it greater than 48, or it has already MAX_INSN
51487 instructions in it. */
51488 num_insn = window_list->num_insn;
51489 num_uops = window_list->num_uops;
51490 window_num = window_list->window_num;
51491 insn_fits = fits_dispatch_window (insn);
51493 if (num_insn >= MAX_INSN
51494 || num_uops + insn_num_uops > MAX_INSN
51495 || !(insn_fits))
51497 window_num = ~window_num & 1;
51498 window_list = allocate_next_window (window_num);
51501 if (window_num == 0)
51503 add_insn_window (insn, window_list, insn_num_uops);
51504 if (window_list->num_insn >= MAX_INSN
51505 && insn_group == disp_branch)
51507 process_end_window ();
51508 return;
51511 else if (window_num == 1)
51513 window0_list = window_list->prev;
51514 sum = window0_list->window_size + window_list->window_size;
51515 if (sum == 32
51516 || (byte_len + sum) >= 48)
51518 process_end_window ();
51519 window_list = dispatch_window_list;
51522 add_insn_window (insn, window_list, insn_num_uops);
51524 else
51525 gcc_unreachable ();
51527 if (is_end_basic_block (insn_group))
51529 /* End of basic block is reached do end-basic-block process. */
51530 process_end_window ();
51531 return;
51535 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51537 DEBUG_FUNCTION static void
51538 debug_dispatch_window_file (FILE *file, int window_num)
51540 dispatch_windows *list;
51541 int i;
51543 if (window_num == 0)
51544 list = dispatch_window_list;
51545 else
51546 list = dispatch_window_list1;
51548 fprintf (file, "Window #%d:\n", list->window_num);
51549 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51550 list->num_insn, list->num_uops, list->window_size);
51551 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51552 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51554 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51555 list->num_stores);
51556 fprintf (file, " insn info:\n");
51558 for (i = 0; i < MAX_INSN; i++)
51560 if (!list->window[i].insn)
51561 break;
51562 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51563 i, group_name[list->window[i].group],
51564 i, (void *)list->window[i].insn,
51565 i, list->window[i].path,
51566 i, list->window[i].byte_len,
51567 i, list->window[i].imm_bytes);
51571 /* Print to stdout a dispatch window. */
51573 DEBUG_FUNCTION void
51574 debug_dispatch_window (int window_num)
51576 debug_dispatch_window_file (stdout, window_num);
51579 /* Print INSN dispatch information to FILE. */
51581 DEBUG_FUNCTION static void
51582 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51584 int byte_len;
51585 enum insn_path path;
51586 enum dispatch_group group;
51587 int imm_size;
51588 int num_imm_operand;
51589 int num_imm32_operand;
51590 int num_imm64_operand;
51592 if (INSN_CODE (insn) < 0)
51593 return;
51595 byte_len = min_insn_size (insn);
51596 path = get_insn_path (insn);
51597 group = get_insn_group (insn);
51598 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51599 &num_imm64_operand);
51601 fprintf (file, " insn info:\n");
51602 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51603 group_name[group], path, byte_len);
51604 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51605 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51608 /* Print to STDERR the status of the ready list with respect to
51609 dispatch windows. */
51611 DEBUG_FUNCTION void
51612 debug_ready_dispatch (void)
51614 int i;
51615 int no_ready = number_in_ready ();
51617 fprintf (stdout, "Number of ready: %d\n", no_ready);
51619 for (i = 0; i < no_ready; i++)
51620 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51623 /* This routine is the driver of the dispatch scheduler. */
51625 static void
51626 do_dispatch (rtx_insn *insn, int mode)
51628 if (mode == DISPATCH_INIT)
51629 init_dispatch_sched ();
51630 else if (mode == ADD_TO_DISPATCH_WINDOW)
51631 add_to_dispatch_window (insn);
51634 /* Return TRUE if Dispatch Scheduling is supported. */
51636 static bool
51637 has_dispatch (rtx_insn *insn, int action)
51639 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51640 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51641 switch (action)
51643 default:
51644 return false;
51646 case IS_DISPATCH_ON:
51647 return true;
51649 case IS_CMP:
51650 return is_cmp (insn);
51652 case DISPATCH_VIOLATION:
51653 return dispatch_violation ();
51655 case FITS_DISPATCH_WINDOW:
51656 return fits_dispatch_window (insn);
51659 return false;
51662 /* Implementation of reassociation_width target hook used by
51663 reassoc phase to identify parallelism level in reassociated
51664 tree. Statements tree_code is passed in OPC. Arguments type
51665 is passed in MODE.
51667 Currently parallel reassociation is enabled for Atom
51668 processors only and we set reassociation width to be 2
51669 because Atom may issue up to 2 instructions per cycle.
51671 Return value should be fixed if parallel reassociation is
51672 enabled for other processors. */
51674 static int
51675 ix86_reassociation_width (unsigned int, machine_mode mode)
51677 /* Vector part. */
51678 if (VECTOR_MODE_P (mode))
51680 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51681 return 2;
51682 else
51683 return 1;
51686 /* Scalar part. */
51687 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51688 return 2;
51689 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51690 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51691 else
51692 return 1;
51695 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51696 place emms and femms instructions. */
51698 static machine_mode
51699 ix86_preferred_simd_mode (scalar_mode mode)
51701 if (!TARGET_SSE)
51702 return word_mode;
51704 switch (mode)
51706 case E_QImode:
51707 return TARGET_AVX512BW ? V64QImode :
51708 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51709 case E_HImode:
51710 return TARGET_AVX512BW ? V32HImode :
51711 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51712 case E_SImode:
51713 return TARGET_AVX512F ? V16SImode :
51714 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51715 case E_DImode:
51716 return TARGET_AVX512F ? V8DImode :
51717 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51719 case E_SFmode:
51720 if (TARGET_AVX512F)
51721 return V16SFmode;
51722 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51723 return V8SFmode;
51724 else
51725 return V4SFmode;
51727 case E_DFmode:
51728 if (TARGET_AVX512F)
51729 return V8DFmode;
51730 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51731 return V4DFmode;
51732 else if (TARGET_SSE2)
51733 return V2DFmode;
51734 /* FALLTHRU */
51736 default:
51737 return word_mode;
51741 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51742 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51743 256bit and 128bit vectors. */
51745 static unsigned int
51746 ix86_autovectorize_vector_sizes (void)
51748 return TARGET_AVX512F ? 64 | 32 | 16 :
51749 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51752 /* Implemenation of targetm.vectorize.get_mask_mode. */
51754 static opt_machine_mode
51755 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51757 unsigned elem_size = vector_size / nunits;
51759 /* Scalar mask case. */
51760 if ((TARGET_AVX512F && vector_size == 64)
51761 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51763 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51764 return smallest_int_mode_for_size (nunits);
51767 scalar_int_mode elem_mode
51768 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
51770 gcc_assert (elem_size * nunits == vector_size);
51772 return mode_for_vector (elem_mode, nunits);
51777 /* Return class of registers which could be used for pseudo of MODE
51778 and of class RCLASS for spilling instead of memory. Return NO_REGS
51779 if it is not possible or non-profitable. */
51781 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51783 static reg_class_t
51784 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51786 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51787 && TARGET_SSE2
51788 && TARGET_INTER_UNIT_MOVES_TO_VEC
51789 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51790 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51791 && INTEGER_CLASS_P (rclass))
51792 return ALL_SSE_REGS;
51793 return NO_REGS;
51796 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51797 but returns a lower bound. */
51799 static unsigned int
51800 ix86_max_noce_ifcvt_seq_cost (edge e)
51802 bool predictable_p = predictable_edge_p (e);
51804 enum compiler_param param
51805 = (predictable_p
51806 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51807 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51809 /* If we have a parameter set, use that, otherwise take a guess using
51810 BRANCH_COST. */
51811 if (global_options_set.x_param_values[param])
51812 return PARAM_VALUE (param);
51813 else
51814 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51817 /* Return true if SEQ is a good candidate as a replacement for the
51818 if-convertible sequence described in IF_INFO. */
51820 static bool
51821 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51823 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51825 int cmov_cnt = 0;
51826 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51827 Maybe we should allow even more conditional moves as long as they
51828 are used far enough not to stall the CPU, or also consider
51829 IF_INFO->TEST_BB succ edge probabilities. */
51830 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51832 rtx set = single_set (insn);
51833 if (!set)
51834 continue;
51835 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51836 continue;
51837 rtx src = SET_SRC (set);
51838 machine_mode mode = GET_MODE (src);
51839 if (GET_MODE_CLASS (mode) != MODE_INT
51840 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51841 continue;
51842 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51843 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51844 continue;
51845 /* insn is CMOV or FCMOV. */
51846 if (++cmov_cnt > 1)
51847 return false;
51850 return default_noce_conversion_profitable_p (seq, if_info);
51853 /* Implement targetm.vectorize.init_cost. */
51855 static void *
51856 ix86_init_cost (struct loop *)
51858 unsigned *cost = XNEWVEC (unsigned, 3);
51859 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51860 return cost;
51863 /* Implement targetm.vectorize.add_stmt_cost. */
51865 static unsigned
51866 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51867 struct _stmt_vec_info *stmt_info, int misalign,
51868 enum vect_cost_model_location where)
51870 unsigned *cost = (unsigned *) data;
51871 unsigned retval = 0;
51873 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51874 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51876 /* Penalize DFmode vector operations for Bonnell. */
51877 if (TARGET_BONNELL && kind == vector_stmt
51878 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51879 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51881 /* Statements in an inner loop relative to the loop being
51882 vectorized are weighted more heavily. The value here is
51883 arbitrary and could potentially be improved with analysis. */
51884 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51885 count *= 50; /* FIXME. */
51887 retval = (unsigned) (count * stmt_cost);
51889 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51890 for Silvermont as it has out of order integer pipeline and can execute
51891 2 scalar instruction per tick, but has in order SIMD pipeline. */
51892 if ((TARGET_SILVERMONT || TARGET_INTEL)
51893 && stmt_info && stmt_info->stmt)
51895 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51896 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51897 retval = (retval * 17) / 10;
51900 cost[where] += retval;
51902 return retval;
51905 /* Implement targetm.vectorize.finish_cost. */
51907 static void
51908 ix86_finish_cost (void *data, unsigned *prologue_cost,
51909 unsigned *body_cost, unsigned *epilogue_cost)
51911 unsigned *cost = (unsigned *) data;
51912 *prologue_cost = cost[vect_prologue];
51913 *body_cost = cost[vect_body];
51914 *epilogue_cost = cost[vect_epilogue];
51917 /* Implement targetm.vectorize.destroy_cost_data. */
51919 static void
51920 ix86_destroy_cost_data (void *data)
51922 free (data);
51925 /* Validate target specific memory model bits in VAL. */
51927 static unsigned HOST_WIDE_INT
51928 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51930 enum memmodel model = memmodel_from_int (val);
51931 bool strong;
51933 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51934 |MEMMODEL_MASK)
51935 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51937 warning (OPT_Winvalid_memory_model,
51938 "Unknown architecture specific memory model");
51939 return MEMMODEL_SEQ_CST;
51941 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51942 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51944 warning (OPT_Winvalid_memory_model,
51945 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51946 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51948 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51950 warning (OPT_Winvalid_memory_model,
51951 "HLE_RELEASE not used with RELEASE or stronger memory model");
51952 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51954 return val;
51957 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51958 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51959 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51960 or number of vecsize_mangle variants that should be emitted. */
51962 static int
51963 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51964 struct cgraph_simd_clone *clonei,
51965 tree base_type, int num)
51967 int ret = 1;
51969 if (clonei->simdlen
51970 && (clonei->simdlen < 2
51971 || clonei->simdlen > 1024
51972 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51974 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51975 "unsupported simdlen %d", clonei->simdlen);
51976 return 0;
51979 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51980 if (TREE_CODE (ret_type) != VOID_TYPE)
51981 switch (TYPE_MODE (ret_type))
51983 case E_QImode:
51984 case E_HImode:
51985 case E_SImode:
51986 case E_DImode:
51987 case E_SFmode:
51988 case E_DFmode:
51989 /* case E_SCmode: */
51990 /* case E_DCmode: */
51991 break;
51992 default:
51993 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51994 "unsupported return type %qT for simd\n", ret_type);
51995 return 0;
51998 tree t;
51999 int i;
52001 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
52002 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
52003 switch (TYPE_MODE (TREE_TYPE (t)))
52005 case E_QImode:
52006 case E_HImode:
52007 case E_SImode:
52008 case E_DImode:
52009 case E_SFmode:
52010 case E_DFmode:
52011 /* case E_SCmode: */
52012 /* case E_DCmode: */
52013 break;
52014 default:
52015 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52016 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
52017 return 0;
52020 if (clonei->cilk_elemental)
52022 /* Parse here processor clause. If not present, default to 'b'. */
52023 clonei->vecsize_mangle = 'b';
52025 else if (!TREE_PUBLIC (node->decl))
52027 /* If the function isn't exported, we can pick up just one ISA
52028 for the clones. */
52029 if (TARGET_AVX512F)
52030 clonei->vecsize_mangle = 'e';
52031 else if (TARGET_AVX2)
52032 clonei->vecsize_mangle = 'd';
52033 else if (TARGET_AVX)
52034 clonei->vecsize_mangle = 'c';
52035 else
52036 clonei->vecsize_mangle = 'b';
52037 ret = 1;
52039 else
52041 clonei->vecsize_mangle = "bcde"[num];
52042 ret = 4;
52044 clonei->mask_mode = VOIDmode;
52045 switch (clonei->vecsize_mangle)
52047 case 'b':
52048 clonei->vecsize_int = 128;
52049 clonei->vecsize_float = 128;
52050 break;
52051 case 'c':
52052 clonei->vecsize_int = 128;
52053 clonei->vecsize_float = 256;
52054 break;
52055 case 'd':
52056 clonei->vecsize_int = 256;
52057 clonei->vecsize_float = 256;
52058 break;
52059 case 'e':
52060 clonei->vecsize_int = 512;
52061 clonei->vecsize_float = 512;
52062 if (TYPE_MODE (base_type) == QImode)
52063 clonei->mask_mode = DImode;
52064 else
52065 clonei->mask_mode = SImode;
52066 break;
52068 if (clonei->simdlen == 0)
52070 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
52071 clonei->simdlen = clonei->vecsize_int;
52072 else
52073 clonei->simdlen = clonei->vecsize_float;
52074 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
52076 else if (clonei->simdlen > 16)
52078 /* For compatibility with ICC, use the same upper bounds
52079 for simdlen. In particular, for CTYPE below, use the return type,
52080 unless the function returns void, in that case use the characteristic
52081 type. If it is possible for given SIMDLEN to pass CTYPE value
52082 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
52083 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
52084 emit corresponding clone. */
52085 tree ctype = ret_type;
52086 if (TREE_CODE (ret_type) == VOID_TYPE)
52087 ctype = base_type;
52088 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
52089 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
52090 cnt /= clonei->vecsize_int;
52091 else
52092 cnt /= clonei->vecsize_float;
52093 if (cnt > (TARGET_64BIT ? 16 : 8))
52095 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52096 "unsupported simdlen %d", clonei->simdlen);
52097 return 0;
52100 return ret;
52103 /* Add target attribute to SIMD clone NODE if needed. */
52105 static void
52106 ix86_simd_clone_adjust (struct cgraph_node *node)
52108 const char *str = NULL;
52109 gcc_assert (node->decl == cfun->decl);
52110 switch (node->simdclone->vecsize_mangle)
52112 case 'b':
52113 if (!TARGET_SSE2)
52114 str = "sse2";
52115 break;
52116 case 'c':
52117 if (!TARGET_AVX)
52118 str = "avx";
52119 break;
52120 case 'd':
52121 if (!TARGET_AVX2)
52122 str = "avx2";
52123 break;
52124 case 'e':
52125 if (!TARGET_AVX512F)
52126 str = "avx512f";
52127 break;
52128 default:
52129 gcc_unreachable ();
52131 if (str == NULL)
52132 return;
52133 push_cfun (NULL);
52134 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
52135 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
52136 gcc_assert (ok);
52137 pop_cfun ();
52138 ix86_reset_previous_fndecl ();
52139 ix86_set_current_function (node->decl);
52142 /* If SIMD clone NODE can't be used in a vectorized loop
52143 in current function, return -1, otherwise return a badness of using it
52144 (0 if it is most desirable from vecsize_mangle point of view, 1
52145 slightly less desirable, etc.). */
52147 static int
52148 ix86_simd_clone_usable (struct cgraph_node *node)
52150 switch (node->simdclone->vecsize_mangle)
52152 case 'b':
52153 if (!TARGET_SSE2)
52154 return -1;
52155 if (!TARGET_AVX)
52156 return 0;
52157 return TARGET_AVX2 ? 2 : 1;
52158 case 'c':
52159 if (!TARGET_AVX)
52160 return -1;
52161 return TARGET_AVX2 ? 1 : 0;
52162 case 'd':
52163 if (!TARGET_AVX2)
52164 return -1;
52165 return 0;
52166 case 'e':
52167 if (!TARGET_AVX512F)
52168 return -1;
52169 return 0;
52170 default:
52171 gcc_unreachable ();
52175 /* This function adjusts the unroll factor based on
52176 the hardware capabilities. For ex, bdver3 has
52177 a loop buffer which makes unrolling of smaller
52178 loops less important. This function decides the
52179 unroll factor using number of memory references
52180 (value 32 is used) as a heuristic. */
52182 static unsigned
52183 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
52185 basic_block *bbs;
52186 rtx_insn *insn;
52187 unsigned i;
52188 unsigned mem_count = 0;
52190 if (!TARGET_ADJUST_UNROLL)
52191 return nunroll;
52193 /* Count the number of memory references within the loop body.
52194 This value determines the unrolling factor for bdver3 and bdver4
52195 architectures. */
52196 subrtx_iterator::array_type array;
52197 bbs = get_loop_body (loop);
52198 for (i = 0; i < loop->num_nodes; i++)
52199 FOR_BB_INSNS (bbs[i], insn)
52200 if (NONDEBUG_INSN_P (insn))
52201 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
52202 if (const_rtx x = *iter)
52203 if (MEM_P (x))
52205 machine_mode mode = GET_MODE (x);
52206 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
52207 if (n_words > 4)
52208 mem_count += 2;
52209 else
52210 mem_count += 1;
52212 free (bbs);
52214 if (mem_count && mem_count <=32)
52215 return 32/mem_count;
52217 return nunroll;
52221 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
52223 static bool
52224 ix86_float_exceptions_rounding_supported_p (void)
52226 /* For x87 floating point with standard excess precision handling,
52227 there is no adddf3 pattern (since x87 floating point only has
52228 XFmode operations) so the default hook implementation gets this
52229 wrong. */
52230 return TARGET_80387 || TARGET_SSE_MATH;
52233 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
52235 static void
52236 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
52238 if (!TARGET_80387 && !TARGET_SSE_MATH)
52239 return;
52240 tree exceptions_var = create_tmp_var_raw (integer_type_node);
52241 if (TARGET_80387)
52243 tree fenv_index_type = build_index_type (size_int (6));
52244 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
52245 tree fenv_var = create_tmp_var_raw (fenv_type);
52246 TREE_ADDRESSABLE (fenv_var) = 1;
52247 tree fenv_ptr = build_pointer_type (fenv_type);
52248 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
52249 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
52250 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
52251 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
52252 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
52253 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
52254 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
52255 tree hold_fnclex = build_call_expr (fnclex, 0);
52256 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
52257 NULL_TREE, NULL_TREE);
52258 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
52259 hold_fnclex);
52260 *clear = build_call_expr (fnclex, 0);
52261 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
52262 tree fnstsw_call = build_call_expr (fnstsw, 0);
52263 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
52264 sw_var, fnstsw_call);
52265 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
52266 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
52267 exceptions_var, exceptions_x87);
52268 *update = build2 (COMPOUND_EXPR, integer_type_node,
52269 sw_mod, update_mod);
52270 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
52271 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
52273 if (TARGET_SSE_MATH)
52275 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
52276 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
52277 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
52278 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
52279 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
52280 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
52281 mxcsr_orig_var, stmxcsr_hold_call);
52282 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
52283 mxcsr_orig_var,
52284 build_int_cst (unsigned_type_node, 0x1f80));
52285 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
52286 build_int_cst (unsigned_type_node, 0xffffffc0));
52287 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
52288 mxcsr_mod_var, hold_mod_val);
52289 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52290 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
52291 hold_assign_orig, hold_assign_mod);
52292 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
52293 ldmxcsr_hold_call);
52294 if (*hold)
52295 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
52296 else
52297 *hold = hold_all;
52298 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52299 if (*clear)
52300 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
52301 ldmxcsr_clear_call);
52302 else
52303 *clear = ldmxcsr_clear_call;
52304 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
52305 tree exceptions_sse = fold_convert (integer_type_node,
52306 stxmcsr_update_call);
52307 if (*update)
52309 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
52310 exceptions_var, exceptions_sse);
52311 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
52312 exceptions_var, exceptions_mod);
52313 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
52314 exceptions_assign);
52316 else
52317 *update = build2 (MODIFY_EXPR, integer_type_node,
52318 exceptions_var, exceptions_sse);
52319 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
52320 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52321 ldmxcsr_update_call);
52323 tree atomic_feraiseexcept
52324 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
52325 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
52326 1, exceptions_var);
52327 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52328 atomic_feraiseexcept_call);
52331 /* Return mode to be used for bounds or VOIDmode
52332 if bounds are not supported. */
52334 static machine_mode
52335 ix86_mpx_bound_mode ()
52337 /* Do not support pointer checker if MPX
52338 is not enabled. */
52339 if (!TARGET_MPX)
52341 if (flag_check_pointer_bounds)
52342 warning (0, "Pointer Checker requires MPX support on this target."
52343 " Use -mmpx options to enable MPX.");
52344 return VOIDmode;
52347 return BNDmode;
52350 /* Return constant used to statically initialize constant bounds.
52352 This function is used to create special bound values. For now
52353 only INIT bounds and NONE bounds are expected. More special
52354 values may be added later. */
52356 static tree
52357 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
52359 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
52360 : build_zero_cst (pointer_sized_int_node);
52361 tree high = ub ? build_zero_cst (pointer_sized_int_node)
52362 : build_minus_one_cst (pointer_sized_int_node);
52364 /* This function is supposed to be used to create INIT and
52365 NONE bounds only. */
52366 gcc_assert ((lb == 0 && ub == -1)
52367 || (lb == -1 && ub == 0));
52369 return build_complex (NULL, low, high);
52372 /* Generate a list of statements STMTS to initialize pointer bounds
52373 variable VAR with bounds LB and UB. Return the number of generated
52374 statements. */
52376 static int
52377 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52379 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52380 tree lhs, modify, var_p;
52382 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52383 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52385 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52386 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52387 append_to_statement_list (modify, stmts);
52389 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52390 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52391 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52392 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52393 append_to_statement_list (modify, stmts);
52395 return 2;
52398 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52399 /* For i386, common symbol is local only for non-PIE binaries. For
52400 x86-64, common symbol is local only for non-PIE binaries or linker
52401 supports copy reloc in PIE binaries. */
52403 static bool
52404 ix86_binds_local_p (const_tree exp)
52406 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52407 (!flag_pic
52408 || (TARGET_64BIT
52409 && HAVE_LD_PIE_COPYRELOC != 0)));
52411 #endif
52413 /* If MEM is in the form of [base+offset], extract the two parts
52414 of address and set to BASE and OFFSET, otherwise return false. */
52416 static bool
52417 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52419 rtx addr;
52421 gcc_assert (MEM_P (mem));
52423 addr = XEXP (mem, 0);
52425 if (GET_CODE (addr) == CONST)
52426 addr = XEXP (addr, 0);
52428 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52430 *base = addr;
52431 *offset = const0_rtx;
52432 return true;
52435 if (GET_CODE (addr) == PLUS
52436 && (REG_P (XEXP (addr, 0))
52437 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52438 && CONST_INT_P (XEXP (addr, 1)))
52440 *base = XEXP (addr, 0);
52441 *offset = XEXP (addr, 1);
52442 return true;
52445 return false;
52448 /* Given OPERANDS of consecutive load/store, check if we can merge
52449 them into move multiple. LOAD is true if they are load instructions.
52450 MODE is the mode of memory operands. */
52452 bool
52453 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52454 machine_mode mode)
52456 HOST_WIDE_INT offval_1, offval_2, msize;
52457 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52459 if (load)
52461 mem_1 = operands[1];
52462 mem_2 = operands[3];
52463 reg_1 = operands[0];
52464 reg_2 = operands[2];
52466 else
52468 mem_1 = operands[0];
52469 mem_2 = operands[2];
52470 reg_1 = operands[1];
52471 reg_2 = operands[3];
52474 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52476 if (REGNO (reg_1) != REGNO (reg_2))
52477 return false;
52479 /* Check if the addresses are in the form of [base+offset]. */
52480 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52481 return false;
52482 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52483 return false;
52485 /* Check if the bases are the same. */
52486 if (!rtx_equal_p (base_1, base_2))
52487 return false;
52489 offval_1 = INTVAL (offset_1);
52490 offval_2 = INTVAL (offset_2);
52491 msize = GET_MODE_SIZE (mode);
52492 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52493 if (offval_1 + msize != offval_2)
52494 return false;
52496 return true;
52499 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52501 static bool
52502 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52503 optimization_type opt_type)
52505 switch (op)
52507 case asin_optab:
52508 case acos_optab:
52509 case log1p_optab:
52510 case exp_optab:
52511 case exp10_optab:
52512 case exp2_optab:
52513 case expm1_optab:
52514 case ldexp_optab:
52515 case scalb_optab:
52516 case round_optab:
52517 return opt_type == OPTIMIZE_FOR_SPEED;
52519 case rint_optab:
52520 if (SSE_FLOAT_MODE_P (mode1)
52521 && TARGET_SSE_MATH
52522 && !flag_trapping_math
52523 && !TARGET_SSE4_1)
52524 return opt_type == OPTIMIZE_FOR_SPEED;
52525 return true;
52527 case floor_optab:
52528 case ceil_optab:
52529 case btrunc_optab:
52530 if (SSE_FLOAT_MODE_P (mode1)
52531 && TARGET_SSE_MATH
52532 && !flag_trapping_math
52533 && TARGET_SSE4_1)
52534 return true;
52535 return opt_type == OPTIMIZE_FOR_SPEED;
52537 case rsqrt_optab:
52538 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52540 default:
52541 return true;
52545 /* Address space support.
52547 This is not "far pointers" in the 16-bit sense, but an easy way
52548 to use %fs and %gs segment prefixes. Therefore:
52550 (a) All address spaces have the same modes,
52551 (b) All address spaces have the same addresss forms,
52552 (c) While %fs and %gs are technically subsets of the generic
52553 address space, they are probably not subsets of each other.
52554 (d) Since we have no access to the segment base register values
52555 without resorting to a system call, we cannot convert a
52556 non-default address space to a default address space.
52557 Therefore we do not claim %fs or %gs are subsets of generic.
52559 Therefore we can (mostly) use the default hooks. */
52561 /* All use of segmentation is assumed to make address 0 valid. */
52563 static bool
52564 ix86_addr_space_zero_address_valid (addr_space_t as)
52566 return as != ADDR_SPACE_GENERIC;
52569 static void
52570 ix86_init_libfuncs (void)
52572 if (TARGET_64BIT)
52574 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52575 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52577 else
52579 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52580 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52583 #if TARGET_MACHO
52584 darwin_rename_builtins ();
52585 #endif
52588 /* Generate call to __divmoddi4. */
52590 static void
52591 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52592 rtx op0, rtx op1,
52593 rtx *quot_p, rtx *rem_p)
52595 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52597 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52598 mode,
52599 op0, GET_MODE (op0),
52600 op1, GET_MODE (op1),
52601 XEXP (rem, 0), Pmode);
52602 *quot_p = quot;
52603 *rem_p = rem;
52606 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52607 FPU, assume that the fpcw is set to extended precision; when using
52608 only SSE, rounding is correct; when using both SSE and the FPU,
52609 the rounding precision is indeterminate, since either may be chosen
52610 apparently at random. */
52612 static enum flt_eval_method
52613 ix86_excess_precision (enum excess_precision_type type)
52615 switch (type)
52617 case EXCESS_PRECISION_TYPE_FAST:
52618 /* The fastest type to promote to will always be the native type,
52619 whether that occurs with implicit excess precision or
52620 otherwise. */
52621 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52622 case EXCESS_PRECISION_TYPE_STANDARD:
52623 case EXCESS_PRECISION_TYPE_IMPLICIT:
52624 /* Otherwise, the excess precision we want when we are
52625 in a standards compliant mode, and the implicit precision we
52626 provide would be identical were it not for the unpredictable
52627 cases. */
52628 if (!TARGET_80387)
52629 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52630 else if (!TARGET_MIX_SSE_I387)
52632 if (!TARGET_SSE_MATH)
52633 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52634 else if (TARGET_SSE2)
52635 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52638 /* If we are in standards compliant mode, but we know we will
52639 calculate in unpredictable precision, return
52640 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52641 excess precision if the target can't guarantee it will honor
52642 it. */
52643 return (type == EXCESS_PRECISION_TYPE_STANDARD
52644 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52645 : FLT_EVAL_METHOD_UNPREDICTABLE);
52646 default:
52647 gcc_unreachable ();
52650 return FLT_EVAL_METHOD_UNPREDICTABLE;
52653 /* Target-specific selftests. */
52655 #if CHECKING_P
52657 namespace selftest {
52659 /* Verify that hard regs are dumped as expected (in compact mode). */
52661 static void
52662 ix86_test_dumping_hard_regs ()
52664 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52665 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52668 /* Test dumping an insn with repeated references to the same SCRATCH,
52669 to verify the rtx_reuse code. */
52671 static void
52672 ix86_test_dumping_memory_blockage ()
52674 set_new_first_and_last_insn (NULL, NULL);
52676 rtx pat = gen_memory_blockage ();
52677 rtx_reuse_manager r;
52678 r.preprocess (pat);
52680 /* Verify that the repeated references to the SCRATCH show use
52681 reuse IDS. The first should be prefixed with a reuse ID,
52682 and the second should be dumped as a "reuse_rtx" of that ID.
52683 The expected string assumes Pmode == DImode. */
52684 if (Pmode == DImode)
52685 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52686 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52687 " (unspec:BLK [\n"
52688 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52689 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52692 /* Verify loading an RTL dump; specifically a dump of copying
52693 a param on x86_64 from a hard reg into the frame.
52694 This test is target-specific since the dump contains target-specific
52695 hard reg names. */
52697 static void
52698 ix86_test_loading_dump_fragment_1 ()
52700 rtl_dump_test t (SELFTEST_LOCATION,
52701 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52703 rtx_insn *insn = get_insn_by_uid (1);
52705 /* The block structure and indentation here is purely for
52706 readability; it mirrors the structure of the rtx. */
52707 tree mem_expr;
52709 rtx pat = PATTERN (insn);
52710 ASSERT_EQ (SET, GET_CODE (pat));
52712 rtx dest = SET_DEST (pat);
52713 ASSERT_EQ (MEM, GET_CODE (dest));
52714 /* Verify the "/c" was parsed. */
52715 ASSERT_TRUE (RTX_FLAG (dest, call));
52716 ASSERT_EQ (SImode, GET_MODE (dest));
52718 rtx addr = XEXP (dest, 0);
52719 ASSERT_EQ (PLUS, GET_CODE (addr));
52720 ASSERT_EQ (DImode, GET_MODE (addr));
52722 rtx lhs = XEXP (addr, 0);
52723 /* Verify that the "frame" REG was consolidated. */
52724 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52727 rtx rhs = XEXP (addr, 1);
52728 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52729 ASSERT_EQ (-4, INTVAL (rhs));
52732 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52733 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52734 /* "i" should have been handled by synthesizing a global int
52735 variable named "i". */
52736 mem_expr = MEM_EXPR (dest);
52737 ASSERT_NE (mem_expr, NULL);
52738 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52739 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52740 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52741 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52742 /* "+0". */
52743 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52744 ASSERT_EQ (0, MEM_OFFSET (dest));
52745 /* "S4". */
52746 ASSERT_EQ (4, MEM_SIZE (dest));
52747 /* "A32. */
52748 ASSERT_EQ (32, MEM_ALIGN (dest));
52751 rtx src = SET_SRC (pat);
52752 ASSERT_EQ (REG, GET_CODE (src));
52753 ASSERT_EQ (SImode, GET_MODE (src));
52754 ASSERT_EQ (5, REGNO (src));
52755 tree reg_expr = REG_EXPR (src);
52756 /* "i" here should point to the same var as for the MEM_EXPR. */
52757 ASSERT_EQ (reg_expr, mem_expr);
52762 /* Verify that the RTL loader copes with a call_insn dump.
52763 This test is target-specific since the dump contains a target-specific
52764 hard reg name. */
52766 static void
52767 ix86_test_loading_call_insn ()
52769 /* The test dump includes register "xmm0", where requires TARGET_SSE
52770 to exist. */
52771 if (!TARGET_SSE)
52772 return;
52774 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52776 rtx_insn *insn = get_insns ();
52777 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52779 /* "/j". */
52780 ASSERT_TRUE (RTX_FLAG (insn, jump));
52782 rtx pat = PATTERN (insn);
52783 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52785 /* Verify REG_NOTES. */
52787 /* "(expr_list:REG_CALL_DECL". */
52788 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52789 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52790 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52792 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52793 rtx_expr_list *note1 = note0->next ();
52794 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52796 ASSERT_EQ (NULL, note1->next ());
52799 /* Verify CALL_INSN_FUNCTION_USAGE. */
52801 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52802 rtx_expr_list *usage
52803 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52804 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52805 ASSERT_EQ (DFmode, GET_MODE (usage));
52806 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52807 ASSERT_EQ (NULL, usage->next ());
52811 /* Verify that the RTL loader copes a dump from print_rtx_function.
52812 This test is target-specific since the dump contains target-specific
52813 hard reg names. */
52815 static void
52816 ix86_test_loading_full_dump ()
52818 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52820 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52822 rtx_insn *insn_1 = get_insn_by_uid (1);
52823 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52825 rtx_insn *insn_7 = get_insn_by_uid (7);
52826 ASSERT_EQ (INSN, GET_CODE (insn_7));
52827 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52829 rtx_insn *insn_15 = get_insn_by_uid (15);
52830 ASSERT_EQ (INSN, GET_CODE (insn_15));
52831 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52833 /* Verify crtl->return_rtx. */
52834 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52835 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52836 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52839 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52840 In particular, verify that it correctly loads the 2nd operand.
52841 This test is target-specific since these are machine-specific
52842 operands (and enums). */
52844 static void
52845 ix86_test_loading_unspec ()
52847 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52849 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52851 ASSERT_TRUE (cfun);
52853 /* Test of an UNSPEC. */
52854 rtx_insn *insn = get_insns ();
52855 ASSERT_EQ (INSN, GET_CODE (insn));
52856 rtx set = single_set (insn);
52857 ASSERT_NE (NULL, set);
52858 rtx dst = SET_DEST (set);
52859 ASSERT_EQ (MEM, GET_CODE (dst));
52860 rtx src = SET_SRC (set);
52861 ASSERT_EQ (UNSPEC, GET_CODE (src));
52862 ASSERT_EQ (BLKmode, GET_MODE (src));
52863 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52865 rtx v0 = XVECEXP (src, 0, 0);
52867 /* Verify that the two uses of the first SCRATCH have pointer
52868 equality. */
52869 rtx scratch_a = XEXP (dst, 0);
52870 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52872 rtx scratch_b = XEXP (v0, 0);
52873 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52875 ASSERT_EQ (scratch_a, scratch_b);
52877 /* Verify that the two mems are thus treated as equal. */
52878 ASSERT_TRUE (rtx_equal_p (dst, v0));
52880 /* Verify the the insn is recognized. */
52881 ASSERT_NE(-1, recog_memoized (insn));
52883 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52884 insn = NEXT_INSN (insn);
52885 ASSERT_EQ (INSN, GET_CODE (insn));
52887 set = single_set (insn);
52888 ASSERT_NE (NULL, set);
52890 src = SET_SRC (set);
52891 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52892 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52895 /* Run all target-specific selftests. */
52897 static void
52898 ix86_run_selftests (void)
52900 ix86_test_dumping_hard_regs ();
52901 ix86_test_dumping_memory_blockage ();
52903 /* Various tests of loading RTL dumps, here because they contain
52904 ix86-isms (e.g. names of hard regs). */
52905 ix86_test_loading_dump_fragment_1 ();
52906 ix86_test_loading_call_insn ();
52907 ix86_test_loading_full_dump ();
52908 ix86_test_loading_unspec ();
52911 } // namespace selftest
52913 #endif /* CHECKING_P */
52915 /* Initialize the GCC target structure. */
52916 #undef TARGET_RETURN_IN_MEMORY
52917 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52919 #undef TARGET_LEGITIMIZE_ADDRESS
52920 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52922 #undef TARGET_ATTRIBUTE_TABLE
52923 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52924 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52925 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52926 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52927 # undef TARGET_MERGE_DECL_ATTRIBUTES
52928 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52929 #endif
52931 #undef TARGET_COMP_TYPE_ATTRIBUTES
52932 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52934 #undef TARGET_INIT_BUILTINS
52935 #define TARGET_INIT_BUILTINS ix86_init_builtins
52936 #undef TARGET_BUILTIN_DECL
52937 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52938 #undef TARGET_EXPAND_BUILTIN
52939 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52941 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52942 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52943 ix86_builtin_vectorized_function
52945 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52946 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52948 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52949 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52951 #undef TARGET_BUILTIN_RECIPROCAL
52952 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52954 #undef TARGET_ASM_FUNCTION_EPILOGUE
52955 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52957 #undef TARGET_ENCODE_SECTION_INFO
52958 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52959 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52960 #else
52961 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52962 #endif
52964 #undef TARGET_ASM_OPEN_PAREN
52965 #define TARGET_ASM_OPEN_PAREN ""
52966 #undef TARGET_ASM_CLOSE_PAREN
52967 #define TARGET_ASM_CLOSE_PAREN ""
52969 #undef TARGET_ASM_BYTE_OP
52970 #define TARGET_ASM_BYTE_OP ASM_BYTE
52972 #undef TARGET_ASM_ALIGNED_HI_OP
52973 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52974 #undef TARGET_ASM_ALIGNED_SI_OP
52975 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52976 #ifdef ASM_QUAD
52977 #undef TARGET_ASM_ALIGNED_DI_OP
52978 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52979 #endif
52981 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52982 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52984 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52985 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52987 #undef TARGET_ASM_UNALIGNED_HI_OP
52988 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52989 #undef TARGET_ASM_UNALIGNED_SI_OP
52990 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52991 #undef TARGET_ASM_UNALIGNED_DI_OP
52992 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52994 #undef TARGET_PRINT_OPERAND
52995 #define TARGET_PRINT_OPERAND ix86_print_operand
52996 #undef TARGET_PRINT_OPERAND_ADDRESS
52997 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52998 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52999 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
53000 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
53001 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
53003 #undef TARGET_SCHED_INIT_GLOBAL
53004 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
53005 #undef TARGET_SCHED_ADJUST_COST
53006 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
53007 #undef TARGET_SCHED_ISSUE_RATE
53008 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
53009 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
53010 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
53011 ia32_multipass_dfa_lookahead
53012 #undef TARGET_SCHED_MACRO_FUSION_P
53013 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
53014 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
53015 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
53017 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
53018 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
53020 #undef TARGET_MEMMODEL_CHECK
53021 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
53023 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
53024 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
53026 #ifdef HAVE_AS_TLS
53027 #undef TARGET_HAVE_TLS
53028 #define TARGET_HAVE_TLS true
53029 #endif
53030 #undef TARGET_CANNOT_FORCE_CONST_MEM
53031 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
53032 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
53033 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
53035 #undef TARGET_DELEGITIMIZE_ADDRESS
53036 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
53038 #undef TARGET_MS_BITFIELD_LAYOUT_P
53039 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
53041 #if TARGET_MACHO
53042 #undef TARGET_BINDS_LOCAL_P
53043 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
53044 #else
53045 #undef TARGET_BINDS_LOCAL_P
53046 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
53047 #endif
53048 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
53049 #undef TARGET_BINDS_LOCAL_P
53050 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
53051 #endif
53053 #undef TARGET_ASM_OUTPUT_MI_THUNK
53054 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
53055 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
53056 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
53058 #undef TARGET_ASM_FILE_START
53059 #define TARGET_ASM_FILE_START x86_file_start
53061 #undef TARGET_OPTION_OVERRIDE
53062 #define TARGET_OPTION_OVERRIDE ix86_option_override
53064 #undef TARGET_REGISTER_MOVE_COST
53065 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
53066 #undef TARGET_MEMORY_MOVE_COST
53067 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
53068 #undef TARGET_RTX_COSTS
53069 #define TARGET_RTX_COSTS ix86_rtx_costs
53070 #undef TARGET_ADDRESS_COST
53071 #define TARGET_ADDRESS_COST ix86_address_cost
53073 #undef TARGET_FLAGS_REGNUM
53074 #define TARGET_FLAGS_REGNUM FLAGS_REG
53075 #undef TARGET_FIXED_CONDITION_CODE_REGS
53076 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
53077 #undef TARGET_CC_MODES_COMPATIBLE
53078 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
53080 #undef TARGET_MACHINE_DEPENDENT_REORG
53081 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
53083 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
53084 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
53086 #undef TARGET_BUILD_BUILTIN_VA_LIST
53087 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
53089 #undef TARGET_FOLD_BUILTIN
53090 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
53092 #undef TARGET_GIMPLE_FOLD_BUILTIN
53093 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
53095 #undef TARGET_COMPARE_VERSION_PRIORITY
53096 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
53098 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
53099 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
53100 ix86_generate_version_dispatcher_body
53102 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
53103 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
53104 ix86_get_function_versions_dispatcher
53106 #undef TARGET_ENUM_VA_LIST_P
53107 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
53109 #undef TARGET_FN_ABI_VA_LIST
53110 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
53112 #undef TARGET_CANONICAL_VA_LIST_TYPE
53113 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
53115 #undef TARGET_EXPAND_BUILTIN_VA_START
53116 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
53118 #undef TARGET_MD_ASM_ADJUST
53119 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
53121 #undef TARGET_C_EXCESS_PRECISION
53122 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
53123 #undef TARGET_PROMOTE_PROTOTYPES
53124 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
53125 #undef TARGET_SETUP_INCOMING_VARARGS
53126 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
53127 #undef TARGET_MUST_PASS_IN_STACK
53128 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
53129 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
53130 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
53131 #undef TARGET_FUNCTION_ARG_ADVANCE
53132 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
53133 #undef TARGET_FUNCTION_ARG
53134 #define TARGET_FUNCTION_ARG ix86_function_arg
53135 #undef TARGET_INIT_PIC_REG
53136 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
53137 #undef TARGET_USE_PSEUDO_PIC_REG
53138 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
53139 #undef TARGET_FUNCTION_ARG_BOUNDARY
53140 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
53141 #undef TARGET_PASS_BY_REFERENCE
53142 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
53143 #undef TARGET_INTERNAL_ARG_POINTER
53144 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
53145 #undef TARGET_UPDATE_STACK_BOUNDARY
53146 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
53147 #undef TARGET_GET_DRAP_RTX
53148 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
53149 #undef TARGET_STRICT_ARGUMENT_NAMING
53150 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
53151 #undef TARGET_STATIC_CHAIN
53152 #define TARGET_STATIC_CHAIN ix86_static_chain
53153 #undef TARGET_TRAMPOLINE_INIT
53154 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
53155 #undef TARGET_RETURN_POPS_ARGS
53156 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
53158 #undef TARGET_WARN_FUNC_RETURN
53159 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
53161 #undef TARGET_LEGITIMATE_COMBINED_INSN
53162 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
53164 #undef TARGET_ASAN_SHADOW_OFFSET
53165 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
53167 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
53168 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
53170 #undef TARGET_SCALAR_MODE_SUPPORTED_P
53171 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
53173 #undef TARGET_VECTOR_MODE_SUPPORTED_P
53174 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
53176 #undef TARGET_C_MODE_FOR_SUFFIX
53177 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
53179 #ifdef HAVE_AS_TLS
53180 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
53181 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
53182 #endif
53184 #ifdef SUBTARGET_INSERT_ATTRIBUTES
53185 #undef TARGET_INSERT_ATTRIBUTES
53186 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
53187 #endif
53189 #undef TARGET_MANGLE_TYPE
53190 #define TARGET_MANGLE_TYPE ix86_mangle_type
53192 #undef TARGET_STACK_PROTECT_GUARD
53193 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
53195 #if !TARGET_MACHO
53196 #undef TARGET_STACK_PROTECT_FAIL
53197 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
53198 #endif
53200 #undef TARGET_FUNCTION_VALUE
53201 #define TARGET_FUNCTION_VALUE ix86_function_value
53203 #undef TARGET_FUNCTION_VALUE_REGNO_P
53204 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
53206 #undef TARGET_PROMOTE_FUNCTION_MODE
53207 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
53209 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
53210 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
53212 #undef TARGET_MEMBER_TYPE_FORCES_BLK
53213 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
53215 #undef TARGET_INSTANTIATE_DECLS
53216 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
53218 #undef TARGET_SECONDARY_RELOAD
53219 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
53220 #undef TARGET_SECONDARY_MEMORY_NEEDED
53221 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
53222 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
53223 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
53225 #undef TARGET_CLASS_MAX_NREGS
53226 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
53228 #undef TARGET_PREFERRED_RELOAD_CLASS
53229 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
53230 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
53231 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
53232 #undef TARGET_CLASS_LIKELY_SPILLED_P
53233 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
53235 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
53236 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
53237 ix86_builtin_vectorization_cost
53238 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
53239 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
53240 ix86_vectorize_vec_perm_const_ok
53241 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
53242 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
53243 ix86_preferred_simd_mode
53244 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
53245 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
53246 ix86_autovectorize_vector_sizes
53247 #undef TARGET_VECTORIZE_GET_MASK_MODE
53248 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
53249 #undef TARGET_VECTORIZE_INIT_COST
53250 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
53251 #undef TARGET_VECTORIZE_ADD_STMT_COST
53252 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
53253 #undef TARGET_VECTORIZE_FINISH_COST
53254 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
53255 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
53256 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
53258 #undef TARGET_SET_CURRENT_FUNCTION
53259 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
53261 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
53262 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
53264 #undef TARGET_OPTION_SAVE
53265 #define TARGET_OPTION_SAVE ix86_function_specific_save
53267 #undef TARGET_OPTION_RESTORE
53268 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
53270 #undef TARGET_OPTION_POST_STREAM_IN
53271 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
53273 #undef TARGET_OPTION_PRINT
53274 #define TARGET_OPTION_PRINT ix86_function_specific_print
53276 #undef TARGET_OPTION_FUNCTION_VERSIONS
53277 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
53279 #undef TARGET_CAN_INLINE_P
53280 #define TARGET_CAN_INLINE_P ix86_can_inline_p
53282 #undef TARGET_LEGITIMATE_ADDRESS_P
53283 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
53285 #undef TARGET_REGISTER_PRIORITY
53286 #define TARGET_REGISTER_PRIORITY ix86_register_priority
53288 #undef TARGET_REGISTER_USAGE_LEVELING_P
53289 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
53291 #undef TARGET_LEGITIMATE_CONSTANT_P
53292 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
53294 #undef TARGET_COMPUTE_FRAME_LAYOUT
53295 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
53297 #undef TARGET_FRAME_POINTER_REQUIRED
53298 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
53300 #undef TARGET_CAN_ELIMINATE
53301 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
53303 #undef TARGET_EXTRA_LIVE_ON_ENTRY
53304 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
53306 #undef TARGET_ASM_CODE_END
53307 #define TARGET_ASM_CODE_END ix86_code_end
53309 #undef TARGET_CONDITIONAL_REGISTER_USAGE
53310 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
53312 #undef TARGET_LOOP_UNROLL_ADJUST
53313 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
53315 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
53316 #undef TARGET_SPILL_CLASS
53317 #define TARGET_SPILL_CLASS ix86_spill_class
53319 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
53320 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
53321 ix86_simd_clone_compute_vecsize_and_simdlen
53323 #undef TARGET_SIMD_CLONE_ADJUST
53324 #define TARGET_SIMD_CLONE_ADJUST \
53325 ix86_simd_clone_adjust
53327 #undef TARGET_SIMD_CLONE_USABLE
53328 #define TARGET_SIMD_CLONE_USABLE \
53329 ix86_simd_clone_usable
53331 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
53332 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
53333 ix86_float_exceptions_rounding_supported_p
53335 #undef TARGET_MODE_EMIT
53336 #define TARGET_MODE_EMIT ix86_emit_mode_set
53338 #undef TARGET_MODE_NEEDED
53339 #define TARGET_MODE_NEEDED ix86_mode_needed
53341 #undef TARGET_MODE_AFTER
53342 #define TARGET_MODE_AFTER ix86_mode_after
53344 #undef TARGET_MODE_ENTRY
53345 #define TARGET_MODE_ENTRY ix86_mode_entry
53347 #undef TARGET_MODE_EXIT
53348 #define TARGET_MODE_EXIT ix86_mode_exit
53350 #undef TARGET_MODE_PRIORITY
53351 #define TARGET_MODE_PRIORITY ix86_mode_priority
53353 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
53354 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
53356 #undef TARGET_LOAD_BOUNDS_FOR_ARG
53357 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
53359 #undef TARGET_STORE_BOUNDS_FOR_ARG
53360 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
53362 #undef TARGET_LOAD_RETURNED_BOUNDS
53363 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
53365 #undef TARGET_STORE_RETURNED_BOUNDS
53366 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
53368 #undef TARGET_CHKP_BOUND_MODE
53369 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53371 #undef TARGET_BUILTIN_CHKP_FUNCTION
53372 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53374 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53375 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53377 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53378 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53380 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53381 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53383 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53384 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53386 #undef TARGET_OFFLOAD_OPTIONS
53387 #define TARGET_OFFLOAD_OPTIONS \
53388 ix86_offload_options
53390 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53391 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53393 #undef TARGET_OPTAB_SUPPORTED_P
53394 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53396 #undef TARGET_HARD_REGNO_SCRATCH_OK
53397 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53399 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53400 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53402 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53403 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53405 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53406 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53408 #undef TARGET_INIT_LIBFUNCS
53409 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53411 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53412 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53414 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53415 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53417 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53418 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53420 #undef TARGET_HARD_REGNO_NREGS
53421 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
53422 #undef TARGET_HARD_REGNO_MODE_OK
53423 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
53425 #undef TARGET_MODES_TIEABLE_P
53426 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
53428 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
53429 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
53430 ix86_hard_regno_call_part_clobbered
53432 #undef TARGET_CAN_CHANGE_MODE_CLASS
53433 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
53435 #if CHECKING_P
53436 #undef TARGET_RUN_TARGET_SELFTESTS
53437 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53438 #endif /* #if CHECKING_P */
53440 struct gcc_target targetm = TARGET_INITIALIZER;
53442 #include "gt-i386.h"