* combine-stack-adj.c (combine_stack_adjustments_for_block): Do
[official-gcc.git] / gcc / config / i386 / i386.c
blobd19c770d4cb6877f9a8fb9c69a7a24fa72a02c61
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 static rtx legitimize_dllimport_symbol (rtx, bool);
96 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
97 static rtx legitimize_pe_coff_symbol (rtx, bool);
98 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
99 static bool ix86_save_reg (unsigned int, bool, bool);
100 static bool ix86_function_naked (const_tree);
102 #ifndef CHECK_STACK_LIMIT
103 #define CHECK_STACK_LIMIT (-1)
104 #endif
106 /* Return index of given mode in mult and division cost tables. */
107 #define MODE_INDEX(mode) \
108 ((mode) == QImode ? 0 \
109 : (mode) == HImode ? 1 \
110 : (mode) == SImode ? 2 \
111 : (mode) == DImode ? 3 \
112 : 4)
114 /* Processor costs (relative to an add) */
115 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
116 #define COSTS_N_BYTES(N) ((N) * 2)
118 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
120 static stringop_algs ix86_size_memcpy[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
123 static stringop_algs ix86_size_memset[2] = {
124 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
127 const
128 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
129 COSTS_N_BYTES (2), /* cost of an add instruction */
130 COSTS_N_BYTES (3), /* cost of a lea instruction */
131 COSTS_N_BYTES (2), /* variable shift costs */
132 COSTS_N_BYTES (3), /* constant shift costs */
133 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
134 COSTS_N_BYTES (3), /* HI */
135 COSTS_N_BYTES (3), /* SI */
136 COSTS_N_BYTES (3), /* DI */
137 COSTS_N_BYTES (5)}, /* other */
138 0, /* cost of multiply per each bit set */
139 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
140 COSTS_N_BYTES (3), /* HI */
141 COSTS_N_BYTES (3), /* SI */
142 COSTS_N_BYTES (3), /* DI */
143 COSTS_N_BYTES (5)}, /* other */
144 COSTS_N_BYTES (3), /* cost of movsx */
145 COSTS_N_BYTES (3), /* cost of movzx */
146 0, /* "large" insn */
147 2, /* MOVE_RATIO */
148 2, /* cost for loading QImode using movzbl */
149 {2, 2, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 2, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {2, 2, 2}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {2, 2, 2}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 3, /* cost of moving MMX register */
159 {3, 3}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {3, 3}, /* cost of storing MMX registers
162 in SImode and DImode */
163 3, /* cost of moving SSE register */
164 {3, 3, 3}, /* cost of loading SSE registers
165 in SImode, DImode and TImode */
166 {3, 3, 3}, /* cost of storing SSE registers
167 in SImode, DImode and TImode */
168 3, /* MMX or SSE register to integer */
169 0, /* size of l1 cache */
170 0, /* size of l2 cache */
171 0, /* size of prefetch block */
172 0, /* number of parallel prefetches */
173 2, /* Branch cost */
174 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
175 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
176 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
177 COSTS_N_BYTES (2), /* cost of FABS instruction. */
178 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
179 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
180 ix86_size_memcpy,
181 ix86_size_memset,
182 1, /* scalar_stmt_cost. */
183 1, /* scalar load_cost. */
184 1, /* scalar_store_cost. */
185 1, /* vec_stmt_cost. */
186 1, /* vec_to_scalar_cost. */
187 1, /* scalar_to_vec_cost. */
188 1, /* vec_align_load_cost. */
189 1, /* vec_unalign_load_cost. */
190 1, /* vec_store_cost. */
191 1, /* cond_taken_branch_cost. */
192 1, /* cond_not_taken_branch_cost. */
195 /* Processor costs (relative to an add) */
196 static stringop_algs i386_memcpy[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
199 static stringop_algs i386_memset[2] = {
200 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
201 DUMMY_STRINGOP_ALGS};
203 static const
204 struct processor_costs i386_cost = { /* 386 specific costs */
205 COSTS_N_INSNS (1), /* cost of an add instruction */
206 COSTS_N_INSNS (1), /* cost of a lea instruction */
207 COSTS_N_INSNS (3), /* variable shift costs */
208 COSTS_N_INSNS (2), /* constant shift costs */
209 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
210 COSTS_N_INSNS (6), /* HI */
211 COSTS_N_INSNS (6), /* SI */
212 COSTS_N_INSNS (6), /* DI */
213 COSTS_N_INSNS (6)}, /* other */
214 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
215 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
216 COSTS_N_INSNS (23), /* HI */
217 COSTS_N_INSNS (23), /* SI */
218 COSTS_N_INSNS (23), /* DI */
219 COSTS_N_INSNS (23)}, /* other */
220 COSTS_N_INSNS (3), /* cost of movsx */
221 COSTS_N_INSNS (2), /* cost of movzx */
222 15, /* "large" insn */
223 3, /* MOVE_RATIO */
224 4, /* cost for loading QImode using movzbl */
225 {2, 4, 2}, /* cost of loading integer registers
226 in QImode, HImode and SImode.
227 Relative to reg-reg move (2). */
228 {2, 4, 2}, /* cost of storing integer registers */
229 2, /* cost of reg,reg fld/fst */
230 {8, 8, 8}, /* cost of loading fp registers
231 in SFmode, DFmode and XFmode */
232 {8, 8, 8}, /* cost of storing fp registers
233 in SFmode, DFmode and XFmode */
234 2, /* cost of moving MMX register */
235 {4, 8}, /* cost of loading MMX registers
236 in SImode and DImode */
237 {4, 8}, /* cost of storing MMX registers
238 in SImode and DImode */
239 2, /* cost of moving SSE register */
240 {4, 8, 16}, /* cost of loading SSE registers
241 in SImode, DImode and TImode */
242 {4, 8, 16}, /* cost of storing SSE registers
243 in SImode, DImode and TImode */
244 3, /* MMX or SSE register to integer */
245 0, /* size of l1 cache */
246 0, /* size of l2 cache */
247 0, /* size of prefetch block */
248 0, /* number of parallel prefetches */
249 1, /* Branch cost */
250 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
251 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
252 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
253 COSTS_N_INSNS (22), /* cost of FABS instruction. */
254 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
255 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
256 i386_memcpy,
257 i386_memset,
258 1, /* scalar_stmt_cost. */
259 1, /* scalar load_cost. */
260 1, /* scalar_store_cost. */
261 1, /* vec_stmt_cost. */
262 1, /* vec_to_scalar_cost. */
263 1, /* scalar_to_vec_cost. */
264 1, /* vec_align_load_cost. */
265 2, /* vec_unalign_load_cost. */
266 1, /* vec_store_cost. */
267 3, /* cond_taken_branch_cost. */
268 1, /* cond_not_taken_branch_cost. */
271 static stringop_algs i486_memcpy[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
274 static stringop_algs i486_memset[2] = {
275 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
276 DUMMY_STRINGOP_ALGS};
278 static const
279 struct processor_costs i486_cost = { /* 486 specific costs */
280 COSTS_N_INSNS (1), /* cost of an add instruction */
281 COSTS_N_INSNS (1), /* cost of a lea instruction */
282 COSTS_N_INSNS (3), /* variable shift costs */
283 COSTS_N_INSNS (2), /* constant shift costs */
284 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
285 COSTS_N_INSNS (12), /* HI */
286 COSTS_N_INSNS (12), /* SI */
287 COSTS_N_INSNS (12), /* DI */
288 COSTS_N_INSNS (12)}, /* other */
289 1, /* cost of multiply per each bit set */
290 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
291 COSTS_N_INSNS (40), /* HI */
292 COSTS_N_INSNS (40), /* SI */
293 COSTS_N_INSNS (40), /* DI */
294 COSTS_N_INSNS (40)}, /* other */
295 COSTS_N_INSNS (3), /* cost of movsx */
296 COSTS_N_INSNS (2), /* cost of movzx */
297 15, /* "large" insn */
298 3, /* MOVE_RATIO */
299 4, /* cost for loading QImode using movzbl */
300 {2, 4, 2}, /* cost of loading integer registers
301 in QImode, HImode and SImode.
302 Relative to reg-reg move (2). */
303 {2, 4, 2}, /* cost of storing integer registers */
304 2, /* cost of reg,reg fld/fst */
305 {8, 8, 8}, /* cost of loading fp registers
306 in SFmode, DFmode and XFmode */
307 {8, 8, 8}, /* cost of storing fp registers
308 in SFmode, DFmode and XFmode */
309 2, /* cost of moving MMX register */
310 {4, 8}, /* cost of loading MMX registers
311 in SImode and DImode */
312 {4, 8}, /* cost of storing MMX registers
313 in SImode and DImode */
314 2, /* cost of moving SSE register */
315 {4, 8, 16}, /* cost of loading SSE registers
316 in SImode, DImode and TImode */
317 {4, 8, 16}, /* cost of storing SSE registers
318 in SImode, DImode and TImode */
319 3, /* MMX or SSE register to integer */
320 4, /* size of l1 cache. 486 has 8kB cache
321 shared for code and data, so 4kB is
322 not really precise. */
323 4, /* size of l2 cache */
324 0, /* size of prefetch block */
325 0, /* number of parallel prefetches */
326 1, /* Branch cost */
327 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
328 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
329 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
330 COSTS_N_INSNS (3), /* cost of FABS instruction. */
331 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
332 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
333 i486_memcpy,
334 i486_memset,
335 1, /* scalar_stmt_cost. */
336 1, /* scalar load_cost. */
337 1, /* scalar_store_cost. */
338 1, /* vec_stmt_cost. */
339 1, /* vec_to_scalar_cost. */
340 1, /* scalar_to_vec_cost. */
341 1, /* vec_align_load_cost. */
342 2, /* vec_unalign_load_cost. */
343 1, /* vec_store_cost. */
344 3, /* cond_taken_branch_cost. */
345 1, /* cond_not_taken_branch_cost. */
348 static stringop_algs pentium_memcpy[2] = {
349 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
350 DUMMY_STRINGOP_ALGS};
351 static stringop_algs pentium_memset[2] = {
352 {libcall, {{-1, rep_prefix_4_byte, false}}},
353 DUMMY_STRINGOP_ALGS};
355 static const
356 struct processor_costs pentium_cost = {
357 COSTS_N_INSNS (1), /* cost of an add instruction */
358 COSTS_N_INSNS (1), /* cost of a lea instruction */
359 COSTS_N_INSNS (4), /* variable shift costs */
360 COSTS_N_INSNS (1), /* constant shift costs */
361 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
362 COSTS_N_INSNS (11), /* HI */
363 COSTS_N_INSNS (11), /* SI */
364 COSTS_N_INSNS (11), /* DI */
365 COSTS_N_INSNS (11)}, /* other */
366 0, /* cost of multiply per each bit set */
367 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
368 COSTS_N_INSNS (25), /* HI */
369 COSTS_N_INSNS (25), /* SI */
370 COSTS_N_INSNS (25), /* DI */
371 COSTS_N_INSNS (25)}, /* other */
372 COSTS_N_INSNS (3), /* cost of movsx */
373 COSTS_N_INSNS (2), /* cost of movzx */
374 8, /* "large" insn */
375 6, /* MOVE_RATIO */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, /* cost of moving SSE register */
392 {4, 8, 16}, /* cost of loading SSE registers
393 in SImode, DImode and TImode */
394 {4, 8, 16}, /* cost of storing SSE registers
395 in SImode, DImode and TImode */
396 3, /* MMX or SSE register to integer */
397 8, /* size of l1 cache. */
398 8, /* size of l2 cache */
399 0, /* size of prefetch block */
400 0, /* number of parallel prefetches */
401 2, /* Branch cost */
402 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
403 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
404 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
405 COSTS_N_INSNS (1), /* cost of FABS instruction. */
406 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
407 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
408 pentium_memcpy,
409 pentium_memset,
410 1, /* scalar_stmt_cost. */
411 1, /* scalar load_cost. */
412 1, /* scalar_store_cost. */
413 1, /* vec_stmt_cost. */
414 1, /* vec_to_scalar_cost. */
415 1, /* scalar_to_vec_cost. */
416 1, /* vec_align_load_cost. */
417 2, /* vec_unalign_load_cost. */
418 1, /* vec_store_cost. */
419 3, /* cond_taken_branch_cost. */
420 1, /* cond_not_taken_branch_cost. */
423 static const
424 struct processor_costs lakemont_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (11), /* HI */
431 COSTS_N_INSNS (11), /* SI */
432 COSTS_N_INSNS (11), /* DI */
433 COSTS_N_INSNS (11)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (25), /* HI */
437 COSTS_N_INSNS (25), /* SI */
438 COSTS_N_INSNS (25), /* DI */
439 COSTS_N_INSNS (25)}, /* other */
440 COSTS_N_INSNS (3), /* cost of movsx */
441 COSTS_N_INSNS (2), /* cost of movzx */
442 8, /* "large" insn */
443 17, /* MOVE_RATIO */
444 6, /* cost for loading QImode using movzbl */
445 {2, 4, 2}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 4, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 8, /* cost of moving MMX register */
455 {8, 8}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {8, 8}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {4, 8, 16}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {4, 8, 16}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 8, /* size of l2 cache */
467 0, /* size of prefetch block */
468 0, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (1), /* cost of FABS instruction. */
474 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
476 pentium_memcpy,
477 pentium_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
491 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
492 (we ensure the alignment). For small blocks inline loop is still a
493 noticeable win, for bigger blocks either rep movsl or rep movsb is
494 way to go. Rep movsb has apparently more expensive startup time in CPU,
495 but after 4K the difference is down in the noise. */
496 static stringop_algs pentiumpro_memcpy[2] = {
497 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
498 {8192, rep_prefix_4_byte, false},
499 {-1, rep_prefix_1_byte, false}}},
500 DUMMY_STRINGOP_ALGS};
501 static stringop_algs pentiumpro_memset[2] = {
502 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
503 {8192, rep_prefix_4_byte, false},
504 {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS};
506 static const
507 struct processor_costs pentiumpro_cost = {
508 COSTS_N_INSNS (1), /* cost of an add instruction */
509 COSTS_N_INSNS (1), /* cost of a lea instruction */
510 COSTS_N_INSNS (1), /* variable shift costs */
511 COSTS_N_INSNS (1), /* constant shift costs */
512 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
513 COSTS_N_INSNS (4), /* HI */
514 COSTS_N_INSNS (4), /* SI */
515 COSTS_N_INSNS (4), /* DI */
516 COSTS_N_INSNS (4)}, /* other */
517 0, /* cost of multiply per each bit set */
518 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
519 COSTS_N_INSNS (17), /* HI */
520 COSTS_N_INSNS (17), /* SI */
521 COSTS_N_INSNS (17), /* DI */
522 COSTS_N_INSNS (17)}, /* other */
523 COSTS_N_INSNS (1), /* cost of movsx */
524 COSTS_N_INSNS (1), /* cost of movzx */
525 8, /* "large" insn */
526 6, /* MOVE_RATIO */
527 2, /* cost for loading QImode using movzbl */
528 {4, 4, 4}, /* cost of loading integer registers
529 in QImode, HImode and SImode.
530 Relative to reg-reg move (2). */
531 {2, 2, 2}, /* cost of storing integer registers */
532 2, /* cost of reg,reg fld/fst */
533 {2, 2, 6}, /* cost of loading fp registers
534 in SFmode, DFmode and XFmode */
535 {4, 4, 6}, /* cost of storing fp registers
536 in SFmode, DFmode and XFmode */
537 2, /* cost of moving MMX register */
538 {2, 2}, /* cost of loading MMX registers
539 in SImode and DImode */
540 {2, 2}, /* cost of storing MMX registers
541 in SImode and DImode */
542 2, /* cost of moving SSE register */
543 {2, 2, 8}, /* cost of loading SSE registers
544 in SImode, DImode and TImode */
545 {2, 2, 8}, /* cost of storing SSE registers
546 in SImode, DImode and TImode */
547 3, /* MMX or SSE register to integer */
548 8, /* size of l1 cache. */
549 256, /* size of l2 cache */
550 32, /* size of prefetch block */
551 6, /* number of parallel prefetches */
552 2, /* Branch cost */
553 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
554 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
555 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
556 COSTS_N_INSNS (2), /* cost of FABS instruction. */
557 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
558 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 1, /* scalar_stmt_cost. */
562 1, /* scalar load_cost. */
563 1, /* scalar_store_cost. */
564 1, /* vec_stmt_cost. */
565 1, /* vec_to_scalar_cost. */
566 1, /* scalar_to_vec_cost. */
567 1, /* vec_align_load_cost. */
568 2, /* vec_unalign_load_cost. */
569 1, /* vec_store_cost. */
570 3, /* cond_taken_branch_cost. */
571 1, /* cond_not_taken_branch_cost. */
574 static stringop_algs geode_memcpy[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static stringop_algs geode_memset[2] = {
578 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
579 DUMMY_STRINGOP_ALGS};
580 static const
581 struct processor_costs geode_cost = {
582 COSTS_N_INSNS (1), /* cost of an add instruction */
583 COSTS_N_INSNS (1), /* cost of a lea instruction */
584 COSTS_N_INSNS (2), /* variable shift costs */
585 COSTS_N_INSNS (1), /* constant shift costs */
586 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
587 COSTS_N_INSNS (4), /* HI */
588 COSTS_N_INSNS (7), /* SI */
589 COSTS_N_INSNS (7), /* DI */
590 COSTS_N_INSNS (7)}, /* other */
591 0, /* cost of multiply per each bit set */
592 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
593 COSTS_N_INSNS (23), /* HI */
594 COSTS_N_INSNS (39), /* SI */
595 COSTS_N_INSNS (39), /* DI */
596 COSTS_N_INSNS (39)}, /* other */
597 COSTS_N_INSNS (1), /* cost of movsx */
598 COSTS_N_INSNS (1), /* cost of movzx */
599 8, /* "large" insn */
600 4, /* MOVE_RATIO */
601 1, /* cost for loading QImode using movzbl */
602 {1, 1, 1}, /* cost of loading integer registers
603 in QImode, HImode and SImode.
604 Relative to reg-reg move (2). */
605 {1, 1, 1}, /* cost of storing integer registers */
606 1, /* cost of reg,reg fld/fst */
607 {1, 1, 1}, /* cost of loading fp registers
608 in SFmode, DFmode and XFmode */
609 {4, 6, 6}, /* cost of storing fp registers
610 in SFmode, DFmode and XFmode */
612 2, /* cost of moving MMX register */
613 {2, 2}, /* cost of loading MMX registers
614 in SImode and DImode */
615 {2, 2}, /* cost of storing MMX registers
616 in SImode and DImode */
617 2, /* cost of moving SSE register */
618 {2, 2, 8}, /* cost of loading SSE registers
619 in SImode, DImode and TImode */
620 {2, 2, 8}, /* cost of storing SSE registers
621 in SImode, DImode and TImode */
622 3, /* MMX or SSE register to integer */
623 64, /* size of l1 cache. */
624 128, /* size of l2 cache. */
625 32, /* size of prefetch block */
626 1, /* number of parallel prefetches */
627 1, /* Branch cost */
628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
629 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
630 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
631 COSTS_N_INSNS (1), /* cost of FABS instruction. */
632 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
633 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
634 geode_memcpy,
635 geode_memset,
636 1, /* scalar_stmt_cost. */
637 1, /* scalar load_cost. */
638 1, /* scalar_store_cost. */
639 1, /* vec_stmt_cost. */
640 1, /* vec_to_scalar_cost. */
641 1, /* scalar_to_vec_cost. */
642 1, /* vec_align_load_cost. */
643 2, /* vec_unalign_load_cost. */
644 1, /* vec_store_cost. */
645 3, /* cond_taken_branch_cost. */
646 1, /* cond_not_taken_branch_cost. */
649 static stringop_algs k6_memcpy[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static stringop_algs k6_memset[2] = {
653 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
654 DUMMY_STRINGOP_ALGS};
655 static const
656 struct processor_costs k6_cost = {
657 COSTS_N_INSNS (1), /* cost of an add instruction */
658 COSTS_N_INSNS (2), /* cost of a lea instruction */
659 COSTS_N_INSNS (1), /* variable shift costs */
660 COSTS_N_INSNS (1), /* constant shift costs */
661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
662 COSTS_N_INSNS (3), /* HI */
663 COSTS_N_INSNS (3), /* SI */
664 COSTS_N_INSNS (3), /* DI */
665 COSTS_N_INSNS (3)}, /* other */
666 0, /* cost of multiply per each bit set */
667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
668 COSTS_N_INSNS (18), /* HI */
669 COSTS_N_INSNS (18), /* SI */
670 COSTS_N_INSNS (18), /* DI */
671 COSTS_N_INSNS (18)}, /* other */
672 COSTS_N_INSNS (2), /* cost of movsx */
673 COSTS_N_INSNS (2), /* cost of movzx */
674 8, /* "large" insn */
675 4, /* MOVE_RATIO */
676 3, /* cost for loading QImode using movzbl */
677 {4, 5, 4}, /* cost of loading integer registers
678 in QImode, HImode and SImode.
679 Relative to reg-reg move (2). */
680 {2, 3, 2}, /* cost of storing integer registers */
681 4, /* cost of reg,reg fld/fst */
682 {6, 6, 6}, /* cost of loading fp registers
683 in SFmode, DFmode and XFmode */
684 {4, 4, 4}, /* cost of storing fp registers
685 in SFmode, DFmode and XFmode */
686 2, /* cost of moving MMX register */
687 {2, 2}, /* cost of loading MMX registers
688 in SImode and DImode */
689 {2, 2}, /* cost of storing MMX registers
690 in SImode and DImode */
691 2, /* cost of moving SSE register */
692 {2, 2, 8}, /* cost of loading SSE registers
693 in SImode, DImode and TImode */
694 {2, 2, 8}, /* cost of storing SSE registers
695 in SImode, DImode and TImode */
696 6, /* MMX or SSE register to integer */
697 32, /* size of l1 cache. */
698 32, /* size of l2 cache. Some models
699 have integrated l2 cache, but
700 optimizing for k6 is not important
701 enough to worry about that. */
702 32, /* size of prefetch block */
703 1, /* number of parallel prefetches */
704 1, /* Branch cost */
705 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
706 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
707 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
708 COSTS_N_INSNS (2), /* cost of FABS instruction. */
709 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
710 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
711 k6_memcpy,
712 k6_memset,
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
726 /* For some reason, Athlon deals better with REP prefix (relative to loops)
727 compared to K8. Alignment becomes important after 8 bytes for memcpy and
728 128 bytes for memset. */
729 static stringop_algs athlon_memcpy[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static stringop_algs athlon_memset[2] = {
733 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 DUMMY_STRINGOP_ALGS};
735 static const
736 struct processor_costs athlon_cost = {
737 COSTS_N_INSNS (1), /* cost of an add instruction */
738 COSTS_N_INSNS (2), /* cost of a lea instruction */
739 COSTS_N_INSNS (1), /* variable shift costs */
740 COSTS_N_INSNS (1), /* constant shift costs */
741 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
742 COSTS_N_INSNS (5), /* HI */
743 COSTS_N_INSNS (5), /* SI */
744 COSTS_N_INSNS (5), /* DI */
745 COSTS_N_INSNS (5)}, /* other */
746 0, /* cost of multiply per each bit set */
747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
748 COSTS_N_INSNS (26), /* HI */
749 COSTS_N_INSNS (42), /* SI */
750 COSTS_N_INSNS (74), /* DI */
751 COSTS_N_INSNS (74)}, /* other */
752 COSTS_N_INSNS (1), /* cost of movsx */
753 COSTS_N_INSNS (1), /* cost of movzx */
754 8, /* "large" insn */
755 9, /* MOVE_RATIO */
756 4, /* cost for loading QImode using movzbl */
757 {3, 4, 3}, /* cost of loading integer registers
758 in QImode, HImode and SImode.
759 Relative to reg-reg move (2). */
760 {3, 4, 3}, /* cost of storing integer registers */
761 4, /* cost of reg,reg fld/fst */
762 {4, 4, 12}, /* cost of loading fp registers
763 in SFmode, DFmode and XFmode */
764 {6, 6, 8}, /* cost of storing fp registers
765 in SFmode, DFmode and XFmode */
766 2, /* cost of moving MMX register */
767 {4, 4}, /* cost of loading MMX registers
768 in SImode and DImode */
769 {4, 4}, /* cost of storing MMX registers
770 in SImode and DImode */
771 2, /* cost of moving SSE register */
772 {4, 4, 6}, /* cost of loading SSE registers
773 in SImode, DImode and TImode */
774 {4, 4, 5}, /* cost of storing SSE registers
775 in SImode, DImode and TImode */
776 5, /* MMX or SSE register to integer */
777 64, /* size of l1 cache. */
778 256, /* size of l2 cache. */
779 64, /* size of prefetch block */
780 6, /* number of parallel prefetches */
781 5, /* Branch cost */
782 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
783 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
784 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
785 COSTS_N_INSNS (2), /* cost of FABS instruction. */
786 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
787 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
788 athlon_memcpy,
789 athlon_memset,
790 1, /* scalar_stmt_cost. */
791 1, /* scalar load_cost. */
792 1, /* scalar_store_cost. */
793 1, /* vec_stmt_cost. */
794 1, /* vec_to_scalar_cost. */
795 1, /* scalar_to_vec_cost. */
796 1, /* vec_align_load_cost. */
797 2, /* vec_unalign_load_cost. */
798 1, /* vec_store_cost. */
799 3, /* cond_taken_branch_cost. */
800 1, /* cond_not_taken_branch_cost. */
803 /* K8 has optimized REP instruction for medium sized blocks, but for very
804 small blocks it is better to use loop. For large blocks, libcall can
805 do nontemporary accesses and beat inline considerably. */
806 static stringop_algs k8_memcpy[2] = {
807 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
808 {-1, rep_prefix_4_byte, false}}},
809 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
810 {-1, libcall, false}}}};
811 static stringop_algs k8_memset[2] = {
812 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
813 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
814 {libcall, {{48, unrolled_loop, false},
815 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
816 static const
817 struct processor_costs k8_cost = {
818 COSTS_N_INSNS (1), /* cost of an add instruction */
819 COSTS_N_INSNS (2), /* cost of a lea instruction */
820 COSTS_N_INSNS (1), /* variable shift costs */
821 COSTS_N_INSNS (1), /* constant shift costs */
822 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
823 COSTS_N_INSNS (4), /* HI */
824 COSTS_N_INSNS (3), /* SI */
825 COSTS_N_INSNS (4), /* DI */
826 COSTS_N_INSNS (5)}, /* other */
827 0, /* cost of multiply per each bit set */
828 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
829 COSTS_N_INSNS (26), /* HI */
830 COSTS_N_INSNS (42), /* SI */
831 COSTS_N_INSNS (74), /* DI */
832 COSTS_N_INSNS (74)}, /* other */
833 COSTS_N_INSNS (1), /* cost of movsx */
834 COSTS_N_INSNS (1), /* cost of movzx */
835 8, /* "large" insn */
836 9, /* MOVE_RATIO */
837 4, /* cost for loading QImode using movzbl */
838 {3, 4, 3}, /* cost of loading integer registers
839 in QImode, HImode and SImode.
840 Relative to reg-reg move (2). */
841 {3, 4, 3}, /* cost of storing integer registers */
842 4, /* cost of reg,reg fld/fst */
843 {4, 4, 12}, /* cost of loading fp registers
844 in SFmode, DFmode and XFmode */
845 {6, 6, 8}, /* cost of storing fp registers
846 in SFmode, DFmode and XFmode */
847 2, /* cost of moving MMX register */
848 {3, 3}, /* cost of loading MMX registers
849 in SImode and DImode */
850 {4, 4}, /* cost of storing MMX registers
851 in SImode and DImode */
852 2, /* cost of moving SSE register */
853 {4, 3, 6}, /* cost of loading SSE registers
854 in SImode, DImode and TImode */
855 {4, 4, 5}, /* cost of storing SSE registers
856 in SImode, DImode and TImode */
857 5, /* MMX or SSE register to integer */
858 64, /* size of l1 cache. */
859 512, /* size of l2 cache. */
860 64, /* size of prefetch block */
861 /* New AMD processors never drop prefetches; if they cannot be performed
862 immediately, they are queued. We set number of simultaneous prefetches
863 to a large constant to reflect this (it probably is not a good idea not
864 to limit number of prefetches at all, as their execution also takes some
865 time). */
866 100, /* number of parallel prefetches */
867 3, /* Branch cost */
868 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
869 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
870 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
871 COSTS_N_INSNS (2), /* cost of FABS instruction. */
872 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
873 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
875 k8_memcpy,
876 k8_memset,
877 4, /* scalar_stmt_cost. */
878 2, /* scalar load_cost. */
879 2, /* scalar_store_cost. */
880 5, /* vec_stmt_cost. */
881 0, /* vec_to_scalar_cost. */
882 2, /* scalar_to_vec_cost. */
883 2, /* vec_align_load_cost. */
884 3, /* vec_unalign_load_cost. */
885 3, /* vec_store_cost. */
886 3, /* cond_taken_branch_cost. */
887 2, /* cond_not_taken_branch_cost. */
890 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
891 very small blocks it is better to use loop. For large blocks, libcall can
892 do nontemporary accesses and beat inline considerably. */
893 static stringop_algs amdfam10_memcpy[2] = {
894 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
895 {-1, rep_prefix_4_byte, false}}},
896 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
898 static stringop_algs amdfam10_memset[2] = {
899 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
900 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
901 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
902 {-1, libcall, false}}}};
903 struct processor_costs amdfam10_cost = {
904 COSTS_N_INSNS (1), /* cost of an add instruction */
905 COSTS_N_INSNS (2), /* cost of a lea instruction */
906 COSTS_N_INSNS (1), /* variable shift costs */
907 COSTS_N_INSNS (1), /* constant shift costs */
908 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
909 COSTS_N_INSNS (4), /* HI */
910 COSTS_N_INSNS (3), /* SI */
911 COSTS_N_INSNS (4), /* DI */
912 COSTS_N_INSNS (5)}, /* other */
913 0, /* cost of multiply per each bit set */
914 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
915 COSTS_N_INSNS (35), /* HI */
916 COSTS_N_INSNS (51), /* SI */
917 COSTS_N_INSNS (83), /* DI */
918 COSTS_N_INSNS (83)}, /* other */
919 COSTS_N_INSNS (1), /* cost of movsx */
920 COSTS_N_INSNS (1), /* cost of movzx */
921 8, /* "large" insn */
922 9, /* MOVE_RATIO */
923 4, /* cost for loading QImode using movzbl */
924 {3, 4, 3}, /* cost of loading integer registers
925 in QImode, HImode and SImode.
926 Relative to reg-reg move (2). */
927 {3, 4, 3}, /* cost of storing integer registers */
928 4, /* cost of reg,reg fld/fst */
929 {4, 4, 12}, /* cost of loading fp registers
930 in SFmode, DFmode and XFmode */
931 {6, 6, 8}, /* cost of storing fp registers
932 in SFmode, DFmode and XFmode */
933 2, /* cost of moving MMX register */
934 {3, 3}, /* cost of loading MMX registers
935 in SImode and DImode */
936 {4, 4}, /* cost of storing MMX registers
937 in SImode and DImode */
938 2, /* cost of moving SSE register */
939 {4, 4, 3}, /* cost of loading SSE registers
940 in SImode, DImode and TImode */
941 {4, 4, 5}, /* cost of storing SSE registers
942 in SImode, DImode and TImode */
943 3, /* MMX or SSE register to integer */
944 /* On K8:
945 MOVD reg64, xmmreg Double FSTORE 4
946 MOVD reg32, xmmreg Double FSTORE 4
947 On AMDFAM10:
948 MOVD reg64, xmmreg Double FADD 3
949 1/1 1/1
950 MOVD reg32, xmmreg Double FADD 3
951 1/1 1/1 */
952 64, /* size of l1 cache. */
953 512, /* size of l2 cache. */
954 64, /* size of prefetch block */
955 /* New AMD processors never drop prefetches; if they cannot be performed
956 immediately, they are queued. We set number of simultaneous prefetches
957 to a large constant to reflect this (it probably is not a good idea not
958 to limit number of prefetches at all, as their execution also takes some
959 time). */
960 100, /* number of parallel prefetches */
961 2, /* Branch cost */
962 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
963 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
964 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
965 COSTS_N_INSNS (2), /* cost of FABS instruction. */
966 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
967 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
969 amdfam10_memcpy,
970 amdfam10_memset,
971 4, /* scalar_stmt_cost. */
972 2, /* scalar load_cost. */
973 2, /* scalar_store_cost. */
974 6, /* vec_stmt_cost. */
975 0, /* vec_to_scalar_cost. */
976 2, /* scalar_to_vec_cost. */
977 2, /* vec_align_load_cost. */
978 2, /* vec_unalign_load_cost. */
979 2, /* vec_store_cost. */
980 2, /* cond_taken_branch_cost. */
981 1, /* cond_not_taken_branch_cost. */
984 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 static stringop_algs bdver1_memcpy[2] = {
988 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
989 {-1, rep_prefix_4_byte, false}}},
990 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
991 {-1, libcall, false}}}};
992 static stringop_algs bdver1_memset[2] = {
993 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
994 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
995 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
996 {-1, libcall, false}}}};
998 const struct processor_costs bdver1_cost = {
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (1), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (4), /* SI */
1006 COSTS_N_INSNS (6), /* DI */
1007 COSTS_N_INSNS (6)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (35), /* HI */
1011 COSTS_N_INSNS (51), /* SI */
1012 COSTS_N_INSNS (83), /* DI */
1013 COSTS_N_INSNS (83)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1017 9, /* MOVE_RATIO */
1018 4, /* cost for loading QImode using movzbl */
1019 {5, 5, 4}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {4, 4, 4}, /* cost of storing integer registers */
1023 2, /* cost of reg,reg fld/fst */
1024 {5, 5, 12}, /* cost of loading fp registers
1025 in SFmode, DFmode and XFmode */
1026 {4, 4, 8}, /* cost of storing fp registers
1027 in SFmode, DFmode and XFmode */
1028 2, /* cost of moving MMX register */
1029 {4, 4}, /* cost of loading MMX registers
1030 in SImode and DImode */
1031 {4, 4}, /* cost of storing MMX registers
1032 in SImode and DImode */
1033 2, /* cost of moving SSE register */
1034 {4, 4, 4}, /* cost of loading SSE registers
1035 in SImode, DImode and TImode */
1036 {4, 4, 4}, /* cost of storing SSE registers
1037 in SImode, DImode and TImode */
1038 2, /* MMX or SSE register to integer */
1039 /* On K8:
1040 MOVD reg64, xmmreg Double FSTORE 4
1041 MOVD reg32, xmmreg Double FSTORE 4
1042 On AMDFAM10:
1043 MOVD reg64, xmmreg Double FADD 3
1044 1/1 1/1
1045 MOVD reg32, xmmreg Double FADD 3
1046 1/1 1/1 */
1047 16, /* size of l1 cache. */
1048 2048, /* size of l2 cache. */
1049 64, /* size of prefetch block */
1050 /* New AMD processors never drop prefetches; if they cannot be performed
1051 immediately, they are queued. We set number of simultaneous prefetches
1052 to a large constant to reflect this (it probably is not a good idea not
1053 to limit number of prefetches at all, as their execution also takes some
1054 time). */
1055 100, /* number of parallel prefetches */
1056 2, /* Branch cost */
1057 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1058 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1059 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1060 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1061 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1062 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1064 bdver1_memcpy,
1065 bdver1_memset,
1066 6, /* scalar_stmt_cost. */
1067 4, /* scalar load_cost. */
1068 4, /* scalar_store_cost. */
1069 6, /* vec_stmt_cost. */
1070 0, /* vec_to_scalar_cost. */
1071 2, /* scalar_to_vec_cost. */
1072 4, /* vec_align_load_cost. */
1073 4, /* vec_unalign_load_cost. */
1074 4, /* vec_store_cost. */
1075 4, /* cond_taken_branch_cost. */
1076 2, /* cond_not_taken_branch_cost. */
1079 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1080 very small blocks it is better to use loop. For large blocks, libcall
1081 can do nontemporary accesses and beat inline considerably. */
1083 static stringop_algs bdver2_memcpy[2] = {
1084 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1085 {-1, rep_prefix_4_byte, false}}},
1086 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}};
1088 static stringop_algs bdver2_memset[2] = {
1089 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1090 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1091 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1092 {-1, libcall, false}}}};
1094 const struct processor_costs bdver2_cost = {
1095 COSTS_N_INSNS (1), /* cost of an add instruction */
1096 COSTS_N_INSNS (1), /* cost of a lea instruction */
1097 COSTS_N_INSNS (1), /* variable shift costs */
1098 COSTS_N_INSNS (1), /* constant shift costs */
1099 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1100 COSTS_N_INSNS (4), /* HI */
1101 COSTS_N_INSNS (4), /* SI */
1102 COSTS_N_INSNS (6), /* DI */
1103 COSTS_N_INSNS (6)}, /* other */
1104 0, /* cost of multiply per each bit set */
1105 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1106 COSTS_N_INSNS (35), /* HI */
1107 COSTS_N_INSNS (51), /* SI */
1108 COSTS_N_INSNS (83), /* DI */
1109 COSTS_N_INSNS (83)}, /* other */
1110 COSTS_N_INSNS (1), /* cost of movsx */
1111 COSTS_N_INSNS (1), /* cost of movzx */
1112 8, /* "large" insn */
1113 9, /* MOVE_RATIO */
1114 4, /* cost for loading QImode using movzbl */
1115 {5, 5, 4}, /* cost of loading integer registers
1116 in QImode, HImode and SImode.
1117 Relative to reg-reg move (2). */
1118 {4, 4, 4}, /* cost of storing integer registers */
1119 2, /* cost of reg,reg fld/fst */
1120 {5, 5, 12}, /* cost of loading fp registers
1121 in SFmode, DFmode and XFmode */
1122 {4, 4, 8}, /* cost of storing fp registers
1123 in SFmode, DFmode and XFmode */
1124 2, /* cost of moving MMX register */
1125 {4, 4}, /* cost of loading MMX registers
1126 in SImode and DImode */
1127 {4, 4}, /* cost of storing MMX registers
1128 in SImode and DImode */
1129 2, /* cost of moving SSE register */
1130 {4, 4, 4}, /* cost of loading SSE registers
1131 in SImode, DImode and TImode */
1132 {4, 4, 4}, /* cost of storing SSE registers
1133 in SImode, DImode and TImode */
1134 2, /* MMX or SSE register to integer */
1135 /* On K8:
1136 MOVD reg64, xmmreg Double FSTORE 4
1137 MOVD reg32, xmmreg Double FSTORE 4
1138 On AMDFAM10:
1139 MOVD reg64, xmmreg Double FADD 3
1140 1/1 1/1
1141 MOVD reg32, xmmreg Double FADD 3
1142 1/1 1/1 */
1143 16, /* size of l1 cache. */
1144 2048, /* size of l2 cache. */
1145 64, /* size of prefetch block */
1146 /* New AMD processors never drop prefetches; if they cannot be performed
1147 immediately, they are queued. We set number of simultaneous prefetches
1148 to a large constant to reflect this (it probably is not a good idea not
1149 to limit number of prefetches at all, as their execution also takes some
1150 time). */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1160 bdver2_memcpy,
1161 bdver2_memset,
1162 6, /* scalar_stmt_cost. */
1163 4, /* scalar load_cost. */
1164 4, /* scalar_store_cost. */
1165 6, /* vec_stmt_cost. */
1166 0, /* vec_to_scalar_cost. */
1167 2, /* scalar_to_vec_cost. */
1168 4, /* vec_align_load_cost. */
1169 4, /* vec_unalign_load_cost. */
1170 4, /* vec_store_cost. */
1171 4, /* cond_taken_branch_cost. */
1172 2, /* cond_not_taken_branch_cost. */
1176 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1177 very small blocks it is better to use loop. For large blocks, libcall
1178 can do nontemporary accesses and beat inline considerably. */
1179 static stringop_algs bdver3_memcpy[2] = {
1180 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1181 {-1, rep_prefix_4_byte, false}}},
1182 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1183 {-1, libcall, false}}}};
1184 static stringop_algs bdver3_memset[2] = {
1185 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1186 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1187 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1188 {-1, libcall, false}}}};
1189 struct processor_costs bdver3_cost = {
1190 COSTS_N_INSNS (1), /* cost of an add instruction */
1191 COSTS_N_INSNS (1), /* cost of a lea instruction */
1192 COSTS_N_INSNS (1), /* variable shift costs */
1193 COSTS_N_INSNS (1), /* constant shift costs */
1194 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1195 COSTS_N_INSNS (4), /* HI */
1196 COSTS_N_INSNS (4), /* SI */
1197 COSTS_N_INSNS (6), /* DI */
1198 COSTS_N_INSNS (6)}, /* other */
1199 0, /* cost of multiply per each bit set */
1200 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1201 COSTS_N_INSNS (35), /* HI */
1202 COSTS_N_INSNS (51), /* SI */
1203 COSTS_N_INSNS (83), /* DI */
1204 COSTS_N_INSNS (83)}, /* other */
1205 COSTS_N_INSNS (1), /* cost of movsx */
1206 COSTS_N_INSNS (1), /* cost of movzx */
1207 8, /* "large" insn */
1208 9, /* MOVE_RATIO */
1209 4, /* cost for loading QImode using movzbl */
1210 {5, 5, 4}, /* cost of loading integer registers
1211 in QImode, HImode and SImode.
1212 Relative to reg-reg move (2). */
1213 {4, 4, 4}, /* cost of storing integer registers */
1214 2, /* cost of reg,reg fld/fst */
1215 {5, 5, 12}, /* cost of loading fp registers
1216 in SFmode, DFmode and XFmode */
1217 {4, 4, 8}, /* cost of storing fp registers
1218 in SFmode, DFmode and XFmode */
1219 2, /* cost of moving MMX register */
1220 {4, 4}, /* cost of loading MMX registers
1221 in SImode and DImode */
1222 {4, 4}, /* cost of storing MMX registers
1223 in SImode and DImode */
1224 2, /* cost of moving SSE register */
1225 {4, 4, 4}, /* cost of loading SSE registers
1226 in SImode, DImode and TImode */
1227 {4, 4, 4}, /* cost of storing SSE registers
1228 in SImode, DImode and TImode */
1229 2, /* MMX or SSE register to integer */
1230 16, /* size of l1 cache. */
1231 2048, /* size of l2 cache. */
1232 64, /* size of prefetch block */
1233 /* New AMD processors never drop prefetches; if they cannot be performed
1234 immediately, they are queued. We set number of simultaneous prefetches
1235 to a large constant to reflect this (it probably is not a good idea not
1236 to limit number of prefetches at all, as their execution also takes some
1237 time). */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1247 bdver3_memcpy,
1248 bdver3_memset,
1249 6, /* scalar_stmt_cost. */
1250 4, /* scalar load_cost. */
1251 4, /* scalar_store_cost. */
1252 6, /* vec_stmt_cost. */
1253 0, /* vec_to_scalar_cost. */
1254 2, /* scalar_to_vec_cost. */
1255 4, /* vec_align_load_cost. */
1256 4, /* vec_unalign_load_cost. */
1257 4, /* vec_store_cost. */
1258 4, /* cond_taken_branch_cost. */
1259 2, /* cond_not_taken_branch_cost. */
1262 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1263 very small blocks it is better to use loop. For large blocks, libcall
1264 can do nontemporary accesses and beat inline considerably. */
1265 static stringop_algs bdver4_memcpy[2] = {
1266 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1267 {-1, rep_prefix_4_byte, false}}},
1268 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1269 {-1, libcall, false}}}};
1270 static stringop_algs bdver4_memset[2] = {
1271 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1272 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1273 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1274 {-1, libcall, false}}}};
1275 struct processor_costs bdver4_cost = {
1276 COSTS_N_INSNS (1), /* cost of an add instruction */
1277 COSTS_N_INSNS (1), /* cost of a lea instruction */
1278 COSTS_N_INSNS (1), /* variable shift costs */
1279 COSTS_N_INSNS (1), /* constant shift costs */
1280 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1281 COSTS_N_INSNS (4), /* HI */
1282 COSTS_N_INSNS (4), /* SI */
1283 COSTS_N_INSNS (6), /* DI */
1284 COSTS_N_INSNS (6)}, /* other */
1285 0, /* cost of multiply per each bit set */
1286 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1287 COSTS_N_INSNS (35), /* HI */
1288 COSTS_N_INSNS (51), /* SI */
1289 COSTS_N_INSNS (83), /* DI */
1290 COSTS_N_INSNS (83)}, /* other */
1291 COSTS_N_INSNS (1), /* cost of movsx */
1292 COSTS_N_INSNS (1), /* cost of movzx */
1293 8, /* "large" insn */
1294 9, /* MOVE_RATIO */
1295 4, /* cost for loading QImode using movzbl */
1296 {5, 5, 4}, /* cost of loading integer registers
1297 in QImode, HImode and SImode.
1298 Relative to reg-reg move (2). */
1299 {4, 4, 4}, /* cost of storing integer registers */
1300 2, /* cost of reg,reg fld/fst */
1301 {5, 5, 12}, /* cost of loading fp registers
1302 in SFmode, DFmode and XFmode */
1303 {4, 4, 8}, /* cost of storing fp registers
1304 in SFmode, DFmode and XFmode */
1305 2, /* cost of moving MMX register */
1306 {4, 4}, /* cost of loading MMX registers
1307 in SImode and DImode */
1308 {4, 4}, /* cost of storing MMX registers
1309 in SImode and DImode */
1310 2, /* cost of moving SSE register */
1311 {4, 4, 4}, /* cost of loading SSE registers
1312 in SImode, DImode and TImode */
1313 {4, 4, 4}, /* cost of storing SSE registers
1314 in SImode, DImode and TImode */
1315 2, /* MMX or SSE register to integer */
1316 16, /* size of l1 cache. */
1317 2048, /* size of l2 cache. */
1318 64, /* size of prefetch block */
1319 /* New AMD processors never drop prefetches; if they cannot be performed
1320 immediately, they are queued. We set number of simultaneous prefetches
1321 to a large constant to reflect this (it probably is not a good idea not
1322 to limit number of prefetches at all, as their execution also takes some
1323 time). */
1324 100, /* number of parallel prefetches */
1325 2, /* Branch cost */
1326 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1327 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1328 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1329 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1330 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1331 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1333 bdver4_memcpy,
1334 bdver4_memset,
1335 6, /* scalar_stmt_cost. */
1336 4, /* scalar load_cost. */
1337 4, /* scalar_store_cost. */
1338 6, /* vec_stmt_cost. */
1339 0, /* vec_to_scalar_cost. */
1340 2, /* scalar_to_vec_cost. */
1341 4, /* vec_align_load_cost. */
1342 4, /* vec_unalign_load_cost. */
1343 4, /* vec_store_cost. */
1344 4, /* cond_taken_branch_cost. */
1345 2, /* cond_not_taken_branch_cost. */
1349 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1350 very small blocks it is better to use loop. For large blocks, libcall
1351 can do nontemporary accesses and beat inline considerably. */
1352 static stringop_algs znver1_memcpy[2] = {
1353 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1354 {-1, rep_prefix_4_byte, false}}},
1355 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1356 {-1, libcall, false}}}};
1357 static stringop_algs znver1_memset[2] = {
1358 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1359 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1360 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1361 {-1, libcall, false}}}};
1362 struct processor_costs znver1_cost = {
1363 COSTS_N_INSNS (1), /* cost of an add instruction. */
1364 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1365 COSTS_N_INSNS (1), /* variable shift costs. */
1366 COSTS_N_INSNS (1), /* constant shift costs. */
1367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1368 COSTS_N_INSNS (3), /* HI. */
1369 COSTS_N_INSNS (3), /* SI. */
1370 COSTS_N_INSNS (4), /* DI. */
1371 COSTS_N_INSNS (4)}, /* other. */
1372 0, /* cost of multiply per each bit
1373 set. */
1374 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1375 COSTS_N_INSNS (35), /* HI. */
1376 COSTS_N_INSNS (51), /* SI. */
1377 COSTS_N_INSNS (83), /* DI. */
1378 COSTS_N_INSNS (83)}, /* other. */
1379 COSTS_N_INSNS (1), /* cost of movsx. */
1380 COSTS_N_INSNS (1), /* cost of movzx. */
1381 8, /* "large" insn. */
1382 9, /* MOVE_RATIO. */
1383 4, /* cost for loading QImode using
1384 movzbl. */
1385 {5, 5, 4}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {4, 4, 4}, /* cost of storing integer
1389 registers. */
1390 2, /* cost of reg,reg fld/fst. */
1391 {5, 5, 12}, /* cost of loading fp registers
1392 in SFmode, DFmode and XFmode. */
1393 {4, 4, 8}, /* cost of storing fp registers
1394 in SFmode, DFmode and XFmode. */
1395 2, /* cost of moving MMX register. */
1396 {4, 4}, /* cost of loading MMX registers
1397 in SImode and DImode. */
1398 {4, 4}, /* cost of storing MMX registers
1399 in SImode and DImode. */
1400 2, /* cost of moving SSE register. */
1401 {4, 4, 4}, /* cost of loading SSE registers
1402 in SImode, DImode and TImode. */
1403 {4, 4, 4}, /* cost of storing SSE registers
1404 in SImode, DImode and TImode. */
1405 2, /* MMX or SSE register to integer. */
1406 32, /* size of l1 cache. */
1407 512, /* size of l2 cache. */
1408 64, /* size of prefetch block. */
1409 /* New AMD processors never drop prefetches; if they cannot be performed
1410 immediately, they are queued. We set number of simultaneous prefetches
1411 to a large constant to reflect this (it probably is not a good idea not
1412 to limit number of prefetches at all, as their execution also takes some
1413 time). */
1414 100, /* number of parallel prefetches. */
1415 2, /* Branch cost. */
1416 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1417 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1418 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1421 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1423 znver1_memcpy,
1424 znver1_memset,
1425 6, /* scalar_stmt_cost. */
1426 4, /* scalar load_cost. */
1427 4, /* scalar_store_cost. */
1428 6, /* vec_stmt_cost. */
1429 0, /* vec_to_scalar_cost. */
1430 2, /* scalar_to_vec_cost. */
1431 4, /* vec_align_load_cost. */
1432 4, /* vec_unalign_load_cost. */
1433 4, /* vec_store_cost. */
1434 4, /* cond_taken_branch_cost. */
1435 2, /* cond_not_taken_branch_cost. */
1438 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1439 very small blocks it is better to use loop. For large blocks, libcall can
1440 do nontemporary accesses and beat inline considerably. */
1441 static stringop_algs btver1_memcpy[2] = {
1442 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1443 {-1, rep_prefix_4_byte, false}}},
1444 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1445 {-1, libcall, false}}}};
1446 static stringop_algs btver1_memset[2] = {
1447 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1448 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1449 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1450 {-1, libcall, false}}}};
1451 const struct processor_costs btver1_cost = {
1452 COSTS_N_INSNS (1), /* cost of an add instruction */
1453 COSTS_N_INSNS (2), /* cost of a lea instruction */
1454 COSTS_N_INSNS (1), /* variable shift costs */
1455 COSTS_N_INSNS (1), /* constant shift costs */
1456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1457 COSTS_N_INSNS (4), /* HI */
1458 COSTS_N_INSNS (3), /* SI */
1459 COSTS_N_INSNS (4), /* DI */
1460 COSTS_N_INSNS (5)}, /* other */
1461 0, /* cost of multiply per each bit set */
1462 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1463 COSTS_N_INSNS (35), /* HI */
1464 COSTS_N_INSNS (51), /* SI */
1465 COSTS_N_INSNS (83), /* DI */
1466 COSTS_N_INSNS (83)}, /* other */
1467 COSTS_N_INSNS (1), /* cost of movsx */
1468 COSTS_N_INSNS (1), /* cost of movzx */
1469 8, /* "large" insn */
1470 9, /* MOVE_RATIO */
1471 4, /* cost for loading QImode using movzbl */
1472 {3, 4, 3}, /* cost of loading integer registers
1473 in QImode, HImode and SImode.
1474 Relative to reg-reg move (2). */
1475 {3, 4, 3}, /* cost of storing integer registers */
1476 4, /* cost of reg,reg fld/fst */
1477 {4, 4, 12}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode */
1479 {6, 6, 8}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode */
1481 2, /* cost of moving MMX register */
1482 {3, 3}, /* cost of loading MMX registers
1483 in SImode and DImode */
1484 {4, 4}, /* cost of storing MMX registers
1485 in SImode and DImode */
1486 2, /* cost of moving SSE register */
1487 {4, 4, 3}, /* cost of loading SSE registers
1488 in SImode, DImode and TImode */
1489 {4, 4, 5}, /* cost of storing SSE registers
1490 in SImode, DImode and TImode */
1491 3, /* MMX or SSE register to integer */
1492 /* On K8:
1493 MOVD reg64, xmmreg Double FSTORE 4
1494 MOVD reg32, xmmreg Double FSTORE 4
1495 On AMDFAM10:
1496 MOVD reg64, xmmreg Double FADD 3
1497 1/1 1/1
1498 MOVD reg32, xmmreg Double FADD 3
1499 1/1 1/1 */
1500 32, /* size of l1 cache. */
1501 512, /* size of l2 cache. */
1502 64, /* size of prefetch block */
1503 100, /* number of parallel prefetches */
1504 2, /* Branch cost */
1505 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1506 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1507 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1508 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1509 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1510 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1512 btver1_memcpy,
1513 btver1_memset,
1514 4, /* scalar_stmt_cost. */
1515 2, /* scalar load_cost. */
1516 2, /* scalar_store_cost. */
1517 6, /* vec_stmt_cost. */
1518 0, /* vec_to_scalar_cost. */
1519 2, /* scalar_to_vec_cost. */
1520 2, /* vec_align_load_cost. */
1521 2, /* vec_unalign_load_cost. */
1522 2, /* vec_store_cost. */
1523 2, /* cond_taken_branch_cost. */
1524 1, /* cond_not_taken_branch_cost. */
1527 static stringop_algs btver2_memcpy[2] = {
1528 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1529 {-1, rep_prefix_4_byte, false}}},
1530 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1531 {-1, libcall, false}}}};
1532 static stringop_algs btver2_memset[2] = {
1533 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1534 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1535 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1536 {-1, libcall, false}}}};
1537 const struct processor_costs btver2_cost = {
1538 COSTS_N_INSNS (1), /* cost of an add instruction */
1539 COSTS_N_INSNS (2), /* cost of a lea instruction */
1540 COSTS_N_INSNS (1), /* variable shift costs */
1541 COSTS_N_INSNS (1), /* constant shift costs */
1542 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1543 COSTS_N_INSNS (4), /* HI */
1544 COSTS_N_INSNS (3), /* SI */
1545 COSTS_N_INSNS (4), /* DI */
1546 COSTS_N_INSNS (5)}, /* other */
1547 0, /* cost of multiply per each bit set */
1548 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1549 COSTS_N_INSNS (35), /* HI */
1550 COSTS_N_INSNS (51), /* SI */
1551 COSTS_N_INSNS (83), /* DI */
1552 COSTS_N_INSNS (83)}, /* other */
1553 COSTS_N_INSNS (1), /* cost of movsx */
1554 COSTS_N_INSNS (1), /* cost of movzx */
1555 8, /* "large" insn */
1556 9, /* MOVE_RATIO */
1557 4, /* cost for loading QImode using movzbl */
1558 {3, 4, 3}, /* cost of loading integer registers
1559 in QImode, HImode and SImode.
1560 Relative to reg-reg move (2). */
1561 {3, 4, 3}, /* cost of storing integer registers */
1562 4, /* cost of reg,reg fld/fst */
1563 {4, 4, 12}, /* cost of loading fp registers
1564 in SFmode, DFmode and XFmode */
1565 {6, 6, 8}, /* cost of storing fp registers
1566 in SFmode, DFmode and XFmode */
1567 2, /* cost of moving MMX register */
1568 {3, 3}, /* cost of loading MMX registers
1569 in SImode and DImode */
1570 {4, 4}, /* cost of storing MMX registers
1571 in SImode and DImode */
1572 2, /* cost of moving SSE register */
1573 {4, 4, 3}, /* cost of loading SSE registers
1574 in SImode, DImode and TImode */
1575 {4, 4, 5}, /* cost of storing SSE registers
1576 in SImode, DImode and TImode */
1577 3, /* MMX or SSE register to integer */
1578 /* On K8:
1579 MOVD reg64, xmmreg Double FSTORE 4
1580 MOVD reg32, xmmreg Double FSTORE 4
1581 On AMDFAM10:
1582 MOVD reg64, xmmreg Double FADD 3
1583 1/1 1/1
1584 MOVD reg32, xmmreg Double FADD 3
1585 1/1 1/1 */
1586 32, /* size of l1 cache. */
1587 2048, /* size of l2 cache. */
1588 64, /* size of prefetch block */
1589 100, /* number of parallel prefetches */
1590 2, /* Branch cost */
1591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1597 btver2_memcpy,
1598 btver2_memset,
1599 4, /* scalar_stmt_cost. */
1600 2, /* scalar load_cost. */
1601 2, /* scalar_store_cost. */
1602 6, /* vec_stmt_cost. */
1603 0, /* vec_to_scalar_cost. */
1604 2, /* scalar_to_vec_cost. */
1605 2, /* vec_align_load_cost. */
1606 2, /* vec_unalign_load_cost. */
1607 2, /* vec_store_cost. */
1608 2, /* cond_taken_branch_cost. */
1609 1, /* cond_not_taken_branch_cost. */
1612 static stringop_algs pentium4_memcpy[2] = {
1613 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1614 DUMMY_STRINGOP_ALGS};
1615 static stringop_algs pentium4_memset[2] = {
1616 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1617 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1618 DUMMY_STRINGOP_ALGS};
1620 static const
1621 struct processor_costs pentium4_cost = {
1622 COSTS_N_INSNS (1), /* cost of an add instruction */
1623 COSTS_N_INSNS (3), /* cost of a lea instruction */
1624 COSTS_N_INSNS (4), /* variable shift costs */
1625 COSTS_N_INSNS (4), /* constant shift costs */
1626 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1627 COSTS_N_INSNS (15), /* HI */
1628 COSTS_N_INSNS (15), /* SI */
1629 COSTS_N_INSNS (15), /* DI */
1630 COSTS_N_INSNS (15)}, /* other */
1631 0, /* cost of multiply per each bit set */
1632 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1633 COSTS_N_INSNS (56), /* HI */
1634 COSTS_N_INSNS (56), /* SI */
1635 COSTS_N_INSNS (56), /* DI */
1636 COSTS_N_INSNS (56)}, /* other */
1637 COSTS_N_INSNS (1), /* cost of movsx */
1638 COSTS_N_INSNS (1), /* cost of movzx */
1639 16, /* "large" insn */
1640 6, /* MOVE_RATIO */
1641 2, /* cost for loading QImode using movzbl */
1642 {4, 5, 4}, /* cost of loading integer registers
1643 in QImode, HImode and SImode.
1644 Relative to reg-reg move (2). */
1645 {2, 3, 2}, /* cost of storing integer registers */
1646 2, /* cost of reg,reg fld/fst */
1647 {2, 2, 6}, /* cost of loading fp registers
1648 in SFmode, DFmode and XFmode */
1649 {4, 4, 6}, /* cost of storing fp registers
1650 in SFmode, DFmode and XFmode */
1651 2, /* cost of moving MMX register */
1652 {2, 2}, /* cost of loading MMX registers
1653 in SImode and DImode */
1654 {2, 2}, /* cost of storing MMX registers
1655 in SImode and DImode */
1656 12, /* cost of moving SSE register */
1657 {12, 12, 12}, /* cost of loading SSE registers
1658 in SImode, DImode and TImode */
1659 {2, 2, 8}, /* cost of storing SSE registers
1660 in SImode, DImode and TImode */
1661 10, /* MMX or SSE register to integer */
1662 8, /* size of l1 cache. */
1663 256, /* size of l2 cache. */
1664 64, /* size of prefetch block */
1665 6, /* number of parallel prefetches */
1666 2, /* Branch cost */
1667 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1668 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1669 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1670 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1671 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1672 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1673 pentium4_memcpy,
1674 pentium4_memset,
1675 1, /* scalar_stmt_cost. */
1676 1, /* scalar load_cost. */
1677 1, /* scalar_store_cost. */
1678 1, /* vec_stmt_cost. */
1679 1, /* vec_to_scalar_cost. */
1680 1, /* scalar_to_vec_cost. */
1681 1, /* vec_align_load_cost. */
1682 2, /* vec_unalign_load_cost. */
1683 1, /* vec_store_cost. */
1684 3, /* cond_taken_branch_cost. */
1685 1, /* cond_not_taken_branch_cost. */
1688 static stringop_algs nocona_memcpy[2] = {
1689 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1690 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1691 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1693 static stringop_algs nocona_memset[2] = {
1694 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1695 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1696 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1697 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1699 static const
1700 struct processor_costs nocona_cost = {
1701 COSTS_N_INSNS (1), /* cost of an add instruction */
1702 COSTS_N_INSNS (1), /* cost of a lea instruction */
1703 COSTS_N_INSNS (1), /* variable shift costs */
1704 COSTS_N_INSNS (1), /* constant shift costs */
1705 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1706 COSTS_N_INSNS (10), /* HI */
1707 COSTS_N_INSNS (10), /* SI */
1708 COSTS_N_INSNS (10), /* DI */
1709 COSTS_N_INSNS (10)}, /* other */
1710 0, /* cost of multiply per each bit set */
1711 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1712 COSTS_N_INSNS (66), /* HI */
1713 COSTS_N_INSNS (66), /* SI */
1714 COSTS_N_INSNS (66), /* DI */
1715 COSTS_N_INSNS (66)}, /* other */
1716 COSTS_N_INSNS (1), /* cost of movsx */
1717 COSTS_N_INSNS (1), /* cost of movzx */
1718 16, /* "large" insn */
1719 17, /* MOVE_RATIO */
1720 4, /* cost for loading QImode using movzbl */
1721 {4, 4, 4}, /* cost of loading integer registers
1722 in QImode, HImode and SImode.
1723 Relative to reg-reg move (2). */
1724 {4, 4, 4}, /* cost of storing integer registers */
1725 3, /* cost of reg,reg fld/fst */
1726 {12, 12, 12}, /* cost of loading fp registers
1727 in SFmode, DFmode and XFmode */
1728 {4, 4, 4}, /* cost of storing fp registers
1729 in SFmode, DFmode and XFmode */
1730 6, /* cost of moving MMX register */
1731 {12, 12}, /* cost of loading MMX registers
1732 in SImode and DImode */
1733 {12, 12}, /* cost of storing MMX registers
1734 in SImode and DImode */
1735 6, /* cost of moving SSE register */
1736 {12, 12, 12}, /* cost of loading SSE registers
1737 in SImode, DImode and TImode */
1738 {12, 12, 12}, /* cost of storing SSE registers
1739 in SImode, DImode and TImode */
1740 8, /* MMX or SSE register to integer */
1741 8, /* size of l1 cache. */
1742 1024, /* size of l2 cache. */
1743 64, /* size of prefetch block */
1744 8, /* number of parallel prefetches */
1745 1, /* Branch cost */
1746 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1747 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1748 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1749 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1750 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1751 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1752 nocona_memcpy,
1753 nocona_memset,
1754 1, /* scalar_stmt_cost. */
1755 1, /* scalar load_cost. */
1756 1, /* scalar_store_cost. */
1757 1, /* vec_stmt_cost. */
1758 1, /* vec_to_scalar_cost. */
1759 1, /* scalar_to_vec_cost. */
1760 1, /* vec_align_load_cost. */
1761 2, /* vec_unalign_load_cost. */
1762 1, /* vec_store_cost. */
1763 3, /* cond_taken_branch_cost. */
1764 1, /* cond_not_taken_branch_cost. */
1767 static stringop_algs atom_memcpy[2] = {
1768 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1769 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1770 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1771 static stringop_algs atom_memset[2] = {
1772 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1773 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1774 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1775 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1776 static const
1777 struct processor_costs atom_cost = {
1778 COSTS_N_INSNS (1), /* cost of an add instruction */
1779 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1780 COSTS_N_INSNS (1), /* variable shift costs */
1781 COSTS_N_INSNS (1), /* constant shift costs */
1782 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1783 COSTS_N_INSNS (4), /* HI */
1784 COSTS_N_INSNS (3), /* SI */
1785 COSTS_N_INSNS (4), /* DI */
1786 COSTS_N_INSNS (2)}, /* other */
1787 0, /* cost of multiply per each bit set */
1788 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1789 COSTS_N_INSNS (26), /* HI */
1790 COSTS_N_INSNS (42), /* SI */
1791 COSTS_N_INSNS (74), /* DI */
1792 COSTS_N_INSNS (74)}, /* other */
1793 COSTS_N_INSNS (1), /* cost of movsx */
1794 COSTS_N_INSNS (1), /* cost of movzx */
1795 8, /* "large" insn */
1796 17, /* MOVE_RATIO */
1797 4, /* cost for loading QImode using movzbl */
1798 {4, 4, 4}, /* cost of loading integer registers
1799 in QImode, HImode and SImode.
1800 Relative to reg-reg move (2). */
1801 {4, 4, 4}, /* cost of storing integer registers */
1802 4, /* cost of reg,reg fld/fst */
1803 {12, 12, 12}, /* cost of loading fp registers
1804 in SFmode, DFmode and XFmode */
1805 {6, 6, 8}, /* cost of storing fp registers
1806 in SFmode, DFmode and XFmode */
1807 2, /* cost of moving MMX register */
1808 {8, 8}, /* cost of loading MMX registers
1809 in SImode and DImode */
1810 {8, 8}, /* cost of storing MMX registers
1811 in SImode and DImode */
1812 2, /* cost of moving SSE register */
1813 {8, 8, 8}, /* cost of loading SSE registers
1814 in SImode, DImode and TImode */
1815 {8, 8, 8}, /* cost of storing SSE registers
1816 in SImode, DImode and TImode */
1817 5, /* MMX or SSE register to integer */
1818 32, /* size of l1 cache. */
1819 256, /* size of l2 cache. */
1820 64, /* size of prefetch block */
1821 6, /* number of parallel prefetches */
1822 3, /* Branch cost */
1823 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1824 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1825 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1826 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1827 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1828 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1829 atom_memcpy,
1830 atom_memset,
1831 1, /* scalar_stmt_cost. */
1832 1, /* scalar load_cost. */
1833 1, /* scalar_store_cost. */
1834 1, /* vec_stmt_cost. */
1835 1, /* vec_to_scalar_cost. */
1836 1, /* scalar_to_vec_cost. */
1837 1, /* vec_align_load_cost. */
1838 2, /* vec_unalign_load_cost. */
1839 1, /* vec_store_cost. */
1840 3, /* cond_taken_branch_cost. */
1841 1, /* cond_not_taken_branch_cost. */
1844 static stringop_algs slm_memcpy[2] = {
1845 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1846 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1847 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1848 static stringop_algs slm_memset[2] = {
1849 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1850 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1851 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1852 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1853 static const
1854 struct processor_costs slm_cost = {
1855 COSTS_N_INSNS (1), /* cost of an add instruction */
1856 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1857 COSTS_N_INSNS (1), /* variable shift costs */
1858 COSTS_N_INSNS (1), /* constant shift costs */
1859 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1860 COSTS_N_INSNS (3), /* HI */
1861 COSTS_N_INSNS (3), /* SI */
1862 COSTS_N_INSNS (4), /* DI */
1863 COSTS_N_INSNS (2)}, /* other */
1864 0, /* cost of multiply per each bit set */
1865 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1866 COSTS_N_INSNS (26), /* HI */
1867 COSTS_N_INSNS (42), /* SI */
1868 COSTS_N_INSNS (74), /* DI */
1869 COSTS_N_INSNS (74)}, /* other */
1870 COSTS_N_INSNS (1), /* cost of movsx */
1871 COSTS_N_INSNS (1), /* cost of movzx */
1872 8, /* "large" insn */
1873 17, /* MOVE_RATIO */
1874 4, /* cost for loading QImode using movzbl */
1875 {4, 4, 4}, /* cost of loading integer registers
1876 in QImode, HImode and SImode.
1877 Relative to reg-reg move (2). */
1878 {4, 4, 4}, /* cost of storing integer registers */
1879 4, /* cost of reg,reg fld/fst */
1880 {12, 12, 12}, /* cost of loading fp registers
1881 in SFmode, DFmode and XFmode */
1882 {6, 6, 8}, /* cost of storing fp registers
1883 in SFmode, DFmode and XFmode */
1884 2, /* cost of moving MMX register */
1885 {8, 8}, /* cost of loading MMX registers
1886 in SImode and DImode */
1887 {8, 8}, /* cost of storing MMX registers
1888 in SImode and DImode */
1889 2, /* cost of moving SSE register */
1890 {8, 8, 8}, /* cost of loading SSE registers
1891 in SImode, DImode and TImode */
1892 {8, 8, 8}, /* cost of storing SSE registers
1893 in SImode, DImode and TImode */
1894 5, /* MMX or SSE register to integer */
1895 32, /* size of l1 cache. */
1896 256, /* size of l2 cache. */
1897 64, /* size of prefetch block */
1898 6, /* number of parallel prefetches */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 slm_memcpy,
1907 slm_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 4, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1921 static stringop_algs intel_memcpy[2] = {
1922 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1923 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1924 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1925 static stringop_algs intel_memset[2] = {
1926 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1927 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1929 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1930 static const
1931 struct processor_costs intel_cost = {
1932 COSTS_N_INSNS (1), /* cost of an add instruction */
1933 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1934 COSTS_N_INSNS (1), /* variable shift costs */
1935 COSTS_N_INSNS (1), /* constant shift costs */
1936 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1937 COSTS_N_INSNS (3), /* HI */
1938 COSTS_N_INSNS (3), /* SI */
1939 COSTS_N_INSNS (4), /* DI */
1940 COSTS_N_INSNS (2)}, /* other */
1941 0, /* cost of multiply per each bit set */
1942 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1943 COSTS_N_INSNS (26), /* HI */
1944 COSTS_N_INSNS (42), /* SI */
1945 COSTS_N_INSNS (74), /* DI */
1946 COSTS_N_INSNS (74)}, /* other */
1947 COSTS_N_INSNS (1), /* cost of movsx */
1948 COSTS_N_INSNS (1), /* cost of movzx */
1949 8, /* "large" insn */
1950 17, /* MOVE_RATIO */
1951 4, /* cost for loading QImode using movzbl */
1952 {4, 4, 4}, /* cost of loading integer registers
1953 in QImode, HImode and SImode.
1954 Relative to reg-reg move (2). */
1955 {4, 4, 4}, /* cost of storing integer registers */
1956 4, /* cost of reg,reg fld/fst */
1957 {12, 12, 12}, /* cost of loading fp registers
1958 in SFmode, DFmode and XFmode */
1959 {6, 6, 8}, /* cost of storing fp registers
1960 in SFmode, DFmode and XFmode */
1961 2, /* cost of moving MMX register */
1962 {8, 8}, /* cost of loading MMX registers
1963 in SImode and DImode */
1964 {8, 8}, /* cost of storing MMX registers
1965 in SImode and DImode */
1966 2, /* cost of moving SSE register */
1967 {8, 8, 8}, /* cost of loading SSE registers
1968 in SImode, DImode and TImode */
1969 {8, 8, 8}, /* cost of storing SSE registers
1970 in SImode, DImode and TImode */
1971 5, /* MMX or SSE register to integer */
1972 32, /* size of l1 cache. */
1973 256, /* size of l2 cache. */
1974 64, /* size of prefetch block */
1975 6, /* number of parallel prefetches */
1976 3, /* Branch cost */
1977 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1978 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1979 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1980 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1981 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1982 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1983 intel_memcpy,
1984 intel_memset,
1985 1, /* scalar_stmt_cost. */
1986 1, /* scalar load_cost. */
1987 1, /* scalar_store_cost. */
1988 1, /* vec_stmt_cost. */
1989 4, /* vec_to_scalar_cost. */
1990 1, /* scalar_to_vec_cost. */
1991 1, /* vec_align_load_cost. */
1992 2, /* vec_unalign_load_cost. */
1993 1, /* vec_store_cost. */
1994 3, /* cond_taken_branch_cost. */
1995 1, /* cond_not_taken_branch_cost. */
1998 /* Generic should produce code tuned for Core-i7 (and newer chips)
1999 and btver1 (and newer chips). */
2001 static stringop_algs generic_memcpy[2] = {
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2003 {-1, libcall, false}}},
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2005 {-1, libcall, false}}}};
2006 static stringop_algs generic_memset[2] = {
2007 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2008 {-1, libcall, false}}},
2009 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2010 {-1, libcall, false}}}};
2011 static const
2012 struct processor_costs generic_cost = {
2013 COSTS_N_INSNS (1), /* cost of an add instruction */
2014 /* On all chips taken into consideration lea is 2 cycles and more. With
2015 this cost however our current implementation of synth_mult results in
2016 use of unnecessary temporary registers causing regression on several
2017 SPECfp benchmarks. */
2018 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2019 COSTS_N_INSNS (1), /* variable shift costs */
2020 COSTS_N_INSNS (1), /* constant shift costs */
2021 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2022 COSTS_N_INSNS (4), /* HI */
2023 COSTS_N_INSNS (3), /* SI */
2024 COSTS_N_INSNS (4), /* DI */
2025 COSTS_N_INSNS (2)}, /* other */
2026 0, /* cost of multiply per each bit set */
2027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2028 COSTS_N_INSNS (26), /* HI */
2029 COSTS_N_INSNS (42), /* SI */
2030 COSTS_N_INSNS (74), /* DI */
2031 COSTS_N_INSNS (74)}, /* other */
2032 COSTS_N_INSNS (1), /* cost of movsx */
2033 COSTS_N_INSNS (1), /* cost of movzx */
2034 8, /* "large" insn */
2035 17, /* MOVE_RATIO */
2036 4, /* cost for loading QImode using movzbl */
2037 {4, 4, 4}, /* cost of loading integer registers
2038 in QImode, HImode and SImode.
2039 Relative to reg-reg move (2). */
2040 {4, 4, 4}, /* cost of storing integer registers */
2041 4, /* cost of reg,reg fld/fst */
2042 {12, 12, 12}, /* cost of loading fp registers
2043 in SFmode, DFmode and XFmode */
2044 {6, 6, 8}, /* cost of storing fp registers
2045 in SFmode, DFmode and XFmode */
2046 2, /* cost of moving MMX register */
2047 {8, 8}, /* cost of loading MMX registers
2048 in SImode and DImode */
2049 {8, 8}, /* cost of storing MMX registers
2050 in SImode and DImode */
2051 2, /* cost of moving SSE register */
2052 {8, 8, 8}, /* cost of loading SSE registers
2053 in SImode, DImode and TImode */
2054 {8, 8, 8}, /* cost of storing SSE registers
2055 in SImode, DImode and TImode */
2056 5, /* MMX or SSE register to integer */
2057 32, /* size of l1 cache. */
2058 512, /* size of l2 cache. */
2059 64, /* size of prefetch block */
2060 6, /* number of parallel prefetches */
2061 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2062 value is increased to perhaps more appropriate value of 5. */
2063 3, /* Branch cost */
2064 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2065 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2066 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2067 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2068 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2069 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2070 generic_memcpy,
2071 generic_memset,
2072 1, /* scalar_stmt_cost. */
2073 1, /* scalar load_cost. */
2074 1, /* scalar_store_cost. */
2075 1, /* vec_stmt_cost. */
2076 1, /* vec_to_scalar_cost. */
2077 1, /* scalar_to_vec_cost. */
2078 1, /* vec_align_load_cost. */
2079 2, /* vec_unalign_load_cost. */
2080 1, /* vec_store_cost. */
2081 3, /* cond_taken_branch_cost. */
2082 1, /* cond_not_taken_branch_cost. */
2085 /* core_cost should produce code tuned for Core familly of CPUs. */
2086 static stringop_algs core_memcpy[2] = {
2087 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2090 static stringop_algs core_memset[2] = {
2091 {libcall, {{6, loop_1_byte, true},
2092 {24, loop, true},
2093 {8192, rep_prefix_4_byte, true},
2094 {-1, libcall, false}}},
2095 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2096 {-1, libcall, false}}}};
2098 static const
2099 struct processor_costs core_cost = {
2100 COSTS_N_INSNS (1), /* cost of an add instruction */
2101 /* On all chips taken into consideration lea is 2 cycles and more. With
2102 this cost however our current implementation of synth_mult results in
2103 use of unnecessary temporary registers causing regression on several
2104 SPECfp benchmarks. */
2105 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2106 COSTS_N_INSNS (1), /* variable shift costs */
2107 COSTS_N_INSNS (1), /* constant shift costs */
2108 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2109 COSTS_N_INSNS (4), /* HI */
2110 COSTS_N_INSNS (3), /* SI */
2111 COSTS_N_INSNS (4), /* DI */
2112 COSTS_N_INSNS (2)}, /* other */
2113 0, /* cost of multiply per each bit set */
2114 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2115 COSTS_N_INSNS (26), /* HI */
2116 COSTS_N_INSNS (42), /* SI */
2117 COSTS_N_INSNS (74), /* DI */
2118 COSTS_N_INSNS (74)}, /* other */
2119 COSTS_N_INSNS (1), /* cost of movsx */
2120 COSTS_N_INSNS (1), /* cost of movzx */
2121 8, /* "large" insn */
2122 17, /* MOVE_RATIO */
2123 4, /* cost for loading QImode using movzbl */
2124 {4, 4, 4}, /* cost of loading integer registers
2125 in QImode, HImode and SImode.
2126 Relative to reg-reg move (2). */
2127 {4, 4, 4}, /* cost of storing integer registers */
2128 4, /* cost of reg,reg fld/fst */
2129 {12, 12, 12}, /* cost of loading fp registers
2130 in SFmode, DFmode and XFmode */
2131 {6, 6, 8}, /* cost of storing fp registers
2132 in SFmode, DFmode and XFmode */
2133 2, /* cost of moving MMX register */
2134 {8, 8}, /* cost of loading MMX registers
2135 in SImode and DImode */
2136 {8, 8}, /* cost of storing MMX registers
2137 in SImode and DImode */
2138 2, /* cost of moving SSE register */
2139 {8, 8, 8}, /* cost of loading SSE registers
2140 in SImode, DImode and TImode */
2141 {8, 8, 8}, /* cost of storing SSE registers
2142 in SImode, DImode and TImode */
2143 5, /* MMX or SSE register to integer */
2144 64, /* size of l1 cache. */
2145 512, /* size of l2 cache. */
2146 64, /* size of prefetch block */
2147 6, /* number of parallel prefetches */
2148 /* FIXME perhaps more appropriate value is 5. */
2149 3, /* Branch cost */
2150 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2151 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2152 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2153 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2154 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2155 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2156 core_memcpy,
2157 core_memset,
2158 1, /* scalar_stmt_cost. */
2159 1, /* scalar load_cost. */
2160 1, /* scalar_store_cost. */
2161 1, /* vec_stmt_cost. */
2162 1, /* vec_to_scalar_cost. */
2163 1, /* scalar_to_vec_cost. */
2164 1, /* vec_align_load_cost. */
2165 2, /* vec_unalign_load_cost. */
2166 1, /* vec_store_cost. */
2167 3, /* cond_taken_branch_cost. */
2168 1, /* cond_not_taken_branch_cost. */
2172 /* Set by -mtune. */
2173 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2175 /* Set by -mtune or -Os. */
2176 const struct processor_costs *ix86_cost = &pentium_cost;
2178 /* Processor feature/optimization bitmasks. */
2179 #define m_386 (1U<<PROCESSOR_I386)
2180 #define m_486 (1U<<PROCESSOR_I486)
2181 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2182 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2183 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2184 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2185 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2186 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2187 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2188 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2189 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2190 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2191 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2192 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2193 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2194 #define m_KNL (1U<<PROCESSOR_KNL)
2195 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2196 #define m_INTEL (1U<<PROCESSOR_INTEL)
2198 #define m_GEODE (1U<<PROCESSOR_GEODE)
2199 #define m_K6 (1U<<PROCESSOR_K6)
2200 #define m_K6_GEODE (m_K6 | m_GEODE)
2201 #define m_K8 (1U<<PROCESSOR_K8)
2202 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2203 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2204 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2205 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2206 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2207 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2208 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2209 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2210 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2211 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2212 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2213 #define m_BTVER (m_BTVER1 | m_BTVER2)
2214 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2215 | m_ZNVER1)
2217 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2219 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2220 #undef DEF_TUNE
2221 #define DEF_TUNE(tune, name, selector) name,
2222 #include "x86-tune.def"
2223 #undef DEF_TUNE
2226 /* Feature tests against the various tunings. */
2227 unsigned char ix86_tune_features[X86_TUNE_LAST];
2229 /* Feature tests against the various tunings used to create ix86_tune_features
2230 based on the processor mask. */
2231 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2232 #undef DEF_TUNE
2233 #define DEF_TUNE(tune, name, selector) selector,
2234 #include "x86-tune.def"
2235 #undef DEF_TUNE
2238 /* Feature tests against the various architecture variations. */
2239 unsigned char ix86_arch_features[X86_ARCH_LAST];
2241 /* Feature tests against the various architecture variations, used to create
2242 ix86_arch_features based on the processor mask. */
2243 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2244 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2245 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2247 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2248 ~m_386,
2250 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2251 ~(m_386 | m_486),
2253 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2254 ~m_386,
2256 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2257 ~m_386,
2260 /* In case the average insn count for single function invocation is
2261 lower than this constant, emit fast (but longer) prologue and
2262 epilogue code. */
2263 #define FAST_PROLOGUE_INSN_COUNT 20
2265 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2266 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2267 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2268 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2270 /* Array of the smallest class containing reg number REGNO, indexed by
2271 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2273 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2275 /* ax, dx, cx, bx */
2276 AREG, DREG, CREG, BREG,
2277 /* si, di, bp, sp */
2278 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2279 /* FP registers */
2280 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2281 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2282 /* arg pointer */
2283 NON_Q_REGS,
2284 /* flags, fpsr, fpcr, frame */
2285 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2286 /* SSE registers */
2287 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2288 SSE_REGS, SSE_REGS,
2289 /* MMX registers */
2290 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2291 MMX_REGS, MMX_REGS,
2292 /* REX registers */
2293 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2294 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2295 /* SSE REX registers */
2296 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2297 SSE_REGS, SSE_REGS,
2298 /* AVX-512 SSE registers */
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2301 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2302 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2303 /* Mask registers. */
2304 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2305 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2306 /* MPX bound registers */
2307 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2310 /* The "default" register map used in 32bit mode. */
2312 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2314 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2315 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2316 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2317 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2318 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2320 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2321 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2322 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2323 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2324 101, 102, 103, 104, /* bound registers */
2327 /* The "default" register map used in 64bit mode. */
2329 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2331 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2332 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2333 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2334 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2335 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2336 8,9,10,11,12,13,14,15, /* extended integer registers */
2337 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2338 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2339 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2340 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2341 126, 127, 128, 129, /* bound registers */
2344 /* Define the register numbers to be used in Dwarf debugging information.
2345 The SVR4 reference port C compiler uses the following register numbers
2346 in its Dwarf output code:
2347 0 for %eax (gcc regno = 0)
2348 1 for %ecx (gcc regno = 2)
2349 2 for %edx (gcc regno = 1)
2350 3 for %ebx (gcc regno = 3)
2351 4 for %esp (gcc regno = 7)
2352 5 for %ebp (gcc regno = 6)
2353 6 for %esi (gcc regno = 4)
2354 7 for %edi (gcc regno = 5)
2355 The following three DWARF register numbers are never generated by
2356 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2357 believes these numbers have these meanings.
2358 8 for %eip (no gcc equivalent)
2359 9 for %eflags (gcc regno = 17)
2360 10 for %trapno (no gcc equivalent)
2361 It is not at all clear how we should number the FP stack registers
2362 for the x86 architecture. If the version of SDB on x86/svr4 were
2363 a bit less brain dead with respect to floating-point then we would
2364 have a precedent to follow with respect to DWARF register numbers
2365 for x86 FP registers, but the SDB on x86/svr4 is so completely
2366 broken with respect to FP registers that it is hardly worth thinking
2367 of it as something to strive for compatibility with.
2368 The version of x86/svr4 SDB I have at the moment does (partially)
2369 seem to believe that DWARF register number 11 is associated with
2370 the x86 register %st(0), but that's about all. Higher DWARF
2371 register numbers don't seem to be associated with anything in
2372 particular, and even for DWARF regno 11, SDB only seems to under-
2373 stand that it should say that a variable lives in %st(0) (when
2374 asked via an `=' command) if we said it was in DWARF regno 11,
2375 but SDB still prints garbage when asked for the value of the
2376 variable in question (via a `/' command).
2377 (Also note that the labels SDB prints for various FP stack regs
2378 when doing an `x' command are all wrong.)
2379 Note that these problems generally don't affect the native SVR4
2380 C compiler because it doesn't allow the use of -O with -g and
2381 because when it is *not* optimizing, it allocates a memory
2382 location for each floating-point variable, and the memory
2383 location is what gets described in the DWARF AT_location
2384 attribute for the variable in question.
2385 Regardless of the severe mental illness of the x86/svr4 SDB, we
2386 do something sensible here and we use the following DWARF
2387 register numbers. Note that these are all stack-top-relative
2388 numbers.
2389 11 for %st(0) (gcc regno = 8)
2390 12 for %st(1) (gcc regno = 9)
2391 13 for %st(2) (gcc regno = 10)
2392 14 for %st(3) (gcc regno = 11)
2393 15 for %st(4) (gcc regno = 12)
2394 16 for %st(5) (gcc regno = 13)
2395 17 for %st(6) (gcc regno = 14)
2396 18 for %st(7) (gcc regno = 15)
2398 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2400 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2401 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2402 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2403 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2404 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2406 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2407 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2408 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2409 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2410 101, 102, 103, 104, /* bound registers */
2413 /* Define parameter passing and return registers. */
2415 static int const x86_64_int_parameter_registers[6] =
2417 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2420 static int const x86_64_ms_abi_int_parameter_registers[4] =
2422 CX_REG, DX_REG, R8_REG, R9_REG
2425 static int const x86_64_int_return_registers[4] =
2427 AX_REG, DX_REG, DI_REG, SI_REG
2430 /* Additional registers that are clobbered by SYSV calls. */
2432 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2433 static int const x86_64_ms_sysv_extra_clobbered_registers
2434 [NUM_X86_64_MS_CLOBBERED_REGS] =
2436 SI_REG, DI_REG,
2437 XMM6_REG, XMM7_REG,
2438 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2439 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2442 enum xlogue_stub {
2443 XLOGUE_STUB_SAVE,
2444 XLOGUE_STUB_RESTORE,
2445 XLOGUE_STUB_RESTORE_TAIL,
2446 XLOGUE_STUB_SAVE_HFP,
2447 XLOGUE_STUB_RESTORE_HFP,
2448 XLOGUE_STUB_RESTORE_HFP_TAIL,
2450 XLOGUE_STUB_COUNT
2453 enum xlogue_stub_sets {
2454 XLOGUE_SET_ALIGNED,
2455 XLOGUE_SET_ALIGNED_PLUS_8,
2456 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2457 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2459 XLOGUE_SET_COUNT
2462 /* Register save/restore layout used by out-of-line stubs. */
2463 class xlogue_layout {
2464 public:
2465 struct reginfo
2467 unsigned regno;
2468 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2469 rsi) to where each register is stored. */
2472 unsigned get_nregs () const {return m_nregs;}
2473 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2475 const reginfo &get_reginfo (unsigned reg) const
2477 gcc_assert (reg < m_nregs);
2478 return m_regs[reg];
2481 static const char *get_stub_name (enum xlogue_stub stub,
2482 unsigned n_extra_args);
2484 /* Returns an rtx for the stub's symbol based upon
2485 1.) the specified stub (save, restore or restore_ret) and
2486 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2487 3.) rather or not stack alignment is being performed. */
2488 static rtx get_stub_rtx (enum xlogue_stub stub);
2490 /* Returns the amount of stack space (including padding) that the stub
2491 needs to store registers based upon data in the machine_function. */
2492 HOST_WIDE_INT get_stack_space_used () const
2494 const struct machine_function *m = cfun->machine;
2495 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2497 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2498 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
2501 /* Returns the offset for the base pointer used by the stub. */
2502 HOST_WIDE_INT get_stub_ptr_offset () const
2504 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2507 static const struct xlogue_layout &get_instance ();
2508 static unsigned count_stub_managed_regs ();
2509 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2511 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2512 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2513 static const unsigned MAX_REGS = 18;
2514 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2515 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2516 static const unsigned STUB_NAME_MAX_LEN = 20;
2517 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2518 static const unsigned REG_ORDER[MAX_REGS];
2519 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2521 private:
2522 xlogue_layout ();
2523 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2524 xlogue_layout (const xlogue_layout &);
2526 /* True if hard frame pointer is used. */
2527 bool m_hfp;
2529 /* Max number of register this layout manages. */
2530 unsigned m_nregs;
2532 /* Incoming offset from 16-byte alignment. */
2533 HOST_WIDE_INT m_stack_align_off_in;
2535 /* Register order and offsets. */
2536 struct reginfo m_regs[MAX_REGS];
2538 /* Lazy-inited cache of symbol names for stubs. */
2539 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
2540 [STUB_NAME_MAX_LEN];
2542 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2545 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2546 "savms64",
2547 "resms64",
2548 "resms64x",
2549 "savms64f",
2550 "resms64f",
2551 "resms64fx"
2554 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2555 /* The below offset values are where each register is stored for the layout
2556 relative to incoming stack pointer. The value of each m_regs[].offset will
2557 be relative to the incoming base pointer (rax or rsi) used by the stub.
2559 s_instances: 0 1 2 3
2560 Offset: realigned or aligned + 8
2561 Register aligned aligned + 8 aligned w/HFP w/HFP */
2562 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2563 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2564 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2565 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2566 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2567 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2568 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2569 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2570 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2571 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2572 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2573 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2574 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2575 BP_REG, /* 0xc0 0xc8 N/A N/A */
2576 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2577 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2578 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2579 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2582 /* Instantiate static const values. */
2583 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2584 const unsigned xlogue_layout::MIN_REGS;
2585 const unsigned xlogue_layout::MAX_REGS;
2586 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2587 const unsigned xlogue_layout::VARIANT_COUNT;
2588 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2590 /* Initialize xlogue_layout::s_stub_names to zero. */
2591 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
2592 [STUB_NAME_MAX_LEN];
2594 /* Instantiates all xlogue_layout instances. */
2595 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2596 xlogue_layout (0, false),
2597 xlogue_layout (8, false),
2598 xlogue_layout (0, true),
2599 xlogue_layout (8, true)
2602 /* Return an appropriate const instance of xlogue_layout based upon values
2603 in cfun->machine and crtl. */
2604 const struct xlogue_layout &
2605 xlogue_layout::get_instance ()
2607 enum xlogue_stub_sets stub_set;
2608 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2610 if (stack_realign_fp)
2611 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2612 else if (frame_pointer_needed)
2613 stub_set = aligned_plus_8
2614 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2615 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2616 else
2617 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2619 return s_instances[stub_set];
2622 /* Determine how many clobbered registers can be saved by the stub.
2623 Returns the count of registers the stub will save and restore. */
2624 unsigned
2625 xlogue_layout::count_stub_managed_regs ()
2627 bool hfp = frame_pointer_needed || stack_realign_fp;
2628 unsigned i, count;
2629 unsigned regno;
2631 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2633 regno = REG_ORDER[i];
2634 if (regno == BP_REG && hfp)
2635 continue;
2636 if (!ix86_save_reg (regno, false, false))
2637 break;
2638 ++count;
2640 return count;
2643 /* Determine if register REGNO is a stub managed register given the
2644 total COUNT of stub managed registers. */
2645 bool
2646 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2648 bool hfp = frame_pointer_needed || stack_realign_fp;
2649 unsigned i;
2651 for (i = 0; i < count; ++i)
2653 gcc_assert (i < MAX_REGS);
2654 if (REG_ORDER[i] == BP_REG && hfp)
2655 ++count;
2656 else if (REG_ORDER[i] == regno)
2657 return true;
2659 return false;
2662 /* Constructor for xlogue_layout. */
2663 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2664 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2665 m_stack_align_off_in (stack_align_off_in)
2667 HOST_WIDE_INT offset = stack_align_off_in;
2668 unsigned i, j;
2670 for (i = j = 0; i < MAX_REGS; ++i)
2672 unsigned regno = REG_ORDER[i];
2674 if (regno == BP_REG && hfp)
2675 continue;
2676 if (SSE_REGNO_P (regno))
2678 offset += 16;
2679 /* Verify that SSE regs are always aligned. */
2680 gcc_assert (!((stack_align_off_in + offset) & 15));
2682 else
2683 offset += 8;
2685 m_regs[j].regno = regno;
2686 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2688 gcc_assert (j == m_nregs);
2691 const char *
2692 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2693 unsigned n_extra_regs)
2695 const int have_avx = TARGET_AVX;
2696 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
2698 /* Lazy init */
2699 if (!*name)
2701 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
2702 (have_avx ? "avx" : "sse"),
2703 STUB_BASE_NAMES[stub],
2704 MIN_REGS + n_extra_regs);
2705 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2708 return name;
2711 /* Return rtx of a symbol ref for the entry point (based upon
2712 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2714 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2716 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2717 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2718 gcc_assert (stub < XLOGUE_STUB_COUNT);
2719 gcc_assert (crtl->stack_realign_finalized);
2721 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2724 /* Define the structure for the machine field in struct function. */
2726 struct GTY(()) stack_local_entry {
2727 unsigned short mode;
2728 unsigned short n;
2729 rtx rtl;
2730 struct stack_local_entry *next;
2733 /* Which cpu are we scheduling for. */
2734 enum attr_cpu ix86_schedule;
2736 /* Which cpu are we optimizing for. */
2737 enum processor_type ix86_tune;
2739 /* Which instruction set architecture to use. */
2740 enum processor_type ix86_arch;
2742 /* True if processor has SSE prefetch instruction. */
2743 unsigned char x86_prefetch_sse;
2745 /* -mstackrealign option */
2746 static const char ix86_force_align_arg_pointer_string[]
2747 = "force_align_arg_pointer";
2749 static rtx (*ix86_gen_leave) (void);
2750 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2751 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2753 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2754 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_clzero) (rtx);
2757 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2758 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2759 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2760 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2761 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2762 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2764 /* Preferred alignment for stack boundary in bits. */
2765 unsigned int ix86_preferred_stack_boundary;
2767 /* Alignment for incoming stack boundary in bits specified at
2768 command line. */
2769 static unsigned int ix86_user_incoming_stack_boundary;
2771 /* Default alignment for incoming stack boundary in bits. */
2772 static unsigned int ix86_default_incoming_stack_boundary;
2774 /* Alignment for incoming stack boundary in bits. */
2775 unsigned int ix86_incoming_stack_boundary;
2777 /* Calling abi specific va_list type nodes. */
2778 static GTY(()) tree sysv_va_list_type_node;
2779 static GTY(()) tree ms_va_list_type_node;
2781 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2782 char internal_label_prefix[16];
2783 int internal_label_prefix_len;
2785 /* Fence to use after loop using movnt. */
2786 tree x86_mfence;
2788 /* Register class used for passing given 64bit part of the argument.
2789 These represent classes as documented by the PS ABI, with the exception
2790 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2791 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2793 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2794 whenever possible (upper half does contain padding). */
2795 enum x86_64_reg_class
2797 X86_64_NO_CLASS,
2798 X86_64_INTEGER_CLASS,
2799 X86_64_INTEGERSI_CLASS,
2800 X86_64_SSE_CLASS,
2801 X86_64_SSESF_CLASS,
2802 X86_64_SSEDF_CLASS,
2803 X86_64_SSEUP_CLASS,
2804 X86_64_X87_CLASS,
2805 X86_64_X87UP_CLASS,
2806 X86_64_COMPLEX_X87_CLASS,
2807 X86_64_MEMORY_CLASS
2810 #define MAX_CLASSES 8
2812 /* Table of constants used by fldpi, fldln2, etc.... */
2813 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2814 static bool ext_80387_constants_init;
2817 static struct machine_function * ix86_init_machine_status (void);
2818 static rtx ix86_function_value (const_tree, const_tree, bool);
2819 static bool ix86_function_value_regno_p (const unsigned int);
2820 static unsigned int ix86_function_arg_boundary (machine_mode,
2821 const_tree);
2822 static rtx ix86_static_chain (const_tree, bool);
2823 static int ix86_function_regparm (const_tree, const_tree);
2824 static void ix86_compute_frame_layout (void);
2825 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2826 rtx, rtx, int);
2827 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2828 static tree ix86_canonical_va_list_type (tree);
2829 static void predict_jump (int);
2830 static unsigned int split_stack_prologue_scratch_regno (void);
2831 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2833 enum ix86_function_specific_strings
2835 IX86_FUNCTION_SPECIFIC_ARCH,
2836 IX86_FUNCTION_SPECIFIC_TUNE,
2837 IX86_FUNCTION_SPECIFIC_MAX
2840 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2841 const char *, const char *, enum fpmath_unit,
2842 bool);
2843 static void ix86_function_specific_save (struct cl_target_option *,
2844 struct gcc_options *opts);
2845 static void ix86_function_specific_restore (struct gcc_options *opts,
2846 struct cl_target_option *);
2847 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2848 static void ix86_function_specific_print (FILE *, int,
2849 struct cl_target_option *);
2850 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2851 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2852 struct gcc_options *,
2853 struct gcc_options *,
2854 struct gcc_options *);
2855 static bool ix86_can_inline_p (tree, tree);
2856 static void ix86_set_current_function (tree);
2857 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2859 static enum calling_abi ix86_function_abi (const_tree);
2862 #ifndef SUBTARGET32_DEFAULT_CPU
2863 #define SUBTARGET32_DEFAULT_CPU "i386"
2864 #endif
2866 /* Whether -mtune= or -march= were specified */
2867 static int ix86_tune_defaulted;
2868 static int ix86_arch_specified;
2870 /* Vectorization library interface and handlers. */
2871 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2873 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2874 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2876 /* Processor target table, indexed by processor number */
2877 struct ptt
2879 const char *const name; /* processor name */
2880 const struct processor_costs *cost; /* Processor costs */
2881 const int align_loop; /* Default alignments. */
2882 const int align_loop_max_skip;
2883 const int align_jump;
2884 const int align_jump_max_skip;
2885 const int align_func;
2888 /* This table must be in sync with enum processor_type in i386.h. */
2889 static const struct ptt processor_target_table[PROCESSOR_max] =
2891 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2892 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2893 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2894 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2895 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2896 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2897 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2898 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2899 {"core2", &core_cost, 16, 10, 16, 10, 16},
2900 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2901 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2902 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2903 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2904 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2905 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2906 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2907 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2908 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2909 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2910 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2911 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2912 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2913 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2914 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2915 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2916 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2917 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2918 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2919 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2922 static unsigned int
2923 rest_of_handle_insert_vzeroupper (void)
2925 int i;
2927 /* vzeroupper instructions are inserted immediately after reload to
2928 account for possible spills from 256bit registers. The pass
2929 reuses mode switching infrastructure by re-running mode insertion
2930 pass, so disable entities that have already been processed. */
2931 for (i = 0; i < MAX_386_ENTITIES; i++)
2932 ix86_optimize_mode_switching[i] = 0;
2934 ix86_optimize_mode_switching[AVX_U128] = 1;
2936 /* Call optimize_mode_switching. */
2937 g->get_passes ()->execute_pass_mode_switching ();
2938 return 0;
2941 /* Return 1 if INSN uses or defines a hard register.
2942 Hard register uses in a memory address are ignored.
2943 Clobbers and flags definitions are ignored. */
2945 static bool
2946 has_non_address_hard_reg (rtx_insn *insn)
2948 df_ref ref;
2949 FOR_EACH_INSN_DEF (ref, insn)
2950 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2951 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2952 && DF_REF_REGNO (ref) != FLAGS_REG)
2953 return true;
2955 FOR_EACH_INSN_USE (ref, insn)
2956 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2957 return true;
2959 return false;
2962 /* Check if comparison INSN may be transformed
2963 into vector comparison. Currently we transform
2964 zero checks only which look like:
2966 (set (reg:CCZ 17 flags)
2967 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2968 (subreg:SI (reg:DI x) 0))
2969 (const_int 0 [0]))) */
2971 static bool
2972 convertible_comparison_p (rtx_insn *insn)
2974 if (!TARGET_SSE4_1)
2975 return false;
2977 rtx def_set = single_set (insn);
2979 gcc_assert (def_set);
2981 rtx src = SET_SRC (def_set);
2982 rtx dst = SET_DEST (def_set);
2984 gcc_assert (GET_CODE (src) == COMPARE);
2986 if (GET_CODE (dst) != REG
2987 || REGNO (dst) != FLAGS_REG
2988 || GET_MODE (dst) != CCZmode)
2989 return false;
2991 rtx op1 = XEXP (src, 0);
2992 rtx op2 = XEXP (src, 1);
2994 if (op2 != CONST0_RTX (GET_MODE (op2)))
2995 return false;
2997 if (GET_CODE (op1) != IOR)
2998 return false;
3000 op2 = XEXP (op1, 1);
3001 op1 = XEXP (op1, 0);
3003 if (!SUBREG_P (op1)
3004 || !SUBREG_P (op2)
3005 || GET_MODE (op1) != SImode
3006 || GET_MODE (op2) != SImode
3007 || ((SUBREG_BYTE (op1) != 0
3008 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3009 && (SUBREG_BYTE (op2) != 0
3010 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3011 return false;
3013 op1 = SUBREG_REG (op1);
3014 op2 = SUBREG_REG (op2);
3016 if (op1 != op2
3017 || !REG_P (op1)
3018 || GET_MODE (op1) != DImode)
3019 return false;
3021 return true;
3024 /* The DImode version of scalar_to_vector_candidate_p. */
3026 static bool
3027 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3029 rtx def_set = single_set (insn);
3031 if (!def_set)
3032 return false;
3034 if (has_non_address_hard_reg (insn))
3035 return false;
3037 rtx src = SET_SRC (def_set);
3038 rtx dst = SET_DEST (def_set);
3040 if (GET_CODE (src) == COMPARE)
3041 return convertible_comparison_p (insn);
3043 /* We are interested in DImode promotion only. */
3044 if ((GET_MODE (src) != DImode
3045 && !CONST_INT_P (src))
3046 || GET_MODE (dst) != DImode)
3047 return false;
3049 if (!REG_P (dst) && !MEM_P (dst))
3050 return false;
3052 switch (GET_CODE (src))
3054 case ASHIFTRT:
3055 if (!TARGET_AVX512VL)
3056 return false;
3057 /* FALLTHRU */
3059 case ASHIFT:
3060 case LSHIFTRT:
3061 if (!REG_P (XEXP (src, 1))
3062 && (!SUBREG_P (XEXP (src, 1))
3063 || SUBREG_BYTE (XEXP (src, 1)) != 0
3064 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3065 && (!CONST_INT_P (XEXP (src, 1))
3066 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3067 return false;
3069 if (GET_MODE (XEXP (src, 1)) != QImode
3070 && !CONST_INT_P (XEXP (src, 1)))
3071 return false;
3072 break;
3074 case PLUS:
3075 case MINUS:
3076 case IOR:
3077 case XOR:
3078 case AND:
3079 if (!REG_P (XEXP (src, 1))
3080 && !MEM_P (XEXP (src, 1))
3081 && !CONST_INT_P (XEXP (src, 1)))
3082 return false;
3084 if (GET_MODE (XEXP (src, 1)) != DImode
3085 && !CONST_INT_P (XEXP (src, 1)))
3086 return false;
3087 break;
3089 case NEG:
3090 case NOT:
3091 break;
3093 case REG:
3094 return true;
3096 case MEM:
3097 case CONST_INT:
3098 return REG_P (dst);
3100 default:
3101 return false;
3104 if (!REG_P (XEXP (src, 0))
3105 && !MEM_P (XEXP (src, 0))
3106 && !CONST_INT_P (XEXP (src, 0))
3107 /* Check for andnot case. */
3108 && (GET_CODE (src) != AND
3109 || GET_CODE (XEXP (src, 0)) != NOT
3110 || !REG_P (XEXP (XEXP (src, 0), 0))))
3111 return false;
3113 if (GET_MODE (XEXP (src, 0)) != DImode
3114 && !CONST_INT_P (XEXP (src, 0)))
3115 return false;
3117 return true;
3120 /* The TImode version of scalar_to_vector_candidate_p. */
3122 static bool
3123 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3125 rtx def_set = single_set (insn);
3127 if (!def_set)
3128 return false;
3130 if (has_non_address_hard_reg (insn))
3131 return false;
3133 rtx src = SET_SRC (def_set);
3134 rtx dst = SET_DEST (def_set);
3136 /* Only TImode load and store are allowed. */
3137 if (GET_MODE (dst) != TImode)
3138 return false;
3140 if (MEM_P (dst))
3142 /* Check for store. Memory must be aligned or unaligned store
3143 is optimal. Only support store from register, standard SSE
3144 constant or CONST_WIDE_INT generated from piecewise store.
3146 ??? Verify performance impact before enabling CONST_INT for
3147 __int128 store. */
3148 if (misaligned_operand (dst, TImode)
3149 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3150 return false;
3152 switch (GET_CODE (src))
3154 default:
3155 return false;
3157 case REG:
3158 case CONST_WIDE_INT:
3159 return true;
3161 case CONST_INT:
3162 return standard_sse_constant_p (src, TImode);
3165 else if (MEM_P (src))
3167 /* Check for load. Memory must be aligned or unaligned load is
3168 optimal. */
3169 return (REG_P (dst)
3170 && (!misaligned_operand (src, TImode)
3171 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3174 return false;
3177 /* Return 1 if INSN may be converted into vector
3178 instruction. */
3180 static bool
3181 scalar_to_vector_candidate_p (rtx_insn *insn)
3183 if (TARGET_64BIT)
3184 return timode_scalar_to_vector_candidate_p (insn);
3185 else
3186 return dimode_scalar_to_vector_candidate_p (insn);
3189 /* The DImode version of remove_non_convertible_regs. */
3191 static void
3192 dimode_remove_non_convertible_regs (bitmap candidates)
3194 bitmap_iterator bi;
3195 unsigned id;
3196 bitmap regs = BITMAP_ALLOC (NULL);
3198 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3200 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3201 rtx reg = SET_DEST (def_set);
3203 if (!REG_P (reg)
3204 || bitmap_bit_p (regs, REGNO (reg))
3205 || HARD_REGISTER_P (reg))
3206 continue;
3208 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3209 def;
3210 def = DF_REF_NEXT_REG (def))
3212 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3214 if (dump_file)
3215 fprintf (dump_file,
3216 "r%d has non convertible definition in insn %d\n",
3217 REGNO (reg), DF_REF_INSN_UID (def));
3219 bitmap_set_bit (regs, REGNO (reg));
3220 break;
3225 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3227 for (df_ref def = DF_REG_DEF_CHAIN (id);
3228 def;
3229 def = DF_REF_NEXT_REG (def))
3230 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3232 if (dump_file)
3233 fprintf (dump_file, "Removing insn %d from candidates list\n",
3234 DF_REF_INSN_UID (def));
3236 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3240 BITMAP_FREE (regs);
3243 /* For a register REGNO, scan instructions for its defs and uses.
3244 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3246 static void
3247 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3248 unsigned int regno)
3250 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3251 def;
3252 def = DF_REF_NEXT_REG (def))
3254 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3256 if (dump_file)
3257 fprintf (dump_file,
3258 "r%d has non convertible def in insn %d\n",
3259 regno, DF_REF_INSN_UID (def));
3261 bitmap_set_bit (regs, regno);
3262 break;
3266 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3267 ref;
3268 ref = DF_REF_NEXT_REG (ref))
3270 /* Debug instructions are skipped. */
3271 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3272 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3274 if (dump_file)
3275 fprintf (dump_file,
3276 "r%d has non convertible use in insn %d\n",
3277 regno, DF_REF_INSN_UID (ref));
3279 bitmap_set_bit (regs, regno);
3280 break;
3285 /* The TImode version of remove_non_convertible_regs. */
3287 static void
3288 timode_remove_non_convertible_regs (bitmap candidates)
3290 bitmap_iterator bi;
3291 unsigned id;
3292 bitmap regs = BITMAP_ALLOC (NULL);
3294 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3296 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3297 rtx dest = SET_DEST (def_set);
3298 rtx src = SET_SRC (def_set);
3300 if ((!REG_P (dest)
3301 || bitmap_bit_p (regs, REGNO (dest))
3302 || HARD_REGISTER_P (dest))
3303 && (!REG_P (src)
3304 || bitmap_bit_p (regs, REGNO (src))
3305 || HARD_REGISTER_P (src)))
3306 continue;
3308 if (REG_P (dest))
3309 timode_check_non_convertible_regs (candidates, regs,
3310 REGNO (dest));
3312 if (REG_P (src))
3313 timode_check_non_convertible_regs (candidates, regs,
3314 REGNO (src));
3317 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3319 for (df_ref def = DF_REG_DEF_CHAIN (id);
3320 def;
3321 def = DF_REF_NEXT_REG (def))
3322 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3324 if (dump_file)
3325 fprintf (dump_file, "Removing insn %d from candidates list\n",
3326 DF_REF_INSN_UID (def));
3328 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3331 for (df_ref ref = DF_REG_USE_CHAIN (id);
3332 ref;
3333 ref = DF_REF_NEXT_REG (ref))
3334 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3336 if (dump_file)
3337 fprintf (dump_file, "Removing insn %d from candidates list\n",
3338 DF_REF_INSN_UID (ref));
3340 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3344 BITMAP_FREE (regs);
3347 /* For a given bitmap of insn UIDs scans all instruction and
3348 remove insn from CANDIDATES in case it has both convertible
3349 and not convertible definitions.
3351 All insns in a bitmap are conversion candidates according to
3352 scalar_to_vector_candidate_p. Currently it implies all insns
3353 are single_set. */
3355 static void
3356 remove_non_convertible_regs (bitmap candidates)
3358 if (TARGET_64BIT)
3359 timode_remove_non_convertible_regs (candidates);
3360 else
3361 dimode_remove_non_convertible_regs (candidates);
3364 class scalar_chain
3366 public:
3367 scalar_chain ();
3368 virtual ~scalar_chain ();
3370 static unsigned max_id;
3372 /* ID of a chain. */
3373 unsigned int chain_id;
3374 /* A queue of instructions to be included into a chain. */
3375 bitmap queue;
3376 /* Instructions included into a chain. */
3377 bitmap insns;
3378 /* All registers defined by a chain. */
3379 bitmap defs;
3380 /* Registers used in both vector and sclar modes. */
3381 bitmap defs_conv;
3383 void build (bitmap candidates, unsigned insn_uid);
3384 virtual int compute_convert_gain () = 0;
3385 int convert ();
3387 protected:
3388 void add_to_queue (unsigned insn_uid);
3389 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3391 private:
3392 void add_insn (bitmap candidates, unsigned insn_uid);
3393 void analyze_register_chain (bitmap candidates, df_ref ref);
3394 virtual void mark_dual_mode_def (df_ref def) = 0;
3395 virtual void convert_insn (rtx_insn *insn) = 0;
3396 virtual void convert_registers () = 0;
3399 class dimode_scalar_chain : public scalar_chain
3401 public:
3402 int compute_convert_gain ();
3403 private:
3404 void mark_dual_mode_def (df_ref def);
3405 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3406 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3407 void convert_insn (rtx_insn *insn);
3408 void convert_op (rtx *op, rtx_insn *insn);
3409 void convert_reg (unsigned regno);
3410 void make_vector_copies (unsigned regno);
3411 void convert_registers ();
3412 int vector_const_cost (rtx exp);
3415 class timode_scalar_chain : public scalar_chain
3417 public:
3418 /* Convert from TImode to V1TImode is always faster. */
3419 int compute_convert_gain () { return 1; }
3421 private:
3422 void mark_dual_mode_def (df_ref def);
3423 void fix_debug_reg_uses (rtx reg);
3424 void convert_insn (rtx_insn *insn);
3425 /* We don't convert registers to difference size. */
3426 void convert_registers () {}
3429 unsigned scalar_chain::max_id = 0;
3431 /* Initialize new chain. */
3433 scalar_chain::scalar_chain ()
3435 chain_id = ++max_id;
3437 if (dump_file)
3438 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3440 bitmap_obstack_initialize (NULL);
3441 insns = BITMAP_ALLOC (NULL);
3442 defs = BITMAP_ALLOC (NULL);
3443 defs_conv = BITMAP_ALLOC (NULL);
3444 queue = NULL;
3447 /* Free chain's data. */
3449 scalar_chain::~scalar_chain ()
3451 BITMAP_FREE (insns);
3452 BITMAP_FREE (defs);
3453 BITMAP_FREE (defs_conv);
3454 bitmap_obstack_release (NULL);
3457 /* Add instruction into chains' queue. */
3459 void
3460 scalar_chain::add_to_queue (unsigned insn_uid)
3462 if (bitmap_bit_p (insns, insn_uid)
3463 || bitmap_bit_p (queue, insn_uid))
3464 return;
3466 if (dump_file)
3467 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3468 insn_uid, chain_id);
3469 bitmap_set_bit (queue, insn_uid);
3472 /* For DImode conversion, mark register defined by DEF as requiring
3473 conversion. */
3475 void
3476 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3478 gcc_assert (DF_REF_REG_DEF_P (def));
3480 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3481 return;
3483 if (dump_file)
3484 fprintf (dump_file,
3485 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3486 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3488 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3491 /* For TImode conversion, it is unused. */
3493 void
3494 timode_scalar_chain::mark_dual_mode_def (df_ref)
3496 gcc_unreachable ();
3499 /* Check REF's chain to add new insns into a queue
3500 and find registers requiring conversion. */
3502 void
3503 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3505 df_link *chain;
3507 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3508 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3509 add_to_queue (DF_REF_INSN_UID (ref));
3511 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3513 unsigned uid = DF_REF_INSN_UID (chain->ref);
3515 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3516 continue;
3518 if (!DF_REF_REG_MEM_P (chain->ref))
3520 if (bitmap_bit_p (insns, uid))
3521 continue;
3523 if (bitmap_bit_p (candidates, uid))
3525 add_to_queue (uid);
3526 continue;
3530 if (DF_REF_REG_DEF_P (chain->ref))
3532 if (dump_file)
3533 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3534 DF_REF_REGNO (chain->ref), uid);
3535 mark_dual_mode_def (chain->ref);
3537 else
3539 if (dump_file)
3540 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3541 DF_REF_REGNO (chain->ref), uid);
3542 mark_dual_mode_def (ref);
3547 /* Add instruction into a chain. */
3549 void
3550 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3552 if (bitmap_bit_p (insns, insn_uid))
3553 return;
3555 if (dump_file)
3556 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3558 bitmap_set_bit (insns, insn_uid);
3560 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3561 rtx def_set = single_set (insn);
3562 if (def_set && REG_P (SET_DEST (def_set))
3563 && !HARD_REGISTER_P (SET_DEST (def_set)))
3564 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3566 df_ref ref;
3567 df_ref def;
3568 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3569 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3570 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3571 def;
3572 def = DF_REF_NEXT_REG (def))
3573 analyze_register_chain (candidates, def);
3574 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3575 if (!DF_REF_REG_MEM_P (ref))
3576 analyze_register_chain (candidates, ref);
3579 /* Build new chain starting from insn INSN_UID recursively
3580 adding all dependent uses and definitions. */
3582 void
3583 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3585 queue = BITMAP_ALLOC (NULL);
3586 bitmap_set_bit (queue, insn_uid);
3588 if (dump_file)
3589 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3591 while (!bitmap_empty_p (queue))
3593 insn_uid = bitmap_first_set_bit (queue);
3594 bitmap_clear_bit (queue, insn_uid);
3595 bitmap_clear_bit (candidates, insn_uid);
3596 add_insn (candidates, insn_uid);
3599 if (dump_file)
3601 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3602 fprintf (dump_file, " insns: ");
3603 dump_bitmap (dump_file, insns);
3604 if (!bitmap_empty_p (defs_conv))
3606 bitmap_iterator bi;
3607 unsigned id;
3608 const char *comma = "";
3609 fprintf (dump_file, " defs to convert: ");
3610 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3612 fprintf (dump_file, "%sr%d", comma, id);
3613 comma = ", ";
3615 fprintf (dump_file, "\n");
3619 BITMAP_FREE (queue);
3622 /* Return a cost of building a vector costant
3623 instead of using a scalar one. */
3626 dimode_scalar_chain::vector_const_cost (rtx exp)
3628 gcc_assert (CONST_INT_P (exp));
3630 if (standard_sse_constant_p (exp, V2DImode))
3631 return COSTS_N_INSNS (1);
3632 return ix86_cost->sse_load[1];
3635 /* Compute a gain for chain conversion. */
3638 dimode_scalar_chain::compute_convert_gain ()
3640 bitmap_iterator bi;
3641 unsigned insn_uid;
3642 int gain = 0;
3643 int cost = 0;
3645 if (dump_file)
3646 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3648 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3650 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3651 rtx def_set = single_set (insn);
3652 rtx src = SET_SRC (def_set);
3653 rtx dst = SET_DEST (def_set);
3655 if (REG_P (src) && REG_P (dst))
3656 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3657 else if (REG_P (src) && MEM_P (dst))
3658 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3659 else if (MEM_P (src) && REG_P (dst))
3660 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3661 else if (GET_CODE (src) == ASHIFT
3662 || GET_CODE (src) == ASHIFTRT
3663 || GET_CODE (src) == LSHIFTRT)
3665 if (CONST_INT_P (XEXP (src, 0)))
3666 gain -= vector_const_cost (XEXP (src, 0));
3667 if (CONST_INT_P (XEXP (src, 1)))
3669 gain += ix86_cost->shift_const;
3670 if (INTVAL (XEXP (src, 1)) >= 32)
3671 gain -= COSTS_N_INSNS (1);
3673 else
3674 /* Additional gain for omitting two CMOVs. */
3675 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3677 else if (GET_CODE (src) == PLUS
3678 || GET_CODE (src) == MINUS
3679 || GET_CODE (src) == IOR
3680 || GET_CODE (src) == XOR
3681 || GET_CODE (src) == AND)
3683 gain += ix86_cost->add;
3684 /* Additional gain for andnot for targets without BMI. */
3685 if (GET_CODE (XEXP (src, 0)) == NOT
3686 && !TARGET_BMI)
3687 gain += 2 * ix86_cost->add;
3689 if (CONST_INT_P (XEXP (src, 0)))
3690 gain -= vector_const_cost (XEXP (src, 0));
3691 if (CONST_INT_P (XEXP (src, 1)))
3692 gain -= vector_const_cost (XEXP (src, 1));
3694 else if (GET_CODE (src) == NEG
3695 || GET_CODE (src) == NOT)
3696 gain += ix86_cost->add - COSTS_N_INSNS (1);
3697 else if (GET_CODE (src) == COMPARE)
3699 /* Assume comparison cost is the same. */
3701 else if (CONST_INT_P (src))
3703 if (REG_P (dst))
3704 gain += COSTS_N_INSNS (2);
3705 else if (MEM_P (dst))
3706 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3707 gain -= vector_const_cost (src);
3709 else
3710 gcc_unreachable ();
3713 if (dump_file)
3714 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3716 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3717 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3719 if (dump_file)
3720 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3722 gain -= cost;
3724 if (dump_file)
3725 fprintf (dump_file, " Total gain: %d\n", gain);
3727 return gain;
3730 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3733 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3735 if (x == reg)
3736 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3738 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3739 int i, j;
3740 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3742 if (fmt[i] == 'e')
3743 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3744 else if (fmt[i] == 'E')
3745 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3746 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3747 reg, new_reg);
3750 return x;
3753 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3755 void
3756 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3757 rtx reg, rtx new_reg)
3759 replace_with_subreg (single_set (insn), reg, new_reg);
3762 /* Insert generated conversion instruction sequence INSNS
3763 after instruction AFTER. New BB may be required in case
3764 instruction has EH region attached. */
3766 void
3767 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3769 if (!control_flow_insn_p (after))
3771 emit_insn_after (insns, after);
3772 return;
3775 basic_block bb = BLOCK_FOR_INSN (after);
3776 edge e = find_fallthru_edge (bb->succs);
3777 gcc_assert (e);
3779 basic_block new_bb = split_edge (e);
3780 emit_insn_after (insns, BB_HEAD (new_bb));
3783 /* Make vector copies for all register REGNO definitions
3784 and replace its uses in a chain. */
3786 void
3787 dimode_scalar_chain::make_vector_copies (unsigned regno)
3789 rtx reg = regno_reg_rtx[regno];
3790 rtx vreg = gen_reg_rtx (DImode);
3791 bool count_reg = false;
3792 df_ref ref;
3794 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3795 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3797 df_ref use;
3799 /* Detect the count register of a shift instruction. */
3800 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3801 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3803 rtx_insn *insn = DF_REF_INSN (use);
3804 rtx def_set = single_set (insn);
3806 gcc_assert (def_set);
3808 rtx src = SET_SRC (def_set);
3810 if ((GET_CODE (src) == ASHIFT
3811 || GET_CODE (src) == ASHIFTRT
3812 || GET_CODE (src) == LSHIFTRT)
3813 && !CONST_INT_P (XEXP (src, 1))
3814 && reg_or_subregno (XEXP (src, 1)) == regno)
3815 count_reg = true;
3818 start_sequence ();
3819 if (count_reg)
3821 rtx qreg = gen_lowpart (QImode, reg);
3822 rtx tmp = gen_reg_rtx (SImode);
3824 if (TARGET_ZERO_EXTEND_WITH_AND
3825 && optimize_function_for_speed_p (cfun))
3827 emit_move_insn (tmp, const0_rtx);
3828 emit_insn (gen_movstrictqi
3829 (gen_lowpart (QImode, tmp), qreg));
3831 else
3832 emit_insn (gen_rtx_SET
3833 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3835 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3837 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3838 emit_move_insn (slot, tmp);
3839 tmp = copy_rtx (slot);
3842 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3844 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3846 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3847 emit_move_insn (adjust_address (tmp, SImode, 0),
3848 gen_rtx_SUBREG (SImode, reg, 0));
3849 emit_move_insn (adjust_address (tmp, SImode, 4),
3850 gen_rtx_SUBREG (SImode, reg, 4));
3851 emit_move_insn (vreg, tmp);
3853 else if (TARGET_SSE4_1)
3855 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3856 CONST0_RTX (V4SImode),
3857 gen_rtx_SUBREG (SImode, reg, 0)));
3858 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3859 gen_rtx_SUBREG (V4SImode, vreg, 0),
3860 gen_rtx_SUBREG (SImode, reg, 4),
3861 GEN_INT (2)));
3863 else
3865 rtx tmp = gen_reg_rtx (DImode);
3866 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3867 CONST0_RTX (V4SImode),
3868 gen_rtx_SUBREG (SImode, reg, 0)));
3869 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3870 CONST0_RTX (V4SImode),
3871 gen_rtx_SUBREG (SImode, reg, 4)));
3872 emit_insn (gen_vec_interleave_lowv4si
3873 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3874 gen_rtx_SUBREG (V4SImode, vreg, 0),
3875 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3877 rtx_insn *seq = get_insns ();
3878 end_sequence ();
3879 rtx_insn *insn = DF_REF_INSN (ref);
3880 emit_conversion_insns (seq, insn);
3882 if (dump_file)
3883 fprintf (dump_file,
3884 " Copied r%d to a vector register r%d for insn %d\n",
3885 regno, REGNO (vreg), INSN_UID (insn));
3888 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3889 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3891 rtx_insn *insn = DF_REF_INSN (ref);
3892 if (count_reg)
3894 rtx def_set = single_set (insn);
3895 gcc_assert (def_set);
3897 rtx src = SET_SRC (def_set);
3899 if ((GET_CODE (src) == ASHIFT
3900 || GET_CODE (src) == ASHIFTRT
3901 || GET_CODE (src) == LSHIFTRT)
3902 && !CONST_INT_P (XEXP (src, 1))
3903 && reg_or_subregno (XEXP (src, 1)) == regno)
3904 XEXP (src, 1) = vreg;
3906 else
3907 replace_with_subreg_in_insn (insn, reg, vreg);
3909 if (dump_file)
3910 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3911 regno, REGNO (vreg), INSN_UID (insn));
3915 /* Convert all definitions of register REGNO
3916 and fix its uses. Scalar copies may be created
3917 in case register is used in not convertible insn. */
3919 void
3920 dimode_scalar_chain::convert_reg (unsigned regno)
3922 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3923 rtx reg = regno_reg_rtx[regno];
3924 rtx scopy = NULL_RTX;
3925 df_ref ref;
3926 bitmap conv;
3928 conv = BITMAP_ALLOC (NULL);
3929 bitmap_copy (conv, insns);
3931 if (scalar_copy)
3932 scopy = gen_reg_rtx (DImode);
3934 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3936 rtx_insn *insn = DF_REF_INSN (ref);
3937 rtx def_set = single_set (insn);
3938 rtx src = SET_SRC (def_set);
3939 rtx reg = DF_REF_REG (ref);
3941 if (!MEM_P (src))
3943 replace_with_subreg_in_insn (insn, reg, reg);
3944 bitmap_clear_bit (conv, INSN_UID (insn));
3947 if (scalar_copy)
3949 start_sequence ();
3950 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3952 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3953 emit_move_insn (tmp, reg);
3954 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3955 adjust_address (tmp, SImode, 0));
3956 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3957 adjust_address (tmp, SImode, 4));
3959 else if (TARGET_SSE4_1)
3961 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3962 emit_insn
3963 (gen_rtx_SET
3964 (gen_rtx_SUBREG (SImode, scopy, 0),
3965 gen_rtx_VEC_SELECT (SImode,
3966 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3968 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3969 emit_insn
3970 (gen_rtx_SET
3971 (gen_rtx_SUBREG (SImode, scopy, 4),
3972 gen_rtx_VEC_SELECT (SImode,
3973 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3975 else
3977 rtx vcopy = gen_reg_rtx (V2DImode);
3978 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3979 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3980 gen_rtx_SUBREG (SImode, vcopy, 0));
3981 emit_move_insn (vcopy,
3982 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3983 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3984 gen_rtx_SUBREG (SImode, vcopy, 0));
3986 rtx_insn *seq = get_insns ();
3987 end_sequence ();
3988 emit_conversion_insns (seq, insn);
3990 if (dump_file)
3991 fprintf (dump_file,
3992 " Copied r%d to a scalar register r%d for insn %d\n",
3993 regno, REGNO (scopy), INSN_UID (insn));
3997 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3998 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
4000 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
4002 rtx_insn *insn = DF_REF_INSN (ref);
4004 rtx def_set = single_set (insn);
4005 gcc_assert (def_set);
4007 rtx src = SET_SRC (def_set);
4008 rtx dst = SET_DEST (def_set);
4010 if ((GET_CODE (src) == ASHIFT
4011 || GET_CODE (src) == ASHIFTRT
4012 || GET_CODE (src) == LSHIFTRT)
4013 && !CONST_INT_P (XEXP (src, 1))
4014 && reg_or_subregno (XEXP (src, 1)) == regno)
4016 rtx tmp2 = gen_reg_rtx (V2DImode);
4018 start_sequence ();
4020 if (TARGET_SSE4_1)
4021 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4022 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4023 else
4025 rtx vec_cst
4026 = gen_rtx_CONST_VECTOR (V2DImode,
4027 gen_rtvec (2, GEN_INT (0xff),
4028 const0_rtx));
4029 vec_cst
4030 = validize_mem (force_const_mem (V2DImode, vec_cst));
4032 emit_insn (gen_rtx_SET
4033 (tmp2,
4034 gen_rtx_AND (V2DImode,
4035 gen_rtx_SUBREG (V2DImode, reg, 0),
4036 vec_cst)));
4038 rtx_insn *seq = get_insns ();
4039 end_sequence ();
4041 emit_insn_before (seq, insn);
4043 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4045 else if (!MEM_P (dst) || !REG_P (src))
4046 replace_with_subreg_in_insn (insn, reg, reg);
4048 bitmap_clear_bit (conv, INSN_UID (insn));
4051 /* Skip debug insns and uninitialized uses. */
4052 else if (DF_REF_CHAIN (ref)
4053 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4055 gcc_assert (scopy);
4056 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4057 df_insn_rescan (DF_REF_INSN (ref));
4060 BITMAP_FREE (conv);
4063 /* Convert operand OP in INSN. We should handle
4064 memory operands and uninitialized registers.
4065 All other register uses are converted during
4066 registers conversion. */
4068 void
4069 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4071 *op = copy_rtx_if_shared (*op);
4073 if (GET_CODE (*op) == NOT)
4075 convert_op (&XEXP (*op, 0), insn);
4076 PUT_MODE (*op, V2DImode);
4078 else if (MEM_P (*op))
4080 rtx tmp = gen_reg_rtx (DImode);
4082 emit_insn_before (gen_move_insn (tmp, *op), insn);
4083 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4085 if (dump_file)
4086 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4087 INSN_UID (insn), REGNO (tmp));
4089 else if (REG_P (*op))
4091 /* We may have not converted register usage in case
4092 this register has no definition. Otherwise it
4093 should be converted in convert_reg. */
4094 df_ref ref;
4095 FOR_EACH_INSN_USE (ref, insn)
4096 if (DF_REF_REGNO (ref) == REGNO (*op))
4098 gcc_assert (!DF_REF_CHAIN (ref));
4099 break;
4101 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4103 else if (CONST_INT_P (*op))
4105 rtx vec_cst;
4106 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4108 /* Prefer all ones vector in case of -1. */
4109 if (constm1_operand (*op, GET_MODE (*op)))
4110 vec_cst = CONSTM1_RTX (V2DImode);
4111 else
4112 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4113 gen_rtvec (2, *op, const0_rtx));
4115 if (!standard_sse_constant_p (vec_cst, V2DImode))
4117 start_sequence ();
4118 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4119 rtx_insn *seq = get_insns ();
4120 end_sequence ();
4121 emit_insn_before (seq, insn);
4124 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4125 *op = tmp;
4127 else
4129 gcc_assert (SUBREG_P (*op));
4130 gcc_assert (GET_MODE (*op) == V2DImode);
4134 /* Convert INSN to vector mode. */
4136 void
4137 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4139 rtx def_set = single_set (insn);
4140 rtx src = SET_SRC (def_set);
4141 rtx dst = SET_DEST (def_set);
4142 rtx subreg;
4144 if (MEM_P (dst) && !REG_P (src))
4146 /* There are no scalar integer instructions and therefore
4147 temporary register usage is required. */
4148 rtx tmp = gen_reg_rtx (DImode);
4149 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4150 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4153 switch (GET_CODE (src))
4155 case ASHIFT:
4156 case ASHIFTRT:
4157 case LSHIFTRT:
4158 convert_op (&XEXP (src, 0), insn);
4159 PUT_MODE (src, V2DImode);
4160 break;
4162 case PLUS:
4163 case MINUS:
4164 case IOR:
4165 case XOR:
4166 case AND:
4167 convert_op (&XEXP (src, 0), insn);
4168 convert_op (&XEXP (src, 1), insn);
4169 PUT_MODE (src, V2DImode);
4170 break;
4172 case NEG:
4173 src = XEXP (src, 0);
4174 convert_op (&src, insn);
4175 subreg = gen_reg_rtx (V2DImode);
4176 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4177 src = gen_rtx_MINUS (V2DImode, subreg, src);
4178 break;
4180 case NOT:
4181 src = XEXP (src, 0);
4182 convert_op (&src, insn);
4183 subreg = gen_reg_rtx (V2DImode);
4184 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4185 src = gen_rtx_XOR (V2DImode, src, subreg);
4186 break;
4188 case MEM:
4189 if (!REG_P (dst))
4190 convert_op (&src, insn);
4191 break;
4193 case REG:
4194 if (!MEM_P (dst))
4195 convert_op (&src, insn);
4196 break;
4198 case SUBREG:
4199 gcc_assert (GET_MODE (src) == V2DImode);
4200 break;
4202 case COMPARE:
4203 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4205 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4206 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4208 if (REG_P (src))
4209 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4210 else
4211 subreg = copy_rtx_if_shared (src);
4212 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4213 copy_rtx_if_shared (subreg),
4214 copy_rtx_if_shared (subreg)),
4215 insn);
4216 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4217 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4218 copy_rtx_if_shared (src)),
4219 UNSPEC_PTEST);
4220 break;
4222 case CONST_INT:
4223 convert_op (&src, insn);
4224 break;
4226 default:
4227 gcc_unreachable ();
4230 SET_SRC (def_set) = src;
4231 SET_DEST (def_set) = dst;
4233 /* Drop possible dead definitions. */
4234 PATTERN (insn) = def_set;
4236 INSN_CODE (insn) = -1;
4237 recog_memoized (insn);
4238 df_insn_rescan (insn);
4241 /* Fix uses of converted REG in debug insns. */
4243 void
4244 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4246 if (!flag_var_tracking)
4247 return;
4249 df_ref ref, next;
4250 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4252 rtx_insn *insn = DF_REF_INSN (ref);
4253 /* Make sure the next ref is for a different instruction,
4254 so that we're not affected by the rescan. */
4255 next = DF_REF_NEXT_REG (ref);
4256 while (next && DF_REF_INSN (next) == insn)
4257 next = DF_REF_NEXT_REG (next);
4259 if (DEBUG_INSN_P (insn))
4261 /* It may be a debug insn with a TImode variable in
4262 register. */
4263 bool changed = false;
4264 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4266 rtx *loc = DF_REF_LOC (ref);
4267 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4269 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4270 changed = true;
4273 if (changed)
4274 df_insn_rescan (insn);
4279 /* Convert INSN from TImode to V1T1mode. */
4281 void
4282 timode_scalar_chain::convert_insn (rtx_insn *insn)
4284 rtx def_set = single_set (insn);
4285 rtx src = SET_SRC (def_set);
4286 rtx dst = SET_DEST (def_set);
4288 switch (GET_CODE (dst))
4290 case REG:
4292 rtx tmp = find_reg_equal_equiv_note (insn);
4293 if (tmp)
4294 PUT_MODE (XEXP (tmp, 0), V1TImode);
4295 PUT_MODE (dst, V1TImode);
4296 fix_debug_reg_uses (dst);
4298 break;
4299 case MEM:
4300 PUT_MODE (dst, V1TImode);
4301 break;
4303 default:
4304 gcc_unreachable ();
4307 switch (GET_CODE (src))
4309 case REG:
4310 PUT_MODE (src, V1TImode);
4311 /* Call fix_debug_reg_uses only if SRC is never defined. */
4312 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4313 fix_debug_reg_uses (src);
4314 break;
4316 case MEM:
4317 PUT_MODE (src, V1TImode);
4318 break;
4320 case CONST_WIDE_INT:
4321 if (NONDEBUG_INSN_P (insn))
4323 /* Since there are no instructions to store 128-bit constant,
4324 temporary register usage is required. */
4325 rtx tmp = gen_reg_rtx (V1TImode);
4326 start_sequence ();
4327 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4328 src = validize_mem (force_const_mem (V1TImode, src));
4329 rtx_insn *seq = get_insns ();
4330 end_sequence ();
4331 if (seq)
4332 emit_insn_before (seq, insn);
4333 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4334 dst = tmp;
4336 break;
4338 case CONST_INT:
4339 switch (standard_sse_constant_p (src, TImode))
4341 case 1:
4342 src = CONST0_RTX (GET_MODE (dst));
4343 break;
4344 case 2:
4345 src = CONSTM1_RTX (GET_MODE (dst));
4346 break;
4347 default:
4348 gcc_unreachable ();
4350 if (NONDEBUG_INSN_P (insn))
4352 rtx tmp = gen_reg_rtx (V1TImode);
4353 /* Since there are no instructions to store standard SSE
4354 constant, temporary register usage is required. */
4355 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4356 dst = tmp;
4358 break;
4360 default:
4361 gcc_unreachable ();
4364 SET_SRC (def_set) = src;
4365 SET_DEST (def_set) = dst;
4367 /* Drop possible dead definitions. */
4368 PATTERN (insn) = def_set;
4370 INSN_CODE (insn) = -1;
4371 recog_memoized (insn);
4372 df_insn_rescan (insn);
4375 void
4376 dimode_scalar_chain::convert_registers ()
4378 bitmap_iterator bi;
4379 unsigned id;
4381 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4382 convert_reg (id);
4384 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4385 make_vector_copies (id);
4388 /* Convert whole chain creating required register
4389 conversions and copies. */
4392 scalar_chain::convert ()
4394 bitmap_iterator bi;
4395 unsigned id;
4396 int converted_insns = 0;
4398 if (!dbg_cnt (stv_conversion))
4399 return 0;
4401 if (dump_file)
4402 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4404 convert_registers ();
4406 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4408 convert_insn (DF_INSN_UID_GET (id)->insn);
4409 converted_insns++;
4412 return converted_insns;
4415 /* Main STV pass function. Find and convert scalar
4416 instructions into vector mode when profitable. */
4418 static unsigned int
4419 convert_scalars_to_vector ()
4421 basic_block bb;
4422 bitmap candidates;
4423 int converted_insns = 0;
4425 bitmap_obstack_initialize (NULL);
4426 candidates = BITMAP_ALLOC (NULL);
4428 calculate_dominance_info (CDI_DOMINATORS);
4429 df_set_flags (DF_DEFER_INSN_RESCAN);
4430 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4431 df_md_add_problem ();
4432 df_analyze ();
4434 /* Find all instructions we want to convert into vector mode. */
4435 if (dump_file)
4436 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4438 FOR_EACH_BB_FN (bb, cfun)
4440 rtx_insn *insn;
4441 FOR_BB_INSNS (bb, insn)
4442 if (scalar_to_vector_candidate_p (insn))
4444 if (dump_file)
4445 fprintf (dump_file, " insn %d is marked as a candidate\n",
4446 INSN_UID (insn));
4448 bitmap_set_bit (candidates, INSN_UID (insn));
4452 remove_non_convertible_regs (candidates);
4454 if (bitmap_empty_p (candidates))
4455 if (dump_file)
4456 fprintf (dump_file, "There are no candidates for optimization.\n");
4458 while (!bitmap_empty_p (candidates))
4460 unsigned uid = bitmap_first_set_bit (candidates);
4461 scalar_chain *chain;
4463 if (TARGET_64BIT)
4464 chain = new timode_scalar_chain;
4465 else
4466 chain = new dimode_scalar_chain;
4468 /* Find instructions chain we want to convert to vector mode.
4469 Check all uses and definitions to estimate all required
4470 conversions. */
4471 chain->build (candidates, uid);
4473 if (chain->compute_convert_gain () > 0)
4474 converted_insns += chain->convert ();
4475 else
4476 if (dump_file)
4477 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4478 chain->chain_id);
4480 delete chain;
4483 if (dump_file)
4484 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4486 BITMAP_FREE (candidates);
4487 bitmap_obstack_release (NULL);
4488 df_process_deferred_rescans ();
4490 /* Conversion means we may have 128bit register spills/fills
4491 which require aligned stack. */
4492 if (converted_insns)
4494 if (crtl->stack_alignment_needed < 128)
4495 crtl->stack_alignment_needed = 128;
4496 if (crtl->stack_alignment_estimated < 128)
4497 crtl->stack_alignment_estimated = 128;
4498 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4499 if (TARGET_64BIT)
4500 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4501 parm; parm = DECL_CHAIN (parm))
4503 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4504 continue;
4505 if (DECL_RTL_SET_P (parm)
4506 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4508 rtx r = DECL_RTL (parm);
4509 if (REG_P (r))
4510 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4512 if (DECL_INCOMING_RTL (parm)
4513 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4515 rtx r = DECL_INCOMING_RTL (parm);
4516 if (REG_P (r))
4517 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4522 return 0;
4525 namespace {
4527 const pass_data pass_data_insert_vzeroupper =
4529 RTL_PASS, /* type */
4530 "vzeroupper", /* name */
4531 OPTGROUP_NONE, /* optinfo_flags */
4532 TV_MACH_DEP, /* tv_id */
4533 0, /* properties_required */
4534 0, /* properties_provided */
4535 0, /* properties_destroyed */
4536 0, /* todo_flags_start */
4537 TODO_df_finish, /* todo_flags_finish */
4540 class pass_insert_vzeroupper : public rtl_opt_pass
4542 public:
4543 pass_insert_vzeroupper(gcc::context *ctxt)
4544 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4547 /* opt_pass methods: */
4548 virtual bool gate (function *)
4550 return TARGET_AVX && !TARGET_AVX512F
4551 && TARGET_VZEROUPPER && flag_expensive_optimizations
4552 && !optimize_size;
4555 virtual unsigned int execute (function *)
4557 return rest_of_handle_insert_vzeroupper ();
4560 }; // class pass_insert_vzeroupper
4562 const pass_data pass_data_stv =
4564 RTL_PASS, /* type */
4565 "stv", /* name */
4566 OPTGROUP_NONE, /* optinfo_flags */
4567 TV_MACH_DEP, /* tv_id */
4568 0, /* properties_required */
4569 0, /* properties_provided */
4570 0, /* properties_destroyed */
4571 0, /* todo_flags_start */
4572 TODO_df_finish, /* todo_flags_finish */
4575 class pass_stv : public rtl_opt_pass
4577 public:
4578 pass_stv (gcc::context *ctxt)
4579 : rtl_opt_pass (pass_data_stv, ctxt),
4580 timode_p (false)
4583 /* opt_pass methods: */
4584 virtual bool gate (function *)
4586 return (timode_p == !!TARGET_64BIT
4587 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4590 virtual unsigned int execute (function *)
4592 return convert_scalars_to_vector ();
4595 opt_pass *clone ()
4597 return new pass_stv (m_ctxt);
4600 void set_pass_param (unsigned int n, bool param)
4602 gcc_assert (n == 0);
4603 timode_p = param;
4606 private:
4607 bool timode_p;
4608 }; // class pass_stv
4610 } // anon namespace
4612 rtl_opt_pass *
4613 make_pass_insert_vzeroupper (gcc::context *ctxt)
4615 return new pass_insert_vzeroupper (ctxt);
4618 rtl_opt_pass *
4619 make_pass_stv (gcc::context *ctxt)
4621 return new pass_stv (ctxt);
4624 /* Return true if a red-zone is in use. */
4626 bool
4627 ix86_using_red_zone (void)
4629 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4632 /* Return a string that documents the current -m options. The caller is
4633 responsible for freeing the string. */
4635 static char *
4636 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4637 int flags, int flags2,
4638 const char *arch, const char *tune,
4639 enum fpmath_unit fpmath, bool add_nl_p)
4641 struct ix86_target_opts
4643 const char *option; /* option string */
4644 HOST_WIDE_INT mask; /* isa mask options */
4647 /* This table is ordered so that options like -msse4.2 that imply other
4648 ISAs come first. Target string will be displayed in the same order. */
4649 static struct ix86_target_opts isa2_opts[] =
4651 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4652 { "-msgx", OPTION_MASK_ISA_SGX },
4653 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4654 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4655 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4657 static struct ix86_target_opts isa_opts[] =
4659 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4660 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4661 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4662 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4663 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4664 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4665 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4666 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4667 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4668 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4669 { "-mfma", OPTION_MASK_ISA_FMA },
4670 { "-mxop", OPTION_MASK_ISA_XOP },
4671 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4672 { "-mf16c", OPTION_MASK_ISA_F16C },
4673 { "-mavx", OPTION_MASK_ISA_AVX },
4674 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4675 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4676 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4677 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4678 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4679 { "-msse3", OPTION_MASK_ISA_SSE3 },
4680 { "-maes", OPTION_MASK_ISA_AES },
4681 { "-msha", OPTION_MASK_ISA_SHA },
4682 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4683 { "-msse2", OPTION_MASK_ISA_SSE2 },
4684 { "-msse", OPTION_MASK_ISA_SSE },
4685 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4686 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4687 { "-mmmx", OPTION_MASK_ISA_MMX },
4688 { "-mrtm", OPTION_MASK_ISA_RTM },
4689 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4690 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4691 { "-madx", OPTION_MASK_ISA_ADX },
4692 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4693 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4694 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4695 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4696 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4697 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4698 { "-mabm", OPTION_MASK_ISA_ABM },
4699 { "-mbmi", OPTION_MASK_ISA_BMI },
4700 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4701 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4702 { "-mtbm", OPTION_MASK_ISA_TBM },
4703 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4704 { "-mcx16", OPTION_MASK_ISA_CX16 },
4705 { "-msahf", OPTION_MASK_ISA_SAHF },
4706 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4707 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4708 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4709 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4710 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4711 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4712 { "-mpku", OPTION_MASK_ISA_PKU },
4713 { "-mlwp", OPTION_MASK_ISA_LWP },
4714 { "-mhle", OPTION_MASK_ISA_HLE },
4715 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4716 { "-mmpx", OPTION_MASK_ISA_MPX },
4717 { "-mclwb", OPTION_MASK_ISA_CLWB }
4720 /* Flag options. */
4721 static struct ix86_target_opts flag_opts[] =
4723 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4724 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4725 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4726 { "-m80387", MASK_80387 },
4727 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4728 { "-malign-double", MASK_ALIGN_DOUBLE },
4729 { "-mcld", MASK_CLD },
4730 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4731 { "-mieee-fp", MASK_IEEE_FP },
4732 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4733 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4734 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4735 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4736 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4737 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4738 { "-mno-red-zone", MASK_NO_RED_ZONE },
4739 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4740 { "-mrecip", MASK_RECIP },
4741 { "-mrtd", MASK_RTD },
4742 { "-msseregparm", MASK_SSEREGPARM },
4743 { "-mstack-arg-probe", MASK_STACK_PROBE },
4744 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4745 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4746 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4747 { "-mvzeroupper", MASK_VZEROUPPER },
4748 { "-mstv", MASK_STV },
4749 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4750 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4751 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4752 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4755 /* Additional flag options. */
4756 static struct ix86_target_opts flag2_opts[] =
4758 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4761 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4762 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4764 char isa_other[40];
4765 char isa2_other[40];
4766 char flags_other[40];
4767 char flags2_other[40];
4768 unsigned num = 0;
4769 unsigned i, j;
4770 char *ret;
4771 char *ptr;
4772 size_t len;
4773 size_t line_len;
4774 size_t sep_len;
4775 const char *abi;
4777 memset (opts, '\0', sizeof (opts));
4779 /* Add -march= option. */
4780 if (arch)
4782 opts[num][0] = "-march=";
4783 opts[num++][1] = arch;
4786 /* Add -mtune= option. */
4787 if (tune)
4789 opts[num][0] = "-mtune=";
4790 opts[num++][1] = tune;
4793 /* Add -m32/-m64/-mx32. */
4794 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4796 if ((isa & OPTION_MASK_ABI_64) != 0)
4797 abi = "-m64";
4798 else
4799 abi = "-mx32";
4800 isa &= ~ (OPTION_MASK_ISA_64BIT
4801 | OPTION_MASK_ABI_64
4802 | OPTION_MASK_ABI_X32);
4804 else
4805 abi = "-m32";
4806 opts[num++][0] = abi;
4808 /* Pick out the options in isa2 options. */
4809 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4811 if ((isa2 & isa2_opts[i].mask) != 0)
4813 opts[num++][0] = isa2_opts[i].option;
4814 isa2 &= ~ isa2_opts[i].mask;
4818 if (isa2 && add_nl_p)
4820 opts[num++][0] = isa2_other;
4821 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4824 /* Pick out the options in isa options. */
4825 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4827 if ((isa & isa_opts[i].mask) != 0)
4829 opts[num++][0] = isa_opts[i].option;
4830 isa &= ~ isa_opts[i].mask;
4834 if (isa && add_nl_p)
4836 opts[num++][0] = isa_other;
4837 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4840 /* Add flag options. */
4841 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4843 if ((flags & flag_opts[i].mask) != 0)
4845 opts[num++][0] = flag_opts[i].option;
4846 flags &= ~ flag_opts[i].mask;
4850 if (flags && add_nl_p)
4852 opts[num++][0] = flags_other;
4853 sprintf (flags_other, "(other flags: %#x)", flags);
4856 /* Add additional flag options. */
4857 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4859 if ((flags2 & flag2_opts[i].mask) != 0)
4861 opts[num++][0] = flag2_opts[i].option;
4862 flags2 &= ~ flag2_opts[i].mask;
4866 if (flags2 && add_nl_p)
4868 opts[num++][0] = flags2_other;
4869 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4872 /* Add -fpmath= option. */
4873 if (fpmath)
4875 opts[num][0] = "-mfpmath=";
4876 switch ((int) fpmath)
4878 case FPMATH_387:
4879 opts[num++][1] = "387";
4880 break;
4882 case FPMATH_SSE:
4883 opts[num++][1] = "sse";
4884 break;
4886 case FPMATH_387 | FPMATH_SSE:
4887 opts[num++][1] = "sse+387";
4888 break;
4890 default:
4891 gcc_unreachable ();
4895 /* Any options? */
4896 if (num == 0)
4897 return NULL;
4899 gcc_assert (num < ARRAY_SIZE (opts));
4901 /* Size the string. */
4902 len = 0;
4903 sep_len = (add_nl_p) ? 3 : 1;
4904 for (i = 0; i < num; i++)
4906 len += sep_len;
4907 for (j = 0; j < 2; j++)
4908 if (opts[i][j])
4909 len += strlen (opts[i][j]);
4912 /* Build the string. */
4913 ret = ptr = (char *) xmalloc (len);
4914 line_len = 0;
4916 for (i = 0; i < num; i++)
4918 size_t len2[2];
4920 for (j = 0; j < 2; j++)
4921 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4923 if (i != 0)
4925 *ptr++ = ' ';
4926 line_len++;
4928 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4930 *ptr++ = '\\';
4931 *ptr++ = '\n';
4932 line_len = 0;
4936 for (j = 0; j < 2; j++)
4937 if (opts[i][j])
4939 memcpy (ptr, opts[i][j], len2[j]);
4940 ptr += len2[j];
4941 line_len += len2[j];
4945 *ptr = '\0';
4946 gcc_assert (ret + len >= ptr);
4948 return ret;
4951 /* Return true, if profiling code should be emitted before
4952 prologue. Otherwise it returns false.
4953 Note: For x86 with "hotfix" it is sorried. */
4954 static bool
4955 ix86_profile_before_prologue (void)
4957 return flag_fentry != 0;
4960 /* Function that is callable from the debugger to print the current
4961 options. */
4962 void ATTRIBUTE_UNUSED
4963 ix86_debug_options (void)
4965 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4966 target_flags, ix86_target_flags,
4967 ix86_arch_string,ix86_tune_string,
4968 ix86_fpmath, true);
4970 if (opts)
4972 fprintf (stderr, "%s\n\n", opts);
4973 free (opts);
4975 else
4976 fputs ("<no options>\n\n", stderr);
4978 return;
4981 /* Return true if T is one of the bytes we should avoid with
4982 -fmitigate-rop. */
4984 static bool
4985 ix86_rop_should_change_byte_p (int t)
4987 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4990 static const char *stringop_alg_names[] = {
4991 #define DEF_ENUM
4992 #define DEF_ALG(alg, name) #name,
4993 #include "stringop.def"
4994 #undef DEF_ENUM
4995 #undef DEF_ALG
4998 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4999 The string is of the following form (or comma separated list of it):
5001 strategy_alg:max_size:[align|noalign]
5003 where the full size range for the strategy is either [0, max_size] or
5004 [min_size, max_size], in which min_size is the max_size + 1 of the
5005 preceding range. The last size range must have max_size == -1.
5007 Examples:
5010 -mmemcpy-strategy=libcall:-1:noalign
5012 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5016 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5018 This is to tell the compiler to use the following strategy for memset
5019 1) when the expected size is between [1, 16], use rep_8byte strategy;
5020 2) when the size is between [17, 2048], use vector_loop;
5021 3) when the size is > 2048, use libcall. */
5023 struct stringop_size_range
5025 int max;
5026 stringop_alg alg;
5027 bool noalign;
5030 static void
5031 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5033 const struct stringop_algs *default_algs;
5034 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5035 char *curr_range_str, *next_range_str;
5036 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5037 int i = 0, n = 0;
5039 if (is_memset)
5040 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5041 else
5042 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5044 curr_range_str = strategy_str;
5048 int maxs;
5049 char alg_name[128];
5050 char align[16];
5051 next_range_str = strchr (curr_range_str, ',');
5052 if (next_range_str)
5053 *next_range_str++ = '\0';
5055 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5056 alg_name, &maxs, align))
5058 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5059 return;
5062 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5064 error ("size ranges of option %qs should be increasing", opt);
5065 return;
5068 for (i = 0; i < last_alg; i++)
5069 if (!strcmp (alg_name, stringop_alg_names[i]))
5070 break;
5072 if (i == last_alg)
5074 error ("wrong strategy name %qs specified for option %qs",
5075 alg_name, opt);
5077 auto_vec <const char *> candidates;
5078 for (i = 0; i < last_alg; i++)
5079 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5080 candidates.safe_push (stringop_alg_names[i]);
5082 char *s;
5083 const char *hint
5084 = candidates_list_and_hint (alg_name, s, candidates);
5085 if (hint)
5086 inform (input_location,
5087 "valid arguments to %qs are: %s; did you mean %qs?",
5088 opt, s, hint);
5089 else
5090 inform (input_location, "valid arguments to %qs are: %s",
5091 opt, s);
5092 XDELETEVEC (s);
5093 return;
5096 if ((stringop_alg) i == rep_prefix_8_byte
5097 && !TARGET_64BIT)
5099 /* rep; movq isn't available in 32-bit code. */
5100 error ("strategy name %qs specified for option %qs "
5101 "not supported for 32-bit code", alg_name, opt);
5102 return;
5105 input_ranges[n].max = maxs;
5106 input_ranges[n].alg = (stringop_alg) i;
5107 if (!strcmp (align, "align"))
5108 input_ranges[n].noalign = false;
5109 else if (!strcmp (align, "noalign"))
5110 input_ranges[n].noalign = true;
5111 else
5113 error ("unknown alignment %qs specified for option %qs", align, opt);
5114 return;
5116 n++;
5117 curr_range_str = next_range_str;
5119 while (curr_range_str);
5121 if (input_ranges[n - 1].max != -1)
5123 error ("the max value for the last size range should be -1"
5124 " for option %qs", opt);
5125 return;
5128 if (n > MAX_STRINGOP_ALGS)
5130 error ("too many size ranges specified in option %qs", opt);
5131 return;
5134 /* Now override the default algs array. */
5135 for (i = 0; i < n; i++)
5137 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5138 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5139 = input_ranges[i].alg;
5140 *const_cast<int *>(&default_algs->size[i].noalign)
5141 = input_ranges[i].noalign;
5146 /* parse -mtune-ctrl= option. When DUMP is true,
5147 print the features that are explicitly set. */
5149 static void
5150 parse_mtune_ctrl_str (bool dump)
5152 if (!ix86_tune_ctrl_string)
5153 return;
5155 char *next_feature_string = NULL;
5156 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5157 char *orig = curr_feature_string;
5158 int i;
5161 bool clear = false;
5163 next_feature_string = strchr (curr_feature_string, ',');
5164 if (next_feature_string)
5165 *next_feature_string++ = '\0';
5166 if (*curr_feature_string == '^')
5168 curr_feature_string++;
5169 clear = true;
5171 for (i = 0; i < X86_TUNE_LAST; i++)
5173 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5175 ix86_tune_features[i] = !clear;
5176 if (dump)
5177 fprintf (stderr, "Explicitly %s feature %s\n",
5178 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5179 break;
5182 if (i == X86_TUNE_LAST)
5183 error ("Unknown parameter to option -mtune-ctrl: %s",
5184 clear ? curr_feature_string - 1 : curr_feature_string);
5185 curr_feature_string = next_feature_string;
5187 while (curr_feature_string);
5188 free (orig);
5191 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5192 processor type. */
5194 static void
5195 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5197 unsigned int ix86_tune_mask = 1u << ix86_tune;
5198 int i;
5200 for (i = 0; i < X86_TUNE_LAST; ++i)
5202 if (ix86_tune_no_default)
5203 ix86_tune_features[i] = 0;
5204 else
5205 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5208 if (dump)
5210 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5211 for (i = 0; i < X86_TUNE_LAST; i++)
5212 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5213 ix86_tune_features[i] ? "on" : "off");
5216 parse_mtune_ctrl_str (dump);
5220 /* Default align_* from the processor table. */
5222 static void
5223 ix86_default_align (struct gcc_options *opts)
5225 if (opts->x_align_loops == 0)
5227 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5228 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5230 if (opts->x_align_jumps == 0)
5232 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5233 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5235 if (opts->x_align_functions == 0)
5237 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5241 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5243 static void
5244 ix86_override_options_after_change (void)
5246 ix86_default_align (&global_options);
5249 /* Override various settings based on options. If MAIN_ARGS_P, the
5250 options are from the command line, otherwise they are from
5251 attributes. Return true if there's an error related to march
5252 option. */
5254 static bool
5255 ix86_option_override_internal (bool main_args_p,
5256 struct gcc_options *opts,
5257 struct gcc_options *opts_set)
5259 int i;
5260 unsigned int ix86_arch_mask;
5261 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5263 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5264 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5265 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5266 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5267 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5268 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5269 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5270 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5271 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5272 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5273 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5274 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5275 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5276 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5277 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5278 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5279 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5280 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5281 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5282 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5283 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5284 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5285 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5286 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5287 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5288 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5289 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5290 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5291 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5292 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5293 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5294 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5295 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5296 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5297 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5298 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5299 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5300 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5301 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5302 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5303 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5304 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5305 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5306 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5307 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5308 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5309 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5310 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5311 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5312 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5313 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5314 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5315 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5316 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5317 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5318 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5319 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5320 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5321 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5322 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5323 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5324 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5325 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5326 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5328 #define PTA_CORE2 \
5329 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5330 | PTA_CX16 | PTA_FXSR)
5331 #define PTA_NEHALEM \
5332 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5333 #define PTA_WESTMERE \
5334 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5335 #define PTA_SANDYBRIDGE \
5336 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5337 #define PTA_IVYBRIDGE \
5338 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5339 #define PTA_HASWELL \
5340 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5341 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5342 #define PTA_BROADWELL \
5343 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5344 #define PTA_SKYLAKE \
5345 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5346 #define PTA_SKYLAKE_AVX512 \
5347 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5348 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5349 #define PTA_KNL \
5350 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5351 #define PTA_BONNELL \
5352 (PTA_CORE2 | PTA_MOVBE)
5353 #define PTA_SILVERMONT \
5354 (PTA_WESTMERE | PTA_MOVBE)
5356 /* if this reaches 64, need to widen struct pta flags below */
5358 static struct pta
5360 const char *const name; /* processor name or nickname. */
5361 const enum processor_type processor;
5362 const enum attr_cpu schedule;
5363 const unsigned HOST_WIDE_INT flags;
5365 const processor_alias_table[] =
5367 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5368 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5369 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5370 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5371 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5372 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5373 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5374 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5375 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5376 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5377 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5378 PTA_MMX | PTA_SSE | PTA_FXSR},
5379 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5380 PTA_MMX | PTA_SSE | PTA_FXSR},
5381 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5382 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5383 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5384 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5385 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5386 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5387 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5388 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5389 PTA_MMX | PTA_SSE | PTA_FXSR},
5390 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5391 PTA_MMX | PTA_SSE | PTA_FXSR},
5392 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5393 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5394 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5395 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5396 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5397 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5398 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5399 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5400 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5401 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5402 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5403 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5404 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5405 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5406 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5407 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5408 PTA_SANDYBRIDGE},
5409 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5410 PTA_SANDYBRIDGE},
5411 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5412 PTA_IVYBRIDGE},
5413 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5414 PTA_IVYBRIDGE},
5415 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5416 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5417 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5418 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5419 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5420 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5421 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5422 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5423 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5424 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5425 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5426 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5427 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5428 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5429 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5430 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5431 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5432 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5433 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5434 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5435 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5436 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5437 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5438 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5439 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5440 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5441 {"x86-64", PROCESSOR_K8, CPU_K8,
5442 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5443 {"eden-x2", PROCESSOR_K8, CPU_K8,
5444 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5445 {"nano", PROCESSOR_K8, CPU_K8,
5446 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5447 | PTA_SSSE3 | PTA_FXSR},
5448 {"nano-1000", PROCESSOR_K8, CPU_K8,
5449 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5450 | PTA_SSSE3 | PTA_FXSR},
5451 {"nano-2000", PROCESSOR_K8, CPU_K8,
5452 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5453 | PTA_SSSE3 | PTA_FXSR},
5454 {"nano-3000", PROCESSOR_K8, CPU_K8,
5455 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5456 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5457 {"nano-x2", PROCESSOR_K8, CPU_K8,
5458 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5459 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5460 {"eden-x4", PROCESSOR_K8, CPU_K8,
5461 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5462 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5463 {"nano-x4", PROCESSOR_K8, CPU_K8,
5464 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5465 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5466 {"k8", PROCESSOR_K8, CPU_K8,
5467 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5468 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5469 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5470 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5471 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5472 {"opteron", PROCESSOR_K8, CPU_K8,
5473 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5474 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5475 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5476 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5477 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5478 {"athlon64", PROCESSOR_K8, CPU_K8,
5479 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5480 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5481 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5482 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5483 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5484 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5485 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5486 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5487 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5488 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5489 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5490 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5491 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5492 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5493 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5494 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5495 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5496 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5497 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5498 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5499 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5500 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5501 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5502 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5503 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5504 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5505 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5506 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5507 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5508 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5509 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5510 | PTA_XSAVEOPT | PTA_FSGSBASE},
5511 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5512 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5513 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5514 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5515 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5516 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5517 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5518 | PTA_MOVBE | PTA_MWAITX},
5519 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5520 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5521 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5522 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5523 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5524 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5525 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5526 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5527 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5528 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5529 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5530 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5531 | PTA_FXSR | PTA_XSAVE},
5532 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5533 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5534 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5535 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5536 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5537 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5539 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5540 PTA_64BIT
5541 | PTA_HLE /* flags are only used for -march switch. */ },
5544 /* -mrecip options. */
5545 static struct
5547 const char *string; /* option name */
5548 unsigned int mask; /* mask bits to set */
5550 const recip_options[] =
5552 { "all", RECIP_MASK_ALL },
5553 { "none", RECIP_MASK_NONE },
5554 { "div", RECIP_MASK_DIV },
5555 { "sqrt", RECIP_MASK_SQRT },
5556 { "vec-div", RECIP_MASK_VEC_DIV },
5557 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5560 int const pta_size = ARRAY_SIZE (processor_alias_table);
5562 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5563 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5564 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5565 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5566 #ifdef TARGET_BI_ARCH
5567 else
5569 #if TARGET_BI_ARCH == 1
5570 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5571 is on and OPTION_MASK_ABI_X32 is off. We turn off
5572 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5573 -mx32. */
5574 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5575 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5576 #else
5577 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5578 on and OPTION_MASK_ABI_64 is off. We turn off
5579 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5580 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5581 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5582 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5583 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5584 #endif
5585 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5586 && TARGET_IAMCU_P (opts->x_target_flags))
5587 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5588 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5590 #endif
5592 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5594 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5595 OPTION_MASK_ABI_64 for TARGET_X32. */
5596 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5597 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5599 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5600 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5601 | OPTION_MASK_ABI_X32
5602 | OPTION_MASK_ABI_64);
5603 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5605 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5606 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5608 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5611 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5612 SUBTARGET_OVERRIDE_OPTIONS;
5613 #endif
5615 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5616 SUBSUBTARGET_OVERRIDE_OPTIONS;
5617 #endif
5619 /* -fPIC is the default for x86_64. */
5620 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5621 opts->x_flag_pic = 2;
5623 /* Need to check -mtune=generic first. */
5624 if (opts->x_ix86_tune_string)
5626 /* As special support for cross compilers we read -mtune=native
5627 as -mtune=generic. With native compilers we won't see the
5628 -mtune=native, as it was changed by the driver. */
5629 if (!strcmp (opts->x_ix86_tune_string, "native"))
5631 opts->x_ix86_tune_string = "generic";
5633 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5634 warning (OPT_Wdeprecated,
5635 main_args_p
5636 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5637 "or %<-mtune=generic%> instead as appropriate")
5638 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5639 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5640 " instead as appropriate"));
5642 else
5644 if (opts->x_ix86_arch_string)
5645 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5646 if (!opts->x_ix86_tune_string)
5648 opts->x_ix86_tune_string
5649 = processor_target_table[TARGET_CPU_DEFAULT].name;
5650 ix86_tune_defaulted = 1;
5653 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5654 or defaulted. We need to use a sensible tune option. */
5655 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5657 opts->x_ix86_tune_string = "generic";
5661 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5662 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5664 /* rep; movq isn't available in 32-bit code. */
5665 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5666 opts->x_ix86_stringop_alg = no_stringop;
5669 if (!opts->x_ix86_arch_string)
5670 opts->x_ix86_arch_string
5671 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5672 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5673 else
5674 ix86_arch_specified = 1;
5676 if (opts_set->x_ix86_pmode)
5678 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5679 && opts->x_ix86_pmode == PMODE_SI)
5680 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5681 && opts->x_ix86_pmode == PMODE_DI))
5682 error ("address mode %qs not supported in the %s bit mode",
5683 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5684 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5686 else
5687 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5688 ? PMODE_DI : PMODE_SI;
5690 if (!opts_set->x_ix86_abi)
5691 opts->x_ix86_abi = DEFAULT_ABI;
5693 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
5694 error ("-mabi=ms not supported with X32 ABI");
5695 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
5697 /* For targets using ms ABI enable ms-extensions, if not
5698 explicit turned off. For non-ms ABI we turn off this
5699 option. */
5700 if (!opts_set->x_flag_ms_extensions)
5701 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5703 if (opts_set->x_ix86_cmodel)
5705 switch (opts->x_ix86_cmodel)
5707 case CM_SMALL:
5708 case CM_SMALL_PIC:
5709 if (opts->x_flag_pic)
5710 opts->x_ix86_cmodel = CM_SMALL_PIC;
5711 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5712 error ("code model %qs not supported in the %s bit mode",
5713 "small", "32");
5714 break;
5716 case CM_MEDIUM:
5717 case CM_MEDIUM_PIC:
5718 if (opts->x_flag_pic)
5719 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5720 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5721 error ("code model %qs not supported in the %s bit mode",
5722 "medium", "32");
5723 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5724 error ("code model %qs not supported in x32 mode",
5725 "medium");
5726 break;
5728 case CM_LARGE:
5729 case CM_LARGE_PIC:
5730 if (opts->x_flag_pic)
5731 opts->x_ix86_cmodel = CM_LARGE_PIC;
5732 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5733 error ("code model %qs not supported in the %s bit mode",
5734 "large", "32");
5735 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5736 error ("code model %qs not supported in x32 mode",
5737 "large");
5738 break;
5740 case CM_32:
5741 if (opts->x_flag_pic)
5742 error ("code model %s does not support PIC mode", "32");
5743 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5744 error ("code model %qs not supported in the %s bit mode",
5745 "32", "64");
5746 break;
5748 case CM_KERNEL:
5749 if (opts->x_flag_pic)
5751 error ("code model %s does not support PIC mode", "kernel");
5752 opts->x_ix86_cmodel = CM_32;
5754 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5755 error ("code model %qs not supported in the %s bit mode",
5756 "kernel", "32");
5757 break;
5759 default:
5760 gcc_unreachable ();
5763 else
5765 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5766 use of rip-relative addressing. This eliminates fixups that
5767 would otherwise be needed if this object is to be placed in a
5768 DLL, and is essentially just as efficient as direct addressing. */
5769 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5770 && (TARGET_RDOS || TARGET_PECOFF))
5771 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5772 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5773 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5774 else
5775 opts->x_ix86_cmodel = CM_32;
5777 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5779 error ("-masm=intel not supported in this configuration");
5780 opts->x_ix86_asm_dialect = ASM_ATT;
5782 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5783 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5784 sorry ("%i-bit mode not compiled in",
5785 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5787 for (i = 0; i < pta_size; i++)
5788 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5790 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5792 error (main_args_p
5793 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5794 "switch")
5795 : G_("%<generic%> CPU can be used only for "
5796 "%<target(\"tune=\")%> attribute"));
5797 return false;
5799 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5801 error (main_args_p
5802 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5803 "switch")
5804 : G_("%<intel%> CPU can be used only for "
5805 "%<target(\"tune=\")%> attribute"));
5806 return false;
5809 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5810 && !(processor_alias_table[i].flags & PTA_64BIT))
5812 error ("CPU you selected does not support x86-64 "
5813 "instruction set");
5814 return false;
5817 ix86_schedule = processor_alias_table[i].schedule;
5818 ix86_arch = processor_alias_table[i].processor;
5819 /* Default cpu tuning to the architecture. */
5820 ix86_tune = ix86_arch;
5822 if (processor_alias_table[i].flags & PTA_MMX
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5825 if (processor_alias_table[i].flags & PTA_3DNOW
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5828 if (processor_alias_table[i].flags & PTA_3DNOW_A
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5831 if (processor_alias_table[i].flags & PTA_SSE
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5834 if (processor_alias_table[i].flags & PTA_SSE2
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5837 if (processor_alias_table[i].flags & PTA_SSE3
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5840 if (processor_alias_table[i].flags & PTA_SSSE3
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5843 if (processor_alias_table[i].flags & PTA_SSE4_1
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5846 if (processor_alias_table[i].flags & PTA_SSE4_2
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5849 if (processor_alias_table[i].flags & PTA_AVX
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5852 if (processor_alias_table[i].flags & PTA_AVX2
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5855 if (processor_alias_table[i].flags & PTA_FMA
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5858 if (processor_alias_table[i].flags & PTA_SSE4A
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5861 if (processor_alias_table[i].flags & PTA_FMA4
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5864 if (processor_alias_table[i].flags & PTA_XOP
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5867 if (processor_alias_table[i].flags & PTA_LWP
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5870 if (processor_alias_table[i].flags & PTA_ABM
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5873 if (processor_alias_table[i].flags & PTA_BMI
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5876 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5879 if (processor_alias_table[i].flags & PTA_TBM
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5882 if (processor_alias_table[i].flags & PTA_BMI2
5883 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5885 if (processor_alias_table[i].flags & PTA_CX16
5886 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5887 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5888 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5889 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5890 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5891 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5892 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5893 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5894 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5895 if (processor_alias_table[i].flags & PTA_MOVBE
5896 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5897 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5898 if (processor_alias_table[i].flags & PTA_AES
5899 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5900 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5901 if (processor_alias_table[i].flags & PTA_SHA
5902 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5903 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5904 if (processor_alias_table[i].flags & PTA_PCLMUL
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5907 if (processor_alias_table[i].flags & PTA_FSGSBASE
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5910 if (processor_alias_table[i].flags & PTA_RDRND
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5913 if (processor_alias_table[i].flags & PTA_F16C
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5916 if (processor_alias_table[i].flags & PTA_RTM
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5919 if (processor_alias_table[i].flags & PTA_HLE
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5922 if (processor_alias_table[i].flags & PTA_PRFCHW
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5925 if (processor_alias_table[i].flags & PTA_RDSEED
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5928 if (processor_alias_table[i].flags & PTA_ADX
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5931 if (processor_alias_table[i].flags & PTA_FXSR
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5934 if (processor_alias_table[i].flags & PTA_XSAVE
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5937 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5940 if (processor_alias_table[i].flags & PTA_AVX512F
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5943 if (processor_alias_table[i].flags & PTA_AVX512ER
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5946 if (processor_alias_table[i].flags & PTA_AVX512PF
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5949 if (processor_alias_table[i].flags & PTA_AVX512CD
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5952 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5955 if (processor_alias_table[i].flags & PTA_CLWB
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5958 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5961 if (processor_alias_table[i].flags & PTA_CLZERO
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5964 if (processor_alias_table[i].flags & PTA_XSAVEC
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5967 if (processor_alias_table[i].flags & PTA_XSAVES
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5970 if (processor_alias_table[i].flags & PTA_AVX512DQ
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5973 if (processor_alias_table[i].flags & PTA_AVX512BW
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5976 if (processor_alias_table[i].flags & PTA_AVX512VL
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5979 if (processor_alias_table[i].flags & PTA_MPX
5980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5982 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5985 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5989 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5992 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5993 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5994 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5995 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5996 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5997 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5998 if (processor_alias_table[i].flags & PTA_SGX
5999 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
6000 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
6002 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
6003 x86_prefetch_sse = true;
6004 if (processor_alias_table[i].flags & PTA_MWAITX
6005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
6006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
6007 if (processor_alias_table[i].flags & PTA_PKU
6008 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6009 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6011 /* Don't enable x87 instructions if only
6012 general registers are allowed. */
6013 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6014 && !(opts_set->x_target_flags & MASK_80387))
6016 if (processor_alias_table[i].flags & PTA_NO_80387)
6017 opts->x_target_flags &= ~MASK_80387;
6018 else
6019 opts->x_target_flags |= MASK_80387;
6021 break;
6024 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6025 error ("Intel MPX does not support x32");
6027 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6028 error ("Intel MPX does not support x32");
6030 if (i == pta_size)
6032 error (main_args_p
6033 ? G_("bad value (%qs) for %<-march=%> switch")
6034 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6035 opts->x_ix86_arch_string);
6037 auto_vec <const char *> candidates;
6038 for (i = 0; i < pta_size; i++)
6039 if (strcmp (processor_alias_table[i].name, "generic")
6040 && strcmp (processor_alias_table[i].name, "intel")
6041 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6042 || (processor_alias_table[i].flags & PTA_64BIT)))
6043 candidates.safe_push (processor_alias_table[i].name);
6045 char *s;
6046 const char *hint
6047 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6048 if (hint)
6049 inform (input_location,
6050 main_args_p
6051 ? G_("valid arguments to %<-march=%> switch are: "
6052 "%s; did you mean %qs?")
6053 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6054 "%s; did you mean %qs?"), s, hint);
6055 else
6056 inform (input_location,
6057 main_args_p
6058 ? G_("valid arguments to %<-march=%> switch are: %s")
6059 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6060 "are: %s"), s);
6061 XDELETEVEC (s);
6064 ix86_arch_mask = 1u << ix86_arch;
6065 for (i = 0; i < X86_ARCH_LAST; ++i)
6066 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6068 for (i = 0; i < pta_size; i++)
6069 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6071 ix86_schedule = processor_alias_table[i].schedule;
6072 ix86_tune = processor_alias_table[i].processor;
6073 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6075 if (!(processor_alias_table[i].flags & PTA_64BIT))
6077 if (ix86_tune_defaulted)
6079 opts->x_ix86_tune_string = "x86-64";
6080 for (i = 0; i < pta_size; i++)
6081 if (! strcmp (opts->x_ix86_tune_string,
6082 processor_alias_table[i].name))
6083 break;
6084 ix86_schedule = processor_alias_table[i].schedule;
6085 ix86_tune = processor_alias_table[i].processor;
6087 else
6088 error ("CPU you selected does not support x86-64 "
6089 "instruction set");
6092 /* Intel CPUs have always interpreted SSE prefetch instructions as
6093 NOPs; so, we can enable SSE prefetch instructions even when
6094 -mtune (rather than -march) points us to a processor that has them.
6095 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6096 higher processors. */
6097 if (TARGET_CMOV
6098 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6099 x86_prefetch_sse = true;
6100 break;
6103 if (ix86_tune_specified && i == pta_size)
6105 error (main_args_p
6106 ? G_("bad value (%qs) for %<-mtune=%> switch")
6107 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6108 opts->x_ix86_tune_string);
6110 auto_vec <const char *> candidates;
6111 for (i = 0; i < pta_size; i++)
6112 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6113 || (processor_alias_table[i].flags & PTA_64BIT))
6114 candidates.safe_push (processor_alias_table[i].name);
6116 char *s;
6117 const char *hint
6118 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6119 if (hint)
6120 inform (input_location,
6121 main_args_p
6122 ? G_("valid arguments to %<-mtune=%> switch are: "
6123 "%s; did you mean %qs?")
6124 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6125 "%s; did you mean %qs?"), s, hint);
6126 else
6127 inform (input_location,
6128 main_args_p
6129 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6130 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6131 "are: %s"), s);
6132 XDELETEVEC (s);
6135 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6137 #ifndef USE_IX86_FRAME_POINTER
6138 #define USE_IX86_FRAME_POINTER 0
6139 #endif
6141 #ifndef USE_X86_64_FRAME_POINTER
6142 #define USE_X86_64_FRAME_POINTER 0
6143 #endif
6145 /* Set the default values for switches whose default depends on TARGET_64BIT
6146 in case they weren't overwritten by command line options. */
6147 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6149 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6150 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6151 if (opts->x_flag_asynchronous_unwind_tables
6152 && !opts_set->x_flag_unwind_tables
6153 && TARGET_64BIT_MS_ABI)
6154 opts->x_flag_unwind_tables = 1;
6155 if (opts->x_flag_asynchronous_unwind_tables == 2)
6156 opts->x_flag_unwind_tables
6157 = opts->x_flag_asynchronous_unwind_tables = 1;
6158 if (opts->x_flag_pcc_struct_return == 2)
6159 opts->x_flag_pcc_struct_return = 0;
6161 else
6163 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6164 opts->x_flag_omit_frame_pointer
6165 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6166 if (opts->x_flag_asynchronous_unwind_tables == 2)
6167 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6168 if (opts->x_flag_pcc_struct_return == 2)
6170 /* Intel MCU psABI specifies that -freg-struct-return should
6171 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6172 we check -miamcu so that -freg-struct-return is always
6173 turned on if -miamcu is used. */
6174 if (TARGET_IAMCU_P (opts->x_target_flags))
6175 opts->x_flag_pcc_struct_return = 0;
6176 else
6177 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6181 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6182 /* TODO: ix86_cost should be chosen at instruction or function granuality
6183 so for cold code we use size_cost even in !optimize_size compilation. */
6184 if (opts->x_optimize_size)
6185 ix86_cost = &ix86_size_cost;
6186 else
6187 ix86_cost = ix86_tune_cost;
6189 /* Arrange to set up i386_stack_locals for all functions. */
6190 init_machine_status = ix86_init_machine_status;
6192 /* Validate -mregparm= value. */
6193 if (opts_set->x_ix86_regparm)
6195 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6196 warning (0, "-mregparm is ignored in 64-bit mode");
6197 else if (TARGET_IAMCU_P (opts->x_target_flags))
6198 warning (0, "-mregparm is ignored for Intel MCU psABI");
6199 if (opts->x_ix86_regparm > REGPARM_MAX)
6201 error ("-mregparm=%d is not between 0 and %d",
6202 opts->x_ix86_regparm, REGPARM_MAX);
6203 opts->x_ix86_regparm = 0;
6206 if (TARGET_IAMCU_P (opts->x_target_flags)
6207 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6208 opts->x_ix86_regparm = REGPARM_MAX;
6210 /* Default align_* from the processor table. */
6211 ix86_default_align (opts);
6213 /* Provide default for -mbranch-cost= value. */
6214 if (!opts_set->x_ix86_branch_cost)
6215 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6217 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6219 opts->x_target_flags
6220 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6222 /* Enable by default the SSE and MMX builtins. Do allow the user to
6223 explicitly disable any of these. In particular, disabling SSE and
6224 MMX for kernel code is extremely useful. */
6225 if (!ix86_arch_specified)
6226 opts->x_ix86_isa_flags
6227 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6228 | TARGET_SUBTARGET64_ISA_DEFAULT)
6229 & ~opts->x_ix86_isa_flags_explicit);
6231 if (TARGET_RTD_P (opts->x_target_flags))
6232 warning (0,
6233 main_args_p
6234 ? G_("%<-mrtd%> is ignored in 64bit mode")
6235 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6237 else
6239 opts->x_target_flags
6240 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6242 if (!ix86_arch_specified)
6243 opts->x_ix86_isa_flags
6244 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6246 /* i386 ABI does not specify red zone. It still makes sense to use it
6247 when programmer takes care to stack from being destroyed. */
6248 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6249 opts->x_target_flags |= MASK_NO_RED_ZONE;
6252 /* Keep nonleaf frame pointers. */
6253 if (opts->x_flag_omit_frame_pointer)
6254 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6255 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6256 opts->x_flag_omit_frame_pointer = 1;
6258 /* If we're doing fast math, we don't care about comparison order
6259 wrt NaNs. This lets us use a shorter comparison sequence. */
6260 if (opts->x_flag_finite_math_only)
6261 opts->x_target_flags &= ~MASK_IEEE_FP;
6263 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6264 since the insns won't need emulation. */
6265 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6266 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6268 /* Likewise, if the target doesn't have a 387, or we've specified
6269 software floating point, don't use 387 inline intrinsics. */
6270 if (!TARGET_80387_P (opts->x_target_flags))
6271 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6273 /* Turn on MMX builtins for -msse. */
6274 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6275 opts->x_ix86_isa_flags
6276 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6278 /* Enable SSE prefetch. */
6279 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6280 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6281 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6282 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6283 x86_prefetch_sse = true;
6285 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6286 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6287 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6288 opts->x_ix86_isa_flags
6289 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6291 /* Enable lzcnt instruction for -mabm. */
6292 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6293 opts->x_ix86_isa_flags
6294 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6296 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6297 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6298 opts->x_ix86_isa_flags
6299 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6300 & ~opts->x_ix86_isa_flags_explicit);
6302 /* Validate -mpreferred-stack-boundary= value or default it to
6303 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6304 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6305 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6307 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6308 int max = TARGET_SEH ? 4 : 12;
6310 if (opts->x_ix86_preferred_stack_boundary_arg < min
6311 || opts->x_ix86_preferred_stack_boundary_arg > max)
6313 if (min == max)
6314 error ("-mpreferred-stack-boundary is not supported "
6315 "for this target");
6316 else
6317 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6318 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6320 else
6321 ix86_preferred_stack_boundary
6322 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6325 /* Set the default value for -mstackrealign. */
6326 if (!opts_set->x_ix86_force_align_arg_pointer)
6327 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6329 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6331 /* Validate -mincoming-stack-boundary= value or default it to
6332 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6333 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6334 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6336 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6338 if (opts->x_ix86_incoming_stack_boundary_arg < min
6339 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6340 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6341 opts->x_ix86_incoming_stack_boundary_arg, min);
6342 else
6344 ix86_user_incoming_stack_boundary
6345 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6346 ix86_incoming_stack_boundary
6347 = ix86_user_incoming_stack_boundary;
6351 #ifndef NO_PROFILE_COUNTERS
6352 if (flag_nop_mcount)
6353 error ("-mnop-mcount is not compatible with this target");
6354 #endif
6355 if (flag_nop_mcount && flag_pic)
6356 error ("-mnop-mcount is not implemented for -fPIC");
6358 /* Accept -msseregparm only if at least SSE support is enabled. */
6359 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6360 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6361 error (main_args_p
6362 ? G_("%<-msseregparm%> used without SSE enabled")
6363 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6365 if (opts_set->x_ix86_fpmath)
6367 if (opts->x_ix86_fpmath & FPMATH_SSE)
6369 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6371 if (TARGET_80387_P (opts->x_target_flags))
6373 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6374 opts->x_ix86_fpmath = FPMATH_387;
6377 else if ((opts->x_ix86_fpmath & FPMATH_387)
6378 && !TARGET_80387_P (opts->x_target_flags))
6380 warning (0, "387 instruction set disabled, using SSE arithmetics");
6381 opts->x_ix86_fpmath = FPMATH_SSE;
6385 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6386 fpmath=387. The second is however default at many targets since the
6387 extra 80bit precision of temporaries is considered to be part of ABI.
6388 Overwrite the default at least for -ffast-math.
6389 TODO: -mfpmath=both seems to produce same performing code with bit
6390 smaller binaries. It is however not clear if register allocation is
6391 ready for this setting.
6392 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6393 codegen. We may switch to 387 with -ffast-math for size optimized
6394 functions. */
6395 else if (fast_math_flags_set_p (&global_options)
6396 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6397 opts->x_ix86_fpmath = FPMATH_SSE;
6398 else
6399 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6401 /* Use external vectorized library in vectorizing intrinsics. */
6402 if (opts_set->x_ix86_veclibabi_type)
6403 switch (opts->x_ix86_veclibabi_type)
6405 case ix86_veclibabi_type_svml:
6406 ix86_veclib_handler = ix86_veclibabi_svml;
6407 break;
6409 case ix86_veclibabi_type_acml:
6410 ix86_veclib_handler = ix86_veclibabi_acml;
6411 break;
6413 default:
6414 gcc_unreachable ();
6417 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6418 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6419 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6421 /* If stack probes are required, the space used for large function
6422 arguments on the stack must also be probed, so enable
6423 -maccumulate-outgoing-args so this happens in the prologue. */
6424 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6425 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6427 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6428 warning (0,
6429 main_args_p
6430 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6431 "for correctness")
6432 : G_("stack probing requires "
6433 "%<target(\"accumulate-outgoing-args\")%> for "
6434 "correctness"));
6435 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6438 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6439 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6440 if (fixed_regs[BP_REG]
6441 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6443 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6444 warning (0,
6445 main_args_p
6446 ? G_("fixed ebp register requires "
6447 "%<-maccumulate-outgoing-args%>")
6448 : G_("fixed ebp register requires "
6449 "%<target(\"accumulate-outgoing-args\")%>"));
6450 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6453 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6455 char *p;
6456 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6457 p = strchr (internal_label_prefix, 'X');
6458 internal_label_prefix_len = p - internal_label_prefix;
6459 *p = '\0';
6462 /* When scheduling description is not available, disable scheduler pass
6463 so it won't slow down the compilation and make x87 code slower. */
6464 if (!TARGET_SCHEDULE)
6465 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6467 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6468 ix86_tune_cost->simultaneous_prefetches,
6469 opts->x_param_values,
6470 opts_set->x_param_values);
6471 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6472 ix86_tune_cost->prefetch_block,
6473 opts->x_param_values,
6474 opts_set->x_param_values);
6475 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6476 ix86_tune_cost->l1_cache_size,
6477 opts->x_param_values,
6478 opts_set->x_param_values);
6479 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6480 ix86_tune_cost->l2_cache_size,
6481 opts->x_param_values,
6482 opts_set->x_param_values);
6484 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6485 if (opts->x_flag_prefetch_loop_arrays < 0
6486 && HAVE_prefetch
6487 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6488 && !opts->x_optimize_size
6489 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6490 opts->x_flag_prefetch_loop_arrays = 1;
6492 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6493 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6494 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6495 targetm.expand_builtin_va_start = NULL;
6497 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6499 ix86_gen_leave = gen_leave_rex64;
6500 if (Pmode == DImode)
6502 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6503 ix86_gen_tls_local_dynamic_base_64
6504 = gen_tls_local_dynamic_base_64_di;
6506 else
6508 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6509 ix86_gen_tls_local_dynamic_base_64
6510 = gen_tls_local_dynamic_base_64_si;
6513 else
6514 ix86_gen_leave = gen_leave;
6516 if (Pmode == DImode)
6518 ix86_gen_add3 = gen_adddi3;
6519 ix86_gen_sub3 = gen_subdi3;
6520 ix86_gen_sub3_carry = gen_subdi3_carry;
6521 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6522 ix86_gen_andsp = gen_anddi3;
6523 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6524 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6525 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6526 ix86_gen_monitor = gen_sse3_monitor_di;
6527 ix86_gen_monitorx = gen_monitorx_di;
6528 ix86_gen_clzero = gen_clzero_di;
6530 else
6532 ix86_gen_add3 = gen_addsi3;
6533 ix86_gen_sub3 = gen_subsi3;
6534 ix86_gen_sub3_carry = gen_subsi3_carry;
6535 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6536 ix86_gen_andsp = gen_andsi3;
6537 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6538 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6539 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6540 ix86_gen_monitor = gen_sse3_monitor_si;
6541 ix86_gen_monitorx = gen_monitorx_si;
6542 ix86_gen_clzero = gen_clzero_si;
6545 #ifdef USE_IX86_CLD
6546 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6547 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6548 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6549 #endif
6551 /* Set the default value for -mfentry. */
6552 if (!opts_set->x_flag_fentry)
6553 opts->x_flag_fentry = TARGET_SEH;
6554 else
6556 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
6557 && opts->x_flag_fentry)
6558 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6559 "with -fpic");
6560 else if (TARGET_SEH && !opts->x_flag_fentry)
6561 sorry ("-mno-fentry isn%'t compatible with SEH");
6564 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6565 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6567 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6568 opts->x_target_flags |= MASK_VZEROUPPER;
6569 if (!(opts_set->x_target_flags & MASK_STV))
6570 opts->x_target_flags |= MASK_STV;
6571 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6572 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6573 stack realignment will be extra cost the pass doesn't take into
6574 account and the pass can't realign the stack. */
6575 if (ix86_preferred_stack_boundary < 128
6576 || ix86_incoming_stack_boundary < 128
6577 || opts->x_ix86_force_align_arg_pointer)
6578 opts->x_target_flags &= ~MASK_STV;
6579 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6580 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6581 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6582 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6583 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6584 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6585 /* Enable 128-bit AVX instruction generation
6586 for the auto-vectorizer. */
6587 if (TARGET_AVX128_OPTIMAL
6588 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6589 opts->x_target_flags |= MASK_PREFER_AVX128;
6591 if (opts->x_ix86_recip_name)
6593 char *p = ASTRDUP (opts->x_ix86_recip_name);
6594 char *q;
6595 unsigned int mask, i;
6596 bool invert;
6598 while ((q = strtok (p, ",")) != NULL)
6600 p = NULL;
6601 if (*q == '!')
6603 invert = true;
6604 q++;
6606 else
6607 invert = false;
6609 if (!strcmp (q, "default"))
6610 mask = RECIP_MASK_ALL;
6611 else
6613 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6614 if (!strcmp (q, recip_options[i].string))
6616 mask = recip_options[i].mask;
6617 break;
6620 if (i == ARRAY_SIZE (recip_options))
6622 error ("unknown option for -mrecip=%s", q);
6623 invert = false;
6624 mask = RECIP_MASK_NONE;
6628 opts->x_recip_mask_explicit |= mask;
6629 if (invert)
6630 opts->x_recip_mask &= ~mask;
6631 else
6632 opts->x_recip_mask |= mask;
6636 if (TARGET_RECIP_P (opts->x_target_flags))
6637 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6638 else if (opts_set->x_target_flags & MASK_RECIP)
6639 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6641 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6642 for 64-bit Bionic. Also default long double to 64-bit for Intel
6643 MCU psABI. */
6644 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6645 && !(opts_set->x_target_flags
6646 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6647 opts->x_target_flags |= (TARGET_64BIT
6648 ? MASK_LONG_DOUBLE_128
6649 : MASK_LONG_DOUBLE_64);
6651 /* Only one of them can be active. */
6652 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6653 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6655 /* Handle stack protector */
6656 if (!opts_set->x_ix86_stack_protector_guard)
6657 opts->x_ix86_stack_protector_guard
6658 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6660 #ifdef TARGET_THREAD_SSP_OFFSET
6661 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
6662 #endif
6664 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
6666 char *endp;
6667 const char *str = ix86_stack_protector_guard_offset_str;
6669 errno = 0;
6670 int64_t offset;
6672 #if defined(INT64_T_IS_LONG)
6673 offset = strtol (str, &endp, 0);
6674 #else
6675 offset = strtoll (str, &endp, 0);
6676 #endif
6678 if (!*str || *endp || errno)
6679 error ("%qs is not a valid number "
6680 "in -mstack-protector-guard-offset=", str);
6682 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
6683 HOST_WIDE_INT_C (0x7fffffff)))
6684 error ("%qs is not a valid offset "
6685 "in -mstack-protector-guard-offset=", str);
6687 ix86_stack_protector_guard_offset = offset;
6690 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
6692 /* The kernel uses a different segment register for performance
6693 reasons; a system call would not have to trash the userspace
6694 segment register, which would be expensive. */
6695 if (ix86_cmodel == CM_KERNEL)
6696 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
6698 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
6700 const char *str = ix86_stack_protector_guard_reg_str;
6701 addr_space_t seg = ADDR_SPACE_GENERIC;
6703 /* Discard optional register prefix. */
6704 if (str[0] == '%')
6705 str++;
6707 if (strlen (str) == 2 && str[1] == 's')
6709 if (str[0] == 'f')
6710 seg = ADDR_SPACE_SEG_FS;
6711 else if (str[0] == 'g')
6712 seg = ADDR_SPACE_SEG_GS;
6715 if (seg == ADDR_SPACE_GENERIC)
6716 error ("%qs is not a valid base register "
6717 "in -mstack-protector-guard-reg=",
6718 ix86_stack_protector_guard_reg_str);
6720 ix86_stack_protector_guard_reg = seg;
6723 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6724 if (opts->x_ix86_tune_memcpy_strategy)
6726 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6727 ix86_parse_stringop_strategy_string (str, false);
6728 free (str);
6731 if (opts->x_ix86_tune_memset_strategy)
6733 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6734 ix86_parse_stringop_strategy_string (str, true);
6735 free (str);
6738 /* Save the initial options in case the user does function specific
6739 options. */
6740 if (main_args_p)
6741 target_option_default_node = target_option_current_node
6742 = build_target_option_node (opts);
6744 return true;
6747 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6749 static void
6750 ix86_option_override (void)
6752 ix86_option_override_internal (true, &global_options, &global_options_set);
6755 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6756 static char *
6757 ix86_offload_options (void)
6759 if (TARGET_LP64)
6760 return xstrdup ("-foffload-abi=lp64");
6761 return xstrdup ("-foffload-abi=ilp32");
6764 /* Update register usage after having seen the compiler flags. */
6766 static void
6767 ix86_conditional_register_usage (void)
6769 int i, c_mask;
6771 /* If there are no caller-saved registers, preserve all registers.
6772 except fixed_regs and registers used for function return value
6773 since aggregate_value_p checks call_used_regs[regno] on return
6774 value. */
6775 if (cfun && cfun->machine->no_caller_saved_registers)
6776 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6777 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6778 call_used_regs[i] = 0;
6780 /* For 32-bit targets, squash the REX registers. */
6781 if (! TARGET_64BIT)
6783 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6784 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6785 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6786 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6787 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6788 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6791 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6792 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6794 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6796 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6798 /* Set/reset conditionally defined registers from
6799 CALL_USED_REGISTERS initializer. */
6800 if (call_used_regs[i] > 1)
6801 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6803 /* Calculate registers of CLOBBERED_REGS register set
6804 as call used registers from GENERAL_REGS register set. */
6805 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6806 && call_used_regs[i])
6807 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6810 /* If MMX is disabled, squash the registers. */
6811 if (! TARGET_MMX)
6812 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6813 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6814 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6816 /* If SSE is disabled, squash the registers. */
6817 if (! TARGET_SSE)
6818 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6819 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6820 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6822 /* If the FPU is disabled, squash the registers. */
6823 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6824 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6825 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6826 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6828 /* If AVX512F is disabled, squash the registers. */
6829 if (! TARGET_AVX512F)
6831 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6832 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6834 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6835 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6838 /* If MPX is disabled, squash the registers. */
6839 if (! TARGET_MPX)
6840 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6841 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6845 /* Save the current options */
6847 static void
6848 ix86_function_specific_save (struct cl_target_option *ptr,
6849 struct gcc_options *opts)
6851 ptr->arch = ix86_arch;
6852 ptr->schedule = ix86_schedule;
6853 ptr->prefetch_sse = x86_prefetch_sse;
6854 ptr->tune = ix86_tune;
6855 ptr->branch_cost = ix86_branch_cost;
6856 ptr->tune_defaulted = ix86_tune_defaulted;
6857 ptr->arch_specified = ix86_arch_specified;
6858 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6859 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6860 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6861 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6862 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6863 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6864 ptr->x_ix86_abi = opts->x_ix86_abi;
6865 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6866 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6867 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6868 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6869 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6870 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6871 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6872 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6873 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6874 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6875 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6876 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6877 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6878 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6879 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6880 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6881 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6882 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6883 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6884 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6886 /* The fields are char but the variables are not; make sure the
6887 values fit in the fields. */
6888 gcc_assert (ptr->arch == ix86_arch);
6889 gcc_assert (ptr->schedule == ix86_schedule);
6890 gcc_assert (ptr->tune == ix86_tune);
6891 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6894 /* Restore the current options */
6896 static void
6897 ix86_function_specific_restore (struct gcc_options *opts,
6898 struct cl_target_option *ptr)
6900 enum processor_type old_tune = ix86_tune;
6901 enum processor_type old_arch = ix86_arch;
6902 unsigned int ix86_arch_mask;
6903 int i;
6905 /* We don't change -fPIC. */
6906 opts->x_flag_pic = flag_pic;
6908 ix86_arch = (enum processor_type) ptr->arch;
6909 ix86_schedule = (enum attr_cpu) ptr->schedule;
6910 ix86_tune = (enum processor_type) ptr->tune;
6911 x86_prefetch_sse = ptr->prefetch_sse;
6912 opts->x_ix86_branch_cost = ptr->branch_cost;
6913 ix86_tune_defaulted = ptr->tune_defaulted;
6914 ix86_arch_specified = ptr->arch_specified;
6915 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6916 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6917 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6918 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6919 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6920 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6921 opts->x_ix86_abi = ptr->x_ix86_abi;
6922 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6923 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6924 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6925 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6926 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6927 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6928 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6929 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6930 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6931 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6932 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6933 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6934 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6935 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6936 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6937 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6938 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6939 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6940 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6941 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6942 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6943 /* TODO: ix86_cost should be chosen at instruction or function granuality
6944 so for cold code we use size_cost even in !optimize_size compilation. */
6945 if (opts->x_optimize_size)
6946 ix86_cost = &ix86_size_cost;
6947 else
6948 ix86_cost = ix86_tune_cost;
6950 /* Recreate the arch feature tests if the arch changed */
6951 if (old_arch != ix86_arch)
6953 ix86_arch_mask = 1u << ix86_arch;
6954 for (i = 0; i < X86_ARCH_LAST; ++i)
6955 ix86_arch_features[i]
6956 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6959 /* Recreate the tune optimization tests */
6960 if (old_tune != ix86_tune)
6961 set_ix86_tune_features (ix86_tune, false);
6964 /* Adjust target options after streaming them in. This is mainly about
6965 reconciling them with global options. */
6967 static void
6968 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6970 /* flag_pic is a global option, but ix86_cmodel is target saved option
6971 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6972 for PIC, or error out. */
6973 if (flag_pic)
6974 switch (ptr->x_ix86_cmodel)
6976 case CM_SMALL:
6977 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6978 break;
6980 case CM_MEDIUM:
6981 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6982 break;
6984 case CM_LARGE:
6985 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6986 break;
6988 case CM_KERNEL:
6989 error ("code model %s does not support PIC mode", "kernel");
6990 break;
6992 default:
6993 break;
6995 else
6996 switch (ptr->x_ix86_cmodel)
6998 case CM_SMALL_PIC:
6999 ptr->x_ix86_cmodel = CM_SMALL;
7000 break;
7002 case CM_MEDIUM_PIC:
7003 ptr->x_ix86_cmodel = CM_MEDIUM;
7004 break;
7006 case CM_LARGE_PIC:
7007 ptr->x_ix86_cmodel = CM_LARGE;
7008 break;
7010 default:
7011 break;
7015 /* Print the current options */
7017 static void
7018 ix86_function_specific_print (FILE *file, int indent,
7019 struct cl_target_option *ptr)
7021 char *target_string
7022 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
7023 ptr->x_target_flags, ptr->x_ix86_target_flags,
7024 NULL, NULL, ptr->x_ix86_fpmath, false);
7026 gcc_assert (ptr->arch < PROCESSOR_max);
7027 fprintf (file, "%*sarch = %d (%s)\n",
7028 indent, "",
7029 ptr->arch, processor_target_table[ptr->arch].name);
7031 gcc_assert (ptr->tune < PROCESSOR_max);
7032 fprintf (file, "%*stune = %d (%s)\n",
7033 indent, "",
7034 ptr->tune, processor_target_table[ptr->tune].name);
7036 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
7038 if (target_string)
7040 fprintf (file, "%*s%s\n", indent, "", target_string);
7041 free (target_string);
7046 /* Inner function to process the attribute((target(...))), take an argument and
7047 set the current options from the argument. If we have a list, recursively go
7048 over the list. */
7050 static bool
7051 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
7052 struct gcc_options *opts,
7053 struct gcc_options *opts_set,
7054 struct gcc_options *enum_opts_set)
7056 char *next_optstr;
7057 bool ret = true;
7059 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
7060 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
7061 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
7062 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7063 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7065 enum ix86_opt_type
7067 ix86_opt_unknown,
7068 ix86_opt_yes,
7069 ix86_opt_no,
7070 ix86_opt_str,
7071 ix86_opt_enum,
7072 ix86_opt_isa
7075 static const struct
7077 const char *string;
7078 size_t len;
7079 enum ix86_opt_type type;
7080 int opt;
7081 int mask;
7082 } attrs[] = {
7083 /* isa options */
7084 IX86_ATTR_ISA ("sgx", OPT_msgx),
7085 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7086 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7087 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7089 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7090 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7091 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7092 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7093 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7094 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7095 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7096 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7097 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7098 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7099 IX86_ATTR_ISA ("fma", OPT_mfma),
7100 IX86_ATTR_ISA ("xop", OPT_mxop),
7101 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7102 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7103 IX86_ATTR_ISA ("avx", OPT_mavx),
7104 IX86_ATTR_ISA ("sse4", OPT_msse4),
7105 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7106 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7107 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7108 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7109 IX86_ATTR_ISA ("sse3", OPT_msse3),
7110 IX86_ATTR_ISA ("aes", OPT_maes),
7111 IX86_ATTR_ISA ("sha", OPT_msha),
7112 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7113 IX86_ATTR_ISA ("sse2", OPT_msse2),
7114 IX86_ATTR_ISA ("sse", OPT_msse),
7115 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7116 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7117 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7118 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7119 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7120 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7121 IX86_ATTR_ISA ("adx", OPT_madx),
7122 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7123 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7124 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7125 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7126 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7127 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7128 IX86_ATTR_ISA ("abm", OPT_mabm),
7129 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7130 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7131 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7132 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7133 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7134 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7135 IX86_ATTR_ISA ("sahf", OPT_msahf),
7136 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7137 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7138 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7139 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7140 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7141 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7142 IX86_ATTR_ISA ("pku", OPT_mpku),
7143 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7144 IX86_ATTR_ISA ("hle", OPT_mhle),
7145 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7146 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7147 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7148 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7150 /* enum options */
7151 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7153 /* string options */
7154 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7155 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7157 /* flag options */
7158 IX86_ATTR_YES ("cld",
7159 OPT_mcld,
7160 MASK_CLD),
7162 IX86_ATTR_NO ("fancy-math-387",
7163 OPT_mfancy_math_387,
7164 MASK_NO_FANCY_MATH_387),
7166 IX86_ATTR_YES ("ieee-fp",
7167 OPT_mieee_fp,
7168 MASK_IEEE_FP),
7170 IX86_ATTR_YES ("inline-all-stringops",
7171 OPT_minline_all_stringops,
7172 MASK_INLINE_ALL_STRINGOPS),
7174 IX86_ATTR_YES ("inline-stringops-dynamically",
7175 OPT_minline_stringops_dynamically,
7176 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7178 IX86_ATTR_NO ("align-stringops",
7179 OPT_mno_align_stringops,
7180 MASK_NO_ALIGN_STRINGOPS),
7182 IX86_ATTR_YES ("recip",
7183 OPT_mrecip,
7184 MASK_RECIP),
7188 /* If this is a list, recurse to get the options. */
7189 if (TREE_CODE (args) == TREE_LIST)
7191 bool ret = true;
7193 for (; args; args = TREE_CHAIN (args))
7194 if (TREE_VALUE (args)
7195 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7196 p_strings, opts, opts_set,
7197 enum_opts_set))
7198 ret = false;
7200 return ret;
7203 else if (TREE_CODE (args) != STRING_CST)
7205 error ("attribute %<target%> argument not a string");
7206 return false;
7209 /* Handle multiple arguments separated by commas. */
7210 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7212 while (next_optstr && *next_optstr != '\0')
7214 char *p = next_optstr;
7215 char *orig_p = p;
7216 char *comma = strchr (next_optstr, ',');
7217 const char *opt_string;
7218 size_t len, opt_len;
7219 int opt;
7220 bool opt_set_p;
7221 char ch;
7222 unsigned i;
7223 enum ix86_opt_type type = ix86_opt_unknown;
7224 int mask = 0;
7226 if (comma)
7228 *comma = '\0';
7229 len = comma - next_optstr;
7230 next_optstr = comma + 1;
7232 else
7234 len = strlen (p);
7235 next_optstr = NULL;
7238 /* Recognize no-xxx. */
7239 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7241 opt_set_p = false;
7242 p += 3;
7243 len -= 3;
7245 else
7246 opt_set_p = true;
7248 /* Find the option. */
7249 ch = *p;
7250 opt = N_OPTS;
7251 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7253 type = attrs[i].type;
7254 opt_len = attrs[i].len;
7255 if (ch == attrs[i].string[0]
7256 && ((type != ix86_opt_str && type != ix86_opt_enum)
7257 ? len == opt_len
7258 : len > opt_len)
7259 && memcmp (p, attrs[i].string, opt_len) == 0)
7261 opt = attrs[i].opt;
7262 mask = attrs[i].mask;
7263 opt_string = attrs[i].string;
7264 break;
7268 /* Process the option. */
7269 if (opt == N_OPTS)
7271 error ("attribute(target(\"%s\")) is unknown", orig_p);
7272 ret = false;
7275 else if (type == ix86_opt_isa)
7277 struct cl_decoded_option decoded;
7279 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7280 ix86_handle_option (opts, opts_set,
7281 &decoded, input_location);
7284 else if (type == ix86_opt_yes || type == ix86_opt_no)
7286 if (type == ix86_opt_no)
7287 opt_set_p = !opt_set_p;
7289 if (opt_set_p)
7290 opts->x_target_flags |= mask;
7291 else
7292 opts->x_target_flags &= ~mask;
7295 else if (type == ix86_opt_str)
7297 if (p_strings[opt])
7299 error ("option(\"%s\") was already specified", opt_string);
7300 ret = false;
7302 else
7303 p_strings[opt] = xstrdup (p + opt_len);
7306 else if (type == ix86_opt_enum)
7308 bool arg_ok;
7309 int value;
7311 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7312 if (arg_ok)
7313 set_option (opts, enum_opts_set, opt, value,
7314 p + opt_len, DK_UNSPECIFIED, input_location,
7315 global_dc);
7316 else
7318 error ("attribute(target(\"%s\")) is unknown", orig_p);
7319 ret = false;
7323 else
7324 gcc_unreachable ();
7327 return ret;
7330 /* Release allocated strings. */
7331 static void
7332 release_options_strings (char **option_strings)
7334 /* Free up memory allocated to hold the strings */
7335 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7336 free (option_strings[i]);
7339 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7341 tree
7342 ix86_valid_target_attribute_tree (tree args,
7343 struct gcc_options *opts,
7344 struct gcc_options *opts_set)
7346 const char *orig_arch_string = opts->x_ix86_arch_string;
7347 const char *orig_tune_string = opts->x_ix86_tune_string;
7348 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7349 int orig_tune_defaulted = ix86_tune_defaulted;
7350 int orig_arch_specified = ix86_arch_specified;
7351 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7352 tree t = NULL_TREE;
7353 struct cl_target_option *def
7354 = TREE_TARGET_OPTION (target_option_default_node);
7355 struct gcc_options enum_opts_set;
7357 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7359 /* Process each of the options on the chain. */
7360 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7361 opts_set, &enum_opts_set))
7362 return error_mark_node;
7364 /* If the changed options are different from the default, rerun
7365 ix86_option_override_internal, and then save the options away.
7366 The string options are attribute options, and will be undone
7367 when we copy the save structure. */
7368 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7369 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7370 || opts->x_target_flags != def->x_target_flags
7371 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7372 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7373 || enum_opts_set.x_ix86_fpmath)
7375 /* If we are using the default tune= or arch=, undo the string assigned,
7376 and use the default. */
7377 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7379 opts->x_ix86_arch_string
7380 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7382 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7383 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7384 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7385 | OPTION_MASK_ABI_64
7386 | OPTION_MASK_ABI_X32
7387 | OPTION_MASK_CODE16);
7388 opts->x_ix86_isa_flags2 = 0;
7390 else if (!orig_arch_specified)
7391 opts->x_ix86_arch_string = NULL;
7393 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7394 opts->x_ix86_tune_string
7395 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7396 else if (orig_tune_defaulted)
7397 opts->x_ix86_tune_string = NULL;
7399 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7400 if (enum_opts_set.x_ix86_fpmath)
7401 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7403 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7404 bool r = ix86_option_override_internal (false, opts, opts_set);
7405 if (!r)
7407 release_options_strings (option_strings);
7408 return error_mark_node;
7411 /* Add any builtin functions with the new isa if any. */
7412 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7414 /* Save the current options unless we are validating options for
7415 #pragma. */
7416 t = build_target_option_node (opts);
7418 opts->x_ix86_arch_string = orig_arch_string;
7419 opts->x_ix86_tune_string = orig_tune_string;
7420 opts_set->x_ix86_fpmath = orig_fpmath_set;
7422 release_options_strings (option_strings);
7425 return t;
7428 /* Hook to validate attribute((target("string"))). */
7430 static bool
7431 ix86_valid_target_attribute_p (tree fndecl,
7432 tree ARG_UNUSED (name),
7433 tree args,
7434 int ARG_UNUSED (flags))
7436 struct gcc_options func_options;
7437 tree new_target, new_optimize;
7438 bool ret = true;
7440 /* attribute((target("default"))) does nothing, beyond
7441 affecting multi-versioning. */
7442 if (TREE_VALUE (args)
7443 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7444 && TREE_CHAIN (args) == NULL_TREE
7445 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7446 return true;
7448 tree old_optimize = build_optimization_node (&global_options);
7450 /* Get the optimization options of the current function. */
7451 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7453 if (!func_optimize)
7454 func_optimize = old_optimize;
7456 /* Init func_options. */
7457 memset (&func_options, 0, sizeof (func_options));
7458 init_options_struct (&func_options, NULL);
7459 lang_hooks.init_options_struct (&func_options);
7461 cl_optimization_restore (&func_options,
7462 TREE_OPTIMIZATION (func_optimize));
7464 /* Initialize func_options to the default before its target options can
7465 be set. */
7466 cl_target_option_restore (&func_options,
7467 TREE_TARGET_OPTION (target_option_default_node));
7469 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7470 &global_options_set);
7472 new_optimize = build_optimization_node (&func_options);
7474 if (new_target == error_mark_node)
7475 ret = false;
7477 else if (fndecl && new_target)
7479 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7481 if (old_optimize != new_optimize)
7482 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7485 finalize_options_struct (&func_options);
7487 return ret;
7491 /* Hook to determine if one function can safely inline another. */
7493 static bool
7494 ix86_can_inline_p (tree caller, tree callee)
7496 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7497 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7498 if (!callee_tree)
7499 callee_tree = target_option_default_node;
7500 if (!caller_tree)
7501 caller_tree = target_option_default_node;
7502 if (callee_tree == caller_tree)
7503 return true;
7505 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7506 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7507 bool ret = false;
7509 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7510 function can inline a SSE2 function but a SSE2 function can't inline
7511 a SSE4 function. */
7512 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7513 != callee_opts->x_ix86_isa_flags)
7514 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7515 != callee_opts->x_ix86_isa_flags2))
7516 ret = false;
7518 /* See if we have the same non-isa options. */
7519 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7520 ret = false;
7522 /* See if arch, tune, etc. are the same. */
7523 else if (caller_opts->arch != callee_opts->arch)
7524 ret = false;
7526 else if (caller_opts->tune != callee_opts->tune)
7527 ret = false;
7529 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
7530 /* If the calle doesn't use FP expressions differences in
7531 ix86_fpmath can be ignored. We are called from FEs
7532 for multi-versioning call optimization, so beware of
7533 ipa_fn_summaries not available. */
7534 && (! ipa_fn_summaries
7535 || ipa_fn_summaries->get
7536 (cgraph_node::get (callee))->fp_expressions))
7537 ret = false;
7539 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7540 ret = false;
7542 else
7543 ret = true;
7545 return ret;
7549 /* Remember the last target of ix86_set_current_function. */
7550 static GTY(()) tree ix86_previous_fndecl;
7552 /* Set targets globals to the default (or current #pragma GCC target
7553 if active). Invalidate ix86_previous_fndecl cache. */
7555 void
7556 ix86_reset_previous_fndecl (void)
7558 tree new_tree = target_option_current_node;
7559 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7560 if (TREE_TARGET_GLOBALS (new_tree))
7561 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7562 else if (new_tree == target_option_default_node)
7563 restore_target_globals (&default_target_globals);
7564 else
7565 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7566 ix86_previous_fndecl = NULL_TREE;
7569 /* Set the func_type field from the function FNDECL. */
7571 static void
7572 ix86_set_func_type (tree fndecl)
7574 if (cfun->machine->func_type == TYPE_UNKNOWN)
7576 if (lookup_attribute ("interrupt",
7577 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7579 if (ix86_function_naked (fndecl))
7580 error_at (DECL_SOURCE_LOCATION (fndecl),
7581 "interrupt and naked attributes are not compatible");
7583 int nargs = 0;
7584 for (tree arg = DECL_ARGUMENTS (fndecl);
7585 arg;
7586 arg = TREE_CHAIN (arg))
7587 nargs++;
7588 cfun->machine->no_caller_saved_registers = true;
7589 cfun->machine->func_type
7590 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7592 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7594 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7595 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7596 sorry ("Only DWARF debug format is supported for interrupt "
7597 "service routine.");
7599 else
7601 cfun->machine->func_type = TYPE_NORMAL;
7602 if (lookup_attribute ("no_caller_saved_registers",
7603 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7604 cfun->machine->no_caller_saved_registers = true;
7609 /* Establish appropriate back-end context for processing the function
7610 FNDECL. The argument might be NULL to indicate processing at top
7611 level, outside of any function scope. */
7612 static void
7613 ix86_set_current_function (tree fndecl)
7615 /* Only change the context if the function changes. This hook is called
7616 several times in the course of compiling a function, and we don't want to
7617 slow things down too much or call target_reinit when it isn't safe. */
7618 if (fndecl == ix86_previous_fndecl)
7620 /* There may be 2 function bodies for the same function FNDECL,
7621 one is extern inline and one isn't. Call ix86_set_func_type
7622 to set the func_type field. */
7623 if (fndecl != NULL_TREE)
7624 ix86_set_func_type (fndecl);
7625 return;
7628 tree old_tree;
7629 if (ix86_previous_fndecl == NULL_TREE)
7630 old_tree = target_option_current_node;
7631 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7632 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7633 else
7634 old_tree = target_option_default_node;
7636 if (fndecl == NULL_TREE)
7638 if (old_tree != target_option_current_node)
7639 ix86_reset_previous_fndecl ();
7640 return;
7643 ix86_set_func_type (fndecl);
7645 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7646 if (new_tree == NULL_TREE)
7647 new_tree = target_option_default_node;
7649 if (old_tree != new_tree)
7651 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7652 if (TREE_TARGET_GLOBALS (new_tree))
7653 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7654 else if (new_tree == target_option_default_node)
7655 restore_target_globals (&default_target_globals);
7656 else
7657 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7659 ix86_previous_fndecl = fndecl;
7661 static bool prev_no_caller_saved_registers;
7663 /* 64-bit MS and SYSV ABI have different set of call used registers.
7664 Avoid expensive re-initialization of init_regs each time we switch
7665 function context. */
7666 if (TARGET_64BIT
7667 && (call_used_regs[SI_REG]
7668 == (cfun->machine->call_abi == MS_ABI)))
7669 reinit_regs ();
7670 /* Need to re-initialize init_regs if caller-saved registers are
7671 changed. */
7672 else if (prev_no_caller_saved_registers
7673 != cfun->machine->no_caller_saved_registers)
7674 reinit_regs ();
7676 if (cfun->machine->func_type != TYPE_NORMAL
7677 || cfun->machine->no_caller_saved_registers)
7679 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7680 may change processor state. */
7681 const char *isa;
7682 if (TARGET_MPX)
7683 isa = "MPX";
7684 else if (TARGET_SSE)
7685 isa = "SSE";
7686 else if (TARGET_MMX)
7687 isa = "MMX/3Dnow";
7688 else if (TARGET_80387)
7689 isa = "80387";
7690 else
7691 isa = NULL;
7692 if (isa != NULL)
7694 if (cfun->machine->func_type != TYPE_NORMAL)
7695 sorry ("%s instructions aren't allowed in %s service routine",
7696 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7697 ? "exception" : "interrupt"));
7698 else
7699 sorry ("%s instructions aren't allowed in function with "
7700 "no_caller_saved_registers attribute", isa);
7701 /* Don't issue the same error twice. */
7702 cfun->machine->func_type = TYPE_NORMAL;
7703 cfun->machine->no_caller_saved_registers = false;
7707 prev_no_caller_saved_registers
7708 = cfun->machine->no_caller_saved_registers;
7712 /* Return true if this goes in large data/bss. */
7714 static bool
7715 ix86_in_large_data_p (tree exp)
7717 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7718 return false;
7720 if (exp == NULL_TREE)
7721 return false;
7723 /* Functions are never large data. */
7724 if (TREE_CODE (exp) == FUNCTION_DECL)
7725 return false;
7727 /* Automatic variables are never large data. */
7728 if (VAR_P (exp) && !is_global_var (exp))
7729 return false;
7731 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7733 const char *section = DECL_SECTION_NAME (exp);
7734 if (strcmp (section, ".ldata") == 0
7735 || strcmp (section, ".lbss") == 0)
7736 return true;
7737 return false;
7739 else
7741 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7743 /* If this is an incomplete type with size 0, then we can't put it
7744 in data because it might be too big when completed. Also,
7745 int_size_in_bytes returns -1 if size can vary or is larger than
7746 an integer in which case also it is safer to assume that it goes in
7747 large data. */
7748 if (size <= 0 || size > ix86_section_threshold)
7749 return true;
7752 return false;
7755 /* i386-specific section flag to mark large sections. */
7756 #define SECTION_LARGE SECTION_MACH_DEP
7758 /* Switch to the appropriate section for output of DECL.
7759 DECL is either a `VAR_DECL' node or a constant of some sort.
7760 RELOC indicates whether forming the initial value of DECL requires
7761 link-time relocations. */
7763 ATTRIBUTE_UNUSED static section *
7764 x86_64_elf_select_section (tree decl, int reloc,
7765 unsigned HOST_WIDE_INT align)
7767 if (ix86_in_large_data_p (decl))
7769 const char *sname = NULL;
7770 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7771 switch (categorize_decl_for_section (decl, reloc))
7773 case SECCAT_DATA:
7774 sname = ".ldata";
7775 break;
7776 case SECCAT_DATA_REL:
7777 sname = ".ldata.rel";
7778 break;
7779 case SECCAT_DATA_REL_LOCAL:
7780 sname = ".ldata.rel.local";
7781 break;
7782 case SECCAT_DATA_REL_RO:
7783 sname = ".ldata.rel.ro";
7784 break;
7785 case SECCAT_DATA_REL_RO_LOCAL:
7786 sname = ".ldata.rel.ro.local";
7787 break;
7788 case SECCAT_BSS:
7789 sname = ".lbss";
7790 flags |= SECTION_BSS;
7791 break;
7792 case SECCAT_RODATA:
7793 case SECCAT_RODATA_MERGE_STR:
7794 case SECCAT_RODATA_MERGE_STR_INIT:
7795 case SECCAT_RODATA_MERGE_CONST:
7796 sname = ".lrodata";
7797 flags &= ~SECTION_WRITE;
7798 break;
7799 case SECCAT_SRODATA:
7800 case SECCAT_SDATA:
7801 case SECCAT_SBSS:
7802 gcc_unreachable ();
7803 case SECCAT_TEXT:
7804 case SECCAT_TDATA:
7805 case SECCAT_TBSS:
7806 /* We don't split these for medium model. Place them into
7807 default sections and hope for best. */
7808 break;
7810 if (sname)
7812 /* We might get called with string constants, but get_named_section
7813 doesn't like them as they are not DECLs. Also, we need to set
7814 flags in that case. */
7815 if (!DECL_P (decl))
7816 return get_section (sname, flags, NULL);
7817 return get_named_section (decl, sname, reloc);
7820 return default_elf_select_section (decl, reloc, align);
7823 /* Select a set of attributes for section NAME based on the properties
7824 of DECL and whether or not RELOC indicates that DECL's initializer
7825 might contain runtime relocations. */
7827 static unsigned int ATTRIBUTE_UNUSED
7828 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7830 unsigned int flags = default_section_type_flags (decl, name, reloc);
7832 if (ix86_in_large_data_p (decl))
7833 flags |= SECTION_LARGE;
7835 if (decl == NULL_TREE
7836 && (strcmp (name, ".ldata.rel.ro") == 0
7837 || strcmp (name, ".ldata.rel.ro.local") == 0))
7838 flags |= SECTION_RELRO;
7840 if (strcmp (name, ".lbss") == 0
7841 || strncmp (name, ".lbss.", 5) == 0
7842 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7843 flags |= SECTION_BSS;
7845 return flags;
7848 /* Build up a unique section name, expressed as a
7849 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7850 RELOC indicates whether the initial value of EXP requires
7851 link-time relocations. */
7853 static void ATTRIBUTE_UNUSED
7854 x86_64_elf_unique_section (tree decl, int reloc)
7856 if (ix86_in_large_data_p (decl))
7858 const char *prefix = NULL;
7859 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7860 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7862 switch (categorize_decl_for_section (decl, reloc))
7864 case SECCAT_DATA:
7865 case SECCAT_DATA_REL:
7866 case SECCAT_DATA_REL_LOCAL:
7867 case SECCAT_DATA_REL_RO:
7868 case SECCAT_DATA_REL_RO_LOCAL:
7869 prefix = one_only ? ".ld" : ".ldata";
7870 break;
7871 case SECCAT_BSS:
7872 prefix = one_only ? ".lb" : ".lbss";
7873 break;
7874 case SECCAT_RODATA:
7875 case SECCAT_RODATA_MERGE_STR:
7876 case SECCAT_RODATA_MERGE_STR_INIT:
7877 case SECCAT_RODATA_MERGE_CONST:
7878 prefix = one_only ? ".lr" : ".lrodata";
7879 break;
7880 case SECCAT_SRODATA:
7881 case SECCAT_SDATA:
7882 case SECCAT_SBSS:
7883 gcc_unreachable ();
7884 case SECCAT_TEXT:
7885 case SECCAT_TDATA:
7886 case SECCAT_TBSS:
7887 /* We don't split these for medium model. Place them into
7888 default sections and hope for best. */
7889 break;
7891 if (prefix)
7893 const char *name, *linkonce;
7894 char *string;
7896 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7897 name = targetm.strip_name_encoding (name);
7899 /* If we're using one_only, then there needs to be a .gnu.linkonce
7900 prefix to the section name. */
7901 linkonce = one_only ? ".gnu.linkonce" : "";
7903 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7905 set_decl_section_name (decl, string);
7906 return;
7909 default_unique_section (decl, reloc);
7912 #ifdef COMMON_ASM_OP
7914 #ifndef LARGECOMM_SECTION_ASM_OP
7915 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7916 #endif
7918 /* This says how to output assembler code to declare an
7919 uninitialized external linkage data object.
7921 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7922 large objects. */
7923 void
7924 x86_elf_aligned_decl_common (FILE *file, tree decl,
7925 const char *name, unsigned HOST_WIDE_INT size,
7926 int align)
7928 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7929 && size > (unsigned int)ix86_section_threshold)
7931 switch_to_section (get_named_section (decl, ".lbss", 0));
7932 fputs (LARGECOMM_SECTION_ASM_OP, file);
7934 else
7935 fputs (COMMON_ASM_OP, file);
7936 assemble_name (file, name);
7937 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7938 size, align / BITS_PER_UNIT);
7940 #endif
7942 /* Utility function for targets to use in implementing
7943 ASM_OUTPUT_ALIGNED_BSS. */
7945 void
7946 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7947 unsigned HOST_WIDE_INT size, int align)
7949 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7950 && size > (unsigned int)ix86_section_threshold)
7951 switch_to_section (get_named_section (decl, ".lbss", 0));
7952 else
7953 switch_to_section (bss_section);
7954 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7955 #ifdef ASM_DECLARE_OBJECT_NAME
7956 last_assemble_variable_decl = decl;
7957 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7958 #else
7959 /* Standard thing is just output label for the object. */
7960 ASM_OUTPUT_LABEL (file, name);
7961 #endif /* ASM_DECLARE_OBJECT_NAME */
7962 ASM_OUTPUT_SKIP (file, size ? size : 1);
7965 /* Decide whether we must probe the stack before any space allocation
7966 on this target. It's essentially TARGET_STACK_PROBE except when
7967 -fstack-check causes the stack to be already probed differently. */
7969 bool
7970 ix86_target_stack_probe (void)
7972 /* Do not probe the stack twice if static stack checking is enabled. */
7973 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7974 return false;
7976 return TARGET_STACK_PROBE;
7979 /* Decide whether we can make a sibling call to a function. DECL is the
7980 declaration of the function being targeted by the call and EXP is the
7981 CALL_EXPR representing the call. */
7983 static bool
7984 ix86_function_ok_for_sibcall (tree decl, tree exp)
7986 tree type, decl_or_type;
7987 rtx a, b;
7988 bool bind_global = decl && !targetm.binds_local_p (decl);
7990 if (ix86_function_naked (current_function_decl))
7991 return false;
7993 /* Sibling call isn't OK if there are no caller-saved registers
7994 since all registers must be preserved before return. */
7995 if (cfun->machine->no_caller_saved_registers)
7996 return false;
7998 /* If we are generating position-independent code, we cannot sibcall
7999 optimize direct calls to global functions, as the PLT requires
8000 %ebx be live. (Darwin does not have a PLT.) */
8001 if (!TARGET_MACHO
8002 && !TARGET_64BIT
8003 && flag_pic
8004 && flag_plt
8005 && bind_global)
8006 return false;
8008 /* If we need to align the outgoing stack, then sibcalling would
8009 unalign the stack, which may break the called function. */
8010 if (ix86_minimum_incoming_stack_boundary (true)
8011 < PREFERRED_STACK_BOUNDARY)
8012 return false;
8014 if (decl)
8016 decl_or_type = decl;
8017 type = TREE_TYPE (decl);
8019 else
8021 /* We're looking at the CALL_EXPR, we need the type of the function. */
8022 type = CALL_EXPR_FN (exp); /* pointer expression */
8023 type = TREE_TYPE (type); /* pointer type */
8024 type = TREE_TYPE (type); /* function type */
8025 decl_or_type = type;
8028 /* Check that the return value locations are the same. Like
8029 if we are returning floats on the 80387 register stack, we cannot
8030 make a sibcall from a function that doesn't return a float to a
8031 function that does or, conversely, from a function that does return
8032 a float to a function that doesn't; the necessary stack adjustment
8033 would not be executed. This is also the place we notice
8034 differences in the return value ABI. Note that it is ok for one
8035 of the functions to have void return type as long as the return
8036 value of the other is passed in a register. */
8037 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
8038 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
8039 cfun->decl, false);
8040 if (STACK_REG_P (a) || STACK_REG_P (b))
8042 if (!rtx_equal_p (a, b))
8043 return false;
8045 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
8047 else if (!rtx_equal_p (a, b))
8048 return false;
8050 if (TARGET_64BIT)
8052 /* The SYSV ABI has more call-clobbered registers;
8053 disallow sibcalls from MS to SYSV. */
8054 if (cfun->machine->call_abi == MS_ABI
8055 && ix86_function_type_abi (type) == SYSV_ABI)
8056 return false;
8058 else
8060 /* If this call is indirect, we'll need to be able to use a
8061 call-clobbered register for the address of the target function.
8062 Make sure that all such registers are not used for passing
8063 parameters. Note that DLLIMPORT functions and call to global
8064 function via GOT slot are indirect. */
8065 if (!decl
8066 || (bind_global && flag_pic && !flag_plt)
8067 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8069 /* Check if regparm >= 3 since arg_reg_available is set to
8070 false if regparm == 0. If regparm is 1 or 2, there is
8071 always a call-clobbered register available.
8073 ??? The symbol indirect call doesn't need a call-clobbered
8074 register. But we don't know if this is a symbol indirect
8075 call or not here. */
8076 if (ix86_function_regparm (type, NULL) >= 3
8077 && !cfun->machine->arg_reg_available)
8078 return false;
8082 /* Otherwise okay. That also includes certain types of indirect calls. */
8083 return true;
8086 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8087 and "sseregparm" calling convention attributes;
8088 arguments as in struct attribute_spec.handler. */
8090 static tree
8091 ix86_handle_cconv_attribute (tree *node, tree name,
8092 tree args,
8093 int,
8094 bool *no_add_attrs)
8096 if (TREE_CODE (*node) != FUNCTION_TYPE
8097 && TREE_CODE (*node) != METHOD_TYPE
8098 && TREE_CODE (*node) != FIELD_DECL
8099 && TREE_CODE (*node) != TYPE_DECL)
8101 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8102 name);
8103 *no_add_attrs = true;
8104 return NULL_TREE;
8107 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8108 if (is_attribute_p ("regparm", name))
8110 tree cst;
8112 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8114 error ("fastcall and regparm attributes are not compatible");
8117 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8119 error ("regparam and thiscall attributes are not compatible");
8122 cst = TREE_VALUE (args);
8123 if (TREE_CODE (cst) != INTEGER_CST)
8125 warning (OPT_Wattributes,
8126 "%qE attribute requires an integer constant argument",
8127 name);
8128 *no_add_attrs = true;
8130 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8132 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8133 name, REGPARM_MAX);
8134 *no_add_attrs = true;
8137 return NULL_TREE;
8140 if (TARGET_64BIT)
8142 /* Do not warn when emulating the MS ABI. */
8143 if ((TREE_CODE (*node) != FUNCTION_TYPE
8144 && TREE_CODE (*node) != METHOD_TYPE)
8145 || ix86_function_type_abi (*node) != MS_ABI)
8146 warning (OPT_Wattributes, "%qE attribute ignored",
8147 name);
8148 *no_add_attrs = true;
8149 return NULL_TREE;
8152 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8153 if (is_attribute_p ("fastcall", name))
8155 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8157 error ("fastcall and cdecl attributes are not compatible");
8159 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8161 error ("fastcall and stdcall attributes are not compatible");
8163 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8165 error ("fastcall and regparm attributes are not compatible");
8167 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8169 error ("fastcall and thiscall attributes are not compatible");
8173 /* Can combine stdcall with fastcall (redundant), regparm and
8174 sseregparm. */
8175 else if (is_attribute_p ("stdcall", name))
8177 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8179 error ("stdcall and cdecl attributes are not compatible");
8181 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8183 error ("stdcall and fastcall attributes are not compatible");
8185 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8187 error ("stdcall and thiscall attributes are not compatible");
8191 /* Can combine cdecl with regparm and sseregparm. */
8192 else if (is_attribute_p ("cdecl", name))
8194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8196 error ("stdcall and cdecl attributes are not compatible");
8198 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8200 error ("fastcall and cdecl attributes are not compatible");
8202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8204 error ("cdecl and thiscall attributes are not compatible");
8207 else if (is_attribute_p ("thiscall", name))
8209 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8210 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8211 name);
8212 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8214 error ("stdcall and thiscall attributes are not compatible");
8216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8218 error ("fastcall and thiscall attributes are not compatible");
8220 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8222 error ("cdecl and thiscall attributes are not compatible");
8226 /* Can combine sseregparm with all attributes. */
8228 return NULL_TREE;
8231 /* The transactional memory builtins are implicitly regparm or fastcall
8232 depending on the ABI. Override the generic do-nothing attribute that
8233 these builtins were declared with, and replace it with one of the two
8234 attributes that we expect elsewhere. */
8236 static tree
8237 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8238 int flags, bool *no_add_attrs)
8240 tree alt;
8242 /* In no case do we want to add the placeholder attribute. */
8243 *no_add_attrs = true;
8245 /* The 64-bit ABI is unchanged for transactional memory. */
8246 if (TARGET_64BIT)
8247 return NULL_TREE;
8249 /* ??? Is there a better way to validate 32-bit windows? We have
8250 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8251 if (CHECK_STACK_LIMIT > 0)
8252 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8253 else
8255 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8256 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8258 decl_attributes (node, alt, flags);
8260 return NULL_TREE;
8263 /* This function determines from TYPE the calling-convention. */
8265 unsigned int
8266 ix86_get_callcvt (const_tree type)
8268 unsigned int ret = 0;
8269 bool is_stdarg;
8270 tree attrs;
8272 if (TARGET_64BIT)
8273 return IX86_CALLCVT_CDECL;
8275 attrs = TYPE_ATTRIBUTES (type);
8276 if (attrs != NULL_TREE)
8278 if (lookup_attribute ("cdecl", attrs))
8279 ret |= IX86_CALLCVT_CDECL;
8280 else if (lookup_attribute ("stdcall", attrs))
8281 ret |= IX86_CALLCVT_STDCALL;
8282 else if (lookup_attribute ("fastcall", attrs))
8283 ret |= IX86_CALLCVT_FASTCALL;
8284 else if (lookup_attribute ("thiscall", attrs))
8285 ret |= IX86_CALLCVT_THISCALL;
8287 /* Regparam isn't allowed for thiscall and fastcall. */
8288 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8290 if (lookup_attribute ("regparm", attrs))
8291 ret |= IX86_CALLCVT_REGPARM;
8292 if (lookup_attribute ("sseregparm", attrs))
8293 ret |= IX86_CALLCVT_SSEREGPARM;
8296 if (IX86_BASE_CALLCVT(ret) != 0)
8297 return ret;
8300 is_stdarg = stdarg_p (type);
8301 if (TARGET_RTD && !is_stdarg)
8302 return IX86_CALLCVT_STDCALL | ret;
8304 if (ret != 0
8305 || is_stdarg
8306 || TREE_CODE (type) != METHOD_TYPE
8307 || ix86_function_type_abi (type) != MS_ABI)
8308 return IX86_CALLCVT_CDECL | ret;
8310 return IX86_CALLCVT_THISCALL;
8313 /* Return 0 if the attributes for two types are incompatible, 1 if they
8314 are compatible, and 2 if they are nearly compatible (which causes a
8315 warning to be generated). */
8317 static int
8318 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8320 unsigned int ccvt1, ccvt2;
8322 if (TREE_CODE (type1) != FUNCTION_TYPE
8323 && TREE_CODE (type1) != METHOD_TYPE)
8324 return 1;
8326 ccvt1 = ix86_get_callcvt (type1);
8327 ccvt2 = ix86_get_callcvt (type2);
8328 if (ccvt1 != ccvt2)
8329 return 0;
8330 if (ix86_function_regparm (type1, NULL)
8331 != ix86_function_regparm (type2, NULL))
8332 return 0;
8334 return 1;
8337 /* Return the regparm value for a function with the indicated TYPE and DECL.
8338 DECL may be NULL when calling function indirectly
8339 or considering a libcall. */
8341 static int
8342 ix86_function_regparm (const_tree type, const_tree decl)
8344 tree attr;
8345 int regparm;
8346 unsigned int ccvt;
8348 if (TARGET_64BIT)
8349 return (ix86_function_type_abi (type) == SYSV_ABI
8350 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8351 ccvt = ix86_get_callcvt (type);
8352 regparm = ix86_regparm;
8354 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8356 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8357 if (attr)
8359 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8360 return regparm;
8363 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8364 return 2;
8365 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8366 return 1;
8368 /* Use register calling convention for local functions when possible. */
8369 if (decl
8370 && TREE_CODE (decl) == FUNCTION_DECL)
8372 cgraph_node *target = cgraph_node::get (decl);
8373 if (target)
8374 target = target->function_symbol ();
8376 /* Caller and callee must agree on the calling convention, so
8377 checking here just optimize means that with
8378 __attribute__((optimize (...))) caller could use regparm convention
8379 and callee not, or vice versa. Instead look at whether the callee
8380 is optimized or not. */
8381 if (target && opt_for_fn (target->decl, optimize)
8382 && !(profile_flag && !flag_fentry))
8384 cgraph_local_info *i = &target->local;
8385 if (i && i->local && i->can_change_signature)
8387 int local_regparm, globals = 0, regno;
8389 /* Make sure no regparm register is taken by a
8390 fixed register variable. */
8391 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8392 local_regparm++)
8393 if (fixed_regs[local_regparm])
8394 break;
8396 /* We don't want to use regparm(3) for nested functions as
8397 these use a static chain pointer in the third argument. */
8398 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8399 local_regparm = 2;
8401 /* Save a register for the split stack. */
8402 if (flag_split_stack)
8404 if (local_regparm == 3)
8405 local_regparm = 2;
8406 else if (local_regparm == 2
8407 && DECL_STATIC_CHAIN (target->decl))
8408 local_regparm = 1;
8411 /* Each fixed register usage increases register pressure,
8412 so less registers should be used for argument passing.
8413 This functionality can be overriden by an explicit
8414 regparm value. */
8415 for (regno = AX_REG; regno <= DI_REG; regno++)
8416 if (fixed_regs[regno])
8417 globals++;
8419 local_regparm
8420 = globals < local_regparm ? local_regparm - globals : 0;
8422 if (local_regparm > regparm)
8423 regparm = local_regparm;
8428 return regparm;
8431 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8432 DFmode (2) arguments in SSE registers for a function with the
8433 indicated TYPE and DECL. DECL may be NULL when calling function
8434 indirectly or considering a libcall. Return -1 if any FP parameter
8435 should be rejected by error. This is used in siutation we imply SSE
8436 calling convetion but the function is called from another function with
8437 SSE disabled. Otherwise return 0. */
8439 static int
8440 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8442 gcc_assert (!TARGET_64BIT);
8444 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8445 by the sseregparm attribute. */
8446 if (TARGET_SSEREGPARM
8447 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8449 if (!TARGET_SSE)
8451 if (warn)
8453 if (decl)
8454 error ("calling %qD with attribute sseregparm without "
8455 "SSE/SSE2 enabled", decl);
8456 else
8457 error ("calling %qT with attribute sseregparm without "
8458 "SSE/SSE2 enabled", type);
8460 return 0;
8463 return 2;
8466 if (!decl)
8467 return 0;
8469 cgraph_node *target = cgraph_node::get (decl);
8470 if (target)
8471 target = target->function_symbol ();
8473 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8474 (and DFmode for SSE2) arguments in SSE registers. */
8475 if (target
8476 /* TARGET_SSE_MATH */
8477 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8478 && opt_for_fn (target->decl, optimize)
8479 && !(profile_flag && !flag_fentry))
8481 cgraph_local_info *i = &target->local;
8482 if (i && i->local && i->can_change_signature)
8484 /* Refuse to produce wrong code when local function with SSE enabled
8485 is called from SSE disabled function.
8486 FIXME: We need a way to detect these cases cross-ltrans partition
8487 and avoid using SSE calling conventions on local functions called
8488 from function with SSE disabled. For now at least delay the
8489 warning until we know we are going to produce wrong code.
8490 See PR66047 */
8491 if (!TARGET_SSE && warn)
8492 return -1;
8493 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8494 ->x_ix86_isa_flags) ? 2 : 1;
8498 return 0;
8501 /* Return true if EAX is live at the start of the function. Used by
8502 ix86_expand_prologue to determine if we need special help before
8503 calling allocate_stack_worker. */
8505 static bool
8506 ix86_eax_live_at_start_p (void)
8508 /* Cheat. Don't bother working forward from ix86_function_regparm
8509 to the function type to whether an actual argument is located in
8510 eax. Instead just look at cfg info, which is still close enough
8511 to correct at this point. This gives false positives for broken
8512 functions that might use uninitialized data that happens to be
8513 allocated in eax, but who cares? */
8514 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8517 static bool
8518 ix86_keep_aggregate_return_pointer (tree fntype)
8520 tree attr;
8522 if (!TARGET_64BIT)
8524 attr = lookup_attribute ("callee_pop_aggregate_return",
8525 TYPE_ATTRIBUTES (fntype));
8526 if (attr)
8527 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8529 /* For 32-bit MS-ABI the default is to keep aggregate
8530 return pointer. */
8531 if (ix86_function_type_abi (fntype) == MS_ABI)
8532 return true;
8534 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8537 /* Value is the number of bytes of arguments automatically
8538 popped when returning from a subroutine call.
8539 FUNDECL is the declaration node of the function (as a tree),
8540 FUNTYPE is the data type of the function (as a tree),
8541 or for a library call it is an identifier node for the subroutine name.
8542 SIZE is the number of bytes of arguments passed on the stack.
8544 On the 80386, the RTD insn may be used to pop them if the number
8545 of args is fixed, but if the number is variable then the caller
8546 must pop them all. RTD can't be used for library calls now
8547 because the library is compiled with the Unix compiler.
8548 Use of RTD is a selectable option, since it is incompatible with
8549 standard Unix calling sequences. If the option is not selected,
8550 the caller must always pop the args.
8552 The attribute stdcall is equivalent to RTD on a per module basis. */
8554 static int
8555 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8557 unsigned int ccvt;
8559 /* None of the 64-bit ABIs pop arguments. */
8560 if (TARGET_64BIT)
8561 return 0;
8563 ccvt = ix86_get_callcvt (funtype);
8565 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8566 | IX86_CALLCVT_THISCALL)) != 0
8567 && ! stdarg_p (funtype))
8568 return size;
8570 /* Lose any fake structure return argument if it is passed on the stack. */
8571 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8572 && !ix86_keep_aggregate_return_pointer (funtype))
8574 int nregs = ix86_function_regparm (funtype, fundecl);
8575 if (nregs == 0)
8576 return GET_MODE_SIZE (Pmode);
8579 return 0;
8582 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8584 static bool
8585 ix86_legitimate_combined_insn (rtx_insn *insn)
8587 int i;
8589 /* Check operand constraints in case hard registers were propagated
8590 into insn pattern. This check prevents combine pass from
8591 generating insn patterns with invalid hard register operands.
8592 These invalid insns can eventually confuse reload to error out
8593 with a spill failure. See also PRs 46829 and 46843. */
8595 gcc_assert (INSN_CODE (insn) >= 0);
8597 extract_insn (insn);
8598 preprocess_constraints (insn);
8600 int n_operands = recog_data.n_operands;
8601 int n_alternatives = recog_data.n_alternatives;
8602 for (i = 0; i < n_operands; i++)
8604 rtx op = recog_data.operand[i];
8605 machine_mode mode = GET_MODE (op);
8606 const operand_alternative *op_alt;
8607 int offset = 0;
8608 bool win;
8609 int j;
8611 /* A unary operator may be accepted by the predicate, but it
8612 is irrelevant for matching constraints. */
8613 if (UNARY_P (op))
8614 op = XEXP (op, 0);
8616 if (SUBREG_P (op))
8618 if (REG_P (SUBREG_REG (op))
8619 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8620 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8621 GET_MODE (SUBREG_REG (op)),
8622 SUBREG_BYTE (op),
8623 GET_MODE (op));
8624 op = SUBREG_REG (op);
8627 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8628 continue;
8630 op_alt = recog_op_alt;
8632 /* Operand has no constraints, anything is OK. */
8633 win = !n_alternatives;
8635 alternative_mask preferred = get_preferred_alternatives (insn);
8636 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8638 if (!TEST_BIT (preferred, j))
8639 continue;
8640 if (op_alt[i].anything_ok
8641 || (op_alt[i].matches != -1
8642 && operands_match_p
8643 (recog_data.operand[i],
8644 recog_data.operand[op_alt[i].matches]))
8645 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8647 win = true;
8648 break;
8652 if (!win)
8653 return false;
8656 return true;
8659 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8661 static unsigned HOST_WIDE_INT
8662 ix86_asan_shadow_offset (void)
8664 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8665 : HOST_WIDE_INT_C (0x7fff8000))
8666 : (HOST_WIDE_INT_1 << 29);
8669 /* Argument support functions. */
8671 /* Return true when register may be used to pass function parameters. */
8672 bool
8673 ix86_function_arg_regno_p (int regno)
8675 int i;
8676 enum calling_abi call_abi;
8677 const int *parm_regs;
8679 if (TARGET_MPX && BND_REGNO_P (regno))
8680 return true;
8682 if (!TARGET_64BIT)
8684 if (TARGET_MACHO)
8685 return (regno < REGPARM_MAX
8686 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8687 else
8688 return (regno < REGPARM_MAX
8689 || (TARGET_MMX && MMX_REGNO_P (regno)
8690 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8691 || (TARGET_SSE && SSE_REGNO_P (regno)
8692 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8695 if (TARGET_SSE && SSE_REGNO_P (regno)
8696 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8697 return true;
8699 /* TODO: The function should depend on current function ABI but
8700 builtins.c would need updating then. Therefore we use the
8701 default ABI. */
8702 call_abi = ix86_cfun_abi ();
8704 /* RAX is used as hidden argument to va_arg functions. */
8705 if (call_abi == SYSV_ABI && regno == AX_REG)
8706 return true;
8708 if (call_abi == MS_ABI)
8709 parm_regs = x86_64_ms_abi_int_parameter_registers;
8710 else
8711 parm_regs = x86_64_int_parameter_registers;
8713 for (i = 0; i < (call_abi == MS_ABI
8714 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8715 if (regno == parm_regs[i])
8716 return true;
8717 return false;
8720 /* Return if we do not know how to pass TYPE solely in registers. */
8722 static bool
8723 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8725 if (must_pass_in_stack_var_size_or_pad (mode, type))
8726 return true;
8728 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8729 The layout_type routine is crafty and tries to trick us into passing
8730 currently unsupported vector types on the stack by using TImode. */
8731 return (!TARGET_64BIT && mode == TImode
8732 && type && TREE_CODE (type) != VECTOR_TYPE);
8735 /* It returns the size, in bytes, of the area reserved for arguments passed
8736 in registers for the function represented by fndecl dependent to the used
8737 abi format. */
8739 ix86_reg_parm_stack_space (const_tree fndecl)
8741 enum calling_abi call_abi = SYSV_ABI;
8742 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8743 call_abi = ix86_function_abi (fndecl);
8744 else
8745 call_abi = ix86_function_type_abi (fndecl);
8746 if (TARGET_64BIT && call_abi == MS_ABI)
8747 return 32;
8748 return 0;
8751 /* We add this as a workaround in order to use libc_has_function
8752 hook in i386.md. */
8753 bool
8754 ix86_libc_has_function (enum function_class fn_class)
8756 return targetm.libc_has_function (fn_class);
8759 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8760 specifying the call abi used. */
8761 enum calling_abi
8762 ix86_function_type_abi (const_tree fntype)
8764 enum calling_abi abi = ix86_abi;
8766 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8767 return abi;
8769 if (abi == SYSV_ABI
8770 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8772 static int warned;
8773 if (TARGET_X32 && !warned)
8775 error ("X32 does not support ms_abi attribute");
8776 warned = 1;
8779 abi = MS_ABI;
8781 else if (abi == MS_ABI
8782 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8783 abi = SYSV_ABI;
8785 return abi;
8788 static enum calling_abi
8789 ix86_function_abi (const_tree fndecl)
8791 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8794 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8795 specifying the call abi used. */
8796 enum calling_abi
8797 ix86_cfun_abi (void)
8799 return cfun ? cfun->machine->call_abi : ix86_abi;
8802 static bool
8803 ix86_function_ms_hook_prologue (const_tree fn)
8805 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8807 if (decl_function_context (fn) != NULL_TREE)
8808 error_at (DECL_SOURCE_LOCATION (fn),
8809 "ms_hook_prologue is not compatible with nested function");
8810 else
8811 return true;
8813 return false;
8816 static bool
8817 ix86_function_naked (const_tree fn)
8819 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8820 return true;
8822 return false;
8825 /* Write the extra assembler code needed to declare a function properly. */
8827 void
8828 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8829 tree decl)
8831 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8833 if (is_ms_hook)
8835 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8836 unsigned int filler_cc = 0xcccccccc;
8838 for (i = 0; i < filler_count; i += 4)
8839 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8842 #ifdef SUBTARGET_ASM_UNWIND_INIT
8843 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8844 #endif
8846 ASM_OUTPUT_LABEL (asm_out_file, fname);
8848 /* Output magic byte marker, if hot-patch attribute is set. */
8849 if (is_ms_hook)
8851 if (TARGET_64BIT)
8853 /* leaq [%rsp + 0], %rsp */
8854 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8855 asm_out_file);
8857 else
8859 /* movl.s %edi, %edi
8860 push %ebp
8861 movl.s %esp, %ebp */
8862 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8867 /* Implementation of call abi switching target hook. Specific to FNDECL
8868 the specific call register sets are set. See also
8869 ix86_conditional_register_usage for more details. */
8870 void
8871 ix86_call_abi_override (const_tree fndecl)
8873 cfun->machine->call_abi = ix86_function_abi (fndecl);
8876 /* Return 1 if pseudo register should be created and used to hold
8877 GOT address for PIC code. */
8878 bool
8879 ix86_use_pseudo_pic_reg (void)
8881 if ((TARGET_64BIT
8882 && (ix86_cmodel == CM_SMALL_PIC
8883 || TARGET_PECOFF))
8884 || !flag_pic)
8885 return false;
8886 return true;
8889 /* Initialize large model PIC register. */
8891 static void
8892 ix86_init_large_pic_reg (unsigned int tmp_regno)
8894 rtx_code_label *label;
8895 rtx tmp_reg;
8897 gcc_assert (Pmode == DImode);
8898 label = gen_label_rtx ();
8899 emit_label (label);
8900 LABEL_PRESERVE_P (label) = 1;
8901 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8902 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8903 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8904 label));
8905 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8906 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8907 pic_offset_table_rtx, tmp_reg));
8908 const char *name = LABEL_NAME (label);
8909 PUT_CODE (label, NOTE);
8910 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
8911 NOTE_DELETED_LABEL_NAME (label) = name;
8914 /* Create and initialize PIC register if required. */
8915 static void
8916 ix86_init_pic_reg (void)
8918 edge entry_edge;
8919 rtx_insn *seq;
8921 if (!ix86_use_pseudo_pic_reg ())
8922 return;
8924 start_sequence ();
8926 if (TARGET_64BIT)
8928 if (ix86_cmodel == CM_LARGE_PIC)
8929 ix86_init_large_pic_reg (R11_REG);
8930 else
8931 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8933 else
8935 /* If there is future mcount call in the function it is more profitable
8936 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8937 rtx reg = crtl->profile
8938 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8939 : pic_offset_table_rtx;
8940 rtx_insn *insn = emit_insn (gen_set_got (reg));
8941 RTX_FRAME_RELATED_P (insn) = 1;
8942 if (crtl->profile)
8943 emit_move_insn (pic_offset_table_rtx, reg);
8944 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8947 seq = get_insns ();
8948 end_sequence ();
8950 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8951 insert_insn_on_edge (seq, entry_edge);
8952 commit_one_edge_insertion (entry_edge);
8955 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8956 for a call to a function whose data type is FNTYPE.
8957 For a library call, FNTYPE is 0. */
8959 void
8960 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8961 tree fntype, /* tree ptr for function decl */
8962 rtx libname, /* SYMBOL_REF of library name or 0 */
8963 tree fndecl,
8964 int caller)
8966 struct cgraph_local_info *i = NULL;
8967 struct cgraph_node *target = NULL;
8969 memset (cum, 0, sizeof (*cum));
8971 if (fndecl)
8973 target = cgraph_node::get (fndecl);
8974 if (target)
8976 target = target->function_symbol ();
8977 i = cgraph_node::local_info (target->decl);
8978 cum->call_abi = ix86_function_abi (target->decl);
8980 else
8981 cum->call_abi = ix86_function_abi (fndecl);
8983 else
8984 cum->call_abi = ix86_function_type_abi (fntype);
8986 cum->caller = caller;
8988 /* Set up the number of registers to use for passing arguments. */
8989 cum->nregs = ix86_regparm;
8990 if (TARGET_64BIT)
8992 cum->nregs = (cum->call_abi == SYSV_ABI
8993 ? X86_64_REGPARM_MAX
8994 : X86_64_MS_REGPARM_MAX);
8996 if (TARGET_SSE)
8998 cum->sse_nregs = SSE_REGPARM_MAX;
8999 if (TARGET_64BIT)
9001 cum->sse_nregs = (cum->call_abi == SYSV_ABI
9002 ? X86_64_SSE_REGPARM_MAX
9003 : X86_64_MS_SSE_REGPARM_MAX);
9006 if (TARGET_MMX)
9007 cum->mmx_nregs = MMX_REGPARM_MAX;
9008 cum->warn_avx512f = true;
9009 cum->warn_avx = true;
9010 cum->warn_sse = true;
9011 cum->warn_mmx = true;
9013 /* Because type might mismatch in between caller and callee, we need to
9014 use actual type of function for local calls.
9015 FIXME: cgraph_analyze can be told to actually record if function uses
9016 va_start so for local functions maybe_vaarg can be made aggressive
9017 helping K&R code.
9018 FIXME: once typesytem is fixed, we won't need this code anymore. */
9019 if (i && i->local && i->can_change_signature)
9020 fntype = TREE_TYPE (target->decl);
9021 cum->stdarg = stdarg_p (fntype);
9022 cum->maybe_vaarg = (fntype
9023 ? (!prototype_p (fntype) || stdarg_p (fntype))
9024 : !libname);
9026 cum->bnd_regno = FIRST_BND_REG;
9027 cum->bnds_in_bt = 0;
9028 cum->force_bnd_pass = 0;
9029 cum->decl = fndecl;
9031 if (!TARGET_64BIT)
9033 /* If there are variable arguments, then we won't pass anything
9034 in registers in 32-bit mode. */
9035 if (stdarg_p (fntype))
9037 cum->nregs = 0;
9038 /* Since in 32-bit, variable arguments are always passed on
9039 stack, there is scratch register available for indirect
9040 sibcall. */
9041 cfun->machine->arg_reg_available = true;
9042 cum->sse_nregs = 0;
9043 cum->mmx_nregs = 0;
9044 cum->warn_avx512f = false;
9045 cum->warn_avx = false;
9046 cum->warn_sse = false;
9047 cum->warn_mmx = false;
9048 return;
9051 /* Use ecx and edx registers if function has fastcall attribute,
9052 else look for regparm information. */
9053 if (fntype)
9055 unsigned int ccvt = ix86_get_callcvt (fntype);
9056 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
9058 cum->nregs = 1;
9059 cum->fastcall = 1; /* Same first register as in fastcall. */
9061 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9063 cum->nregs = 2;
9064 cum->fastcall = 1;
9066 else
9067 cum->nregs = ix86_function_regparm (fntype, fndecl);
9070 /* Set up the number of SSE registers used for passing SFmode
9071 and DFmode arguments. Warn for mismatching ABI. */
9072 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9075 cfun->machine->arg_reg_available = (cum->nregs > 0);
9078 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9079 But in the case of vector types, it is some vector mode.
9081 When we have only some of our vector isa extensions enabled, then there
9082 are some modes for which vector_mode_supported_p is false. For these
9083 modes, the generic vector support in gcc will choose some non-vector mode
9084 in order to implement the type. By computing the natural mode, we'll
9085 select the proper ABI location for the operand and not depend on whatever
9086 the middle-end decides to do with these vector types.
9088 The midde-end can't deal with the vector types > 16 bytes. In this
9089 case, we return the original mode and warn ABI change if CUM isn't
9090 NULL.
9092 If INT_RETURN is true, warn ABI change if the vector mode isn't
9093 available for function return value. */
9095 static machine_mode
9096 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9097 bool in_return)
9099 machine_mode mode = TYPE_MODE (type);
9101 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9103 HOST_WIDE_INT size = int_size_in_bytes (type);
9104 if ((size == 8 || size == 16 || size == 32 || size == 64)
9105 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9106 && TYPE_VECTOR_SUBPARTS (type) > 1)
9108 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9110 /* There are no XFmode vector modes. */
9111 if (innermode == XFmode)
9112 return mode;
9114 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9115 mode = MIN_MODE_VECTOR_FLOAT;
9116 else
9117 mode = MIN_MODE_VECTOR_INT;
9119 /* Get the mode which has this inner mode and number of units. */
9120 FOR_EACH_MODE_FROM (mode, mode)
9121 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9122 && GET_MODE_INNER (mode) == innermode)
9124 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9126 static bool warnedavx512f;
9127 static bool warnedavx512f_ret;
9129 if (cum && cum->warn_avx512f && !warnedavx512f)
9131 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9132 "without AVX512F enabled changes the ABI"))
9133 warnedavx512f = true;
9135 else if (in_return && !warnedavx512f_ret)
9137 if (warning (OPT_Wpsabi, "AVX512F vector return "
9138 "without AVX512F enabled changes the ABI"))
9139 warnedavx512f_ret = true;
9142 return TYPE_MODE (type);
9144 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9146 static bool warnedavx;
9147 static bool warnedavx_ret;
9149 if (cum && cum->warn_avx && !warnedavx)
9151 if (warning (OPT_Wpsabi, "AVX vector argument "
9152 "without AVX enabled changes the ABI"))
9153 warnedavx = true;
9155 else if (in_return && !warnedavx_ret)
9157 if (warning (OPT_Wpsabi, "AVX vector return "
9158 "without AVX enabled changes the ABI"))
9159 warnedavx_ret = true;
9162 return TYPE_MODE (type);
9164 else if (((size == 8 && TARGET_64BIT) || size == 16)
9165 && !TARGET_SSE
9166 && !TARGET_IAMCU)
9168 static bool warnedsse;
9169 static bool warnedsse_ret;
9171 if (cum && cum->warn_sse && !warnedsse)
9173 if (warning (OPT_Wpsabi, "SSE vector argument "
9174 "without SSE enabled changes the ABI"))
9175 warnedsse = true;
9177 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9179 if (warning (OPT_Wpsabi, "SSE vector return "
9180 "without SSE enabled changes the ABI"))
9181 warnedsse_ret = true;
9184 else if ((size == 8 && !TARGET_64BIT)
9185 && (!cfun
9186 || cfun->machine->func_type == TYPE_NORMAL)
9187 && !TARGET_MMX
9188 && !TARGET_IAMCU)
9190 static bool warnedmmx;
9191 static bool warnedmmx_ret;
9193 if (cum && cum->warn_mmx && !warnedmmx)
9195 if (warning (OPT_Wpsabi, "MMX vector argument "
9196 "without MMX enabled changes the ABI"))
9197 warnedmmx = true;
9199 else if (in_return && !warnedmmx_ret)
9201 if (warning (OPT_Wpsabi, "MMX vector return "
9202 "without MMX enabled changes the ABI"))
9203 warnedmmx_ret = true;
9206 return mode;
9209 gcc_unreachable ();
9213 return mode;
9216 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9217 this may not agree with the mode that the type system has chosen for the
9218 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9219 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9221 static rtx
9222 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9223 unsigned int regno)
9225 rtx tmp;
9227 if (orig_mode != BLKmode)
9228 tmp = gen_rtx_REG (orig_mode, regno);
9229 else
9231 tmp = gen_rtx_REG (mode, regno);
9232 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9233 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9236 return tmp;
9239 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9240 of this code is to classify each 8bytes of incoming argument by the register
9241 class and assign registers accordingly. */
9243 /* Return the union class of CLASS1 and CLASS2.
9244 See the x86-64 PS ABI for details. */
9246 static enum x86_64_reg_class
9247 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9249 /* Rule #1: If both classes are equal, this is the resulting class. */
9250 if (class1 == class2)
9251 return class1;
9253 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9254 the other class. */
9255 if (class1 == X86_64_NO_CLASS)
9256 return class2;
9257 if (class2 == X86_64_NO_CLASS)
9258 return class1;
9260 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9261 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9262 return X86_64_MEMORY_CLASS;
9264 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9265 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9266 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9267 return X86_64_INTEGERSI_CLASS;
9268 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9269 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9270 return X86_64_INTEGER_CLASS;
9272 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9273 MEMORY is used. */
9274 if (class1 == X86_64_X87_CLASS
9275 || class1 == X86_64_X87UP_CLASS
9276 || class1 == X86_64_COMPLEX_X87_CLASS
9277 || class2 == X86_64_X87_CLASS
9278 || class2 == X86_64_X87UP_CLASS
9279 || class2 == X86_64_COMPLEX_X87_CLASS)
9280 return X86_64_MEMORY_CLASS;
9282 /* Rule #6: Otherwise class SSE is used. */
9283 return X86_64_SSE_CLASS;
9286 /* Classify the argument of type TYPE and mode MODE.
9287 CLASSES will be filled by the register class used to pass each word
9288 of the operand. The number of words is returned. In case the parameter
9289 should be passed in memory, 0 is returned. As a special case for zero
9290 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9292 BIT_OFFSET is used internally for handling records and specifies offset
9293 of the offset in bits modulo 512 to avoid overflow cases.
9295 See the x86-64 PS ABI for details.
9298 static int
9299 classify_argument (machine_mode mode, const_tree type,
9300 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9302 HOST_WIDE_INT bytes =
9303 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9304 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9306 /* Variable sized entities are always passed/returned in memory. */
9307 if (bytes < 0)
9308 return 0;
9310 if (mode != VOIDmode
9311 && targetm.calls.must_pass_in_stack (mode, type))
9312 return 0;
9314 if (type && AGGREGATE_TYPE_P (type))
9316 int i;
9317 tree field;
9318 enum x86_64_reg_class subclasses[MAX_CLASSES];
9320 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9321 if (bytes > 64)
9322 return 0;
9324 for (i = 0; i < words; i++)
9325 classes[i] = X86_64_NO_CLASS;
9327 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9328 signalize memory class, so handle it as special case. */
9329 if (!words)
9331 classes[0] = X86_64_NO_CLASS;
9332 return 1;
9335 /* Classify each field of record and merge classes. */
9336 switch (TREE_CODE (type))
9338 case RECORD_TYPE:
9339 /* And now merge the fields of structure. */
9340 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9342 if (TREE_CODE (field) == FIELD_DECL)
9344 int num;
9346 if (TREE_TYPE (field) == error_mark_node)
9347 continue;
9349 /* Bitfields are always classified as integer. Handle them
9350 early, since later code would consider them to be
9351 misaligned integers. */
9352 if (DECL_BIT_FIELD (field))
9354 for (i = (int_bit_position (field)
9355 + (bit_offset % 64)) / 8 / 8;
9356 i < ((int_bit_position (field) + (bit_offset % 64))
9357 + tree_to_shwi (DECL_SIZE (field))
9358 + 63) / 8 / 8; i++)
9359 classes[i] =
9360 merge_classes (X86_64_INTEGER_CLASS,
9361 classes[i]);
9363 else
9365 int pos;
9367 type = TREE_TYPE (field);
9369 /* Flexible array member is ignored. */
9370 if (TYPE_MODE (type) == BLKmode
9371 && TREE_CODE (type) == ARRAY_TYPE
9372 && TYPE_SIZE (type) == NULL_TREE
9373 && TYPE_DOMAIN (type) != NULL_TREE
9374 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9375 == NULL_TREE))
9377 static bool warned;
9379 if (!warned && warn_psabi)
9381 warned = true;
9382 inform (input_location,
9383 "the ABI of passing struct with"
9384 " a flexible array member has"
9385 " changed in GCC 4.4");
9387 continue;
9389 num = classify_argument (TYPE_MODE (type), type,
9390 subclasses,
9391 (int_bit_position (field)
9392 + bit_offset) % 512);
9393 if (!num)
9394 return 0;
9395 pos = (int_bit_position (field)
9396 + (bit_offset % 64)) / 8 / 8;
9397 for (i = 0; i < num && (i + pos) < words; i++)
9398 classes[i + pos] =
9399 merge_classes (subclasses[i], classes[i + pos]);
9403 break;
9405 case ARRAY_TYPE:
9406 /* Arrays are handled as small records. */
9408 int num;
9409 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9410 TREE_TYPE (type), subclasses, bit_offset);
9411 if (!num)
9412 return 0;
9414 /* The partial classes are now full classes. */
9415 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9416 subclasses[0] = X86_64_SSE_CLASS;
9417 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9418 && !((bit_offset % 64) == 0 && bytes == 4))
9419 subclasses[0] = X86_64_INTEGER_CLASS;
9421 for (i = 0; i < words; i++)
9422 classes[i] = subclasses[i % num];
9424 break;
9426 case UNION_TYPE:
9427 case QUAL_UNION_TYPE:
9428 /* Unions are similar to RECORD_TYPE but offset is always 0.
9430 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9432 if (TREE_CODE (field) == FIELD_DECL)
9434 int num;
9436 if (TREE_TYPE (field) == error_mark_node)
9437 continue;
9439 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9440 TREE_TYPE (field), subclasses,
9441 bit_offset);
9442 if (!num)
9443 return 0;
9444 for (i = 0; i < num && i < words; i++)
9445 classes[i] = merge_classes (subclasses[i], classes[i]);
9448 break;
9450 default:
9451 gcc_unreachable ();
9454 if (words > 2)
9456 /* When size > 16 bytes, if the first one isn't
9457 X86_64_SSE_CLASS or any other ones aren't
9458 X86_64_SSEUP_CLASS, everything should be passed in
9459 memory. */
9460 if (classes[0] != X86_64_SSE_CLASS)
9461 return 0;
9463 for (i = 1; i < words; i++)
9464 if (classes[i] != X86_64_SSEUP_CLASS)
9465 return 0;
9468 /* Final merger cleanup. */
9469 for (i = 0; i < words; i++)
9471 /* If one class is MEMORY, everything should be passed in
9472 memory. */
9473 if (classes[i] == X86_64_MEMORY_CLASS)
9474 return 0;
9476 /* The X86_64_SSEUP_CLASS should be always preceded by
9477 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9478 if (classes[i] == X86_64_SSEUP_CLASS
9479 && classes[i - 1] != X86_64_SSE_CLASS
9480 && classes[i - 1] != X86_64_SSEUP_CLASS)
9482 /* The first one should never be X86_64_SSEUP_CLASS. */
9483 gcc_assert (i != 0);
9484 classes[i] = X86_64_SSE_CLASS;
9487 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9488 everything should be passed in memory. */
9489 if (classes[i] == X86_64_X87UP_CLASS
9490 && (classes[i - 1] != X86_64_X87_CLASS))
9492 static bool warned;
9494 /* The first one should never be X86_64_X87UP_CLASS. */
9495 gcc_assert (i != 0);
9496 if (!warned && warn_psabi)
9498 warned = true;
9499 inform (input_location,
9500 "the ABI of passing union with long double"
9501 " has changed in GCC 4.4");
9503 return 0;
9506 return words;
9509 /* Compute alignment needed. We align all types to natural boundaries with
9510 exception of XFmode that is aligned to 64bits. */
9511 if (mode != VOIDmode && mode != BLKmode)
9513 int mode_alignment = GET_MODE_BITSIZE (mode);
9515 if (mode == XFmode)
9516 mode_alignment = 128;
9517 else if (mode == XCmode)
9518 mode_alignment = 256;
9519 if (COMPLEX_MODE_P (mode))
9520 mode_alignment /= 2;
9521 /* Misaligned fields are always returned in memory. */
9522 if (bit_offset % mode_alignment)
9523 return 0;
9526 /* for V1xx modes, just use the base mode */
9527 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9528 && GET_MODE_UNIT_SIZE (mode) == bytes)
9529 mode = GET_MODE_INNER (mode);
9531 /* Classification of atomic types. */
9532 switch (mode)
9534 case E_SDmode:
9535 case E_DDmode:
9536 classes[0] = X86_64_SSE_CLASS;
9537 return 1;
9538 case E_TDmode:
9539 classes[0] = X86_64_SSE_CLASS;
9540 classes[1] = X86_64_SSEUP_CLASS;
9541 return 2;
9542 case E_DImode:
9543 case E_SImode:
9544 case E_HImode:
9545 case E_QImode:
9546 case E_CSImode:
9547 case E_CHImode:
9548 case E_CQImode:
9550 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9552 /* Analyze last 128 bits only. */
9553 size = (size - 1) & 0x7f;
9555 if (size < 32)
9557 classes[0] = X86_64_INTEGERSI_CLASS;
9558 return 1;
9560 else if (size < 64)
9562 classes[0] = X86_64_INTEGER_CLASS;
9563 return 1;
9565 else if (size < 64+32)
9567 classes[0] = X86_64_INTEGER_CLASS;
9568 classes[1] = X86_64_INTEGERSI_CLASS;
9569 return 2;
9571 else if (size < 64+64)
9573 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9574 return 2;
9576 else
9577 gcc_unreachable ();
9579 case E_CDImode:
9580 case E_TImode:
9581 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9582 return 2;
9583 case E_COImode:
9584 case E_OImode:
9585 /* OImode shouldn't be used directly. */
9586 gcc_unreachable ();
9587 case E_CTImode:
9588 return 0;
9589 case E_SFmode:
9590 if (!(bit_offset % 64))
9591 classes[0] = X86_64_SSESF_CLASS;
9592 else
9593 classes[0] = X86_64_SSE_CLASS;
9594 return 1;
9595 case E_DFmode:
9596 classes[0] = X86_64_SSEDF_CLASS;
9597 return 1;
9598 case E_XFmode:
9599 classes[0] = X86_64_X87_CLASS;
9600 classes[1] = X86_64_X87UP_CLASS;
9601 return 2;
9602 case E_TFmode:
9603 classes[0] = X86_64_SSE_CLASS;
9604 classes[1] = X86_64_SSEUP_CLASS;
9605 return 2;
9606 case E_SCmode:
9607 classes[0] = X86_64_SSE_CLASS;
9608 if (!(bit_offset % 64))
9609 return 1;
9610 else
9612 static bool warned;
9614 if (!warned && warn_psabi)
9616 warned = true;
9617 inform (input_location,
9618 "the ABI of passing structure with complex float"
9619 " member has changed in GCC 4.4");
9621 classes[1] = X86_64_SSESF_CLASS;
9622 return 2;
9624 case E_DCmode:
9625 classes[0] = X86_64_SSEDF_CLASS;
9626 classes[1] = X86_64_SSEDF_CLASS;
9627 return 2;
9628 case E_XCmode:
9629 classes[0] = X86_64_COMPLEX_X87_CLASS;
9630 return 1;
9631 case E_TCmode:
9632 /* This modes is larger than 16 bytes. */
9633 return 0;
9634 case E_V8SFmode:
9635 case E_V8SImode:
9636 case E_V32QImode:
9637 case E_V16HImode:
9638 case E_V4DFmode:
9639 case E_V4DImode:
9640 classes[0] = X86_64_SSE_CLASS;
9641 classes[1] = X86_64_SSEUP_CLASS;
9642 classes[2] = X86_64_SSEUP_CLASS;
9643 classes[3] = X86_64_SSEUP_CLASS;
9644 return 4;
9645 case E_V8DFmode:
9646 case E_V16SFmode:
9647 case E_V8DImode:
9648 case E_V16SImode:
9649 case E_V32HImode:
9650 case E_V64QImode:
9651 classes[0] = X86_64_SSE_CLASS;
9652 classes[1] = X86_64_SSEUP_CLASS;
9653 classes[2] = X86_64_SSEUP_CLASS;
9654 classes[3] = X86_64_SSEUP_CLASS;
9655 classes[4] = X86_64_SSEUP_CLASS;
9656 classes[5] = X86_64_SSEUP_CLASS;
9657 classes[6] = X86_64_SSEUP_CLASS;
9658 classes[7] = X86_64_SSEUP_CLASS;
9659 return 8;
9660 case E_V4SFmode:
9661 case E_V4SImode:
9662 case E_V16QImode:
9663 case E_V8HImode:
9664 case E_V2DFmode:
9665 case E_V2DImode:
9666 classes[0] = X86_64_SSE_CLASS;
9667 classes[1] = X86_64_SSEUP_CLASS;
9668 return 2;
9669 case E_V1TImode:
9670 case E_V1DImode:
9671 case E_V2SFmode:
9672 case E_V2SImode:
9673 case E_V4HImode:
9674 case E_V8QImode:
9675 classes[0] = X86_64_SSE_CLASS;
9676 return 1;
9677 case E_BLKmode:
9678 case E_VOIDmode:
9679 return 0;
9680 default:
9681 gcc_assert (VECTOR_MODE_P (mode));
9683 if (bytes > 16)
9684 return 0;
9686 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9688 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9689 classes[0] = X86_64_INTEGERSI_CLASS;
9690 else
9691 classes[0] = X86_64_INTEGER_CLASS;
9692 classes[1] = X86_64_INTEGER_CLASS;
9693 return 1 + (bytes > 8);
9697 /* Examine the argument and return set number of register required in each
9698 class. Return true iff parameter should be passed in memory. */
9700 static bool
9701 examine_argument (machine_mode mode, const_tree type, int in_return,
9702 int *int_nregs, int *sse_nregs)
9704 enum x86_64_reg_class regclass[MAX_CLASSES];
9705 int n = classify_argument (mode, type, regclass, 0);
9707 *int_nregs = 0;
9708 *sse_nregs = 0;
9710 if (!n)
9711 return true;
9712 for (n--; n >= 0; n--)
9713 switch (regclass[n])
9715 case X86_64_INTEGER_CLASS:
9716 case X86_64_INTEGERSI_CLASS:
9717 (*int_nregs)++;
9718 break;
9719 case X86_64_SSE_CLASS:
9720 case X86_64_SSESF_CLASS:
9721 case X86_64_SSEDF_CLASS:
9722 (*sse_nregs)++;
9723 break;
9724 case X86_64_NO_CLASS:
9725 case X86_64_SSEUP_CLASS:
9726 break;
9727 case X86_64_X87_CLASS:
9728 case X86_64_X87UP_CLASS:
9729 case X86_64_COMPLEX_X87_CLASS:
9730 if (!in_return)
9731 return true;
9732 break;
9733 case X86_64_MEMORY_CLASS:
9734 gcc_unreachable ();
9737 return false;
9740 /* Construct container for the argument used by GCC interface. See
9741 FUNCTION_ARG for the detailed description. */
9743 static rtx
9744 construct_container (machine_mode mode, machine_mode orig_mode,
9745 const_tree type, int in_return, int nintregs, int nsseregs,
9746 const int *intreg, int sse_regno)
9748 /* The following variables hold the static issued_error state. */
9749 static bool issued_sse_arg_error;
9750 static bool issued_sse_ret_error;
9751 static bool issued_x87_ret_error;
9753 machine_mode tmpmode;
9754 int bytes =
9755 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9756 enum x86_64_reg_class regclass[MAX_CLASSES];
9757 int n;
9758 int i;
9759 int nexps = 0;
9760 int needed_sseregs, needed_intregs;
9761 rtx exp[MAX_CLASSES];
9762 rtx ret;
9764 n = classify_argument (mode, type, regclass, 0);
9765 if (!n)
9766 return NULL;
9767 if (examine_argument (mode, type, in_return, &needed_intregs,
9768 &needed_sseregs))
9769 return NULL;
9770 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9771 return NULL;
9773 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9774 some less clueful developer tries to use floating-point anyway. */
9775 if (needed_sseregs && !TARGET_SSE)
9777 if (in_return)
9779 if (!issued_sse_ret_error)
9781 error ("SSE register return with SSE disabled");
9782 issued_sse_ret_error = true;
9785 else if (!issued_sse_arg_error)
9787 error ("SSE register argument with SSE disabled");
9788 issued_sse_arg_error = true;
9790 return NULL;
9793 /* Likewise, error if the ABI requires us to return values in the
9794 x87 registers and the user specified -mno-80387. */
9795 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9796 for (i = 0; i < n; i++)
9797 if (regclass[i] == X86_64_X87_CLASS
9798 || regclass[i] == X86_64_X87UP_CLASS
9799 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9801 if (!issued_x87_ret_error)
9803 error ("x87 register return with x87 disabled");
9804 issued_x87_ret_error = true;
9806 return NULL;
9809 /* First construct simple cases. Avoid SCmode, since we want to use
9810 single register to pass this type. */
9811 if (n == 1 && mode != SCmode)
9812 switch (regclass[0])
9814 case X86_64_INTEGER_CLASS:
9815 case X86_64_INTEGERSI_CLASS:
9816 return gen_rtx_REG (mode, intreg[0]);
9817 case X86_64_SSE_CLASS:
9818 case X86_64_SSESF_CLASS:
9819 case X86_64_SSEDF_CLASS:
9820 if (mode != BLKmode)
9821 return gen_reg_or_parallel (mode, orig_mode,
9822 SSE_REGNO (sse_regno));
9823 break;
9824 case X86_64_X87_CLASS:
9825 case X86_64_COMPLEX_X87_CLASS:
9826 return gen_rtx_REG (mode, FIRST_STACK_REG);
9827 case X86_64_NO_CLASS:
9828 /* Zero sized array, struct or class. */
9829 return NULL;
9830 default:
9831 gcc_unreachable ();
9833 if (n == 2
9834 && regclass[0] == X86_64_SSE_CLASS
9835 && regclass[1] == X86_64_SSEUP_CLASS
9836 && mode != BLKmode)
9837 return gen_reg_or_parallel (mode, orig_mode,
9838 SSE_REGNO (sse_regno));
9839 if (n == 4
9840 && regclass[0] == X86_64_SSE_CLASS
9841 && regclass[1] == X86_64_SSEUP_CLASS
9842 && regclass[2] == X86_64_SSEUP_CLASS
9843 && regclass[3] == X86_64_SSEUP_CLASS
9844 && mode != BLKmode)
9845 return gen_reg_or_parallel (mode, orig_mode,
9846 SSE_REGNO (sse_regno));
9847 if (n == 8
9848 && regclass[0] == X86_64_SSE_CLASS
9849 && regclass[1] == X86_64_SSEUP_CLASS
9850 && regclass[2] == X86_64_SSEUP_CLASS
9851 && regclass[3] == X86_64_SSEUP_CLASS
9852 && regclass[4] == X86_64_SSEUP_CLASS
9853 && regclass[5] == X86_64_SSEUP_CLASS
9854 && regclass[6] == X86_64_SSEUP_CLASS
9855 && regclass[7] == X86_64_SSEUP_CLASS
9856 && mode != BLKmode)
9857 return gen_reg_or_parallel (mode, orig_mode,
9858 SSE_REGNO (sse_regno));
9859 if (n == 2
9860 && regclass[0] == X86_64_X87_CLASS
9861 && regclass[1] == X86_64_X87UP_CLASS)
9862 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9864 if (n == 2
9865 && regclass[0] == X86_64_INTEGER_CLASS
9866 && regclass[1] == X86_64_INTEGER_CLASS
9867 && (mode == CDImode || mode == TImode)
9868 && intreg[0] + 1 == intreg[1])
9869 return gen_rtx_REG (mode, intreg[0]);
9871 /* Otherwise figure out the entries of the PARALLEL. */
9872 for (i = 0; i < n; i++)
9874 int pos;
9876 switch (regclass[i])
9878 case X86_64_NO_CLASS:
9879 break;
9880 case X86_64_INTEGER_CLASS:
9881 case X86_64_INTEGERSI_CLASS:
9882 /* Merge TImodes on aligned occasions here too. */
9883 if (i * 8 + 8 > bytes)
9885 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
9886 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
9887 /* We've requested 24 bytes we
9888 don't have mode for. Use DImode. */
9889 tmpmode = DImode;
9891 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9892 tmpmode = SImode;
9893 else
9894 tmpmode = DImode;
9895 exp [nexps++]
9896 = gen_rtx_EXPR_LIST (VOIDmode,
9897 gen_rtx_REG (tmpmode, *intreg),
9898 GEN_INT (i*8));
9899 intreg++;
9900 break;
9901 case X86_64_SSESF_CLASS:
9902 exp [nexps++]
9903 = gen_rtx_EXPR_LIST (VOIDmode,
9904 gen_rtx_REG (SFmode,
9905 SSE_REGNO (sse_regno)),
9906 GEN_INT (i*8));
9907 sse_regno++;
9908 break;
9909 case X86_64_SSEDF_CLASS:
9910 exp [nexps++]
9911 = gen_rtx_EXPR_LIST (VOIDmode,
9912 gen_rtx_REG (DFmode,
9913 SSE_REGNO (sse_regno)),
9914 GEN_INT (i*8));
9915 sse_regno++;
9916 break;
9917 case X86_64_SSE_CLASS:
9918 pos = i;
9919 switch (n)
9921 case 1:
9922 tmpmode = DImode;
9923 break;
9924 case 2:
9925 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9927 tmpmode = TImode;
9928 i++;
9930 else
9931 tmpmode = DImode;
9932 break;
9933 case 4:
9934 gcc_assert (i == 0
9935 && regclass[1] == X86_64_SSEUP_CLASS
9936 && regclass[2] == X86_64_SSEUP_CLASS
9937 && regclass[3] == X86_64_SSEUP_CLASS);
9938 tmpmode = OImode;
9939 i += 3;
9940 break;
9941 case 8:
9942 gcc_assert (i == 0
9943 && regclass[1] == X86_64_SSEUP_CLASS
9944 && regclass[2] == X86_64_SSEUP_CLASS
9945 && regclass[3] == X86_64_SSEUP_CLASS
9946 && regclass[4] == X86_64_SSEUP_CLASS
9947 && regclass[5] == X86_64_SSEUP_CLASS
9948 && regclass[6] == X86_64_SSEUP_CLASS
9949 && regclass[7] == X86_64_SSEUP_CLASS);
9950 tmpmode = XImode;
9951 i += 7;
9952 break;
9953 default:
9954 gcc_unreachable ();
9956 exp [nexps++]
9957 = gen_rtx_EXPR_LIST (VOIDmode,
9958 gen_rtx_REG (tmpmode,
9959 SSE_REGNO (sse_regno)),
9960 GEN_INT (pos*8));
9961 sse_regno++;
9962 break;
9963 default:
9964 gcc_unreachable ();
9968 /* Empty aligned struct, union or class. */
9969 if (nexps == 0)
9970 return NULL;
9972 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9973 for (i = 0; i < nexps; i++)
9974 XVECEXP (ret, 0, i) = exp [i];
9975 return ret;
9978 /* Update the data in CUM to advance over an argument of mode MODE
9979 and data type TYPE. (TYPE is null for libcalls where that information
9980 may not be available.)
9982 Return a number of integer regsiters advanced over. */
9984 static int
9985 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9986 const_tree type, HOST_WIDE_INT bytes,
9987 HOST_WIDE_INT words)
9989 int res = 0;
9990 bool error_p = false;
9992 if (TARGET_IAMCU)
9994 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9995 bytes in registers. */
9996 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9997 goto pass_in_reg;
9998 return res;
10001 switch (mode)
10003 default:
10004 break;
10006 case E_BLKmode:
10007 if (bytes < 0)
10008 break;
10009 /* FALLTHRU */
10011 case E_DImode:
10012 case E_SImode:
10013 case E_HImode:
10014 case E_QImode:
10015 pass_in_reg:
10016 cum->words += words;
10017 cum->nregs -= words;
10018 cum->regno += words;
10019 if (cum->nregs >= 0)
10020 res = words;
10021 if (cum->nregs <= 0)
10023 cum->nregs = 0;
10024 cfun->machine->arg_reg_available = false;
10025 cum->regno = 0;
10027 break;
10029 case E_OImode:
10030 /* OImode shouldn't be used directly. */
10031 gcc_unreachable ();
10033 case E_DFmode:
10034 if (cum->float_in_sse == -1)
10035 error_p = true;
10036 if (cum->float_in_sse < 2)
10037 break;
10038 /* FALLTHRU */
10039 case E_SFmode:
10040 if (cum->float_in_sse == -1)
10041 error_p = true;
10042 if (cum->float_in_sse < 1)
10043 break;
10044 /* FALLTHRU */
10046 case E_V8SFmode:
10047 case E_V8SImode:
10048 case E_V64QImode:
10049 case E_V32HImode:
10050 case E_V16SImode:
10051 case E_V8DImode:
10052 case E_V16SFmode:
10053 case E_V8DFmode:
10054 case E_V32QImode:
10055 case E_V16HImode:
10056 case E_V4DFmode:
10057 case E_V4DImode:
10058 case E_TImode:
10059 case E_V16QImode:
10060 case E_V8HImode:
10061 case E_V4SImode:
10062 case E_V2DImode:
10063 case E_V4SFmode:
10064 case E_V2DFmode:
10065 if (!type || !AGGREGATE_TYPE_P (type))
10067 cum->sse_words += words;
10068 cum->sse_nregs -= 1;
10069 cum->sse_regno += 1;
10070 if (cum->sse_nregs <= 0)
10072 cum->sse_nregs = 0;
10073 cum->sse_regno = 0;
10076 break;
10078 case E_V8QImode:
10079 case E_V4HImode:
10080 case E_V2SImode:
10081 case E_V2SFmode:
10082 case E_V1TImode:
10083 case E_V1DImode:
10084 if (!type || !AGGREGATE_TYPE_P (type))
10086 cum->mmx_words += words;
10087 cum->mmx_nregs -= 1;
10088 cum->mmx_regno += 1;
10089 if (cum->mmx_nregs <= 0)
10091 cum->mmx_nregs = 0;
10092 cum->mmx_regno = 0;
10095 break;
10097 if (error_p)
10099 cum->float_in_sse = 0;
10100 error ("calling %qD with SSE calling convention without "
10101 "SSE/SSE2 enabled", cum->decl);
10102 sorry ("this is a GCC bug that can be worked around by adding "
10103 "attribute used to function called");
10106 return res;
10109 static int
10110 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10111 const_tree type, HOST_WIDE_INT words, bool named)
10113 int int_nregs, sse_nregs;
10115 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10116 if (!named && (VALID_AVX512F_REG_MODE (mode)
10117 || VALID_AVX256_REG_MODE (mode)))
10118 return 0;
10120 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10121 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10123 cum->nregs -= int_nregs;
10124 cum->sse_nregs -= sse_nregs;
10125 cum->regno += int_nregs;
10126 cum->sse_regno += sse_nregs;
10127 return int_nregs;
10129 else
10131 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10132 cum->words = ROUND_UP (cum->words, align);
10133 cum->words += words;
10134 return 0;
10138 static int
10139 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10140 HOST_WIDE_INT words)
10142 /* Otherwise, this should be passed indirect. */
10143 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10145 cum->words += words;
10146 if (cum->nregs > 0)
10148 cum->nregs -= 1;
10149 cum->regno += 1;
10150 return 1;
10152 return 0;
10155 /* Update the data in CUM to advance over an argument of mode MODE and
10156 data type TYPE. (TYPE is null for libcalls where that information
10157 may not be available.) */
10159 static void
10160 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10161 const_tree type, bool named)
10163 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10164 HOST_WIDE_INT bytes, words;
10165 int nregs;
10167 /* The argument of interrupt handler is a special case and is
10168 handled in ix86_function_arg. */
10169 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10170 return;
10172 if (mode == BLKmode)
10173 bytes = int_size_in_bytes (type);
10174 else
10175 bytes = GET_MODE_SIZE (mode);
10176 words = CEIL (bytes, UNITS_PER_WORD);
10178 if (type)
10179 mode = type_natural_mode (type, NULL, false);
10181 if ((type && POINTER_BOUNDS_TYPE_P (type))
10182 || POINTER_BOUNDS_MODE_P (mode))
10184 /* If we pass bounds in BT then just update remained bounds count. */
10185 if (cum->bnds_in_bt)
10187 cum->bnds_in_bt--;
10188 return;
10191 /* Update remained number of bounds to force. */
10192 if (cum->force_bnd_pass)
10193 cum->force_bnd_pass--;
10195 cum->bnd_regno++;
10197 return;
10200 /* The first arg not going to Bounds Tables resets this counter. */
10201 cum->bnds_in_bt = 0;
10202 /* For unnamed args we always pass bounds to avoid bounds mess when
10203 passed and received types do not match. If bounds do not follow
10204 unnamed arg, still pretend required number of bounds were passed. */
10205 if (cum->force_bnd_pass)
10207 cum->bnd_regno += cum->force_bnd_pass;
10208 cum->force_bnd_pass = 0;
10211 if (TARGET_64BIT)
10213 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10215 if (call_abi == MS_ABI)
10216 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10217 else
10218 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10220 else
10221 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10223 /* For stdarg we expect bounds to be passed for each value passed
10224 in register. */
10225 if (cum->stdarg)
10226 cum->force_bnd_pass = nregs;
10227 /* For pointers passed in memory we expect bounds passed in Bounds
10228 Table. */
10229 if (!nregs)
10231 /* Track if there are outgoing arguments on stack. */
10232 if (cum->caller)
10233 cfun->machine->outgoing_args_on_stack = true;
10235 cum->bnds_in_bt = chkp_type_bounds_count (type);
10239 /* Define where to put the arguments to a function.
10240 Value is zero to push the argument on the stack,
10241 or a hard register in which to store the argument.
10243 MODE is the argument's machine mode.
10244 TYPE is the data type of the argument (as a tree).
10245 This is null for libcalls where that information may
10246 not be available.
10247 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10248 the preceding args and about the function being called.
10249 NAMED is nonzero if this argument is a named parameter
10250 (otherwise it is an extra parameter matching an ellipsis). */
10252 static rtx
10253 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10254 machine_mode orig_mode, const_tree type,
10255 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10257 bool error_p = false;
10259 /* Avoid the AL settings for the Unix64 ABI. */
10260 if (mode == VOIDmode)
10261 return constm1_rtx;
10263 if (TARGET_IAMCU)
10265 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10266 bytes in registers. */
10267 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10268 goto pass_in_reg;
10269 return NULL_RTX;
10272 switch (mode)
10274 default:
10275 break;
10277 case E_BLKmode:
10278 if (bytes < 0)
10279 break;
10280 /* FALLTHRU */
10281 case E_DImode:
10282 case E_SImode:
10283 case E_HImode:
10284 case E_QImode:
10285 pass_in_reg:
10286 if (words <= cum->nregs)
10288 int regno = cum->regno;
10290 /* Fastcall allocates the first two DWORD (SImode) or
10291 smaller arguments to ECX and EDX if it isn't an
10292 aggregate type . */
10293 if (cum->fastcall)
10295 if (mode == BLKmode
10296 || mode == DImode
10297 || (type && AGGREGATE_TYPE_P (type)))
10298 break;
10300 /* ECX not EAX is the first allocated register. */
10301 if (regno == AX_REG)
10302 regno = CX_REG;
10304 return gen_rtx_REG (mode, regno);
10306 break;
10308 case E_DFmode:
10309 if (cum->float_in_sse == -1)
10310 error_p = true;
10311 if (cum->float_in_sse < 2)
10312 break;
10313 /* FALLTHRU */
10314 case E_SFmode:
10315 if (cum->float_in_sse == -1)
10316 error_p = true;
10317 if (cum->float_in_sse < 1)
10318 break;
10319 /* FALLTHRU */
10320 case E_TImode:
10321 /* In 32bit, we pass TImode in xmm registers. */
10322 case E_V16QImode:
10323 case E_V8HImode:
10324 case E_V4SImode:
10325 case E_V2DImode:
10326 case E_V4SFmode:
10327 case E_V2DFmode:
10328 if (!type || !AGGREGATE_TYPE_P (type))
10330 if (cum->sse_nregs)
10331 return gen_reg_or_parallel (mode, orig_mode,
10332 cum->sse_regno + FIRST_SSE_REG);
10334 break;
10336 case E_OImode:
10337 case E_XImode:
10338 /* OImode and XImode shouldn't be used directly. */
10339 gcc_unreachable ();
10341 case E_V64QImode:
10342 case E_V32HImode:
10343 case E_V16SImode:
10344 case E_V8DImode:
10345 case E_V16SFmode:
10346 case E_V8DFmode:
10347 case E_V8SFmode:
10348 case E_V8SImode:
10349 case E_V32QImode:
10350 case E_V16HImode:
10351 case E_V4DFmode:
10352 case E_V4DImode:
10353 if (!type || !AGGREGATE_TYPE_P (type))
10355 if (cum->sse_nregs)
10356 return gen_reg_or_parallel (mode, orig_mode,
10357 cum->sse_regno + FIRST_SSE_REG);
10359 break;
10361 case E_V8QImode:
10362 case E_V4HImode:
10363 case E_V2SImode:
10364 case E_V2SFmode:
10365 case E_V1TImode:
10366 case E_V1DImode:
10367 if (!type || !AGGREGATE_TYPE_P (type))
10369 if (cum->mmx_nregs)
10370 return gen_reg_or_parallel (mode, orig_mode,
10371 cum->mmx_regno + FIRST_MMX_REG);
10373 break;
10375 if (error_p)
10377 cum->float_in_sse = 0;
10378 error ("calling %qD with SSE calling convention without "
10379 "SSE/SSE2 enabled", cum->decl);
10380 sorry ("this is a GCC bug that can be worked around by adding "
10381 "attribute used to function called");
10384 return NULL_RTX;
10387 static rtx
10388 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10389 machine_mode orig_mode, const_tree type, bool named)
10391 /* Handle a hidden AL argument containing number of registers
10392 for varargs x86-64 functions. */
10393 if (mode == VOIDmode)
10394 return GEN_INT (cum->maybe_vaarg
10395 ? (cum->sse_nregs < 0
10396 ? X86_64_SSE_REGPARM_MAX
10397 : cum->sse_regno)
10398 : -1);
10400 switch (mode)
10402 default:
10403 break;
10405 case E_V8SFmode:
10406 case E_V8SImode:
10407 case E_V32QImode:
10408 case E_V16HImode:
10409 case E_V4DFmode:
10410 case E_V4DImode:
10411 case E_V16SFmode:
10412 case E_V16SImode:
10413 case E_V64QImode:
10414 case E_V32HImode:
10415 case E_V8DFmode:
10416 case E_V8DImode:
10417 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10418 if (!named)
10419 return NULL;
10420 break;
10423 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10424 cum->sse_nregs,
10425 &x86_64_int_parameter_registers [cum->regno],
10426 cum->sse_regno);
10429 static rtx
10430 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10431 machine_mode orig_mode, bool named,
10432 HOST_WIDE_INT bytes)
10434 unsigned int regno;
10436 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10437 We use value of -2 to specify that current function call is MSABI. */
10438 if (mode == VOIDmode)
10439 return GEN_INT (-2);
10441 /* If we've run out of registers, it goes on the stack. */
10442 if (cum->nregs == 0)
10443 return NULL_RTX;
10445 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10447 /* Only floating point modes are passed in anything but integer regs. */
10448 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10450 if (named)
10451 regno = cum->regno + FIRST_SSE_REG;
10452 else
10454 rtx t1, t2;
10456 /* Unnamed floating parameters are passed in both the
10457 SSE and integer registers. */
10458 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10459 t2 = gen_rtx_REG (mode, regno);
10460 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10461 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10462 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10465 /* Handle aggregated types passed in register. */
10466 if (orig_mode == BLKmode)
10468 if (bytes > 0 && bytes <= 8)
10469 mode = (bytes > 4 ? DImode : SImode);
10470 if (mode == BLKmode)
10471 mode = DImode;
10474 return gen_reg_or_parallel (mode, orig_mode, regno);
10477 /* Return where to put the arguments to a function.
10478 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10480 MODE is the argument's machine mode. TYPE is the data type of the
10481 argument. It is null for libcalls where that information may not be
10482 available. CUM gives information about the preceding args and about
10483 the function being called. NAMED is nonzero if this argument is a
10484 named parameter (otherwise it is an extra parameter matching an
10485 ellipsis). */
10487 static rtx
10488 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10489 const_tree type, bool named)
10491 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10492 machine_mode mode = omode;
10493 HOST_WIDE_INT bytes, words;
10494 rtx arg;
10496 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10498 gcc_assert (type != NULL_TREE);
10499 if (POINTER_TYPE_P (type))
10501 /* This is the pointer argument. */
10502 gcc_assert (TYPE_MODE (type) == Pmode);
10503 /* It is at -WORD(AP) in the current frame in interrupt and
10504 exception handlers. */
10505 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10507 else
10509 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10510 && TREE_CODE (type) == INTEGER_TYPE
10511 && TYPE_MODE (type) == word_mode);
10512 /* The error code is the word-mode integer argument at
10513 -2 * WORD(AP) in the current frame of the exception
10514 handler. */
10515 arg = gen_rtx_MEM (word_mode,
10516 plus_constant (Pmode,
10517 arg_pointer_rtx,
10518 -2 * UNITS_PER_WORD));
10520 return arg;
10523 /* All pointer bounds arguments are handled separately here. */
10524 if ((type && POINTER_BOUNDS_TYPE_P (type))
10525 || POINTER_BOUNDS_MODE_P (mode))
10527 /* Return NULL if bounds are forced to go in Bounds Table. */
10528 if (cum->bnds_in_bt)
10529 arg = NULL;
10530 /* Return the next available bound reg if any. */
10531 else if (cum->bnd_regno <= LAST_BND_REG)
10532 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10533 /* Return the next special slot number otherwise. */
10534 else
10535 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10537 return arg;
10540 if (mode == BLKmode)
10541 bytes = int_size_in_bytes (type);
10542 else
10543 bytes = GET_MODE_SIZE (mode);
10544 words = CEIL (bytes, UNITS_PER_WORD);
10546 /* To simplify the code below, represent vector types with a vector mode
10547 even if MMX/SSE are not active. */
10548 if (type && TREE_CODE (type) == VECTOR_TYPE)
10549 mode = type_natural_mode (type, cum, false);
10551 if (TARGET_64BIT)
10553 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10555 if (call_abi == MS_ABI)
10556 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10557 else
10558 arg = function_arg_64 (cum, mode, omode, type, named);
10560 else
10561 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10563 /* Track if there are outgoing arguments on stack. */
10564 if (arg == NULL_RTX && cum->caller)
10565 cfun->machine->outgoing_args_on_stack = true;
10567 return arg;
10570 /* A C expression that indicates when an argument must be passed by
10571 reference. If nonzero for an argument, a copy of that argument is
10572 made in memory and a pointer to the argument is passed instead of
10573 the argument itself. The pointer is passed in whatever way is
10574 appropriate for passing a pointer to that type. */
10576 static bool
10577 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10578 const_tree type, bool)
10580 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10582 /* Bounds are never passed by reference. */
10583 if ((type && POINTER_BOUNDS_TYPE_P (type))
10584 || POINTER_BOUNDS_MODE_P (mode))
10585 return false;
10587 if (TARGET_64BIT)
10589 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10591 /* See Windows x64 Software Convention. */
10592 if (call_abi == MS_ABI)
10594 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10596 if (type)
10598 /* Arrays are passed by reference. */
10599 if (TREE_CODE (type) == ARRAY_TYPE)
10600 return true;
10602 if (RECORD_OR_UNION_TYPE_P (type))
10604 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10605 are passed by reference. */
10606 msize = int_size_in_bytes (type);
10610 /* __m128 is passed by reference. */
10611 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10613 else if (type && int_size_in_bytes (type) == -1)
10614 return true;
10617 return false;
10620 /* Return true when TYPE should be 128bit aligned for 32bit argument
10621 passing ABI. XXX: This function is obsolete and is only used for
10622 checking psABI compatibility with previous versions of GCC. */
10624 static bool
10625 ix86_compat_aligned_value_p (const_tree type)
10627 machine_mode mode = TYPE_MODE (type);
10628 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10629 || mode == TDmode
10630 || mode == TFmode
10631 || mode == TCmode)
10632 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10633 return true;
10634 if (TYPE_ALIGN (type) < 128)
10635 return false;
10637 if (AGGREGATE_TYPE_P (type))
10639 /* Walk the aggregates recursively. */
10640 switch (TREE_CODE (type))
10642 case RECORD_TYPE:
10643 case UNION_TYPE:
10644 case QUAL_UNION_TYPE:
10646 tree field;
10648 /* Walk all the structure fields. */
10649 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10651 if (TREE_CODE (field) == FIELD_DECL
10652 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10653 return true;
10655 break;
10658 case ARRAY_TYPE:
10659 /* Just for use if some languages passes arrays by value. */
10660 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10661 return true;
10662 break;
10664 default:
10665 gcc_unreachable ();
10668 return false;
10671 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10672 XXX: This function is obsolete and is only used for checking psABI
10673 compatibility with previous versions of GCC. */
10675 static unsigned int
10676 ix86_compat_function_arg_boundary (machine_mode mode,
10677 const_tree type, unsigned int align)
10679 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10680 natural boundaries. */
10681 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10683 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10684 make an exception for SSE modes since these require 128bit
10685 alignment.
10687 The handling here differs from field_alignment. ICC aligns MMX
10688 arguments to 4 byte boundaries, while structure fields are aligned
10689 to 8 byte boundaries. */
10690 if (!type)
10692 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10693 align = PARM_BOUNDARY;
10695 else
10697 if (!ix86_compat_aligned_value_p (type))
10698 align = PARM_BOUNDARY;
10701 if (align > BIGGEST_ALIGNMENT)
10702 align = BIGGEST_ALIGNMENT;
10703 return align;
10706 /* Return true when TYPE should be 128bit aligned for 32bit argument
10707 passing ABI. */
10709 static bool
10710 ix86_contains_aligned_value_p (const_tree type)
10712 machine_mode mode = TYPE_MODE (type);
10714 if (mode == XFmode || mode == XCmode)
10715 return false;
10717 if (TYPE_ALIGN (type) < 128)
10718 return false;
10720 if (AGGREGATE_TYPE_P (type))
10722 /* Walk the aggregates recursively. */
10723 switch (TREE_CODE (type))
10725 case RECORD_TYPE:
10726 case UNION_TYPE:
10727 case QUAL_UNION_TYPE:
10729 tree field;
10731 /* Walk all the structure fields. */
10732 for (field = TYPE_FIELDS (type);
10733 field;
10734 field = DECL_CHAIN (field))
10736 if (TREE_CODE (field) == FIELD_DECL
10737 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10738 return true;
10740 break;
10743 case ARRAY_TYPE:
10744 /* Just for use if some languages passes arrays by value. */
10745 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10746 return true;
10747 break;
10749 default:
10750 gcc_unreachable ();
10753 else
10754 return TYPE_ALIGN (type) >= 128;
10756 return false;
10759 /* Gives the alignment boundary, in bits, of an argument with the
10760 specified mode and type. */
10762 static unsigned int
10763 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10765 unsigned int align;
10766 if (type)
10768 /* Since the main variant type is used for call, we convert it to
10769 the main variant type. */
10770 type = TYPE_MAIN_VARIANT (type);
10771 align = TYPE_ALIGN (type);
10773 else
10774 align = GET_MODE_ALIGNMENT (mode);
10775 if (align < PARM_BOUNDARY)
10776 align = PARM_BOUNDARY;
10777 else
10779 static bool warned;
10780 unsigned int saved_align = align;
10782 if (!TARGET_64BIT)
10784 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10785 if (!type)
10787 if (mode == XFmode || mode == XCmode)
10788 align = PARM_BOUNDARY;
10790 else if (!ix86_contains_aligned_value_p (type))
10791 align = PARM_BOUNDARY;
10793 if (align < 128)
10794 align = PARM_BOUNDARY;
10797 if (warn_psabi
10798 && !warned
10799 && align != ix86_compat_function_arg_boundary (mode, type,
10800 saved_align))
10802 warned = true;
10803 inform (input_location,
10804 "The ABI for passing parameters with %d-byte"
10805 " alignment has changed in GCC 4.6",
10806 align / BITS_PER_UNIT);
10810 return align;
10813 /* Return true if N is a possible register number of function value. */
10815 static bool
10816 ix86_function_value_regno_p (const unsigned int regno)
10818 switch (regno)
10820 case AX_REG:
10821 return true;
10822 case DX_REG:
10823 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10824 case DI_REG:
10825 case SI_REG:
10826 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10828 case BND0_REG:
10829 case BND1_REG:
10830 return chkp_function_instrumented_p (current_function_decl);
10832 /* Complex values are returned in %st(0)/%st(1) pair. */
10833 case ST0_REG:
10834 case ST1_REG:
10835 /* TODO: The function should depend on current function ABI but
10836 builtins.c would need updating then. Therefore we use the
10837 default ABI. */
10838 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10839 return false;
10840 return TARGET_FLOAT_RETURNS_IN_80387;
10842 /* Complex values are returned in %xmm0/%xmm1 pair. */
10843 case XMM0_REG:
10844 case XMM1_REG:
10845 return TARGET_SSE;
10847 case MM0_REG:
10848 if (TARGET_MACHO || TARGET_64BIT)
10849 return false;
10850 return TARGET_MMX;
10853 return false;
10856 /* Define how to find the value returned by a function.
10857 VALTYPE is the data type of the value (as a tree).
10858 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10859 otherwise, FUNC is 0. */
10861 static rtx
10862 function_value_32 (machine_mode orig_mode, machine_mode mode,
10863 const_tree fntype, const_tree fn)
10865 unsigned int regno;
10867 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10868 we normally prevent this case when mmx is not available. However
10869 some ABIs may require the result to be returned like DImode. */
10870 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10871 regno = FIRST_MMX_REG;
10873 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10874 we prevent this case when sse is not available. However some ABIs
10875 may require the result to be returned like integer TImode. */
10876 else if (mode == TImode
10877 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10878 regno = FIRST_SSE_REG;
10880 /* 32-byte vector modes in %ymm0. */
10881 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10882 regno = FIRST_SSE_REG;
10884 /* 64-byte vector modes in %zmm0. */
10885 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10886 regno = FIRST_SSE_REG;
10888 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10889 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10890 regno = FIRST_FLOAT_REG;
10891 else
10892 /* Most things go in %eax. */
10893 regno = AX_REG;
10895 /* Override FP return register with %xmm0 for local functions when
10896 SSE math is enabled or for functions with sseregparm attribute. */
10897 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10899 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10900 if (sse_level == -1)
10902 error ("calling %qD with SSE calling convention without "
10903 "SSE/SSE2 enabled", fn);
10904 sorry ("this is a GCC bug that can be worked around by adding "
10905 "attribute used to function called");
10907 else if ((sse_level >= 1 && mode == SFmode)
10908 || (sse_level == 2 && mode == DFmode))
10909 regno = FIRST_SSE_REG;
10912 /* OImode shouldn't be used directly. */
10913 gcc_assert (mode != OImode);
10915 return gen_rtx_REG (orig_mode, regno);
10918 static rtx
10919 function_value_64 (machine_mode orig_mode, machine_mode mode,
10920 const_tree valtype)
10922 rtx ret;
10924 /* Handle libcalls, which don't provide a type node. */
10925 if (valtype == NULL)
10927 unsigned int regno;
10929 switch (mode)
10931 case E_SFmode:
10932 case E_SCmode:
10933 case E_DFmode:
10934 case E_DCmode:
10935 case E_TFmode:
10936 case E_SDmode:
10937 case E_DDmode:
10938 case E_TDmode:
10939 regno = FIRST_SSE_REG;
10940 break;
10941 case E_XFmode:
10942 case E_XCmode:
10943 regno = FIRST_FLOAT_REG;
10944 break;
10945 case E_TCmode:
10946 return NULL;
10947 default:
10948 regno = AX_REG;
10951 return gen_rtx_REG (mode, regno);
10953 else if (POINTER_TYPE_P (valtype))
10955 /* Pointers are always returned in word_mode. */
10956 mode = word_mode;
10959 ret = construct_container (mode, orig_mode, valtype, 1,
10960 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10961 x86_64_int_return_registers, 0);
10963 /* For zero sized structures, construct_container returns NULL, but we
10964 need to keep rest of compiler happy by returning meaningful value. */
10965 if (!ret)
10966 ret = gen_rtx_REG (orig_mode, AX_REG);
10968 return ret;
10971 static rtx
10972 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10973 const_tree valtype)
10975 unsigned int regno = AX_REG;
10977 if (TARGET_SSE)
10979 switch (GET_MODE_SIZE (mode))
10981 case 16:
10982 if (valtype != NULL_TREE
10983 && !VECTOR_INTEGER_TYPE_P (valtype)
10984 && !VECTOR_INTEGER_TYPE_P (valtype)
10985 && !INTEGRAL_TYPE_P (valtype)
10986 && !VECTOR_FLOAT_TYPE_P (valtype))
10987 break;
10988 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10989 && !COMPLEX_MODE_P (mode))
10990 regno = FIRST_SSE_REG;
10991 break;
10992 case 8:
10993 case 4:
10994 if (mode == SFmode || mode == DFmode)
10995 regno = FIRST_SSE_REG;
10996 break;
10997 default:
10998 break;
11001 return gen_rtx_REG (orig_mode, regno);
11004 static rtx
11005 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
11006 machine_mode orig_mode, machine_mode mode)
11008 const_tree fn, fntype;
11010 fn = NULL_TREE;
11011 if (fntype_or_decl && DECL_P (fntype_or_decl))
11012 fn = fntype_or_decl;
11013 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
11015 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
11016 || POINTER_BOUNDS_MODE_P (mode))
11017 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
11018 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
11019 return function_value_ms_64 (orig_mode, mode, valtype);
11020 else if (TARGET_64BIT)
11021 return function_value_64 (orig_mode, mode, valtype);
11022 else
11023 return function_value_32 (orig_mode, mode, fntype, fn);
11026 static rtx
11027 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
11029 machine_mode mode, orig_mode;
11031 orig_mode = TYPE_MODE (valtype);
11032 mode = type_natural_mode (valtype, NULL, true);
11033 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
11036 /* Return an RTX representing a place where a function returns
11037 or recieves pointer bounds or NULL if no bounds are returned.
11039 VALTYPE is a data type of a value returned by the function.
11041 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
11042 or FUNCTION_TYPE of the function.
11044 If OUTGOING is false, return a place in which the caller will
11045 see the return value. Otherwise, return a place where a
11046 function returns a value. */
11048 static rtx
11049 ix86_function_value_bounds (const_tree valtype,
11050 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
11051 bool outgoing ATTRIBUTE_UNUSED)
11053 rtx res = NULL_RTX;
11055 if (BOUNDED_TYPE_P (valtype))
11056 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
11057 else if (chkp_type_has_pointer (valtype))
11059 bitmap slots;
11060 rtx bounds[2];
11061 bitmap_iterator bi;
11062 unsigned i, bnd_no = 0;
11064 bitmap_obstack_initialize (NULL);
11065 slots = BITMAP_ALLOC (NULL);
11066 chkp_find_bound_slots (valtype, slots);
11068 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11070 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11071 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11072 gcc_assert (bnd_no < 2);
11073 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11076 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11078 BITMAP_FREE (slots);
11079 bitmap_obstack_release (NULL);
11081 else
11082 res = NULL_RTX;
11084 return res;
11087 /* Pointer function arguments and return values are promoted to
11088 word_mode for normal functions. */
11090 static machine_mode
11091 ix86_promote_function_mode (const_tree type, machine_mode mode,
11092 int *punsignedp, const_tree fntype,
11093 int for_return)
11095 if (cfun->machine->func_type == TYPE_NORMAL
11096 && type != NULL_TREE
11097 && POINTER_TYPE_P (type))
11099 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11100 return word_mode;
11102 return default_promote_function_mode (type, mode, punsignedp, fntype,
11103 for_return);
11106 /* Return true if a structure, union or array with MODE containing FIELD
11107 should be accessed using BLKmode. */
11109 static bool
11110 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11112 /* Union with XFmode must be in BLKmode. */
11113 return (mode == XFmode
11114 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11115 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11119 ix86_libcall_value (machine_mode mode)
11121 return ix86_function_value_1 (NULL, NULL, mode, mode);
11124 /* Return true iff type is returned in memory. */
11126 static bool
11127 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11129 #ifdef SUBTARGET_RETURN_IN_MEMORY
11130 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11131 #else
11132 const machine_mode mode = type_natural_mode (type, NULL, true);
11133 HOST_WIDE_INT size;
11135 if (POINTER_BOUNDS_TYPE_P (type))
11136 return false;
11138 if (TARGET_64BIT)
11140 if (ix86_function_type_abi (fntype) == MS_ABI)
11142 size = int_size_in_bytes (type);
11144 /* __m128 is returned in xmm0. */
11145 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11146 || INTEGRAL_TYPE_P (type)
11147 || VECTOR_FLOAT_TYPE_P (type))
11148 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11149 && !COMPLEX_MODE_P (mode)
11150 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11151 return false;
11153 /* Otherwise, the size must be exactly in [1248]. */
11154 return size != 1 && size != 2 && size != 4 && size != 8;
11156 else
11158 int needed_intregs, needed_sseregs;
11160 return examine_argument (mode, type, 1,
11161 &needed_intregs, &needed_sseregs);
11164 else
11166 size = int_size_in_bytes (type);
11168 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11169 bytes in registers. */
11170 if (TARGET_IAMCU)
11171 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11173 if (mode == BLKmode)
11174 return true;
11176 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11177 return false;
11179 if (VECTOR_MODE_P (mode) || mode == TImode)
11181 /* User-created vectors small enough to fit in EAX. */
11182 if (size < 8)
11183 return false;
11185 /* Unless ABI prescibes otherwise,
11186 MMX/3dNow values are returned in MM0 if available. */
11188 if (size == 8)
11189 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11191 /* SSE values are returned in XMM0 if available. */
11192 if (size == 16)
11193 return !TARGET_SSE;
11195 /* AVX values are returned in YMM0 if available. */
11196 if (size == 32)
11197 return !TARGET_AVX;
11199 /* AVX512F values are returned in ZMM0 if available. */
11200 if (size == 64)
11201 return !TARGET_AVX512F;
11204 if (mode == XFmode)
11205 return false;
11207 if (size > 12)
11208 return true;
11210 /* OImode shouldn't be used directly. */
11211 gcc_assert (mode != OImode);
11213 return false;
11215 #endif
11219 /* Create the va_list data type. */
11221 static tree
11222 ix86_build_builtin_va_list_64 (void)
11224 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11226 record = lang_hooks.types.make_type (RECORD_TYPE);
11227 type_decl = build_decl (BUILTINS_LOCATION,
11228 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11230 f_gpr = build_decl (BUILTINS_LOCATION,
11231 FIELD_DECL, get_identifier ("gp_offset"),
11232 unsigned_type_node);
11233 f_fpr = build_decl (BUILTINS_LOCATION,
11234 FIELD_DECL, get_identifier ("fp_offset"),
11235 unsigned_type_node);
11236 f_ovf = build_decl (BUILTINS_LOCATION,
11237 FIELD_DECL, get_identifier ("overflow_arg_area"),
11238 ptr_type_node);
11239 f_sav = build_decl (BUILTINS_LOCATION,
11240 FIELD_DECL, get_identifier ("reg_save_area"),
11241 ptr_type_node);
11243 va_list_gpr_counter_field = f_gpr;
11244 va_list_fpr_counter_field = f_fpr;
11246 DECL_FIELD_CONTEXT (f_gpr) = record;
11247 DECL_FIELD_CONTEXT (f_fpr) = record;
11248 DECL_FIELD_CONTEXT (f_ovf) = record;
11249 DECL_FIELD_CONTEXT (f_sav) = record;
11251 TYPE_STUB_DECL (record) = type_decl;
11252 TYPE_NAME (record) = type_decl;
11253 TYPE_FIELDS (record) = f_gpr;
11254 DECL_CHAIN (f_gpr) = f_fpr;
11255 DECL_CHAIN (f_fpr) = f_ovf;
11256 DECL_CHAIN (f_ovf) = f_sav;
11258 layout_type (record);
11260 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11261 NULL_TREE, TYPE_ATTRIBUTES (record));
11263 /* The correct type is an array type of one element. */
11264 return build_array_type (record, build_index_type (size_zero_node));
11267 /* Setup the builtin va_list data type and for 64-bit the additional
11268 calling convention specific va_list data types. */
11270 static tree
11271 ix86_build_builtin_va_list (void)
11273 if (TARGET_64BIT)
11275 /* Initialize ABI specific va_list builtin types.
11277 In lto1, we can encounter two va_list types:
11278 - one as a result of the type-merge across TUs, and
11279 - the one constructed here.
11280 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11281 a type identity check in canonical_va_list_type based on
11282 TYPE_MAIN_VARIANT (which we used to have) will not work.
11283 Instead, we tag each va_list_type_node with its unique attribute, and
11284 look for the attribute in the type identity check in
11285 canonical_va_list_type.
11287 Tagging sysv_va_list_type_node directly with the attribute is
11288 problematic since it's a array of one record, which will degrade into a
11289 pointer to record when used as parameter (see build_va_arg comments for
11290 an example), dropping the attribute in the process. So we tag the
11291 record instead. */
11293 /* For SYSV_ABI we use an array of one record. */
11294 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11296 /* For MS_ABI we use plain pointer to argument area. */
11297 tree char_ptr_type = build_pointer_type (char_type_node);
11298 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11299 TYPE_ATTRIBUTES (char_ptr_type));
11300 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11302 return ((ix86_abi == MS_ABI)
11303 ? ms_va_list_type_node
11304 : sysv_va_list_type_node);
11306 else
11308 /* For i386 we use plain pointer to argument area. */
11309 return build_pointer_type (char_type_node);
11313 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11315 static void
11316 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11318 rtx save_area, mem;
11319 alias_set_type set;
11320 int i, max;
11322 /* GPR size of varargs save area. */
11323 if (cfun->va_list_gpr_size)
11324 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11325 else
11326 ix86_varargs_gpr_size = 0;
11328 /* FPR size of varargs save area. We don't need it if we don't pass
11329 anything in SSE registers. */
11330 if (TARGET_SSE && cfun->va_list_fpr_size)
11331 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11332 else
11333 ix86_varargs_fpr_size = 0;
11335 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11336 return;
11338 save_area = frame_pointer_rtx;
11339 set = get_varargs_alias_set ();
11341 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11342 if (max > X86_64_REGPARM_MAX)
11343 max = X86_64_REGPARM_MAX;
11345 for (i = cum->regno; i < max; i++)
11347 mem = gen_rtx_MEM (word_mode,
11348 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11349 MEM_NOTRAP_P (mem) = 1;
11350 set_mem_alias_set (mem, set);
11351 emit_move_insn (mem,
11352 gen_rtx_REG (word_mode,
11353 x86_64_int_parameter_registers[i]));
11356 if (ix86_varargs_fpr_size)
11358 machine_mode smode;
11359 rtx_code_label *label;
11360 rtx test;
11362 /* Now emit code to save SSE registers. The AX parameter contains number
11363 of SSE parameter registers used to call this function, though all we
11364 actually check here is the zero/non-zero status. */
11366 label = gen_label_rtx ();
11367 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11368 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11369 label));
11371 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11372 we used movdqa (i.e. TImode) instead? Perhaps even better would
11373 be if we could determine the real mode of the data, via a hook
11374 into pass_stdarg. Ignore all that for now. */
11375 smode = V4SFmode;
11376 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11377 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11379 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11380 if (max > X86_64_SSE_REGPARM_MAX)
11381 max = X86_64_SSE_REGPARM_MAX;
11383 for (i = cum->sse_regno; i < max; ++i)
11385 mem = plus_constant (Pmode, save_area,
11386 i * 16 + ix86_varargs_gpr_size);
11387 mem = gen_rtx_MEM (smode, mem);
11388 MEM_NOTRAP_P (mem) = 1;
11389 set_mem_alias_set (mem, set);
11390 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11392 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11395 emit_label (label);
11399 static void
11400 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11402 alias_set_type set = get_varargs_alias_set ();
11403 int i;
11405 /* Reset to zero, as there might be a sysv vaarg used
11406 before. */
11407 ix86_varargs_gpr_size = 0;
11408 ix86_varargs_fpr_size = 0;
11410 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11412 rtx reg, mem;
11414 mem = gen_rtx_MEM (Pmode,
11415 plus_constant (Pmode, virtual_incoming_args_rtx,
11416 i * UNITS_PER_WORD));
11417 MEM_NOTRAP_P (mem) = 1;
11418 set_mem_alias_set (mem, set);
11420 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11421 emit_move_insn (mem, reg);
11425 static void
11426 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11427 tree type, int *, int no_rtl)
11429 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11430 CUMULATIVE_ARGS next_cum;
11431 tree fntype;
11433 /* This argument doesn't appear to be used anymore. Which is good,
11434 because the old code here didn't suppress rtl generation. */
11435 gcc_assert (!no_rtl);
11437 if (!TARGET_64BIT)
11438 return;
11440 fntype = TREE_TYPE (current_function_decl);
11442 /* For varargs, we do not want to skip the dummy va_dcl argument.
11443 For stdargs, we do want to skip the last named argument. */
11444 next_cum = *cum;
11445 if (stdarg_p (fntype))
11446 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11447 true);
11449 if (cum->call_abi == MS_ABI)
11450 setup_incoming_varargs_ms_64 (&next_cum);
11451 else
11452 setup_incoming_varargs_64 (&next_cum);
11455 static void
11456 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11457 machine_mode mode,
11458 tree type,
11459 int *pretend_size ATTRIBUTE_UNUSED,
11460 int no_rtl)
11462 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11463 CUMULATIVE_ARGS next_cum;
11464 tree fntype;
11465 rtx save_area;
11466 int bnd_reg, i, max;
11468 gcc_assert (!no_rtl);
11470 /* Do nothing if we use plain pointer to argument area. */
11471 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11472 return;
11474 fntype = TREE_TYPE (current_function_decl);
11476 /* For varargs, we do not want to skip the dummy va_dcl argument.
11477 For stdargs, we do want to skip the last named argument. */
11478 next_cum = *cum;
11479 if (stdarg_p (fntype))
11480 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11481 true);
11482 save_area = frame_pointer_rtx;
11484 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11485 if (max > X86_64_REGPARM_MAX)
11486 max = X86_64_REGPARM_MAX;
11488 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11489 if (chkp_function_instrumented_p (current_function_decl))
11490 for (i = cum->regno; i < max; i++)
11492 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11493 rtx ptr = gen_rtx_REG (Pmode,
11494 x86_64_int_parameter_registers[i]);
11495 rtx bounds;
11497 if (bnd_reg <= LAST_BND_REG)
11498 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11499 else
11501 rtx ldx_addr =
11502 plus_constant (Pmode, arg_pointer_rtx,
11503 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11504 bounds = gen_reg_rtx (BNDmode);
11505 emit_insn (BNDmode == BND64mode
11506 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11507 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11510 emit_insn (BNDmode == BND64mode
11511 ? gen_bnd64_stx (addr, ptr, bounds)
11512 : gen_bnd32_stx (addr, ptr, bounds));
11514 bnd_reg++;
11519 /* Checks if TYPE is of kind va_list char *. */
11521 static bool
11522 is_va_list_char_pointer (tree type)
11524 tree canonic;
11526 /* For 32-bit it is always true. */
11527 if (!TARGET_64BIT)
11528 return true;
11529 canonic = ix86_canonical_va_list_type (type);
11530 return (canonic == ms_va_list_type_node
11531 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11534 /* Implement va_start. */
11536 static void
11537 ix86_va_start (tree valist, rtx nextarg)
11539 HOST_WIDE_INT words, n_gpr, n_fpr;
11540 tree f_gpr, f_fpr, f_ovf, f_sav;
11541 tree gpr, fpr, ovf, sav, t;
11542 tree type;
11543 rtx ovf_rtx;
11545 if (flag_split_stack
11546 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11548 unsigned int scratch_regno;
11550 /* When we are splitting the stack, we can't refer to the stack
11551 arguments using internal_arg_pointer, because they may be on
11552 the old stack. The split stack prologue will arrange to
11553 leave a pointer to the old stack arguments in a scratch
11554 register, which we here copy to a pseudo-register. The split
11555 stack prologue can't set the pseudo-register directly because
11556 it (the prologue) runs before any registers have been saved. */
11558 scratch_regno = split_stack_prologue_scratch_regno ();
11559 if (scratch_regno != INVALID_REGNUM)
11561 rtx reg;
11562 rtx_insn *seq;
11564 reg = gen_reg_rtx (Pmode);
11565 cfun->machine->split_stack_varargs_pointer = reg;
11567 start_sequence ();
11568 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11569 seq = get_insns ();
11570 end_sequence ();
11572 push_topmost_sequence ();
11573 emit_insn_after (seq, entry_of_function ());
11574 pop_topmost_sequence ();
11578 /* Only 64bit target needs something special. */
11579 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11581 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11582 std_expand_builtin_va_start (valist, nextarg);
11583 else
11585 rtx va_r, next;
11587 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11588 next = expand_binop (ptr_mode, add_optab,
11589 cfun->machine->split_stack_varargs_pointer,
11590 crtl->args.arg_offset_rtx,
11591 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11592 convert_move (va_r, next, 0);
11594 /* Store zero bounds for va_list. */
11595 if (chkp_function_instrumented_p (current_function_decl))
11596 chkp_expand_bounds_reset_for_mem (valist,
11597 make_tree (TREE_TYPE (valist),
11598 next));
11601 return;
11604 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11605 f_fpr = DECL_CHAIN (f_gpr);
11606 f_ovf = DECL_CHAIN (f_fpr);
11607 f_sav = DECL_CHAIN (f_ovf);
11609 valist = build_simple_mem_ref (valist);
11610 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11611 /* The following should be folded into the MEM_REF offset. */
11612 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11613 f_gpr, NULL_TREE);
11614 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11615 f_fpr, NULL_TREE);
11616 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11617 f_ovf, NULL_TREE);
11618 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11619 f_sav, NULL_TREE);
11621 /* Count number of gp and fp argument registers used. */
11622 words = crtl->args.info.words;
11623 n_gpr = crtl->args.info.regno;
11624 n_fpr = crtl->args.info.sse_regno;
11626 if (cfun->va_list_gpr_size)
11628 type = TREE_TYPE (gpr);
11629 t = build2 (MODIFY_EXPR, type,
11630 gpr, build_int_cst (type, n_gpr * 8));
11631 TREE_SIDE_EFFECTS (t) = 1;
11632 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11635 if (TARGET_SSE && cfun->va_list_fpr_size)
11637 type = TREE_TYPE (fpr);
11638 t = build2 (MODIFY_EXPR, type, fpr,
11639 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11640 TREE_SIDE_EFFECTS (t) = 1;
11641 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11644 /* Find the overflow area. */
11645 type = TREE_TYPE (ovf);
11646 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11647 ovf_rtx = crtl->args.internal_arg_pointer;
11648 else
11649 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11650 t = make_tree (type, ovf_rtx);
11651 if (words != 0)
11652 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11654 /* Store zero bounds for overflow area pointer. */
11655 if (chkp_function_instrumented_p (current_function_decl))
11656 chkp_expand_bounds_reset_for_mem (ovf, t);
11658 t = build2 (MODIFY_EXPR, type, ovf, t);
11659 TREE_SIDE_EFFECTS (t) = 1;
11660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11662 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11664 /* Find the register save area.
11665 Prologue of the function save it right above stack frame. */
11666 type = TREE_TYPE (sav);
11667 t = make_tree (type, frame_pointer_rtx);
11668 if (!ix86_varargs_gpr_size)
11669 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11671 /* Store zero bounds for save area pointer. */
11672 if (chkp_function_instrumented_p (current_function_decl))
11673 chkp_expand_bounds_reset_for_mem (sav, t);
11675 t = build2 (MODIFY_EXPR, type, sav, t);
11676 TREE_SIDE_EFFECTS (t) = 1;
11677 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11681 /* Implement va_arg. */
11683 static tree
11684 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11685 gimple_seq *post_p)
11687 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11688 tree f_gpr, f_fpr, f_ovf, f_sav;
11689 tree gpr, fpr, ovf, sav, t;
11690 int size, rsize;
11691 tree lab_false, lab_over = NULL_TREE;
11692 tree addr, t2;
11693 rtx container;
11694 int indirect_p = 0;
11695 tree ptrtype;
11696 machine_mode nat_mode;
11697 unsigned int arg_boundary;
11699 /* Only 64bit target needs something special. */
11700 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11701 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11703 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11704 f_fpr = DECL_CHAIN (f_gpr);
11705 f_ovf = DECL_CHAIN (f_fpr);
11706 f_sav = DECL_CHAIN (f_ovf);
11708 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11709 valist, f_gpr, NULL_TREE);
11711 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11712 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11713 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11715 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11716 if (indirect_p)
11717 type = build_pointer_type (type);
11718 size = int_size_in_bytes (type);
11719 rsize = CEIL (size, UNITS_PER_WORD);
11721 nat_mode = type_natural_mode (type, NULL, false);
11722 switch (nat_mode)
11724 case E_V8SFmode:
11725 case E_V8SImode:
11726 case E_V32QImode:
11727 case E_V16HImode:
11728 case E_V4DFmode:
11729 case E_V4DImode:
11730 case E_V16SFmode:
11731 case E_V16SImode:
11732 case E_V64QImode:
11733 case E_V32HImode:
11734 case E_V8DFmode:
11735 case E_V8DImode:
11736 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11737 if (!TARGET_64BIT_MS_ABI)
11739 container = NULL;
11740 break;
11742 /* FALLTHRU */
11744 default:
11745 container = construct_container (nat_mode, TYPE_MODE (type),
11746 type, 0, X86_64_REGPARM_MAX,
11747 X86_64_SSE_REGPARM_MAX, intreg,
11749 break;
11752 /* Pull the value out of the saved registers. */
11754 addr = create_tmp_var (ptr_type_node, "addr");
11756 if (container)
11758 int needed_intregs, needed_sseregs;
11759 bool need_temp;
11760 tree int_addr, sse_addr;
11762 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11763 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11765 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11767 need_temp = (!REG_P (container)
11768 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11769 || TYPE_ALIGN (type) > 128));
11771 /* In case we are passing structure, verify that it is consecutive block
11772 on the register save area. If not we need to do moves. */
11773 if (!need_temp && !REG_P (container))
11775 /* Verify that all registers are strictly consecutive */
11776 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11778 int i;
11780 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11782 rtx slot = XVECEXP (container, 0, i);
11783 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11784 || INTVAL (XEXP (slot, 1)) != i * 16)
11785 need_temp = true;
11788 else
11790 int i;
11792 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11794 rtx slot = XVECEXP (container, 0, i);
11795 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11796 || INTVAL (XEXP (slot, 1)) != i * 8)
11797 need_temp = true;
11801 if (!need_temp)
11803 int_addr = addr;
11804 sse_addr = addr;
11806 else
11808 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11809 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11812 /* First ensure that we fit completely in registers. */
11813 if (needed_intregs)
11815 t = build_int_cst (TREE_TYPE (gpr),
11816 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11817 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11818 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11819 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11820 gimplify_and_add (t, pre_p);
11822 if (needed_sseregs)
11824 t = build_int_cst (TREE_TYPE (fpr),
11825 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11826 + X86_64_REGPARM_MAX * 8);
11827 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11828 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11829 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11830 gimplify_and_add (t, pre_p);
11833 /* Compute index to start of area used for integer regs. */
11834 if (needed_intregs)
11836 /* int_addr = gpr + sav; */
11837 t = fold_build_pointer_plus (sav, gpr);
11838 gimplify_assign (int_addr, t, pre_p);
11840 if (needed_sseregs)
11842 /* sse_addr = fpr + sav; */
11843 t = fold_build_pointer_plus (sav, fpr);
11844 gimplify_assign (sse_addr, t, pre_p);
11846 if (need_temp)
11848 int i, prev_size = 0;
11849 tree temp = create_tmp_var (type, "va_arg_tmp");
11851 /* addr = &temp; */
11852 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11853 gimplify_assign (addr, t, pre_p);
11855 for (i = 0; i < XVECLEN (container, 0); i++)
11857 rtx slot = XVECEXP (container, 0, i);
11858 rtx reg = XEXP (slot, 0);
11859 machine_mode mode = GET_MODE (reg);
11860 tree piece_type;
11861 tree addr_type;
11862 tree daddr_type;
11863 tree src_addr, src;
11864 int src_offset;
11865 tree dest_addr, dest;
11866 int cur_size = GET_MODE_SIZE (mode);
11868 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11869 prev_size = INTVAL (XEXP (slot, 1));
11870 if (prev_size + cur_size > size)
11872 cur_size = size - prev_size;
11873 unsigned int nbits = cur_size * BITS_PER_UNIT;
11874 if (!int_mode_for_size (nbits, 1).exists (&mode))
11875 mode = QImode;
11877 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11878 if (mode == GET_MODE (reg))
11879 addr_type = build_pointer_type (piece_type);
11880 else
11881 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11882 true);
11883 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11884 true);
11886 if (SSE_REGNO_P (REGNO (reg)))
11888 src_addr = sse_addr;
11889 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11891 else
11893 src_addr = int_addr;
11894 src_offset = REGNO (reg) * 8;
11896 src_addr = fold_convert (addr_type, src_addr);
11897 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11899 dest_addr = fold_convert (daddr_type, addr);
11900 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11901 if (cur_size == GET_MODE_SIZE (mode))
11903 src = build_va_arg_indirect_ref (src_addr);
11904 dest = build_va_arg_indirect_ref (dest_addr);
11906 gimplify_assign (dest, src, pre_p);
11908 else
11910 tree copy
11911 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11912 3, dest_addr, src_addr,
11913 size_int (cur_size));
11914 gimplify_and_add (copy, pre_p);
11916 prev_size += cur_size;
11920 if (needed_intregs)
11922 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11923 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11924 gimplify_assign (gpr, t, pre_p);
11927 if (needed_sseregs)
11929 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11930 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11931 gimplify_assign (unshare_expr (fpr), t, pre_p);
11934 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11936 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11939 /* ... otherwise out of the overflow area. */
11941 /* When we align parameter on stack for caller, if the parameter
11942 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11943 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11944 here with caller. */
11945 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11946 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11947 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11949 /* Care for on-stack alignment if needed. */
11950 if (arg_boundary <= 64 || size == 0)
11951 t = ovf;
11952 else
11954 HOST_WIDE_INT align = arg_boundary / 8;
11955 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11956 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11957 build_int_cst (TREE_TYPE (t), -align));
11960 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11961 gimplify_assign (addr, t, pre_p);
11963 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11964 gimplify_assign (unshare_expr (ovf), t, pre_p);
11966 if (container)
11967 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11969 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11970 addr = fold_convert (ptrtype, addr);
11972 if (indirect_p)
11973 addr = build_va_arg_indirect_ref (addr);
11974 return build_va_arg_indirect_ref (addr);
11977 /* Return true if OPNUM's MEM should be matched
11978 in movabs* patterns. */
11980 bool
11981 ix86_check_movabs (rtx insn, int opnum)
11983 rtx set, mem;
11985 set = PATTERN (insn);
11986 if (GET_CODE (set) == PARALLEL)
11987 set = XVECEXP (set, 0, 0);
11988 gcc_assert (GET_CODE (set) == SET);
11989 mem = XEXP (set, opnum);
11990 while (SUBREG_P (mem))
11991 mem = SUBREG_REG (mem);
11992 gcc_assert (MEM_P (mem));
11993 return volatile_ok || !MEM_VOLATILE_P (mem);
11996 /* Return false if INSN contains a MEM with a non-default address space. */
11997 bool
11998 ix86_check_no_addr_space (rtx insn)
12000 subrtx_var_iterator::array_type array;
12001 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
12003 rtx x = *iter;
12004 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
12005 return false;
12007 return true;
12010 /* Initialize the table of extra 80387 mathematical constants. */
12012 static void
12013 init_ext_80387_constants (void)
12015 static const char * cst[5] =
12017 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
12018 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
12019 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
12020 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
12021 "3.1415926535897932385128089594061862044", /* 4: fldpi */
12023 int i;
12025 for (i = 0; i < 5; i++)
12027 real_from_string (&ext_80387_constants_table[i], cst[i]);
12028 /* Ensure each constant is rounded to XFmode precision. */
12029 real_convert (&ext_80387_constants_table[i],
12030 XFmode, &ext_80387_constants_table[i]);
12033 ext_80387_constants_init = 1;
12036 /* Return non-zero if the constant is something that
12037 can be loaded with a special instruction. */
12040 standard_80387_constant_p (rtx x)
12042 machine_mode mode = GET_MODE (x);
12044 const REAL_VALUE_TYPE *r;
12046 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
12047 return -1;
12049 if (x == CONST0_RTX (mode))
12050 return 1;
12051 if (x == CONST1_RTX (mode))
12052 return 2;
12054 r = CONST_DOUBLE_REAL_VALUE (x);
12056 /* For XFmode constants, try to find a special 80387 instruction when
12057 optimizing for size or on those CPUs that benefit from them. */
12058 if (mode == XFmode
12059 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
12061 int i;
12063 if (! ext_80387_constants_init)
12064 init_ext_80387_constants ();
12066 for (i = 0; i < 5; i++)
12067 if (real_identical (r, &ext_80387_constants_table[i]))
12068 return i + 3;
12071 /* Load of the constant -0.0 or -1.0 will be split as
12072 fldz;fchs or fld1;fchs sequence. */
12073 if (real_isnegzero (r))
12074 return 8;
12075 if (real_identical (r, &dconstm1))
12076 return 9;
12078 return 0;
12081 /* Return the opcode of the special instruction to be used to load
12082 the constant X. */
12084 const char *
12085 standard_80387_constant_opcode (rtx x)
12087 switch (standard_80387_constant_p (x))
12089 case 1:
12090 return "fldz";
12091 case 2:
12092 return "fld1";
12093 case 3:
12094 return "fldlg2";
12095 case 4:
12096 return "fldln2";
12097 case 5:
12098 return "fldl2e";
12099 case 6:
12100 return "fldl2t";
12101 case 7:
12102 return "fldpi";
12103 case 8:
12104 case 9:
12105 return "#";
12106 default:
12107 gcc_unreachable ();
12111 /* Return the CONST_DOUBLE representing the 80387 constant that is
12112 loaded by the specified special instruction. The argument IDX
12113 matches the return value from standard_80387_constant_p. */
12116 standard_80387_constant_rtx (int idx)
12118 int i;
12120 if (! ext_80387_constants_init)
12121 init_ext_80387_constants ();
12123 switch (idx)
12125 case 3:
12126 case 4:
12127 case 5:
12128 case 6:
12129 case 7:
12130 i = idx - 3;
12131 break;
12133 default:
12134 gcc_unreachable ();
12137 return const_double_from_real_value (ext_80387_constants_table[i],
12138 XFmode);
12141 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12142 in supported SSE/AVX vector mode. */
12145 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12147 machine_mode mode;
12149 if (!TARGET_SSE)
12150 return 0;
12152 mode = GET_MODE (x);
12154 if (x == const0_rtx || const0_operand (x, mode))
12155 return 1;
12157 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12159 /* VOIDmode integer constant, get mode from the predicate. */
12160 if (mode == VOIDmode)
12161 mode = pred_mode;
12163 switch (GET_MODE_SIZE (mode))
12165 case 64:
12166 if (TARGET_AVX512F)
12167 return 2;
12168 break;
12169 case 32:
12170 if (TARGET_AVX2)
12171 return 2;
12172 break;
12173 case 16:
12174 if (TARGET_SSE2)
12175 return 2;
12176 break;
12177 case 0:
12178 /* VOIDmode */
12179 gcc_unreachable ();
12180 default:
12181 break;
12185 return 0;
12188 /* Return the opcode of the special instruction to be used to load
12189 the constant X. */
12191 const char *
12192 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12194 machine_mode mode;
12196 gcc_assert (TARGET_SSE);
12198 mode = GET_MODE (x);
12200 if (x == const0_rtx || const0_operand (x, mode))
12202 switch (get_attr_mode (insn))
12204 case MODE_XI:
12205 return "vpxord\t%g0, %g0, %g0";
12206 case MODE_OI:
12207 return (TARGET_AVX512VL
12208 ? "vpxord\t%x0, %x0, %x0"
12209 : "vpxor\t%x0, %x0, %x0");
12210 case MODE_TI:
12211 return (TARGET_AVX512VL
12212 ? "vpxord\t%t0, %t0, %t0"
12213 : "%vpxor\t%0, %d0");
12215 case MODE_V8DF:
12216 return (TARGET_AVX512DQ
12217 ? "vxorpd\t%g0, %g0, %g0"
12218 : "vpxorq\t%g0, %g0, %g0");
12219 case MODE_V4DF:
12220 return "vxorpd\t%x0, %x0, %x0";
12221 case MODE_V2DF:
12222 return "%vxorpd\t%0, %d0";
12224 case MODE_V16SF:
12225 return (TARGET_AVX512DQ
12226 ? "vxorps\t%g0, %g0, %g0"
12227 : "vpxord\t%g0, %g0, %g0");
12228 case MODE_V8SF:
12229 return "vxorps\t%x0, %x0, %x0";
12230 case MODE_V4SF:
12231 return "%vxorps\t%0, %d0";
12233 default:
12234 gcc_unreachable ();
12237 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12239 enum attr_mode insn_mode = get_attr_mode (insn);
12241 switch (insn_mode)
12243 case MODE_XI:
12244 case MODE_V8DF:
12245 case MODE_V16SF:
12246 gcc_assert (TARGET_AVX512F);
12247 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12249 case MODE_OI:
12250 case MODE_V4DF:
12251 case MODE_V8SF:
12252 gcc_assert (TARGET_AVX2);
12253 /* FALLTHRU */
12254 case MODE_TI:
12255 case MODE_V2DF:
12256 case MODE_V4SF:
12257 gcc_assert (TARGET_SSE2);
12258 return (TARGET_AVX
12259 ? "vpcmpeqd\t%0, %0, %0"
12260 : "pcmpeqd\t%0, %0");
12262 default:
12263 gcc_unreachable ();
12267 gcc_unreachable ();
12270 /* Returns true if INSN can be transformed from a memory load
12271 to a supported FP constant load. */
12273 bool
12274 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12276 rtx src = find_constant_src (insn);
12278 gcc_assert (REG_P (dst));
12280 if (src == NULL
12281 || (SSE_REGNO_P (REGNO (dst))
12282 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12283 || (STACK_REGNO_P (REGNO (dst))
12284 && standard_80387_constant_p (src) < 1))
12285 return false;
12287 return true;
12290 /* Returns true if OP contains a symbol reference */
12292 bool
12293 symbolic_reference_mentioned_p (rtx op)
12295 const char *fmt;
12296 int i;
12298 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12299 return true;
12301 fmt = GET_RTX_FORMAT (GET_CODE (op));
12302 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12304 if (fmt[i] == 'E')
12306 int j;
12308 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12309 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12310 return true;
12313 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12314 return true;
12317 return false;
12320 /* Return true if it is appropriate to emit `ret' instructions in the
12321 body of a function. Do this only if the epilogue is simple, needing a
12322 couple of insns. Prior to reloading, we can't tell how many registers
12323 must be saved, so return false then. Return false if there is no frame
12324 marker to de-allocate. */
12326 bool
12327 ix86_can_use_return_insn_p (void)
12329 struct ix86_frame frame;
12331 if (ix86_function_naked (current_function_decl))
12332 return false;
12334 /* Don't use `ret' instruction in interrupt handler. */
12335 if (! reload_completed
12336 || frame_pointer_needed
12337 || cfun->machine->func_type != TYPE_NORMAL)
12338 return 0;
12340 /* Don't allow more than 32k pop, since that's all we can do
12341 with one instruction. */
12342 if (crtl->args.pops_args && crtl->args.size >= 32768)
12343 return 0;
12345 frame = cfun->machine->frame;
12346 return (frame.stack_pointer_offset == UNITS_PER_WORD
12347 && (frame.nregs + frame.nsseregs) == 0);
12350 /* Value should be nonzero if functions must have frame pointers.
12351 Zero means the frame pointer need not be set up (and parms may
12352 be accessed via the stack pointer) in functions that seem suitable. */
12354 static bool
12355 ix86_frame_pointer_required (void)
12357 /* If we accessed previous frames, then the generated code expects
12358 to be able to access the saved ebp value in our frame. */
12359 if (cfun->machine->accesses_prev_frame)
12360 return true;
12362 /* Several x86 os'es need a frame pointer for other reasons,
12363 usually pertaining to setjmp. */
12364 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12365 return true;
12367 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12368 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12369 return true;
12371 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12372 allocation is 4GB. */
12373 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12374 return true;
12376 /* SSE saves require frame-pointer when stack is misaligned. */
12377 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12378 return true;
12380 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12381 turns off the frame pointer by default. Turn it back on now if
12382 we've not got a leaf function. */
12383 if (TARGET_OMIT_LEAF_FRAME_POINTER
12384 && (!crtl->is_leaf
12385 || ix86_current_function_calls_tls_descriptor))
12386 return true;
12388 if (crtl->profile && !flag_fentry)
12389 return true;
12391 return false;
12394 /* Record that the current function accesses previous call frames. */
12396 void
12397 ix86_setup_frame_addresses (void)
12399 cfun->machine->accesses_prev_frame = 1;
12402 #ifndef USE_HIDDEN_LINKONCE
12403 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12404 # define USE_HIDDEN_LINKONCE 1
12405 # else
12406 # define USE_HIDDEN_LINKONCE 0
12407 # endif
12408 #endif
12410 static int pic_labels_used;
12412 /* Fills in the label name that should be used for a pc thunk for
12413 the given register. */
12415 static void
12416 get_pc_thunk_name (char name[32], unsigned int regno)
12418 gcc_assert (!TARGET_64BIT);
12420 if (USE_HIDDEN_LINKONCE)
12421 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12422 else
12423 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12427 /* This function generates code for -fpic that loads %ebx with
12428 the return address of the caller and then returns. */
12430 static void
12431 ix86_code_end (void)
12433 rtx xops[2];
12434 int regno;
12436 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12438 char name[32];
12439 tree decl;
12441 if (!(pic_labels_used & (1 << regno)))
12442 continue;
12444 get_pc_thunk_name (name, regno);
12446 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12447 get_identifier (name),
12448 build_function_type_list (void_type_node, NULL_TREE));
12449 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12450 NULL_TREE, void_type_node);
12451 TREE_PUBLIC (decl) = 1;
12452 TREE_STATIC (decl) = 1;
12453 DECL_IGNORED_P (decl) = 1;
12455 #if TARGET_MACHO
12456 if (TARGET_MACHO)
12458 switch_to_section (darwin_sections[picbase_thunk_section]);
12459 fputs ("\t.weak_definition\t", asm_out_file);
12460 assemble_name (asm_out_file, name);
12461 fputs ("\n\t.private_extern\t", asm_out_file);
12462 assemble_name (asm_out_file, name);
12463 putc ('\n', asm_out_file);
12464 ASM_OUTPUT_LABEL (asm_out_file, name);
12465 DECL_WEAK (decl) = 1;
12467 else
12468 #endif
12469 if (USE_HIDDEN_LINKONCE)
12471 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12473 targetm.asm_out.unique_section (decl, 0);
12474 switch_to_section (get_named_section (decl, NULL, 0));
12476 targetm.asm_out.globalize_label (asm_out_file, name);
12477 fputs ("\t.hidden\t", asm_out_file);
12478 assemble_name (asm_out_file, name);
12479 putc ('\n', asm_out_file);
12480 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12482 else
12484 switch_to_section (text_section);
12485 ASM_OUTPUT_LABEL (asm_out_file, name);
12488 DECL_INITIAL (decl) = make_node (BLOCK);
12489 current_function_decl = decl;
12490 allocate_struct_function (decl, false);
12491 init_function_start (decl);
12492 /* We're about to hide the function body from callees of final_* by
12493 emitting it directly; tell them we're a thunk, if they care. */
12494 cfun->is_thunk = true;
12495 first_function_block_is_cold = false;
12496 /* Make sure unwind info is emitted for the thunk if needed. */
12497 final_start_function (emit_barrier (), asm_out_file, 1);
12499 /* Pad stack IP move with 4 instructions (two NOPs count
12500 as one instruction). */
12501 if (TARGET_PAD_SHORT_FUNCTION)
12503 int i = 8;
12505 while (i--)
12506 fputs ("\tnop\n", asm_out_file);
12509 xops[0] = gen_rtx_REG (Pmode, regno);
12510 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12511 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12512 output_asm_insn ("%!ret", NULL);
12513 final_end_function ();
12514 init_insn_lengths ();
12515 free_after_compilation (cfun);
12516 set_cfun (NULL);
12517 current_function_decl = NULL;
12520 if (flag_split_stack)
12521 file_end_indicate_split_stack ();
12524 /* Emit code for the SET_GOT patterns. */
12526 const char *
12527 output_set_got (rtx dest, rtx label)
12529 rtx xops[3];
12531 xops[0] = dest;
12533 if (TARGET_VXWORKS_RTP && flag_pic)
12535 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12536 xops[2] = gen_rtx_MEM (Pmode,
12537 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12538 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12540 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12541 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12542 an unadorned address. */
12543 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12544 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12545 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12546 return "";
12549 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12551 if (flag_pic)
12553 char name[32];
12554 get_pc_thunk_name (name, REGNO (dest));
12555 pic_labels_used |= 1 << REGNO (dest);
12557 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12558 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12559 output_asm_insn ("%!call\t%X2", xops);
12561 #if TARGET_MACHO
12562 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12563 This is what will be referenced by the Mach-O PIC subsystem. */
12564 if (machopic_should_output_picbase_label () || !label)
12565 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12567 /* When we are restoring the pic base at the site of a nonlocal label,
12568 and we decided to emit the pic base above, we will still output a
12569 local label used for calculating the correction offset (even though
12570 the offset will be 0 in that case). */
12571 if (label)
12572 targetm.asm_out.internal_label (asm_out_file, "L",
12573 CODE_LABEL_NUMBER (label));
12574 #endif
12576 else
12578 if (TARGET_MACHO)
12579 /* We don't need a pic base, we're not producing pic. */
12580 gcc_unreachable ();
12582 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12583 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12584 targetm.asm_out.internal_label (asm_out_file, "L",
12585 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12588 if (!TARGET_MACHO)
12589 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12591 return "";
12594 /* Generate an "push" pattern for input ARG. */
12596 static rtx
12597 gen_push (rtx arg)
12599 struct machine_function *m = cfun->machine;
12601 if (m->fs.cfa_reg == stack_pointer_rtx)
12602 m->fs.cfa_offset += UNITS_PER_WORD;
12603 m->fs.sp_offset += UNITS_PER_WORD;
12605 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12606 arg = gen_rtx_REG (word_mode, REGNO (arg));
12608 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12609 gen_rtx_PRE_DEC (Pmode,
12610 stack_pointer_rtx)),
12611 arg);
12614 /* Generate an "pop" pattern for input ARG. */
12616 static rtx
12617 gen_pop (rtx arg)
12619 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12620 arg = gen_rtx_REG (word_mode, REGNO (arg));
12622 return gen_rtx_SET (arg,
12623 gen_rtx_MEM (word_mode,
12624 gen_rtx_POST_INC (Pmode,
12625 stack_pointer_rtx)));
12628 /* Return >= 0 if there is an unused call-clobbered register available
12629 for the entire function. */
12631 static unsigned int
12632 ix86_select_alt_pic_regnum (void)
12634 if (ix86_use_pseudo_pic_reg ())
12635 return INVALID_REGNUM;
12637 if (crtl->is_leaf
12638 && !crtl->profile
12639 && !ix86_current_function_calls_tls_descriptor)
12641 int i, drap;
12642 /* Can't use the same register for both PIC and DRAP. */
12643 if (crtl->drap_reg)
12644 drap = REGNO (crtl->drap_reg);
12645 else
12646 drap = -1;
12647 for (i = 2; i >= 0; --i)
12648 if (i != drap && !df_regs_ever_live_p (i))
12649 return i;
12652 return INVALID_REGNUM;
12655 /* Return true if REGNO is used by the epilogue. */
12657 bool
12658 ix86_epilogue_uses (int regno)
12660 /* If there are no caller-saved registers, we preserve all registers,
12661 except for MMX and x87 registers which aren't supported when saving
12662 and restoring registers. Don't explicitly save SP register since
12663 it is always preserved. */
12664 return (epilogue_completed
12665 && cfun->machine->no_caller_saved_registers
12666 && !fixed_regs[regno]
12667 && !STACK_REGNO_P (regno)
12668 && !MMX_REGNO_P (regno));
12671 /* Return nonzero if register REGNO can be used as a scratch register
12672 in peephole2. */
12674 static bool
12675 ix86_hard_regno_scratch_ok (unsigned int regno)
12677 /* If there are no caller-saved registers, we can't use any register
12678 as a scratch register after epilogue and use REGNO as scratch
12679 register only if it has been used before to avoid saving and
12680 restoring it. */
12681 return (!cfun->machine->no_caller_saved_registers
12682 || (!epilogue_completed
12683 && df_regs_ever_live_p (regno)));
12686 /* Return true if register class CL should be an additional allocno
12687 class. */
12689 static bool
12690 ix86_additional_allocno_class_p (reg_class_t cl)
12692 return cl == MOD4_SSE_REGS;
12695 /* Return TRUE if we need to save REGNO. */
12697 static bool
12698 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12700 /* If there are no caller-saved registers, we preserve all registers,
12701 except for MMX and x87 registers which aren't supported when saving
12702 and restoring registers. Don't explicitly save SP register since
12703 it is always preserved. */
12704 if (cfun->machine->no_caller_saved_registers)
12706 /* Don't preserve registers used for function return value. */
12707 rtx reg = crtl->return_rtx;
12708 if (reg)
12710 unsigned int i = REGNO (reg);
12711 unsigned int nregs = REG_NREGS (reg);
12712 while (nregs-- > 0)
12713 if ((i + nregs) == regno)
12714 return false;
12716 reg = crtl->return_bnd;
12717 if (reg)
12719 i = REGNO (reg);
12720 nregs = REG_NREGS (reg);
12721 while (nregs-- > 0)
12722 if ((i + nregs) == regno)
12723 return false;
12727 return (df_regs_ever_live_p (regno)
12728 && !fixed_regs[regno]
12729 && !STACK_REGNO_P (regno)
12730 && !MMX_REGNO_P (regno)
12731 && (regno != HARD_FRAME_POINTER_REGNUM
12732 || !frame_pointer_needed));
12735 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12736 && pic_offset_table_rtx)
12738 if (ix86_use_pseudo_pic_reg ())
12740 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12741 _mcount in prologue. */
12742 if (!TARGET_64BIT && flag_pic && crtl->profile)
12743 return true;
12745 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12746 || crtl->profile
12747 || crtl->calls_eh_return
12748 || crtl->uses_const_pool
12749 || cfun->has_nonlocal_label)
12750 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12753 if (crtl->calls_eh_return && maybe_eh_return)
12755 unsigned i;
12756 for (i = 0; ; i++)
12758 unsigned test = EH_RETURN_DATA_REGNO (i);
12759 if (test == INVALID_REGNUM)
12760 break;
12761 if (test == regno)
12762 return true;
12766 if (ignore_outlined && cfun->machine->call_ms2sysv)
12768 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12769 + xlogue_layout::MIN_REGS;
12770 if (xlogue_layout::is_stub_managed_reg (regno, count))
12771 return false;
12774 if (crtl->drap_reg
12775 && regno == REGNO (crtl->drap_reg)
12776 && !cfun->machine->no_drap_save_restore)
12777 return true;
12779 return (df_regs_ever_live_p (regno)
12780 && !call_used_regs[regno]
12781 && !fixed_regs[regno]
12782 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12785 /* Return number of saved general prupose registers. */
12787 static int
12788 ix86_nsaved_regs (void)
12790 int nregs = 0;
12791 int regno;
12793 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12794 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12795 nregs ++;
12796 return nregs;
12799 /* Return number of saved SSE registers. */
12801 static int
12802 ix86_nsaved_sseregs (void)
12804 int nregs = 0;
12805 int regno;
12807 if (!TARGET_64BIT_MS_ABI)
12808 return 0;
12809 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12810 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12811 nregs ++;
12812 return nregs;
12815 /* Given FROM and TO register numbers, say whether this elimination is
12816 allowed. If stack alignment is needed, we can only replace argument
12817 pointer with hard frame pointer, or replace frame pointer with stack
12818 pointer. Otherwise, frame pointer elimination is automatically
12819 handled and all other eliminations are valid. */
12821 static bool
12822 ix86_can_eliminate (const int from, const int to)
12824 if (stack_realign_fp)
12825 return ((from == ARG_POINTER_REGNUM
12826 && to == HARD_FRAME_POINTER_REGNUM)
12827 || (from == FRAME_POINTER_REGNUM
12828 && to == STACK_POINTER_REGNUM));
12829 else
12830 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12833 /* Return the offset between two registers, one to be eliminated, and the other
12834 its replacement, at the start of a routine. */
12836 HOST_WIDE_INT
12837 ix86_initial_elimination_offset (int from, int to)
12839 struct ix86_frame frame = cfun->machine->frame;
12841 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12842 return frame.hard_frame_pointer_offset;
12843 else if (from == FRAME_POINTER_REGNUM
12844 && to == HARD_FRAME_POINTER_REGNUM)
12845 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12846 else
12848 gcc_assert (to == STACK_POINTER_REGNUM);
12850 if (from == ARG_POINTER_REGNUM)
12851 return frame.stack_pointer_offset;
12853 gcc_assert (from == FRAME_POINTER_REGNUM);
12854 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12858 /* In a dynamically-aligned function, we can't know the offset from
12859 stack pointer to frame pointer, so we must ensure that setjmp
12860 eliminates fp against the hard fp (%ebp) rather than trying to
12861 index from %esp up to the top of the frame across a gap that is
12862 of unknown (at compile-time) size. */
12863 static rtx
12864 ix86_builtin_setjmp_frame_value (void)
12866 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12869 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12870 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12872 static bool warned_once = false;
12873 if (!warned_once)
12875 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12876 feature);
12877 warned_once = true;
12881 /* When using -fsplit-stack, the allocation routines set a field in
12882 the TCB to the bottom of the stack plus this much space, measured
12883 in bytes. */
12885 #define SPLIT_STACK_AVAILABLE 256
12887 /* Fill structure ix86_frame about frame of currently computed function. */
12889 static void
12890 ix86_compute_frame_layout (void)
12892 struct ix86_frame *frame = &cfun->machine->frame;
12893 struct machine_function *m = cfun->machine;
12894 unsigned HOST_WIDE_INT stack_alignment_needed;
12895 HOST_WIDE_INT offset;
12896 unsigned HOST_WIDE_INT preferred_alignment;
12897 HOST_WIDE_INT size = get_frame_size ();
12898 HOST_WIDE_INT to_allocate;
12900 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12901 * ms_abi functions that call a sysv function. We now need to prune away
12902 * cases where it should be disabled. */
12903 if (TARGET_64BIT && m->call_ms2sysv)
12905 gcc_assert (TARGET_64BIT_MS_ABI);
12906 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12907 gcc_assert (!TARGET_SEH);
12908 gcc_assert (TARGET_SSE);
12909 gcc_assert (!ix86_using_red_zone ());
12911 if (crtl->calls_eh_return)
12913 gcc_assert (!reload_completed);
12914 m->call_ms2sysv = false;
12915 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12918 else if (ix86_static_chain_on_stack)
12920 gcc_assert (!reload_completed);
12921 m->call_ms2sysv = false;
12922 warn_once_call_ms2sysv_xlogues ("static call chains");
12925 /* Finally, compute which registers the stub will manage. */
12926 else
12928 unsigned count = xlogue_layout::count_stub_managed_regs ();
12929 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12930 m->call_ms2sysv_pad_in = 0;
12934 frame->nregs = ix86_nsaved_regs ();
12935 frame->nsseregs = ix86_nsaved_sseregs ();
12937 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12938 except for function prologues, leaf functions and when the defult
12939 incoming stack boundary is overriden at command line or via
12940 force_align_arg_pointer attribute. */
12941 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12942 && (!crtl->is_leaf || cfun->calls_alloca != 0
12943 || ix86_current_function_calls_tls_descriptor
12944 || ix86_incoming_stack_boundary < 128))
12946 crtl->preferred_stack_boundary = 128;
12947 crtl->stack_alignment_needed = 128;
12950 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12951 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12953 gcc_assert (!size || stack_alignment_needed);
12954 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12955 gcc_assert (preferred_alignment <= stack_alignment_needed);
12957 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
12958 gcc_assert (TARGET_64BIT || !frame->nsseregs);
12959 if (TARGET_64BIT && m->call_ms2sysv)
12961 gcc_assert (stack_alignment_needed >= 16);
12962 gcc_assert (!frame->nsseregs);
12965 /* For SEH we have to limit the amount of code movement into the prologue.
12966 At present we do this via a BLOCKAGE, at which point there's very little
12967 scheduling that can be done, which means that there's very little point
12968 in doing anything except PUSHs. */
12969 if (TARGET_SEH)
12970 m->use_fast_prologue_epilogue = false;
12971 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12973 int count = frame->nregs;
12974 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12976 /* The fast prologue uses move instead of push to save registers. This
12977 is significantly longer, but also executes faster as modern hardware
12978 can execute the moves in parallel, but can't do that for push/pop.
12980 Be careful about choosing what prologue to emit: When function takes
12981 many instructions to execute we may use slow version as well as in
12982 case function is known to be outside hot spot (this is known with
12983 feedback only). Weight the size of function by number of registers
12984 to save as it is cheap to use one or two push instructions but very
12985 slow to use many of them. */
12986 if (count)
12987 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12988 if (node->frequency < NODE_FREQUENCY_NORMAL
12989 || (flag_branch_probabilities
12990 && node->frequency < NODE_FREQUENCY_HOT))
12991 m->use_fast_prologue_epilogue = false;
12992 else
12993 m->use_fast_prologue_epilogue
12994 = !expensive_function_p (count);
12997 frame->save_regs_using_mov
12998 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12999 /* If static stack checking is enabled and done with probes,
13000 the registers need to be saved before allocating the frame. */
13001 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
13003 /* Skip return address and error code in exception handler. */
13004 offset = INCOMING_FRAME_SP_OFFSET;
13006 /* Skip pushed static chain. */
13007 if (ix86_static_chain_on_stack)
13008 offset += UNITS_PER_WORD;
13010 /* Skip saved base pointer. */
13011 if (frame_pointer_needed)
13012 offset += UNITS_PER_WORD;
13013 frame->hfp_save_offset = offset;
13015 /* The traditional frame pointer location is at the top of the frame. */
13016 frame->hard_frame_pointer_offset = offset;
13018 /* Register save area */
13019 offset += frame->nregs * UNITS_PER_WORD;
13020 frame->reg_save_offset = offset;
13022 /* On SEH target, registers are pushed just before the frame pointer
13023 location. */
13024 if (TARGET_SEH)
13025 frame->hard_frame_pointer_offset = offset;
13027 /* Calculate the size of the va-arg area (not including padding, if any). */
13028 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
13030 if (stack_realign_fp)
13032 /* We may need a 16-byte aligned stack for the remainder of the
13033 register save area, but the stack frame for the local function
13034 may require a greater alignment if using AVX/2/512. In order
13035 to avoid wasting space, we first calculate the space needed for
13036 the rest of the register saves, add that to the stack pointer,
13037 and then realign the stack to the boundary of the start of the
13038 frame for the local function. */
13039 HOST_WIDE_INT space_needed = 0;
13040 HOST_WIDE_INT sse_reg_space_needed = 0;
13042 if (TARGET_64BIT)
13044 if (m->call_ms2sysv)
13046 m->call_ms2sysv_pad_in = 0;
13047 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
13050 else if (frame->nsseregs)
13051 /* The only ABI that has saved SSE registers (Win64) also has a
13052 16-byte aligned default stack. However, many programs violate
13053 the ABI, and Wine64 forces stack realignment to compensate. */
13054 space_needed = frame->nsseregs * 16;
13056 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
13058 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
13059 rounding to be pedantic. */
13060 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
13062 else
13063 space_needed = frame->va_arg_size;
13065 /* Record the allocation size required prior to the realignment AND. */
13066 frame->stack_realign_allocate = space_needed;
13068 /* The re-aligned stack starts at frame->stack_realign_offset. Values
13069 before this point are not directly comparable with values below
13070 this point. Use sp_valid_at to determine if the stack pointer is
13071 valid for a given offset, fp_valid_at for the frame pointer, or
13072 choose_baseaddr to have a base register chosen for you.
13074 Note that the result of (frame->stack_realign_offset
13075 & (stack_alignment_needed - 1)) may not equal zero. */
13076 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
13077 frame->stack_realign_offset = offset - space_needed;
13078 frame->sse_reg_save_offset = frame->stack_realign_offset
13079 + sse_reg_space_needed;
13081 else
13083 frame->stack_realign_offset = offset;
13085 if (TARGET_64BIT && m->call_ms2sysv)
13087 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
13088 offset += xlogue_layout::get_instance ().get_stack_space_used ();
13091 /* Align and set SSE register save area. */
13092 else if (frame->nsseregs)
13094 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
13095 required and the DRAP re-alignment boundary is at least 16 bytes,
13096 then we want the SSE register save area properly aligned. */
13097 if (ix86_incoming_stack_boundary >= 128
13098 || (stack_realign_drap && stack_alignment_needed >= 16))
13099 offset = ROUND_UP (offset, 16);
13100 offset += frame->nsseregs * 16;
13102 frame->sse_reg_save_offset = offset;
13103 offset += frame->va_arg_size;
13106 /* Align start of frame for local function. */
13107 if (m->call_ms2sysv
13108 || frame->va_arg_size != 0
13109 || size != 0
13110 || !crtl->is_leaf
13111 || cfun->calls_alloca
13112 || ix86_current_function_calls_tls_descriptor)
13113 offset = ROUND_UP (offset, stack_alignment_needed);
13115 /* Frame pointer points here. */
13116 frame->frame_pointer_offset = offset;
13118 offset += size;
13120 /* Add outgoing arguments area. Can be skipped if we eliminated
13121 all the function calls as dead code.
13122 Skipping is however impossible when function calls alloca. Alloca
13123 expander assumes that last crtl->outgoing_args_size
13124 of stack frame are unused. */
13125 if (ACCUMULATE_OUTGOING_ARGS
13126 && (!crtl->is_leaf || cfun->calls_alloca
13127 || ix86_current_function_calls_tls_descriptor))
13129 offset += crtl->outgoing_args_size;
13130 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13132 else
13133 frame->outgoing_arguments_size = 0;
13135 /* Align stack boundary. Only needed if we're calling another function
13136 or using alloca. */
13137 if (!crtl->is_leaf || cfun->calls_alloca
13138 || ix86_current_function_calls_tls_descriptor)
13139 offset = ROUND_UP (offset, preferred_alignment);
13141 /* We've reached end of stack frame. */
13142 frame->stack_pointer_offset = offset;
13144 /* Size prologue needs to allocate. */
13145 to_allocate = offset - frame->sse_reg_save_offset;
13147 if ((!to_allocate && frame->nregs <= 1)
13148 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13149 frame->save_regs_using_mov = false;
13151 if (ix86_using_red_zone ()
13152 && crtl->sp_is_unchanging
13153 && crtl->is_leaf
13154 && !ix86_pc_thunk_call_expanded
13155 && !ix86_current_function_calls_tls_descriptor)
13157 frame->red_zone_size = to_allocate;
13158 if (frame->save_regs_using_mov)
13159 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13160 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13161 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13163 else
13164 frame->red_zone_size = 0;
13165 frame->stack_pointer_offset -= frame->red_zone_size;
13167 /* The SEH frame pointer location is near the bottom of the frame.
13168 This is enforced by the fact that the difference between the
13169 stack pointer and the frame pointer is limited to 240 bytes in
13170 the unwind data structure. */
13171 if (TARGET_SEH)
13173 HOST_WIDE_INT diff;
13175 /* If we can leave the frame pointer where it is, do so. Also, returns
13176 the establisher frame for __builtin_frame_address (0). */
13177 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13178 if (diff <= SEH_MAX_FRAME_SIZE
13179 && (diff > 240 || (diff & 15) != 0)
13180 && !crtl->accesses_prior_frames)
13182 /* Ideally we'd determine what portion of the local stack frame
13183 (within the constraint of the lowest 240) is most heavily used.
13184 But without that complication, simply bias the frame pointer
13185 by 128 bytes so as to maximize the amount of the local stack
13186 frame that is addressable with 8-bit offsets. */
13187 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13192 /* This is semi-inlined memory_address_length, but simplified
13193 since we know that we're always dealing with reg+offset, and
13194 to avoid having to create and discard all that rtl. */
13196 static inline int
13197 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13199 int len = 4;
13201 if (offset == 0)
13203 /* EBP and R13 cannot be encoded without an offset. */
13204 len = (regno == BP_REG || regno == R13_REG);
13206 else if (IN_RANGE (offset, -128, 127))
13207 len = 1;
13209 /* ESP and R12 must be encoded with a SIB byte. */
13210 if (regno == SP_REG || regno == R12_REG)
13211 len++;
13213 return len;
13216 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
13217 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13219 static bool
13220 sp_valid_at (HOST_WIDE_INT cfa_offset)
13222 const struct machine_frame_state &fs = cfun->machine->fs;
13223 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
13225 /* Validate that the cfa_offset isn't in a "no-man's land". */
13226 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
13227 return false;
13229 return fs.sp_valid;
13232 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
13233 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13235 static inline bool
13236 fp_valid_at (HOST_WIDE_INT cfa_offset)
13238 const struct machine_frame_state &fs = cfun->machine->fs;
13239 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
13241 /* Validate that the cfa_offset isn't in a "no-man's land". */
13242 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
13243 return false;
13245 return fs.fp_valid;
13248 /* Choose a base register based upon alignment requested, speed and/or
13249 size. */
13251 static void
13252 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13253 HOST_WIDE_INT &base_offset,
13254 unsigned int align_reqested, unsigned int *align)
13256 const struct machine_function *m = cfun->machine;
13257 unsigned int hfp_align;
13258 unsigned int drap_align;
13259 unsigned int sp_align;
13260 bool hfp_ok = fp_valid_at (cfa_offset);
13261 bool drap_ok = m->fs.drap_valid;
13262 bool sp_ok = sp_valid_at (cfa_offset);
13264 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13266 /* Filter out any registers that don't meet the requested alignment
13267 criteria. */
13268 if (align_reqested)
13270 if (m->fs.realigned)
13271 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13272 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13273 notes (which we would need to use a realigned stack pointer),
13274 so disable on SEH targets. */
13275 else if (m->fs.sp_realigned)
13276 sp_align = crtl->stack_alignment_needed;
13278 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13279 drap_ok = drap_ok && drap_align >= align_reqested;
13280 sp_ok = sp_ok && sp_align >= align_reqested;
13283 if (m->use_fast_prologue_epilogue)
13285 /* Choose the base register most likely to allow the most scheduling
13286 opportunities. Generally FP is valid throughout the function,
13287 while DRAP must be reloaded within the epilogue. But choose either
13288 over the SP due to increased encoding size. */
13290 if (hfp_ok)
13292 base_reg = hard_frame_pointer_rtx;
13293 base_offset = m->fs.fp_offset - cfa_offset;
13295 else if (drap_ok)
13297 base_reg = crtl->drap_reg;
13298 base_offset = 0 - cfa_offset;
13300 else if (sp_ok)
13302 base_reg = stack_pointer_rtx;
13303 base_offset = m->fs.sp_offset - cfa_offset;
13306 else
13308 HOST_WIDE_INT toffset;
13309 int len = 16, tlen;
13311 /* Choose the base register with the smallest address encoding.
13312 With a tie, choose FP > DRAP > SP. */
13313 if (sp_ok)
13315 base_reg = stack_pointer_rtx;
13316 base_offset = m->fs.sp_offset - cfa_offset;
13317 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13319 if (drap_ok)
13321 toffset = 0 - cfa_offset;
13322 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13323 if (tlen <= len)
13325 base_reg = crtl->drap_reg;
13326 base_offset = toffset;
13327 len = tlen;
13330 if (hfp_ok)
13332 toffset = m->fs.fp_offset - cfa_offset;
13333 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13334 if (tlen <= len)
13336 base_reg = hard_frame_pointer_rtx;
13337 base_offset = toffset;
13338 len = tlen;
13343 /* Set the align return value. */
13344 if (align)
13346 if (base_reg == stack_pointer_rtx)
13347 *align = sp_align;
13348 else if (base_reg == crtl->drap_reg)
13349 *align = drap_align;
13350 else if (base_reg == hard_frame_pointer_rtx)
13351 *align = hfp_align;
13355 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13356 the alignment of address. If ALIGN is non-null, it should point to
13357 an alignment value (in bits) that is preferred or zero and will
13358 recieve the alignment of the base register that was selected,
13359 irrespective of rather or not CFA_OFFSET is a multiple of that
13360 alignment value.
13362 The valid base registers are taken from CFUN->MACHINE->FS. */
13364 static rtx
13365 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13367 rtx base_reg = NULL;
13368 HOST_WIDE_INT base_offset = 0;
13370 /* If a specific alignment is requested, try to get a base register
13371 with that alignment first. */
13372 if (align && *align)
13373 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13375 if (!base_reg)
13376 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13378 gcc_assert (base_reg != NULL);
13379 return plus_constant (Pmode, base_reg, base_offset);
13382 /* Emit code to save registers in the prologue. */
13384 static void
13385 ix86_emit_save_regs (void)
13387 unsigned int regno;
13388 rtx_insn *insn;
13390 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13391 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13393 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13394 RTX_FRAME_RELATED_P (insn) = 1;
13398 /* Emit a single register save at CFA - CFA_OFFSET. */
13400 static void
13401 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13402 HOST_WIDE_INT cfa_offset)
13404 struct machine_function *m = cfun->machine;
13405 rtx reg = gen_rtx_REG (mode, regno);
13406 rtx mem, addr, base, insn;
13407 unsigned int align = GET_MODE_ALIGNMENT (mode);
13409 addr = choose_baseaddr (cfa_offset, &align);
13410 mem = gen_frame_mem (mode, addr);
13412 /* The location aligment depends upon the base register. */
13413 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13414 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13415 set_mem_align (mem, align);
13417 insn = emit_insn (gen_rtx_SET (mem, reg));
13418 RTX_FRAME_RELATED_P (insn) = 1;
13420 base = addr;
13421 if (GET_CODE (base) == PLUS)
13422 base = XEXP (base, 0);
13423 gcc_checking_assert (REG_P (base));
13425 /* When saving registers into a re-aligned local stack frame, avoid
13426 any tricky guessing by dwarf2out. */
13427 if (m->fs.realigned)
13429 gcc_checking_assert (stack_realign_drap);
13431 if (regno == REGNO (crtl->drap_reg))
13433 /* A bit of a hack. We force the DRAP register to be saved in
13434 the re-aligned stack frame, which provides us with a copy
13435 of the CFA that will last past the prologue. Install it. */
13436 gcc_checking_assert (cfun->machine->fs.fp_valid);
13437 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13438 cfun->machine->fs.fp_offset - cfa_offset);
13439 mem = gen_rtx_MEM (mode, addr);
13440 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13442 else
13444 /* The frame pointer is a stable reference within the
13445 aligned frame. Use it. */
13446 gcc_checking_assert (cfun->machine->fs.fp_valid);
13447 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13448 cfun->machine->fs.fp_offset - cfa_offset);
13449 mem = gen_rtx_MEM (mode, addr);
13450 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13454 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13455 && cfa_offset >= m->fs.sp_realigned_offset)
13457 gcc_checking_assert (stack_realign_fp);
13458 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13461 /* The memory may not be relative to the current CFA register,
13462 which means that we may need to generate a new pattern for
13463 use by the unwind info. */
13464 else if (base != m->fs.cfa_reg)
13466 addr = plus_constant (Pmode, m->fs.cfa_reg,
13467 m->fs.cfa_offset - cfa_offset);
13468 mem = gen_rtx_MEM (mode, addr);
13469 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13473 /* Emit code to save registers using MOV insns.
13474 First register is stored at CFA - CFA_OFFSET. */
13475 static void
13476 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13478 unsigned int regno;
13480 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13481 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13483 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13484 cfa_offset -= UNITS_PER_WORD;
13488 /* Emit code to save SSE registers using MOV insns.
13489 First register is stored at CFA - CFA_OFFSET. */
13490 static void
13491 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13493 unsigned int regno;
13495 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13496 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13498 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13499 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13503 static GTY(()) rtx queued_cfa_restores;
13505 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13506 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13507 Don't add the note if the previously saved value will be left untouched
13508 within stack red-zone till return, as unwinders can find the same value
13509 in the register and on the stack. */
13511 static void
13512 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13514 if (!crtl->shrink_wrapped
13515 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13516 return;
13518 if (insn)
13520 add_reg_note (insn, REG_CFA_RESTORE, reg);
13521 RTX_FRAME_RELATED_P (insn) = 1;
13523 else
13524 queued_cfa_restores
13525 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13528 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13530 static void
13531 ix86_add_queued_cfa_restore_notes (rtx insn)
13533 rtx last;
13534 if (!queued_cfa_restores)
13535 return;
13536 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13538 XEXP (last, 1) = REG_NOTES (insn);
13539 REG_NOTES (insn) = queued_cfa_restores;
13540 queued_cfa_restores = NULL_RTX;
13541 RTX_FRAME_RELATED_P (insn) = 1;
13544 /* Expand prologue or epilogue stack adjustment.
13545 The pattern exist to put a dependency on all ebp-based memory accesses.
13546 STYLE should be negative if instructions should be marked as frame related,
13547 zero if %r11 register is live and cannot be freely used and positive
13548 otherwise. */
13550 static rtx
13551 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13552 int style, bool set_cfa)
13554 struct machine_function *m = cfun->machine;
13555 rtx insn;
13556 bool add_frame_related_expr = false;
13558 if (Pmode == SImode)
13559 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13560 else if (x86_64_immediate_operand (offset, DImode))
13561 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13562 else
13564 rtx tmp;
13565 /* r11 is used by indirect sibcall return as well, set before the
13566 epilogue and used after the epilogue. */
13567 if (style)
13568 tmp = gen_rtx_REG (DImode, R11_REG);
13569 else
13571 gcc_assert (src != hard_frame_pointer_rtx
13572 && dest != hard_frame_pointer_rtx);
13573 tmp = hard_frame_pointer_rtx;
13575 insn = emit_insn (gen_rtx_SET (tmp, offset));
13576 if (style < 0)
13577 add_frame_related_expr = true;
13579 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13582 insn = emit_insn (insn);
13583 if (style >= 0)
13584 ix86_add_queued_cfa_restore_notes (insn);
13586 if (set_cfa)
13588 rtx r;
13590 gcc_assert (m->fs.cfa_reg == src);
13591 m->fs.cfa_offset += INTVAL (offset);
13592 m->fs.cfa_reg = dest;
13594 r = gen_rtx_PLUS (Pmode, src, offset);
13595 r = gen_rtx_SET (dest, r);
13596 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13597 RTX_FRAME_RELATED_P (insn) = 1;
13599 else if (style < 0)
13601 RTX_FRAME_RELATED_P (insn) = 1;
13602 if (add_frame_related_expr)
13604 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13605 r = gen_rtx_SET (dest, r);
13606 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13610 if (dest == stack_pointer_rtx)
13612 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13613 bool valid = m->fs.sp_valid;
13614 bool realigned = m->fs.sp_realigned;
13616 if (src == hard_frame_pointer_rtx)
13618 valid = m->fs.fp_valid;
13619 realigned = false;
13620 ooffset = m->fs.fp_offset;
13622 else if (src == crtl->drap_reg)
13624 valid = m->fs.drap_valid;
13625 realigned = false;
13626 ooffset = 0;
13628 else
13630 /* Else there are two possibilities: SP itself, which we set
13631 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13632 taken care of this by hand along the eh_return path. */
13633 gcc_checking_assert (src == stack_pointer_rtx
13634 || offset == const0_rtx);
13637 m->fs.sp_offset = ooffset - INTVAL (offset);
13638 m->fs.sp_valid = valid;
13639 m->fs.sp_realigned = realigned;
13641 return insn;
13644 /* Find an available register to be used as dynamic realign argument
13645 pointer regsiter. Such a register will be written in prologue and
13646 used in begin of body, so it must not be
13647 1. parameter passing register.
13648 2. GOT pointer.
13649 We reuse static-chain register if it is available. Otherwise, we
13650 use DI for i386 and R13 for x86-64. We chose R13 since it has
13651 shorter encoding.
13653 Return: the regno of chosen register. */
13655 static unsigned int
13656 find_drap_reg (void)
13658 tree decl = cfun->decl;
13660 /* Always use callee-saved register if there are no caller-saved
13661 registers. */
13662 if (TARGET_64BIT)
13664 /* Use R13 for nested function or function need static chain.
13665 Since function with tail call may use any caller-saved
13666 registers in epilogue, DRAP must not use caller-saved
13667 register in such case. */
13668 if (DECL_STATIC_CHAIN (decl)
13669 || cfun->machine->no_caller_saved_registers
13670 || crtl->tail_call_emit)
13671 return R13_REG;
13673 return R10_REG;
13675 else
13677 /* Use DI for nested function or function need static chain.
13678 Since function with tail call may use any caller-saved
13679 registers in epilogue, DRAP must not use caller-saved
13680 register in such case. */
13681 if (DECL_STATIC_CHAIN (decl)
13682 || cfun->machine->no_caller_saved_registers
13683 || crtl->tail_call_emit)
13684 return DI_REG;
13686 /* Reuse static chain register if it isn't used for parameter
13687 passing. */
13688 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13690 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13691 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13692 return CX_REG;
13694 return DI_REG;
13698 /* Handle a "force_align_arg_pointer" attribute. */
13700 static tree
13701 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13702 tree, int, bool *no_add_attrs)
13704 if (TREE_CODE (*node) != FUNCTION_TYPE
13705 && TREE_CODE (*node) != METHOD_TYPE
13706 && TREE_CODE (*node) != FIELD_DECL
13707 && TREE_CODE (*node) != TYPE_DECL)
13709 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13710 name);
13711 *no_add_attrs = true;
13714 return NULL_TREE;
13717 /* Return minimum incoming stack alignment. */
13719 static unsigned int
13720 ix86_minimum_incoming_stack_boundary (bool sibcall)
13722 unsigned int incoming_stack_boundary;
13724 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13725 if (cfun->machine->func_type != TYPE_NORMAL)
13726 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13727 /* Prefer the one specified at command line. */
13728 else if (ix86_user_incoming_stack_boundary)
13729 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13730 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13731 if -mstackrealign is used, it isn't used for sibcall check and
13732 estimated stack alignment is 128bit. */
13733 else if (!sibcall
13734 && ix86_force_align_arg_pointer
13735 && crtl->stack_alignment_estimated == 128)
13736 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13737 else
13738 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13740 /* Incoming stack alignment can be changed on individual functions
13741 via force_align_arg_pointer attribute. We use the smallest
13742 incoming stack boundary. */
13743 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13744 && lookup_attribute (ix86_force_align_arg_pointer_string,
13745 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13746 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13748 /* The incoming stack frame has to be aligned at least at
13749 parm_stack_boundary. */
13750 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13751 incoming_stack_boundary = crtl->parm_stack_boundary;
13753 /* Stack at entrance of main is aligned by runtime. We use the
13754 smallest incoming stack boundary. */
13755 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13756 && DECL_NAME (current_function_decl)
13757 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13758 && DECL_FILE_SCOPE_P (current_function_decl))
13759 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13761 return incoming_stack_boundary;
13764 /* Update incoming stack boundary and estimated stack alignment. */
13766 static void
13767 ix86_update_stack_boundary (void)
13769 ix86_incoming_stack_boundary
13770 = ix86_minimum_incoming_stack_boundary (false);
13772 /* x86_64 vararg needs 16byte stack alignment for register save
13773 area. */
13774 if (TARGET_64BIT
13775 && cfun->stdarg
13776 && crtl->stack_alignment_estimated < 128)
13777 crtl->stack_alignment_estimated = 128;
13779 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13780 if (ix86_tls_descriptor_calls_expanded_in_cfun
13781 && crtl->preferred_stack_boundary < 128)
13782 crtl->preferred_stack_boundary = 128;
13785 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13786 needed or an rtx for DRAP otherwise. */
13788 static rtx
13789 ix86_get_drap_rtx (void)
13791 /* We must use DRAP if there are outgoing arguments on stack and
13792 ACCUMULATE_OUTGOING_ARGS is false. */
13793 if (ix86_force_drap
13794 || (cfun->machine->outgoing_args_on_stack
13795 && !ACCUMULATE_OUTGOING_ARGS))
13796 crtl->need_drap = true;
13798 if (stack_realign_drap)
13800 /* Assign DRAP to vDRAP and returns vDRAP */
13801 unsigned int regno = find_drap_reg ();
13802 rtx drap_vreg;
13803 rtx arg_ptr;
13804 rtx_insn *seq, *insn;
13806 arg_ptr = gen_rtx_REG (Pmode, regno);
13807 crtl->drap_reg = arg_ptr;
13809 start_sequence ();
13810 drap_vreg = copy_to_reg (arg_ptr);
13811 seq = get_insns ();
13812 end_sequence ();
13814 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13815 if (!optimize)
13817 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13818 RTX_FRAME_RELATED_P (insn) = 1;
13820 return drap_vreg;
13822 else
13823 return NULL;
13826 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13828 static rtx
13829 ix86_internal_arg_pointer (void)
13831 return virtual_incoming_args_rtx;
13834 struct scratch_reg {
13835 rtx reg;
13836 bool saved;
13839 /* Return a short-lived scratch register for use on function entry.
13840 In 32-bit mode, it is valid only after the registers are saved
13841 in the prologue. This register must be released by means of
13842 release_scratch_register_on_entry once it is dead. */
13844 static void
13845 get_scratch_register_on_entry (struct scratch_reg *sr)
13847 int regno;
13849 sr->saved = false;
13851 if (TARGET_64BIT)
13853 /* We always use R11 in 64-bit mode. */
13854 regno = R11_REG;
13856 else
13858 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13859 bool fastcall_p
13860 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13861 bool thiscall_p
13862 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13863 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13864 int regparm = ix86_function_regparm (fntype, decl);
13865 int drap_regno
13866 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13868 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13869 for the static chain register. */
13870 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13871 && drap_regno != AX_REG)
13872 regno = AX_REG;
13873 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13874 for the static chain register. */
13875 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13876 regno = AX_REG;
13877 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13878 regno = DX_REG;
13879 /* ecx is the static chain register. */
13880 else if (regparm < 3 && !fastcall_p && !thiscall_p
13881 && !static_chain_p
13882 && drap_regno != CX_REG)
13883 regno = CX_REG;
13884 else if (ix86_save_reg (BX_REG, true, false))
13885 regno = BX_REG;
13886 /* esi is the static chain register. */
13887 else if (!(regparm == 3 && static_chain_p)
13888 && ix86_save_reg (SI_REG, true, false))
13889 regno = SI_REG;
13890 else if (ix86_save_reg (DI_REG, true, false))
13891 regno = DI_REG;
13892 else
13894 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13895 sr->saved = true;
13899 sr->reg = gen_rtx_REG (Pmode, regno);
13900 if (sr->saved)
13902 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13903 RTX_FRAME_RELATED_P (insn) = 1;
13907 /* Release a scratch register obtained from the preceding function. */
13909 static void
13910 release_scratch_register_on_entry (struct scratch_reg *sr)
13912 if (sr->saved)
13914 struct machine_function *m = cfun->machine;
13915 rtx x, insn = emit_insn (gen_pop (sr->reg));
13917 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13918 RTX_FRAME_RELATED_P (insn) = 1;
13919 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13920 x = gen_rtx_SET (stack_pointer_rtx, x);
13921 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13922 m->fs.sp_offset -= UNITS_PER_WORD;
13926 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13928 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
13930 This differs from the next routine in that it tries hard to prevent
13931 attacks that jump the stack guard. Thus it is never allowed to allocate
13932 more than PROBE_INTERVAL bytes of stack space without a suitable
13933 probe. */
13935 static void
13936 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
13938 struct machine_function *m = cfun->machine;
13940 /* If this function does not statically allocate stack space, then
13941 no probes are needed. */
13942 if (!size)
13944 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
13945 return;
13948 /* If we are a noreturn function, then we have to consider the
13949 possibility that we're called via a jump rather than a call.
13951 Thus we don't have the implicit probe generated by saving the
13952 return address into the stack at the call. Thus, the stack
13953 pointer could be anywhere in the guard page. The safe thing
13954 to do is emit a probe now.
13956 ?!? This should be revamped to work like aarch64 and s390 where
13957 we track the offset from the most recent probe. Normally that
13958 offset would be zero. For a non-return function we would reset
13959 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
13960 we just probe when we cross PROBE_INTERVAL. */
13961 if (TREE_THIS_VOLATILE (cfun->decl))
13963 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13964 -GET_MODE_SIZE (word_mode)));
13965 emit_insn (gen_blockage ());
13968 /* If we allocate less than the size of the guard statically,
13969 then no probing is necessary, but we do need to allocate
13970 the stack. */
13971 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
13973 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13974 GEN_INT (-size), -1,
13975 m->fs.cfa_reg == stack_pointer_rtx);
13976 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
13977 return;
13980 /* We're allocating a large enough stack frame that we need to
13981 emit probes. Either emit them inline or in a loop depending
13982 on the size. */
13983 HOST_WIDE_INT probe_interval
13984 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
13985 if (size <= 4 * probe_interval)
13987 HOST_WIDE_INT i;
13988 for (i = probe_interval; i <= size; i += probe_interval)
13990 /* Allocate PROBE_INTERVAL bytes. */
13991 rtx insn
13992 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13993 GEN_INT (-PROBE_INTERVAL), -1,
13994 m->fs.cfa_reg == stack_pointer_rtx);
13995 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
13997 /* And probe at *sp. */
13998 emit_stack_probe (stack_pointer_rtx);
13999 emit_insn (gen_blockage ());
14002 /* We need to allocate space for the residual, but we do not need
14003 to probe the residual. */
14004 HOST_WIDE_INT residual = (i - probe_interval - size);
14005 if (residual)
14006 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14007 GEN_INT (residual), -1,
14008 m->fs.cfa_reg == stack_pointer_rtx);
14009 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
14011 else
14013 struct scratch_reg sr;
14014 get_scratch_register_on_entry (&sr);
14016 /* Step 1: round SIZE down to a multiple of the interval. */
14017 HOST_WIDE_INT rounded_size = size & -probe_interval;
14019 /* Step 2: compute final value of the loop counter. Use lea if
14020 possible. */
14021 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
14022 rtx insn;
14023 if (address_no_seg_operand (addr, Pmode))
14024 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
14025 else
14027 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
14028 insn = emit_insn (gen_rtx_SET (sr.reg,
14029 gen_rtx_PLUS (Pmode, sr.reg,
14030 stack_pointer_rtx)));
14032 if (m->fs.cfa_reg == stack_pointer_rtx)
14034 add_reg_note (insn, REG_CFA_DEF_CFA,
14035 plus_constant (Pmode, sr.reg,
14036 m->fs.cfa_offset + rounded_size));
14037 RTX_FRAME_RELATED_P (insn) = 1;
14040 /* Step 3: the loop. */
14041 rtx size_rtx = GEN_INT (rounded_size);
14042 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
14043 size_rtx));
14044 if (m->fs.cfa_reg == stack_pointer_rtx)
14046 m->fs.cfa_offset += rounded_size;
14047 add_reg_note (insn, REG_CFA_DEF_CFA,
14048 plus_constant (Pmode, stack_pointer_rtx,
14049 m->fs.cfa_offset));
14050 RTX_FRAME_RELATED_P (insn) = 1;
14052 m->fs.sp_offset += rounded_size;
14053 emit_insn (gen_blockage ());
14055 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
14056 is equal to ROUNDED_SIZE. */
14058 if (size != rounded_size)
14059 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14060 GEN_INT (rounded_size - size), -1,
14061 m->fs.cfa_reg == stack_pointer_rtx);
14062 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
14064 release_scratch_register_on_entry (&sr);
14067 /* Make sure nothing is scheduled before we are done. */
14068 emit_insn (gen_blockage ());
14071 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
14073 static void
14074 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
14076 /* We skip the probe for the first interval + a small dope of 4 words and
14077 probe that many bytes past the specified size to maintain a protection
14078 area at the botton of the stack. */
14079 const int dope = 4 * UNITS_PER_WORD;
14080 rtx size_rtx = GEN_INT (size), last;
14082 /* See if we have a constant small number of probes to generate. If so,
14083 that's the easy case. The run-time loop is made up of 9 insns in the
14084 generic case while the compile-time loop is made up of 3+2*(n-1) insns
14085 for n # of intervals. */
14086 if (size <= 4 * PROBE_INTERVAL)
14088 HOST_WIDE_INT i, adjust;
14089 bool first_probe = true;
14091 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
14092 values of N from 1 until it exceeds SIZE. If only one probe is
14093 needed, this will not generate any code. Then adjust and probe
14094 to PROBE_INTERVAL + SIZE. */
14095 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14097 if (first_probe)
14099 adjust = 2 * PROBE_INTERVAL + dope;
14100 first_probe = false;
14102 else
14103 adjust = PROBE_INTERVAL;
14105 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14106 plus_constant (Pmode, stack_pointer_rtx,
14107 -adjust)));
14108 emit_stack_probe (stack_pointer_rtx);
14111 if (first_probe)
14112 adjust = size + PROBE_INTERVAL + dope;
14113 else
14114 adjust = size + PROBE_INTERVAL - i;
14116 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14117 plus_constant (Pmode, stack_pointer_rtx,
14118 -adjust)));
14119 emit_stack_probe (stack_pointer_rtx);
14121 /* Adjust back to account for the additional first interval. */
14122 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
14123 plus_constant (Pmode, stack_pointer_rtx,
14124 PROBE_INTERVAL + dope)));
14127 /* Otherwise, do the same as above, but in a loop. Note that we must be
14128 extra careful with variables wrapping around because we might be at
14129 the very top (or the very bottom) of the address space and we have
14130 to be able to handle this case properly; in particular, we use an
14131 equality test for the loop condition. */
14132 else
14134 HOST_WIDE_INT rounded_size;
14135 struct scratch_reg sr;
14137 get_scratch_register_on_entry (&sr);
14140 /* Step 1: round SIZE to the previous multiple of the interval. */
14142 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14145 /* Step 2: compute initial and final value of the loop counter. */
14147 /* SP = SP_0 + PROBE_INTERVAL. */
14148 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14149 plus_constant (Pmode, stack_pointer_rtx,
14150 - (PROBE_INTERVAL + dope))));
14152 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
14153 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
14154 emit_insn (gen_rtx_SET (sr.reg,
14155 plus_constant (Pmode, stack_pointer_rtx,
14156 -rounded_size)));
14157 else
14159 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
14160 emit_insn (gen_rtx_SET (sr.reg,
14161 gen_rtx_PLUS (Pmode, sr.reg,
14162 stack_pointer_rtx)));
14166 /* Step 3: the loop
14170 SP = SP + PROBE_INTERVAL
14171 probe at SP
14173 while (SP != LAST_ADDR)
14175 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
14176 values of N from 1 until it is equal to ROUNDED_SIZE. */
14178 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
14181 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
14182 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
14184 if (size != rounded_size)
14186 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14187 plus_constant (Pmode, stack_pointer_rtx,
14188 rounded_size - size)));
14189 emit_stack_probe (stack_pointer_rtx);
14192 /* Adjust back to account for the additional first interval. */
14193 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
14194 plus_constant (Pmode, stack_pointer_rtx,
14195 PROBE_INTERVAL + dope)));
14197 release_scratch_register_on_entry (&sr);
14200 /* Even if the stack pointer isn't the CFA register, we need to correctly
14201 describe the adjustments made to it, in particular differentiate the
14202 frame-related ones from the frame-unrelated ones. */
14203 if (size > 0)
14205 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
14206 XVECEXP (expr, 0, 0)
14207 = gen_rtx_SET (stack_pointer_rtx,
14208 plus_constant (Pmode, stack_pointer_rtx, -size));
14209 XVECEXP (expr, 0, 1)
14210 = gen_rtx_SET (stack_pointer_rtx,
14211 plus_constant (Pmode, stack_pointer_rtx,
14212 PROBE_INTERVAL + dope + size));
14213 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
14214 RTX_FRAME_RELATED_P (last) = 1;
14216 cfun->machine->fs.sp_offset += size;
14219 /* Make sure nothing is scheduled before we are done. */
14220 emit_insn (gen_blockage ());
14223 /* Adjust the stack pointer up to REG while probing it. */
14225 const char *
14226 output_adjust_stack_and_probe (rtx reg)
14228 static int labelno = 0;
14229 char loop_lab[32];
14230 rtx xops[2];
14232 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14234 /* Loop. */
14235 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14237 /* SP = SP + PROBE_INTERVAL. */
14238 xops[0] = stack_pointer_rtx;
14239 xops[1] = GEN_INT (PROBE_INTERVAL);
14240 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14242 /* Probe at SP. */
14243 xops[1] = const0_rtx;
14244 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
14246 /* Test if SP == LAST_ADDR. */
14247 xops[0] = stack_pointer_rtx;
14248 xops[1] = reg;
14249 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14251 /* Branch. */
14252 fputs ("\tjne\t", asm_out_file);
14253 assemble_name_raw (asm_out_file, loop_lab);
14254 fputc ('\n', asm_out_file);
14256 return "";
14259 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
14260 inclusive. These are offsets from the current stack pointer. */
14262 static void
14263 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14265 /* See if we have a constant small number of probes to generate. If so,
14266 that's the easy case. The run-time loop is made up of 6 insns in the
14267 generic case while the compile-time loop is made up of n insns for n #
14268 of intervals. */
14269 if (size <= 6 * PROBE_INTERVAL)
14271 HOST_WIDE_INT i;
14273 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14274 it exceeds SIZE. If only one probe is needed, this will not
14275 generate any code. Then probe at FIRST + SIZE. */
14276 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14277 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14278 -(first + i)));
14280 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14281 -(first + size)));
14284 /* Otherwise, do the same as above, but in a loop. Note that we must be
14285 extra careful with variables wrapping around because we might be at
14286 the very top (or the very bottom) of the address space and we have
14287 to be able to handle this case properly; in particular, we use an
14288 equality test for the loop condition. */
14289 else
14291 HOST_WIDE_INT rounded_size, last;
14292 struct scratch_reg sr;
14294 get_scratch_register_on_entry (&sr);
14297 /* Step 1: round SIZE to the previous multiple of the interval. */
14299 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14302 /* Step 2: compute initial and final value of the loop counter. */
14304 /* TEST_OFFSET = FIRST. */
14305 emit_move_insn (sr.reg, GEN_INT (-first));
14307 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14308 last = first + rounded_size;
14311 /* Step 3: the loop
14315 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14316 probe at TEST_ADDR
14318 while (TEST_ADDR != LAST_ADDR)
14320 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14321 until it is equal to ROUNDED_SIZE. */
14323 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14326 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14327 that SIZE is equal to ROUNDED_SIZE. */
14329 if (size != rounded_size)
14330 emit_stack_probe (plus_constant (Pmode,
14331 gen_rtx_PLUS (Pmode,
14332 stack_pointer_rtx,
14333 sr.reg),
14334 rounded_size - size));
14336 release_scratch_register_on_entry (&sr);
14339 /* Make sure nothing is scheduled before we are done. */
14340 emit_insn (gen_blockage ());
14343 /* Probe a range of stack addresses from REG to END, inclusive. These are
14344 offsets from the current stack pointer. */
14346 const char *
14347 output_probe_stack_range (rtx reg, rtx end)
14349 static int labelno = 0;
14350 char loop_lab[32];
14351 rtx xops[3];
14353 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14355 /* Loop. */
14356 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14358 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14359 xops[0] = reg;
14360 xops[1] = GEN_INT (PROBE_INTERVAL);
14361 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14363 /* Probe at TEST_ADDR. */
14364 xops[0] = stack_pointer_rtx;
14365 xops[1] = reg;
14366 xops[2] = const0_rtx;
14367 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14369 /* Test if TEST_ADDR == LAST_ADDR. */
14370 xops[0] = reg;
14371 xops[1] = end;
14372 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14374 /* Branch. */
14375 fputs ("\tjne\t", asm_out_file);
14376 assemble_name_raw (asm_out_file, loop_lab);
14377 fputc ('\n', asm_out_file);
14379 return "";
14382 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
14383 will guide prologue/epilogue to be generated in correct form. */
14385 static void
14386 ix86_finalize_stack_frame_flags (void)
14388 /* Check if stack realign is really needed after reload, and
14389 stores result in cfun */
14390 unsigned int incoming_stack_boundary
14391 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14392 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14393 unsigned int stack_alignment
14394 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14395 ? crtl->max_used_stack_slot_alignment
14396 : crtl->stack_alignment_needed);
14397 unsigned int stack_realign
14398 = (incoming_stack_boundary < stack_alignment);
14399 bool recompute_frame_layout_p = false;
14401 if (crtl->stack_realign_finalized)
14403 /* After stack_realign_needed is finalized, we can't no longer
14404 change it. */
14405 gcc_assert (crtl->stack_realign_needed == stack_realign);
14406 return;
14409 /* If the only reason for frame_pointer_needed is that we conservatively
14410 assumed stack realignment might be needed or -fno-omit-frame-pointer
14411 is used, but in the end nothing that needed the stack alignment had
14412 been spilled nor stack access, clear frame_pointer_needed and say we
14413 don't need stack realignment. */
14414 if ((stack_realign || !flag_omit_frame_pointer)
14415 && frame_pointer_needed
14416 && crtl->is_leaf
14417 && crtl->sp_is_unchanging
14418 && !ix86_current_function_calls_tls_descriptor
14419 && !crtl->accesses_prior_frames
14420 && !cfun->calls_alloca
14421 && !crtl->calls_eh_return
14422 /* See ira_setup_eliminable_regset for the rationale. */
14423 && !(STACK_CHECK_MOVING_SP
14424 && flag_stack_check
14425 && flag_exceptions
14426 && cfun->can_throw_non_call_exceptions)
14427 && !ix86_frame_pointer_required ()
14428 && get_frame_size () == 0
14429 && ix86_nsaved_sseregs () == 0
14430 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14432 HARD_REG_SET set_up_by_prologue, prologue_used;
14433 basic_block bb;
14435 CLEAR_HARD_REG_SET (prologue_used);
14436 CLEAR_HARD_REG_SET (set_up_by_prologue);
14437 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14438 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14439 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14440 HARD_FRAME_POINTER_REGNUM);
14442 /* The preferred stack alignment is the minimum stack alignment. */
14443 if (stack_alignment > crtl->preferred_stack_boundary)
14444 stack_alignment = crtl->preferred_stack_boundary;
14446 bool require_stack_frame = false;
14448 FOR_EACH_BB_FN (bb, cfun)
14450 rtx_insn *insn;
14451 FOR_BB_INSNS (bb, insn)
14452 if (NONDEBUG_INSN_P (insn)
14453 && requires_stack_frame_p (insn, prologue_used,
14454 set_up_by_prologue))
14456 require_stack_frame = true;
14458 if (stack_realign)
14460 /* Find the maximum stack alignment. */
14461 subrtx_iterator::array_type array;
14462 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
14463 if (MEM_P (*iter)
14464 && (reg_mentioned_p (stack_pointer_rtx,
14465 *iter)
14466 || reg_mentioned_p (frame_pointer_rtx,
14467 *iter)))
14469 unsigned int alignment = MEM_ALIGN (*iter);
14470 if (alignment > stack_alignment)
14471 stack_alignment = alignment;
14477 if (require_stack_frame)
14479 /* Stack frame is required. If stack alignment needed is less
14480 than incoming stack boundary, don't realign stack. */
14481 stack_realign = incoming_stack_boundary < stack_alignment;
14482 if (!stack_realign)
14484 crtl->max_used_stack_slot_alignment
14485 = incoming_stack_boundary;
14486 crtl->stack_alignment_needed
14487 = incoming_stack_boundary;
14488 /* Also update preferred_stack_boundary for leaf
14489 functions. */
14490 crtl->preferred_stack_boundary
14491 = incoming_stack_boundary;
14494 else
14496 /* If drap has been set, but it actually isn't live at the
14497 start of the function, there is no reason to set it up. */
14498 if (crtl->drap_reg)
14500 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14501 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
14502 REGNO (crtl->drap_reg)))
14504 crtl->drap_reg = NULL_RTX;
14505 crtl->need_drap = false;
14508 else
14509 cfun->machine->no_drap_save_restore = true;
14511 frame_pointer_needed = false;
14512 stack_realign = false;
14513 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14514 crtl->stack_alignment_needed = incoming_stack_boundary;
14515 crtl->stack_alignment_estimated = incoming_stack_boundary;
14516 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14517 crtl->preferred_stack_boundary = incoming_stack_boundary;
14518 df_finish_pass (true);
14519 df_scan_alloc (NULL);
14520 df_scan_blocks ();
14521 df_compute_regs_ever_live (true);
14522 df_analyze ();
14524 if (flag_var_tracking)
14526 /* Since frame pointer is no longer available, replace it with
14527 stack pointer - UNITS_PER_WORD in debug insns. */
14528 df_ref ref, next;
14529 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
14530 ref; ref = next)
14532 rtx_insn *insn = DF_REF_INSN (ref);
14533 /* Make sure the next ref is for a different instruction,
14534 so that we're not affected by the rescan. */
14535 next = DF_REF_NEXT_REG (ref);
14536 while (next && DF_REF_INSN (next) == insn)
14537 next = DF_REF_NEXT_REG (next);
14539 if (DEBUG_INSN_P (insn))
14541 bool changed = false;
14542 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
14544 rtx *loc = DF_REF_LOC (ref);
14545 if (*loc == hard_frame_pointer_rtx)
14547 *loc = plus_constant (Pmode,
14548 stack_pointer_rtx,
14549 -UNITS_PER_WORD);
14550 changed = true;
14553 if (changed)
14554 df_insn_rescan (insn);
14559 recompute_frame_layout_p = true;
14563 if (crtl->stack_realign_needed != stack_realign)
14564 recompute_frame_layout_p = true;
14565 crtl->stack_realign_needed = stack_realign;
14566 crtl->stack_realign_finalized = true;
14567 if (recompute_frame_layout_p)
14568 ix86_compute_frame_layout ();
14571 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14573 static void
14574 ix86_elim_entry_set_got (rtx reg)
14576 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14577 rtx_insn *c_insn = BB_HEAD (bb);
14578 if (!NONDEBUG_INSN_P (c_insn))
14579 c_insn = next_nonnote_nondebug_insn (c_insn);
14580 if (c_insn && NONJUMP_INSN_P (c_insn))
14582 rtx pat = PATTERN (c_insn);
14583 if (GET_CODE (pat) == PARALLEL)
14585 rtx vec = XVECEXP (pat, 0, 0);
14586 if (GET_CODE (vec) == SET
14587 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14588 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14589 delete_insn (c_insn);
14594 static rtx
14595 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14597 rtx addr, mem;
14599 if (offset)
14600 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14601 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14602 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14605 static inline rtx
14606 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14608 return gen_frame_set (reg, frame_reg, offset, false);
14611 static inline rtx
14612 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14614 return gen_frame_set (reg, frame_reg, offset, true);
14617 static void
14618 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14620 struct machine_function *m = cfun->machine;
14621 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14622 + m->call_ms2sysv_extra_regs;
14623 rtvec v = rtvec_alloc (ncregs + 1);
14624 unsigned int align, i, vi = 0;
14625 rtx_insn *insn;
14626 rtx sym, addr;
14627 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14628 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14629 HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14631 /* AL should only be live with sysv_abi. */
14632 gcc_assert (!ix86_eax_live_at_start_p ());
14634 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
14635 we've actually realigned the stack or not. */
14636 align = GET_MODE_ALIGNMENT (V4SFmode);
14637 addr = choose_baseaddr (frame.stack_realign_offset
14638 + xlogue.get_stub_ptr_offset (), &align);
14639 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14640 emit_insn (gen_rtx_SET (rax, addr));
14642 /* Allocate stack if not already done. */
14643 if (allocate > 0)
14644 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14645 GEN_INT (-allocate), -1, false);
14647 /* Get the stub symbol. */
14648 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14649 : XLOGUE_STUB_SAVE);
14650 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14652 for (i = 0; i < ncregs; ++i)
14654 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14655 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14656 r.regno);
14657 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
14660 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14662 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14663 RTX_FRAME_RELATED_P (insn) = true;
14666 /* Expand the prologue into a bunch of separate insns. */
14668 void
14669 ix86_expand_prologue (void)
14671 struct machine_function *m = cfun->machine;
14672 rtx insn, t;
14673 struct ix86_frame frame;
14674 HOST_WIDE_INT allocate;
14675 bool int_registers_saved;
14676 bool sse_registers_saved;
14677 rtx static_chain = NULL_RTX;
14679 if (ix86_function_naked (current_function_decl))
14680 return;
14682 ix86_finalize_stack_frame_flags ();
14684 /* DRAP should not coexist with stack_realign_fp */
14685 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14687 memset (&m->fs, 0, sizeof (m->fs));
14689 /* Initialize CFA state for before the prologue. */
14690 m->fs.cfa_reg = stack_pointer_rtx;
14691 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14693 /* Track SP offset to the CFA. We continue tracking this after we've
14694 swapped the CFA register away from SP. In the case of re-alignment
14695 this is fudged; we're interested to offsets within the local frame. */
14696 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14697 m->fs.sp_valid = true;
14698 m->fs.sp_realigned = false;
14700 frame = m->frame;
14702 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14704 /* We should have already generated an error for any use of
14705 ms_hook on a nested function. */
14706 gcc_checking_assert (!ix86_static_chain_on_stack);
14708 /* Check if profiling is active and we shall use profiling before
14709 prologue variant. If so sorry. */
14710 if (crtl->profile && flag_fentry != 0)
14711 sorry ("ms_hook_prologue attribute isn%'t compatible "
14712 "with -mfentry for 32-bit");
14714 /* In ix86_asm_output_function_label we emitted:
14715 8b ff movl.s %edi,%edi
14716 55 push %ebp
14717 8b ec movl.s %esp,%ebp
14719 This matches the hookable function prologue in Win32 API
14720 functions in Microsoft Windows XP Service Pack 2 and newer.
14721 Wine uses this to enable Windows apps to hook the Win32 API
14722 functions provided by Wine.
14724 What that means is that we've already set up the frame pointer. */
14726 if (frame_pointer_needed
14727 && !(crtl->drap_reg && crtl->stack_realign_needed))
14729 rtx push, mov;
14731 /* We've decided to use the frame pointer already set up.
14732 Describe this to the unwinder by pretending that both
14733 push and mov insns happen right here.
14735 Putting the unwind info here at the end of the ms_hook
14736 is done so that we can make absolutely certain we get
14737 the required byte sequence at the start of the function,
14738 rather than relying on an assembler that can produce
14739 the exact encoding required.
14741 However it does mean (in the unpatched case) that we have
14742 a 1 insn window where the asynchronous unwind info is
14743 incorrect. However, if we placed the unwind info at
14744 its correct location we would have incorrect unwind info
14745 in the patched case. Which is probably all moot since
14746 I don't expect Wine generates dwarf2 unwind info for the
14747 system libraries that use this feature. */
14749 insn = emit_insn (gen_blockage ());
14751 push = gen_push (hard_frame_pointer_rtx);
14752 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14753 stack_pointer_rtx);
14754 RTX_FRAME_RELATED_P (push) = 1;
14755 RTX_FRAME_RELATED_P (mov) = 1;
14757 RTX_FRAME_RELATED_P (insn) = 1;
14758 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14759 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14761 /* Note that gen_push incremented m->fs.cfa_offset, even
14762 though we didn't emit the push insn here. */
14763 m->fs.cfa_reg = hard_frame_pointer_rtx;
14764 m->fs.fp_offset = m->fs.cfa_offset;
14765 m->fs.fp_valid = true;
14767 else
14769 /* The frame pointer is not needed so pop %ebp again.
14770 This leaves us with a pristine state. */
14771 emit_insn (gen_pop (hard_frame_pointer_rtx));
14775 /* The first insn of a function that accepts its static chain on the
14776 stack is to push the register that would be filled in by a direct
14777 call. This insn will be skipped by the trampoline. */
14778 else if (ix86_static_chain_on_stack)
14780 static_chain = ix86_static_chain (cfun->decl, false);
14781 insn = emit_insn (gen_push (static_chain));
14782 emit_insn (gen_blockage ());
14784 /* We don't want to interpret this push insn as a register save,
14785 only as a stack adjustment. The real copy of the register as
14786 a save will be done later, if needed. */
14787 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14788 t = gen_rtx_SET (stack_pointer_rtx, t);
14789 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14790 RTX_FRAME_RELATED_P (insn) = 1;
14793 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14794 of DRAP is needed and stack realignment is really needed after reload */
14795 if (stack_realign_drap)
14797 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14799 /* Can't use DRAP in interrupt function. */
14800 if (cfun->machine->func_type != TYPE_NORMAL)
14801 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14802 "in interrupt service routine. This may be worked "
14803 "around by avoiding functions with aggregate return.");
14805 /* Only need to push parameter pointer reg if it is caller saved. */
14806 if (!call_used_regs[REGNO (crtl->drap_reg)])
14808 /* Push arg pointer reg */
14809 insn = emit_insn (gen_push (crtl->drap_reg));
14810 RTX_FRAME_RELATED_P (insn) = 1;
14813 /* Grab the argument pointer. */
14814 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14815 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14816 RTX_FRAME_RELATED_P (insn) = 1;
14817 m->fs.cfa_reg = crtl->drap_reg;
14818 m->fs.cfa_offset = 0;
14820 /* Align the stack. */
14821 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14822 stack_pointer_rtx,
14823 GEN_INT (-align_bytes)));
14824 RTX_FRAME_RELATED_P (insn) = 1;
14826 /* Replicate the return address on the stack so that return
14827 address can be reached via (argp - 1) slot. This is needed
14828 to implement macro RETURN_ADDR_RTX and intrinsic function
14829 expand_builtin_return_addr etc. */
14830 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14831 t = gen_frame_mem (word_mode, t);
14832 insn = emit_insn (gen_push (t));
14833 RTX_FRAME_RELATED_P (insn) = 1;
14835 /* For the purposes of frame and register save area addressing,
14836 we've started over with a new frame. */
14837 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14838 m->fs.realigned = true;
14840 if (static_chain)
14842 /* Replicate static chain on the stack so that static chain
14843 can be reached via (argp - 2) slot. This is needed for
14844 nested function with stack realignment. */
14845 insn = emit_insn (gen_push (static_chain));
14846 RTX_FRAME_RELATED_P (insn) = 1;
14850 int_registers_saved = (frame.nregs == 0);
14851 sse_registers_saved = (frame.nsseregs == 0);
14853 if (frame_pointer_needed && !m->fs.fp_valid)
14855 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14856 slower on all targets. Also sdb doesn't like it. */
14857 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14858 RTX_FRAME_RELATED_P (insn) = 1;
14860 /* Push registers now, before setting the frame pointer
14861 on SEH target. */
14862 if (!int_registers_saved
14863 && TARGET_SEH
14864 && !frame.save_regs_using_mov)
14866 ix86_emit_save_regs ();
14867 int_registers_saved = true;
14868 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14871 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14873 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14874 RTX_FRAME_RELATED_P (insn) = 1;
14876 if (m->fs.cfa_reg == stack_pointer_rtx)
14877 m->fs.cfa_reg = hard_frame_pointer_rtx;
14878 m->fs.fp_offset = m->fs.sp_offset;
14879 m->fs.fp_valid = true;
14883 if (!int_registers_saved)
14885 /* If saving registers via PUSH, do so now. */
14886 if (!frame.save_regs_using_mov)
14888 ix86_emit_save_regs ();
14889 int_registers_saved = true;
14890 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14893 /* When using red zone we may start register saving before allocating
14894 the stack frame saving one cycle of the prologue. However, avoid
14895 doing this if we have to probe the stack; at least on x86_64 the
14896 stack probe can turn into a call that clobbers a red zone location. */
14897 else if (ix86_using_red_zone ()
14898 && (! TARGET_STACK_PROBE
14899 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14901 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14902 int_registers_saved = true;
14906 if (stack_realign_fp)
14908 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14909 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14911 /* Record last valid frame pointer offset. */
14912 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
14914 /* The computation of the size of the re-aligned stack frame means
14915 that we must allocate the size of the register save area before
14916 performing the actual alignment. Otherwise we cannot guarantee
14917 that there's enough storage above the realignment point. */
14918 allocate = frame.reg_save_offset - m->fs.sp_offset
14919 + frame.stack_realign_allocate;
14920 if (allocate)
14921 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14922 GEN_INT (-allocate), -1, false);
14924 /* Align the stack. */
14925 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14926 stack_pointer_rtx,
14927 GEN_INT (-align_bytes)));
14928 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14929 m->fs.sp_realigned_offset = m->fs.sp_offset
14930 - frame.stack_realign_allocate;
14931 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
14932 Beyond this point, stack access should be done via choose_baseaddr or
14933 by using sp_valid_at and fp_valid_at to determine the correct base
14934 register. Henceforth, any CFA offset should be thought of as logical
14935 and not physical. */
14936 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
14937 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14938 m->fs.sp_realigned = true;
14940 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14941 is needed to describe where a register is saved using a realigned
14942 stack pointer, so we need to invalidate the stack pointer for that
14943 target. */
14944 if (TARGET_SEH)
14945 m->fs.sp_valid = false;
14948 if (m->call_ms2sysv)
14949 ix86_emit_outlined_ms2sysv_save (frame);
14951 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14953 if (flag_stack_usage_info)
14955 /* We start to count from ARG_POINTER. */
14956 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14958 /* If it was realigned, take into account the fake frame. */
14959 if (stack_realign_drap)
14961 if (ix86_static_chain_on_stack)
14962 stack_size += UNITS_PER_WORD;
14964 if (!call_used_regs[REGNO (crtl->drap_reg)])
14965 stack_size += UNITS_PER_WORD;
14967 /* This over-estimates by 1 minimal-stack-alignment-unit but
14968 mitigates that by counting in the new return address slot. */
14969 current_function_dynamic_stack_size
14970 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14973 current_function_static_stack_size = stack_size;
14976 /* On SEH target with very large frame size, allocate an area to save
14977 SSE registers (as the very large allocation won't be described). */
14978 if (TARGET_SEH
14979 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14980 && !sse_registers_saved)
14982 HOST_WIDE_INT sse_size =
14983 frame.sse_reg_save_offset - frame.reg_save_offset;
14985 gcc_assert (int_registers_saved);
14987 /* No need to do stack checking as the area will be immediately
14988 written. */
14989 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14990 GEN_INT (-sse_size), -1,
14991 m->fs.cfa_reg == stack_pointer_rtx);
14992 allocate -= sse_size;
14993 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14994 sse_registers_saved = true;
14997 /* The stack has already been decremented by the instruction calling us
14998 so probe if the size is non-negative to preserve the protection area. */
14999 if (allocate >= 0
15000 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
15001 || flag_stack_clash_protection))
15003 /* We expect the GP registers to be saved when probes are used. */
15004 gcc_assert (int_registers_saved);
15006 if (flag_stack_clash_protection)
15008 ix86_adjust_stack_and_probe_stack_clash (allocate);
15009 allocate = 0;
15011 else if (STACK_CHECK_MOVING_SP)
15013 if (!(crtl->is_leaf && !cfun->calls_alloca
15014 && allocate <= PROBE_INTERVAL))
15016 ix86_adjust_stack_and_probe (allocate);
15017 allocate = 0;
15020 else
15022 HOST_WIDE_INT size = allocate;
15024 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
15025 size = 0x80000000 - get_stack_check_protect () - 1;
15027 if (TARGET_STACK_PROBE)
15029 if (crtl->is_leaf && !cfun->calls_alloca)
15031 if (size > PROBE_INTERVAL)
15032 ix86_emit_probe_stack_range (0, size);
15034 else
15035 ix86_emit_probe_stack_range (0,
15036 size + get_stack_check_protect ());
15038 else
15040 if (crtl->is_leaf && !cfun->calls_alloca)
15042 if (size > PROBE_INTERVAL
15043 && size > get_stack_check_protect ())
15044 ix86_emit_probe_stack_range (get_stack_check_protect (),
15045 size - get_stack_check_protect ());
15047 else
15048 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
15053 if (allocate == 0)
15055 else if (!ix86_target_stack_probe ()
15056 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
15058 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15059 GEN_INT (-allocate), -1,
15060 m->fs.cfa_reg == stack_pointer_rtx);
15062 else
15064 rtx eax = gen_rtx_REG (Pmode, AX_REG);
15065 rtx r10 = NULL;
15066 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
15067 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
15068 bool eax_live = ix86_eax_live_at_start_p ();
15069 bool r10_live = false;
15071 if (TARGET_64BIT)
15072 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
15074 if (eax_live)
15076 insn = emit_insn (gen_push (eax));
15077 allocate -= UNITS_PER_WORD;
15078 /* Note that SEH directives need to continue tracking the stack
15079 pointer even after the frame pointer has been set up. */
15080 if (sp_is_cfa_reg || TARGET_SEH)
15082 if (sp_is_cfa_reg)
15083 m->fs.cfa_offset += UNITS_PER_WORD;
15084 RTX_FRAME_RELATED_P (insn) = 1;
15085 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
15086 gen_rtx_SET (stack_pointer_rtx,
15087 plus_constant (Pmode, stack_pointer_rtx,
15088 -UNITS_PER_WORD)));
15092 if (r10_live)
15094 r10 = gen_rtx_REG (Pmode, R10_REG);
15095 insn = emit_insn (gen_push (r10));
15096 allocate -= UNITS_PER_WORD;
15097 if (sp_is_cfa_reg || TARGET_SEH)
15099 if (sp_is_cfa_reg)
15100 m->fs.cfa_offset += UNITS_PER_WORD;
15101 RTX_FRAME_RELATED_P (insn) = 1;
15102 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
15103 gen_rtx_SET (stack_pointer_rtx,
15104 plus_constant (Pmode, stack_pointer_rtx,
15105 -UNITS_PER_WORD)));
15109 emit_move_insn (eax, GEN_INT (allocate));
15110 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
15112 /* Use the fact that AX still contains ALLOCATE. */
15113 adjust_stack_insn = (Pmode == DImode
15114 ? gen_pro_epilogue_adjust_stack_di_sub
15115 : gen_pro_epilogue_adjust_stack_si_sub);
15117 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
15118 stack_pointer_rtx, eax));
15120 if (sp_is_cfa_reg || TARGET_SEH)
15122 if (sp_is_cfa_reg)
15123 m->fs.cfa_offset += allocate;
15124 RTX_FRAME_RELATED_P (insn) = 1;
15125 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
15126 gen_rtx_SET (stack_pointer_rtx,
15127 plus_constant (Pmode, stack_pointer_rtx,
15128 -allocate)));
15130 m->fs.sp_offset += allocate;
15132 /* Use stack_pointer_rtx for relative addressing so that code
15133 works for realigned stack, too. */
15134 if (r10_live && eax_live)
15136 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
15137 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15138 gen_frame_mem (word_mode, t));
15139 t = plus_constant (Pmode, t, UNITS_PER_WORD);
15140 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
15141 gen_frame_mem (word_mode, t));
15143 else if (eax_live || r10_live)
15145 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
15146 emit_move_insn (gen_rtx_REG (word_mode,
15147 (eax_live ? AX_REG : R10_REG)),
15148 gen_frame_mem (word_mode, t));
15151 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
15153 /* If we havn't already set up the frame pointer, do so now. */
15154 if (frame_pointer_needed && !m->fs.fp_valid)
15156 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
15157 GEN_INT (frame.stack_pointer_offset
15158 - frame.hard_frame_pointer_offset));
15159 insn = emit_insn (insn);
15160 RTX_FRAME_RELATED_P (insn) = 1;
15161 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
15163 if (m->fs.cfa_reg == stack_pointer_rtx)
15164 m->fs.cfa_reg = hard_frame_pointer_rtx;
15165 m->fs.fp_offset = frame.hard_frame_pointer_offset;
15166 m->fs.fp_valid = true;
15169 if (!int_registers_saved)
15170 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
15171 if (!sse_registers_saved)
15172 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
15174 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
15175 in PROLOGUE. */
15176 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
15178 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
15179 insn = emit_insn (gen_set_got (pic));
15180 RTX_FRAME_RELATED_P (insn) = 1;
15181 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
15182 emit_insn (gen_prologue_use (pic));
15183 /* Deleting already emmitted SET_GOT if exist and allocated to
15184 REAL_PIC_OFFSET_TABLE_REGNUM. */
15185 ix86_elim_entry_set_got (pic);
15188 if (crtl->drap_reg && !crtl->stack_realign_needed)
15190 /* vDRAP is setup but after reload it turns out stack realign
15191 isn't necessary, here we will emit prologue to setup DRAP
15192 without stack realign adjustment */
15193 t = choose_baseaddr (0, NULL);
15194 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
15197 /* Prevent instructions from being scheduled into register save push
15198 sequence when access to the redzone area is done through frame pointer.
15199 The offset between the frame pointer and the stack pointer is calculated
15200 relative to the value of the stack pointer at the end of the function
15201 prologue, and moving instructions that access redzone area via frame
15202 pointer inside push sequence violates this assumption. */
15203 if (frame_pointer_needed && frame.red_zone_size)
15204 emit_insn (gen_memory_blockage ());
15206 /* SEH requires that the prologue end within 256 bytes of the start of
15207 the function. Prevent instruction schedules that would extend that.
15208 Further, prevent alloca modifications to the stack pointer from being
15209 combined with prologue modifications. */
15210 if (TARGET_SEH)
15211 emit_insn (gen_prologue_use (stack_pointer_rtx));
15214 /* Emit code to restore REG using a POP insn. */
15216 static void
15217 ix86_emit_restore_reg_using_pop (rtx reg)
15219 struct machine_function *m = cfun->machine;
15220 rtx_insn *insn = emit_insn (gen_pop (reg));
15222 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
15223 m->fs.sp_offset -= UNITS_PER_WORD;
15225 if (m->fs.cfa_reg == crtl->drap_reg
15226 && REGNO (reg) == REGNO (crtl->drap_reg))
15228 /* Previously we'd represented the CFA as an expression
15229 like *(%ebp - 8). We've just popped that value from
15230 the stack, which means we need to reset the CFA to
15231 the drap register. This will remain until we restore
15232 the stack pointer. */
15233 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15234 RTX_FRAME_RELATED_P (insn) = 1;
15236 /* This means that the DRAP register is valid for addressing too. */
15237 m->fs.drap_valid = true;
15238 return;
15241 if (m->fs.cfa_reg == stack_pointer_rtx)
15243 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15244 x = gen_rtx_SET (stack_pointer_rtx, x);
15245 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15246 RTX_FRAME_RELATED_P (insn) = 1;
15248 m->fs.cfa_offset -= UNITS_PER_WORD;
15251 /* When the frame pointer is the CFA, and we pop it, we are
15252 swapping back to the stack pointer as the CFA. This happens
15253 for stack frames that don't allocate other data, so we assume
15254 the stack pointer is now pointing at the return address, i.e.
15255 the function entry state, which makes the offset be 1 word. */
15256 if (reg == hard_frame_pointer_rtx)
15258 m->fs.fp_valid = false;
15259 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15261 m->fs.cfa_reg = stack_pointer_rtx;
15262 m->fs.cfa_offset -= UNITS_PER_WORD;
15264 add_reg_note (insn, REG_CFA_DEF_CFA,
15265 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15266 GEN_INT (m->fs.cfa_offset)));
15267 RTX_FRAME_RELATED_P (insn) = 1;
15272 /* Emit code to restore saved registers using POP insns. */
15274 static void
15275 ix86_emit_restore_regs_using_pop (void)
15277 unsigned int regno;
15279 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15280 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
15281 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
15284 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
15285 omits the emit and only attaches the notes. */
15287 static void
15288 ix86_emit_leave (rtx_insn *insn)
15290 struct machine_function *m = cfun->machine;
15291 if (!insn)
15292 insn = emit_insn (ix86_gen_leave ());
15294 ix86_add_queued_cfa_restore_notes (insn);
15296 gcc_assert (m->fs.fp_valid);
15297 m->fs.sp_valid = true;
15298 m->fs.sp_realigned = false;
15299 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
15300 m->fs.fp_valid = false;
15302 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15304 m->fs.cfa_reg = stack_pointer_rtx;
15305 m->fs.cfa_offset = m->fs.sp_offset;
15307 add_reg_note (insn, REG_CFA_DEF_CFA,
15308 plus_constant (Pmode, stack_pointer_rtx,
15309 m->fs.sp_offset));
15310 RTX_FRAME_RELATED_P (insn) = 1;
15312 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
15313 m->fs.fp_offset);
15316 /* Emit code to restore saved registers using MOV insns.
15317 First register is restored from CFA - CFA_OFFSET. */
15318 static void
15319 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
15320 bool maybe_eh_return)
15322 struct machine_function *m = cfun->machine;
15323 unsigned int regno;
15325 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15326 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15328 rtx reg = gen_rtx_REG (word_mode, regno);
15329 rtx mem;
15330 rtx_insn *insn;
15332 mem = choose_baseaddr (cfa_offset, NULL);
15333 mem = gen_frame_mem (word_mode, mem);
15334 insn = emit_move_insn (reg, mem);
15336 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
15338 /* Previously we'd represented the CFA as an expression
15339 like *(%ebp - 8). We've just popped that value from
15340 the stack, which means we need to reset the CFA to
15341 the drap register. This will remain until we restore
15342 the stack pointer. */
15343 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15344 RTX_FRAME_RELATED_P (insn) = 1;
15346 /* This means that the DRAP register is valid for addressing. */
15347 m->fs.drap_valid = true;
15349 else
15350 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15352 cfa_offset -= UNITS_PER_WORD;
15356 /* Emit code to restore saved registers using MOV insns.
15357 First register is restored from CFA - CFA_OFFSET. */
15358 static void
15359 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15360 bool maybe_eh_return)
15362 unsigned int regno;
15364 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15365 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15367 rtx reg = gen_rtx_REG (V4SFmode, regno);
15368 rtx mem;
15369 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15371 mem = choose_baseaddr (cfa_offset, &align);
15372 mem = gen_rtx_MEM (V4SFmode, mem);
15374 /* The location aligment depends upon the base register. */
15375 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15376 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15377 set_mem_align (mem, align);
15378 emit_insn (gen_rtx_SET (reg, mem));
15380 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15382 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15386 static void
15387 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15388 bool use_call, int style)
15390 struct machine_function *m = cfun->machine;
15391 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15392 + m->call_ms2sysv_extra_regs;
15393 rtvec v;
15394 unsigned int elems_needed, align, i, vi = 0;
15395 rtx_insn *insn;
15396 rtx sym, tmp;
15397 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15398 rtx r10 = NULL_RTX;
15399 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15400 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15401 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15402 rtx rsi_frame_load = NULL_RTX;
15403 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15404 enum xlogue_stub stub;
15406 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15408 /* If using a realigned stack, we should never start with padding. */
15409 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15411 /* Setup RSI as the stub's base pointer. */
15412 align = GET_MODE_ALIGNMENT (V4SFmode);
15413 tmp = choose_baseaddr (rsi_offset, &align);
15414 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15415 emit_insn (gen_rtx_SET (rsi, tmp));
15417 /* Get a symbol for the stub. */
15418 if (frame_pointer_needed)
15419 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15420 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15421 else
15422 stub = use_call ? XLOGUE_STUB_RESTORE
15423 : XLOGUE_STUB_RESTORE_TAIL;
15424 sym = xlogue.get_stub_rtx (stub);
15426 elems_needed = ncregs;
15427 if (use_call)
15428 elems_needed += 1;
15429 else
15430 elems_needed += frame_pointer_needed ? 5 : 3;
15431 v = rtvec_alloc (elems_needed);
15433 /* We call the epilogue stub when we need to pop incoming args or we are
15434 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15435 epilogue stub and it is the tail-call. */
15436 if (use_call)
15437 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15438 else
15440 RTVEC_ELT (v, vi++) = ret_rtx;
15441 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15442 if (frame_pointer_needed)
15444 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15445 gcc_assert (m->fs.fp_valid);
15446 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15448 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15449 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15450 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15451 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15452 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15454 else
15456 /* If no hard frame pointer, we set R10 to the SP restore value. */
15457 gcc_assert (!m->fs.fp_valid);
15458 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15459 gcc_assert (m->fs.sp_valid);
15461 r10 = gen_rtx_REG (DImode, R10_REG);
15462 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15463 emit_insn (gen_rtx_SET (r10, tmp));
15465 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15469 /* Generate frame load insns and restore notes. */
15470 for (i = 0; i < ncregs; ++i)
15472 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15473 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15474 rtx reg, frame_load;
15476 reg = gen_rtx_REG (mode, r.regno);
15477 frame_load = gen_frame_load (reg, rsi, r.offset);
15479 /* Save RSI frame load insn & note to add last. */
15480 if (r.regno == SI_REG)
15482 gcc_assert (!rsi_frame_load);
15483 rsi_frame_load = frame_load;
15484 rsi_restore_offset = r.offset;
15486 else
15488 RTVEC_ELT (v, vi++) = frame_load;
15489 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15493 /* Add RSI frame load & restore note at the end. */
15494 gcc_assert (rsi_frame_load);
15495 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15496 RTVEC_ELT (v, vi++) = rsi_frame_load;
15497 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15498 rsi_restore_offset);
15500 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15501 if (!use_call && !frame_pointer_needed)
15503 gcc_assert (m->fs.sp_valid);
15504 gcc_assert (!m->fs.sp_realigned);
15506 /* At this point, R10 should point to frame.stack_realign_offset. */
15507 if (m->fs.cfa_reg == stack_pointer_rtx)
15508 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15509 m->fs.sp_offset = frame.stack_realign_offset;
15512 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15513 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15514 if (use_call)
15515 insn = emit_insn (tmp);
15516 else
15518 insn = emit_jump_insn (tmp);
15519 JUMP_LABEL (insn) = ret_rtx;
15521 if (frame_pointer_needed)
15522 ix86_emit_leave (insn);
15523 else
15525 /* Need CFA adjust note. */
15526 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15527 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15531 RTX_FRAME_RELATED_P (insn) = true;
15532 ix86_add_queued_cfa_restore_notes (insn);
15534 /* If we're not doing a tail-call, we need to adjust the stack. */
15535 if (use_call && m->fs.sp_valid)
15537 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15538 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15539 GEN_INT (dealloc), style,
15540 m->fs.cfa_reg == stack_pointer_rtx);
15544 /* Restore function stack, frame, and registers. */
15546 void
15547 ix86_expand_epilogue (int style)
15549 struct machine_function *m = cfun->machine;
15550 struct machine_frame_state frame_state_save = m->fs;
15551 struct ix86_frame frame;
15552 bool restore_regs_via_mov;
15553 bool using_drap;
15554 bool restore_stub_is_tail = false;
15556 if (ix86_function_naked (current_function_decl))
15558 /* The program should not reach this point. */
15559 emit_insn (gen_ud2 ());
15560 return;
15563 ix86_finalize_stack_frame_flags ();
15564 frame = m->frame;
15566 m->fs.sp_realigned = stack_realign_fp;
15567 m->fs.sp_valid = stack_realign_fp
15568 || !frame_pointer_needed
15569 || crtl->sp_is_unchanging;
15570 gcc_assert (!m->fs.sp_valid
15571 || m->fs.sp_offset == frame.stack_pointer_offset);
15573 /* The FP must be valid if the frame pointer is present. */
15574 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15575 gcc_assert (!m->fs.fp_valid
15576 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15578 /* We must have *some* valid pointer to the stack frame. */
15579 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15581 /* The DRAP is never valid at this point. */
15582 gcc_assert (!m->fs.drap_valid);
15584 /* See the comment about red zone and frame
15585 pointer usage in ix86_expand_prologue. */
15586 if (frame_pointer_needed && frame.red_zone_size)
15587 emit_insn (gen_memory_blockage ());
15589 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15590 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15592 /* Determine the CFA offset of the end of the red-zone. */
15593 m->fs.red_zone_offset = 0;
15594 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15596 /* The red-zone begins below return address and error code in
15597 exception handler. */
15598 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15600 /* When the register save area is in the aligned portion of
15601 the stack, determine the maximum runtime displacement that
15602 matches up with the aligned frame. */
15603 if (stack_realign_drap)
15604 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15605 + UNITS_PER_WORD);
15608 /* Special care must be taken for the normal return case of a function
15609 using eh_return: the eax and edx registers are marked as saved, but
15610 not restored along this path. Adjust the save location to match. */
15611 if (crtl->calls_eh_return && style != 2)
15612 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15614 /* EH_RETURN requires the use of moves to function properly. */
15615 if (crtl->calls_eh_return)
15616 restore_regs_via_mov = true;
15617 /* SEH requires the use of pops to identify the epilogue. */
15618 else if (TARGET_SEH)
15619 restore_regs_via_mov = false;
15620 /* If we're only restoring one register and sp cannot be used then
15621 using a move instruction to restore the register since it's
15622 less work than reloading sp and popping the register. */
15623 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15624 restore_regs_via_mov = true;
15625 else if (TARGET_EPILOGUE_USING_MOVE
15626 && cfun->machine->use_fast_prologue_epilogue
15627 && (frame.nregs > 1
15628 || m->fs.sp_offset != frame.reg_save_offset))
15629 restore_regs_via_mov = true;
15630 else if (frame_pointer_needed
15631 && !frame.nregs
15632 && m->fs.sp_offset != frame.reg_save_offset)
15633 restore_regs_via_mov = true;
15634 else if (frame_pointer_needed
15635 && TARGET_USE_LEAVE
15636 && cfun->machine->use_fast_prologue_epilogue
15637 && frame.nregs == 1)
15638 restore_regs_via_mov = true;
15639 else
15640 restore_regs_via_mov = false;
15642 if (restore_regs_via_mov || frame.nsseregs)
15644 /* Ensure that the entire register save area is addressable via
15645 the stack pointer, if we will restore SSE regs via sp. */
15646 if (TARGET_64BIT
15647 && m->fs.sp_offset > 0x7fffffff
15648 && sp_valid_at (frame.stack_realign_offset)
15649 && (frame.nsseregs + frame.nregs) != 0)
15651 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15652 GEN_INT (m->fs.sp_offset
15653 - frame.sse_reg_save_offset),
15654 style,
15655 m->fs.cfa_reg == stack_pointer_rtx);
15659 /* If there are any SSE registers to restore, then we have to do it
15660 via moves, since there's obviously no pop for SSE regs. */
15661 if (frame.nsseregs)
15662 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15663 style == 2);
15665 if (m->call_ms2sysv)
15667 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15669 /* We cannot use a tail-call for the stub if:
15670 1. We have to pop incoming args,
15671 2. We have additional int regs to restore, or
15672 3. A sibling call will be the tail-call, or
15673 4. We are emitting an eh_return_internal epilogue.
15675 TODO: Item 4 has not yet tested!
15677 If any of the above are true, we will call the stub rather than
15678 jump to it. */
15679 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15680 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15683 /* If using out-of-line stub that is a tail-call, then...*/
15684 if (m->call_ms2sysv && restore_stub_is_tail)
15686 /* TODO: parinoid tests. (remove eventually) */
15687 gcc_assert (m->fs.sp_valid);
15688 gcc_assert (!m->fs.sp_realigned);
15689 gcc_assert (!m->fs.fp_valid);
15690 gcc_assert (!m->fs.realigned);
15691 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15692 gcc_assert (!crtl->drap_reg);
15693 gcc_assert (!frame.nregs);
15695 else if (restore_regs_via_mov)
15697 rtx t;
15699 if (frame.nregs)
15700 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15702 /* eh_return epilogues need %ecx added to the stack pointer. */
15703 if (style == 2)
15705 rtx sa = EH_RETURN_STACKADJ_RTX;
15706 rtx_insn *insn;
15708 /* %ecx can't be used for both DRAP register and eh_return. */
15709 if (crtl->drap_reg)
15710 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15712 /* regparm nested functions don't work with eh_return. */
15713 gcc_assert (!ix86_static_chain_on_stack);
15715 if (frame_pointer_needed)
15717 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15718 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15719 emit_insn (gen_rtx_SET (sa, t));
15721 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15722 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15724 /* Note that we use SA as a temporary CFA, as the return
15725 address is at the proper place relative to it. We
15726 pretend this happens at the FP restore insn because
15727 prior to this insn the FP would be stored at the wrong
15728 offset relative to SA, and after this insn we have no
15729 other reasonable register to use for the CFA. We don't
15730 bother resetting the CFA to the SP for the duration of
15731 the return insn. */
15732 add_reg_note (insn, REG_CFA_DEF_CFA,
15733 plus_constant (Pmode, sa, UNITS_PER_WORD));
15734 ix86_add_queued_cfa_restore_notes (insn);
15735 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15736 RTX_FRAME_RELATED_P (insn) = 1;
15738 m->fs.cfa_reg = sa;
15739 m->fs.cfa_offset = UNITS_PER_WORD;
15740 m->fs.fp_valid = false;
15742 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15743 const0_rtx, style, false);
15745 else
15747 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15748 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15749 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15750 ix86_add_queued_cfa_restore_notes (insn);
15752 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15753 if (m->fs.cfa_offset != UNITS_PER_WORD)
15755 m->fs.cfa_offset = UNITS_PER_WORD;
15756 add_reg_note (insn, REG_CFA_DEF_CFA,
15757 plus_constant (Pmode, stack_pointer_rtx,
15758 UNITS_PER_WORD));
15759 RTX_FRAME_RELATED_P (insn) = 1;
15762 m->fs.sp_offset = UNITS_PER_WORD;
15763 m->fs.sp_valid = true;
15764 m->fs.sp_realigned = false;
15767 else
15769 /* SEH requires that the function end with (1) a stack adjustment
15770 if necessary, (2) a sequence of pops, and (3) a return or
15771 jump instruction. Prevent insns from the function body from
15772 being scheduled into this sequence. */
15773 if (TARGET_SEH)
15775 /* Prevent a catch region from being adjacent to the standard
15776 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15777 several other flags that would be interesting to test are
15778 not yet set up. */
15779 if (flag_non_call_exceptions)
15780 emit_insn (gen_nops (const1_rtx));
15781 else
15782 emit_insn (gen_blockage ());
15785 /* First step is to deallocate the stack frame so that we can
15786 pop the registers. If the stack pointer was realigned, it needs
15787 to be restored now. Also do it on SEH target for very large
15788 frame as the emitted instructions aren't allowed by the ABI
15789 in epilogues. */
15790 if (!m->fs.sp_valid || m->fs.sp_realigned
15791 || (TARGET_SEH
15792 && (m->fs.sp_offset - frame.reg_save_offset
15793 >= SEH_MAX_FRAME_SIZE)))
15795 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15796 GEN_INT (m->fs.fp_offset
15797 - frame.reg_save_offset),
15798 style, false);
15800 else if (m->fs.sp_offset != frame.reg_save_offset)
15802 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15803 GEN_INT (m->fs.sp_offset
15804 - frame.reg_save_offset),
15805 style,
15806 m->fs.cfa_reg == stack_pointer_rtx);
15809 ix86_emit_restore_regs_using_pop ();
15812 /* If we used a stack pointer and haven't already got rid of it,
15813 then do so now. */
15814 if (m->fs.fp_valid)
15816 /* If the stack pointer is valid and pointing at the frame
15817 pointer store address, then we only need a pop. */
15818 if (sp_valid_at (frame.hfp_save_offset)
15819 && m->fs.sp_offset == frame.hfp_save_offset)
15820 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15821 /* Leave results in shorter dependency chains on CPUs that are
15822 able to grok it fast. */
15823 else if (TARGET_USE_LEAVE
15824 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15825 || !cfun->machine->use_fast_prologue_epilogue)
15826 ix86_emit_leave (NULL);
15827 else
15829 pro_epilogue_adjust_stack (stack_pointer_rtx,
15830 hard_frame_pointer_rtx,
15831 const0_rtx, style, !using_drap);
15832 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15836 if (using_drap)
15838 int param_ptr_offset = UNITS_PER_WORD;
15839 rtx_insn *insn;
15841 gcc_assert (stack_realign_drap);
15843 if (ix86_static_chain_on_stack)
15844 param_ptr_offset += UNITS_PER_WORD;
15845 if (!call_used_regs[REGNO (crtl->drap_reg)])
15846 param_ptr_offset += UNITS_PER_WORD;
15848 insn = emit_insn (gen_rtx_SET
15849 (stack_pointer_rtx,
15850 gen_rtx_PLUS (Pmode,
15851 crtl->drap_reg,
15852 GEN_INT (-param_ptr_offset))));
15853 m->fs.cfa_reg = stack_pointer_rtx;
15854 m->fs.cfa_offset = param_ptr_offset;
15855 m->fs.sp_offset = param_ptr_offset;
15856 m->fs.realigned = false;
15858 add_reg_note (insn, REG_CFA_DEF_CFA,
15859 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15860 GEN_INT (param_ptr_offset)));
15861 RTX_FRAME_RELATED_P (insn) = 1;
15863 if (!call_used_regs[REGNO (crtl->drap_reg)])
15864 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15867 /* At this point the stack pointer must be valid, and we must have
15868 restored all of the registers. We may not have deallocated the
15869 entire stack frame. We've delayed this until now because it may
15870 be possible to merge the local stack deallocation with the
15871 deallocation forced by ix86_static_chain_on_stack. */
15872 gcc_assert (m->fs.sp_valid);
15873 gcc_assert (!m->fs.sp_realigned);
15874 gcc_assert (!m->fs.fp_valid);
15875 gcc_assert (!m->fs.realigned);
15876 if (m->fs.sp_offset != UNITS_PER_WORD)
15878 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15879 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15880 style, true);
15882 else
15883 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15885 /* Sibcall epilogues don't want a return instruction. */
15886 if (style == 0)
15888 m->fs = frame_state_save;
15889 return;
15892 if (cfun->machine->func_type != TYPE_NORMAL)
15893 emit_jump_insn (gen_interrupt_return ());
15894 else if (crtl->args.pops_args && crtl->args.size)
15896 rtx popc = GEN_INT (crtl->args.pops_args);
15898 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15899 address, do explicit add, and jump indirectly to the caller. */
15901 if (crtl->args.pops_args >= 65536)
15903 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15904 rtx_insn *insn;
15906 /* There is no "pascal" calling convention in any 64bit ABI. */
15907 gcc_assert (!TARGET_64BIT);
15909 insn = emit_insn (gen_pop (ecx));
15910 m->fs.cfa_offset -= UNITS_PER_WORD;
15911 m->fs.sp_offset -= UNITS_PER_WORD;
15913 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15914 x = gen_rtx_SET (stack_pointer_rtx, x);
15915 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15916 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15917 RTX_FRAME_RELATED_P (insn) = 1;
15919 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15920 popc, -1, true);
15921 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15923 else
15924 emit_jump_insn (gen_simple_return_pop_internal (popc));
15926 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15927 emit_jump_insn (gen_simple_return_internal ());
15929 /* Restore the state back to the state from the prologue,
15930 so that it's correct for the next epilogue. */
15931 m->fs = frame_state_save;
15934 /* Reset from the function's potential modifications. */
15936 static void
15937 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
15939 if (pic_offset_table_rtx
15940 && !ix86_use_pseudo_pic_reg ())
15941 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15943 if (TARGET_MACHO)
15945 rtx_insn *insn = get_last_insn ();
15946 rtx_insn *deleted_debug_label = NULL;
15948 /* Mach-O doesn't support labels at the end of objects, so if
15949 it looks like we might want one, take special action.
15950 First, collect any sequence of deleted debug labels. */
15951 while (insn
15952 && NOTE_P (insn)
15953 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15955 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15956 notes only, instead set their CODE_LABEL_NUMBER to -1,
15957 otherwise there would be code generation differences
15958 in between -g and -g0. */
15959 if (NOTE_P (insn) && NOTE_KIND (insn)
15960 == NOTE_INSN_DELETED_DEBUG_LABEL)
15961 deleted_debug_label = insn;
15962 insn = PREV_INSN (insn);
15965 /* If we have:
15966 label:
15967 barrier
15968 then this needs to be detected, so skip past the barrier. */
15970 if (insn && BARRIER_P (insn))
15971 insn = PREV_INSN (insn);
15973 /* Up to now we've only seen notes or barriers. */
15974 if (insn)
15976 if (LABEL_P (insn)
15977 || (NOTE_P (insn)
15978 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15979 /* Trailing label. */
15980 fputs ("\tnop\n", file);
15981 else if (cfun && ! cfun->is_thunk)
15983 /* See if we have a completely empty function body, skipping
15984 the special case of the picbase thunk emitted as asm. */
15985 while (insn && ! INSN_P (insn))
15986 insn = PREV_INSN (insn);
15987 /* If we don't find any insns, we've got an empty function body;
15988 I.e. completely empty - without a return or branch. This is
15989 taken as the case where a function body has been removed
15990 because it contains an inline __builtin_unreachable(). GCC
15991 declares that reaching __builtin_unreachable() means UB so
15992 we're not obliged to do anything special; however, we want
15993 non-zero-sized function bodies. To meet this, and help the
15994 user out, let's trap the case. */
15995 if (insn == NULL)
15996 fputs ("\tud2\n", file);
15999 else if (deleted_debug_label)
16000 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
16001 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
16002 CODE_LABEL_NUMBER (insn) = -1;
16006 /* Return a scratch register to use in the split stack prologue. The
16007 split stack prologue is used for -fsplit-stack. It is the first
16008 instructions in the function, even before the regular prologue.
16009 The scratch register can be any caller-saved register which is not
16010 used for parameters or for the static chain. */
16012 static unsigned int
16013 split_stack_prologue_scratch_regno (void)
16015 if (TARGET_64BIT)
16016 return R11_REG;
16017 else
16019 bool is_fastcall, is_thiscall;
16020 int regparm;
16022 is_fastcall = (lookup_attribute ("fastcall",
16023 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
16024 != NULL);
16025 is_thiscall = (lookup_attribute ("thiscall",
16026 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
16027 != NULL);
16028 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
16030 if (is_fastcall)
16032 if (DECL_STATIC_CHAIN (cfun->decl))
16034 sorry ("-fsplit-stack does not support fastcall with "
16035 "nested function");
16036 return INVALID_REGNUM;
16038 return AX_REG;
16040 else if (is_thiscall)
16042 if (!DECL_STATIC_CHAIN (cfun->decl))
16043 return DX_REG;
16044 return AX_REG;
16046 else if (regparm < 3)
16048 if (!DECL_STATIC_CHAIN (cfun->decl))
16049 return CX_REG;
16050 else
16052 if (regparm >= 2)
16054 sorry ("-fsplit-stack does not support 2 register "
16055 "parameters for a nested function");
16056 return INVALID_REGNUM;
16058 return DX_REG;
16061 else
16063 /* FIXME: We could make this work by pushing a register
16064 around the addition and comparison. */
16065 sorry ("-fsplit-stack does not support 3 register parameters");
16066 return INVALID_REGNUM;
16071 /* A SYMBOL_REF for the function which allocates new stackspace for
16072 -fsplit-stack. */
16074 static GTY(()) rtx split_stack_fn;
16076 /* A SYMBOL_REF for the more stack function when using the large
16077 model. */
16079 static GTY(()) rtx split_stack_fn_large;
16081 /* Return location of the stack guard value in the TLS block. */
16084 ix86_split_stack_guard (void)
16086 int offset;
16087 addr_space_t as = DEFAULT_TLS_SEG_REG;
16088 rtx r;
16090 gcc_assert (flag_split_stack);
16092 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
16093 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
16094 #else
16095 gcc_unreachable ();
16096 #endif
16098 r = GEN_INT (offset);
16099 r = gen_const_mem (Pmode, r);
16100 set_mem_addr_space (r, as);
16102 return r;
16105 /* Handle -fsplit-stack. These are the first instructions in the
16106 function, even before the regular prologue. */
16108 void
16109 ix86_expand_split_stack_prologue (void)
16111 struct ix86_frame frame;
16112 HOST_WIDE_INT allocate;
16113 unsigned HOST_WIDE_INT args_size;
16114 rtx_code_label *label;
16115 rtx limit, current, allocate_rtx, call_insn, call_fusage;
16116 rtx scratch_reg = NULL_RTX;
16117 rtx_code_label *varargs_label = NULL;
16118 rtx fn;
16120 gcc_assert (flag_split_stack && reload_completed);
16122 ix86_finalize_stack_frame_flags ();
16123 frame = cfun->machine->frame;
16124 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
16126 /* This is the label we will branch to if we have enough stack
16127 space. We expect the basic block reordering pass to reverse this
16128 branch if optimizing, so that we branch in the unlikely case. */
16129 label = gen_label_rtx ();
16131 /* We need to compare the stack pointer minus the frame size with
16132 the stack boundary in the TCB. The stack boundary always gives
16133 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
16134 can compare directly. Otherwise we need to do an addition. */
16136 limit = ix86_split_stack_guard ();
16138 if (allocate < SPLIT_STACK_AVAILABLE)
16139 current = stack_pointer_rtx;
16140 else
16142 unsigned int scratch_regno;
16143 rtx offset;
16145 /* We need a scratch register to hold the stack pointer minus
16146 the required frame size. Since this is the very start of the
16147 function, the scratch register can be any caller-saved
16148 register which is not used for parameters. */
16149 offset = GEN_INT (- allocate);
16150 scratch_regno = split_stack_prologue_scratch_regno ();
16151 if (scratch_regno == INVALID_REGNUM)
16152 return;
16153 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16154 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
16156 /* We don't use ix86_gen_add3 in this case because it will
16157 want to split to lea, but when not optimizing the insn
16158 will not be split after this point. */
16159 emit_insn (gen_rtx_SET (scratch_reg,
16160 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16161 offset)));
16163 else
16165 emit_move_insn (scratch_reg, offset);
16166 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
16167 stack_pointer_rtx));
16169 current = scratch_reg;
16172 ix86_expand_branch (GEU, current, limit, label);
16173 rtx_insn *jump_insn = get_last_insn ();
16174 JUMP_LABEL (jump_insn) = label;
16176 /* Mark the jump as very likely to be taken. */
16177 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
16179 if (split_stack_fn == NULL_RTX)
16181 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
16182 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
16184 fn = split_stack_fn;
16186 /* Get more stack space. We pass in the desired stack space and the
16187 size of the arguments to copy to the new stack. In 32-bit mode
16188 we push the parameters; __morestack will return on a new stack
16189 anyhow. In 64-bit mode we pass the parameters in r10 and
16190 r11. */
16191 allocate_rtx = GEN_INT (allocate);
16192 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
16193 call_fusage = NULL_RTX;
16194 rtx pop = NULL_RTX;
16195 if (TARGET_64BIT)
16197 rtx reg10, reg11;
16199 reg10 = gen_rtx_REG (Pmode, R10_REG);
16200 reg11 = gen_rtx_REG (Pmode, R11_REG);
16202 /* If this function uses a static chain, it will be in %r10.
16203 Preserve it across the call to __morestack. */
16204 if (DECL_STATIC_CHAIN (cfun->decl))
16206 rtx rax;
16208 rax = gen_rtx_REG (word_mode, AX_REG);
16209 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
16210 use_reg (&call_fusage, rax);
16213 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
16214 && !TARGET_PECOFF)
16216 HOST_WIDE_INT argval;
16218 gcc_assert (Pmode == DImode);
16219 /* When using the large model we need to load the address
16220 into a register, and we've run out of registers. So we
16221 switch to a different calling convention, and we call a
16222 different function: __morestack_large. We pass the
16223 argument size in the upper 32 bits of r10 and pass the
16224 frame size in the lower 32 bits. */
16225 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
16226 gcc_assert ((args_size & 0xffffffff) == args_size);
16228 if (split_stack_fn_large == NULL_RTX)
16230 split_stack_fn_large =
16231 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
16232 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
16234 if (ix86_cmodel == CM_LARGE_PIC)
16236 rtx_code_label *label;
16237 rtx x;
16239 label = gen_label_rtx ();
16240 emit_label (label);
16241 LABEL_PRESERVE_P (label) = 1;
16242 emit_insn (gen_set_rip_rex64 (reg10, label));
16243 emit_insn (gen_set_got_offset_rex64 (reg11, label));
16244 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
16245 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
16246 UNSPEC_GOT);
16247 x = gen_rtx_CONST (Pmode, x);
16248 emit_move_insn (reg11, x);
16249 x = gen_rtx_PLUS (Pmode, reg10, reg11);
16250 x = gen_const_mem (Pmode, x);
16251 emit_move_insn (reg11, x);
16253 else
16254 emit_move_insn (reg11, split_stack_fn_large);
16256 fn = reg11;
16258 argval = ((args_size << 16) << 16) + allocate;
16259 emit_move_insn (reg10, GEN_INT (argval));
16261 else
16263 emit_move_insn (reg10, allocate_rtx);
16264 emit_move_insn (reg11, GEN_INT (args_size));
16265 use_reg (&call_fusage, reg11);
16268 use_reg (&call_fusage, reg10);
16270 else
16272 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
16273 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
16274 insn = emit_insn (gen_push (allocate_rtx));
16275 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
16276 pop = GEN_INT (2 * UNITS_PER_WORD);
16278 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
16279 GEN_INT (UNITS_PER_WORD), constm1_rtx,
16280 pop, false);
16281 add_function_usage_to (call_insn, call_fusage);
16282 if (!TARGET_64BIT)
16283 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
16284 /* Indicate that this function can't jump to non-local gotos. */
16285 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
16287 /* In order to make call/return prediction work right, we now need
16288 to execute a return instruction. See
16289 libgcc/config/i386/morestack.S for the details on how this works.
16291 For flow purposes gcc must not see this as a return
16292 instruction--we need control flow to continue at the subsequent
16293 label. Therefore, we use an unspec. */
16294 gcc_assert (crtl->args.pops_args < 65536);
16295 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
16297 /* If we are in 64-bit mode and this function uses a static chain,
16298 we saved %r10 in %rax before calling _morestack. */
16299 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
16300 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
16301 gen_rtx_REG (word_mode, AX_REG));
16303 /* If this function calls va_start, we need to store a pointer to
16304 the arguments on the old stack, because they may not have been
16305 all copied to the new stack. At this point the old stack can be
16306 found at the frame pointer value used by __morestack, because
16307 __morestack has set that up before calling back to us. Here we
16308 store that pointer in a scratch register, and in
16309 ix86_expand_prologue we store the scratch register in a stack
16310 slot. */
16311 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16313 unsigned int scratch_regno;
16314 rtx frame_reg;
16315 int words;
16317 scratch_regno = split_stack_prologue_scratch_regno ();
16318 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16319 frame_reg = gen_rtx_REG (Pmode, BP_REG);
16321 /* 64-bit:
16322 fp -> old fp value
16323 return address within this function
16324 return address of caller of this function
16325 stack arguments
16326 So we add three words to get to the stack arguments.
16328 32-bit:
16329 fp -> old fp value
16330 return address within this function
16331 first argument to __morestack
16332 second argument to __morestack
16333 return address of caller of this function
16334 stack arguments
16335 So we add five words to get to the stack arguments.
16337 words = TARGET_64BIT ? 3 : 5;
16338 emit_insn (gen_rtx_SET (scratch_reg,
16339 gen_rtx_PLUS (Pmode, frame_reg,
16340 GEN_INT (words * UNITS_PER_WORD))));
16342 varargs_label = gen_label_rtx ();
16343 emit_jump_insn (gen_jump (varargs_label));
16344 JUMP_LABEL (get_last_insn ()) = varargs_label;
16346 emit_barrier ();
16349 emit_label (label);
16350 LABEL_NUSES (label) = 1;
16352 /* If this function calls va_start, we now have to set the scratch
16353 register for the case where we do not call __morestack. In this
16354 case we need to set it based on the stack pointer. */
16355 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16357 emit_insn (gen_rtx_SET (scratch_reg,
16358 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16359 GEN_INT (UNITS_PER_WORD))));
16361 emit_label (varargs_label);
16362 LABEL_NUSES (varargs_label) = 1;
16366 /* We may have to tell the dataflow pass that the split stack prologue
16367 is initializing a scratch register. */
16369 static void
16370 ix86_live_on_entry (bitmap regs)
16372 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16374 gcc_assert (flag_split_stack);
16375 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16379 /* Extract the parts of an RTL expression that is a valid memory address
16380 for an instruction. Return 0 if the structure of the address is
16381 grossly off. Return -1 if the address contains ASHIFT, so it is not
16382 strictly valid, but still used for computing length of lea instruction. */
16385 ix86_decompose_address (rtx addr, struct ix86_address *out)
16387 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16388 rtx base_reg, index_reg;
16389 HOST_WIDE_INT scale = 1;
16390 rtx scale_rtx = NULL_RTX;
16391 rtx tmp;
16392 int retval = 1;
16393 addr_space_t seg = ADDR_SPACE_GENERIC;
16395 /* Allow zero-extended SImode addresses,
16396 they will be emitted with addr32 prefix. */
16397 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16399 if (GET_CODE (addr) == ZERO_EXTEND
16400 && GET_MODE (XEXP (addr, 0)) == SImode)
16402 addr = XEXP (addr, 0);
16403 if (CONST_INT_P (addr))
16404 return 0;
16406 else if (GET_CODE (addr) == AND
16407 && const_32bit_mask (XEXP (addr, 1), DImode))
16409 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16410 if (addr == NULL_RTX)
16411 return 0;
16413 if (CONST_INT_P (addr))
16414 return 0;
16418 /* Allow SImode subregs of DImode addresses,
16419 they will be emitted with addr32 prefix. */
16420 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16422 if (SUBREG_P (addr)
16423 && GET_MODE (SUBREG_REG (addr)) == DImode)
16425 addr = SUBREG_REG (addr);
16426 if (CONST_INT_P (addr))
16427 return 0;
16431 if (REG_P (addr))
16432 base = addr;
16433 else if (SUBREG_P (addr))
16435 if (REG_P (SUBREG_REG (addr)))
16436 base = addr;
16437 else
16438 return 0;
16440 else if (GET_CODE (addr) == PLUS)
16442 rtx addends[4], op;
16443 int n = 0, i;
16445 op = addr;
16448 if (n >= 4)
16449 return 0;
16450 addends[n++] = XEXP (op, 1);
16451 op = XEXP (op, 0);
16453 while (GET_CODE (op) == PLUS);
16454 if (n >= 4)
16455 return 0;
16456 addends[n] = op;
16458 for (i = n; i >= 0; --i)
16460 op = addends[i];
16461 switch (GET_CODE (op))
16463 case MULT:
16464 if (index)
16465 return 0;
16466 index = XEXP (op, 0);
16467 scale_rtx = XEXP (op, 1);
16468 break;
16470 case ASHIFT:
16471 if (index)
16472 return 0;
16473 index = XEXP (op, 0);
16474 tmp = XEXP (op, 1);
16475 if (!CONST_INT_P (tmp))
16476 return 0;
16477 scale = INTVAL (tmp);
16478 if ((unsigned HOST_WIDE_INT) scale > 3)
16479 return 0;
16480 scale = 1 << scale;
16481 break;
16483 case ZERO_EXTEND:
16484 op = XEXP (op, 0);
16485 if (GET_CODE (op) != UNSPEC)
16486 return 0;
16487 /* FALLTHRU */
16489 case UNSPEC:
16490 if (XINT (op, 1) == UNSPEC_TP
16491 && TARGET_TLS_DIRECT_SEG_REFS
16492 && seg == ADDR_SPACE_GENERIC)
16493 seg = DEFAULT_TLS_SEG_REG;
16494 else
16495 return 0;
16496 break;
16498 case SUBREG:
16499 if (!REG_P (SUBREG_REG (op)))
16500 return 0;
16501 /* FALLTHRU */
16503 case REG:
16504 if (!base)
16505 base = op;
16506 else if (!index)
16507 index = op;
16508 else
16509 return 0;
16510 break;
16512 case CONST:
16513 case CONST_INT:
16514 case SYMBOL_REF:
16515 case LABEL_REF:
16516 if (disp)
16517 return 0;
16518 disp = op;
16519 break;
16521 default:
16522 return 0;
16526 else if (GET_CODE (addr) == MULT)
16528 index = XEXP (addr, 0); /* index*scale */
16529 scale_rtx = XEXP (addr, 1);
16531 else if (GET_CODE (addr) == ASHIFT)
16533 /* We're called for lea too, which implements ashift on occasion. */
16534 index = XEXP (addr, 0);
16535 tmp = XEXP (addr, 1);
16536 if (!CONST_INT_P (tmp))
16537 return 0;
16538 scale = INTVAL (tmp);
16539 if ((unsigned HOST_WIDE_INT) scale > 3)
16540 return 0;
16541 scale = 1 << scale;
16542 retval = -1;
16544 else
16545 disp = addr; /* displacement */
16547 if (index)
16549 if (REG_P (index))
16551 else if (SUBREG_P (index)
16552 && REG_P (SUBREG_REG (index)))
16554 else
16555 return 0;
16558 /* Extract the integral value of scale. */
16559 if (scale_rtx)
16561 if (!CONST_INT_P (scale_rtx))
16562 return 0;
16563 scale = INTVAL (scale_rtx);
16566 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16567 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16569 /* Avoid useless 0 displacement. */
16570 if (disp == const0_rtx && (base || index))
16571 disp = NULL_RTX;
16573 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16574 if (base_reg && index_reg && scale == 1
16575 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16576 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16577 || REGNO (index_reg) == SP_REG))
16579 std::swap (base, index);
16580 std::swap (base_reg, index_reg);
16583 /* Special case: %ebp cannot be encoded as a base without a displacement.
16584 Similarly %r13. */
16585 if (!disp && base_reg
16586 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16587 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16588 || REGNO (base_reg) == BP_REG
16589 || REGNO (base_reg) == R13_REG))
16590 disp = const0_rtx;
16592 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16593 Avoid this by transforming to [%esi+0].
16594 Reload calls address legitimization without cfun defined, so we need
16595 to test cfun for being non-NULL. */
16596 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16597 && base_reg && !index_reg && !disp
16598 && REGNO (base_reg) == SI_REG)
16599 disp = const0_rtx;
16601 /* Special case: encode reg+reg instead of reg*2. */
16602 if (!base && index && scale == 2)
16603 base = index, base_reg = index_reg, scale = 1;
16605 /* Special case: scaling cannot be encoded without base or displacement. */
16606 if (!base && !disp && index && scale != 1)
16607 disp = const0_rtx;
16609 out->base = base;
16610 out->index = index;
16611 out->disp = disp;
16612 out->scale = scale;
16613 out->seg = seg;
16615 return retval;
16618 /* Return cost of the memory address x.
16619 For i386, it is better to use a complex address than let gcc copy
16620 the address into a reg and make a new pseudo. But not if the address
16621 requires to two regs - that would mean more pseudos with longer
16622 lifetimes. */
16623 static int
16624 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16626 struct ix86_address parts;
16627 int cost = 1;
16628 int ok = ix86_decompose_address (x, &parts);
16630 gcc_assert (ok);
16632 if (parts.base && SUBREG_P (parts.base))
16633 parts.base = SUBREG_REG (parts.base);
16634 if (parts.index && SUBREG_P (parts.index))
16635 parts.index = SUBREG_REG (parts.index);
16637 /* Attempt to minimize number of registers in the address by increasing
16638 address cost for each used register. We don't increase address cost
16639 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16640 is not invariant itself it most likely means that base or index is not
16641 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16642 which is not profitable for x86. */
16643 if (parts.base
16644 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16645 && (current_pass->type == GIMPLE_PASS
16646 || !pic_offset_table_rtx
16647 || !REG_P (parts.base)
16648 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16649 cost++;
16651 if (parts.index
16652 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16653 && (current_pass->type == GIMPLE_PASS
16654 || !pic_offset_table_rtx
16655 || !REG_P (parts.index)
16656 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16657 cost++;
16659 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16660 since it's predecode logic can't detect the length of instructions
16661 and it degenerates to vector decoded. Increase cost of such
16662 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16663 to split such addresses or even refuse such addresses at all.
16665 Following addressing modes are affected:
16666 [base+scale*index]
16667 [scale*index+disp]
16668 [base+index]
16670 The first and last case may be avoidable by explicitly coding the zero in
16671 memory address, but I don't have AMD-K6 machine handy to check this
16672 theory. */
16674 if (TARGET_K6
16675 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16676 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16677 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16678 cost += 10;
16680 return cost;
16683 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16684 this is used for to form addresses to local data when -fPIC is in
16685 use. */
16687 static bool
16688 darwin_local_data_pic (rtx disp)
16690 return (GET_CODE (disp) == UNSPEC
16691 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16694 /* True if operand X should be loaded from GOT. */
16696 bool
16697 ix86_force_load_from_GOT_p (rtx x)
16699 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16700 && !TARGET_PECOFF && !TARGET_MACHO
16701 && !flag_plt && !flag_pic
16702 && ix86_cmodel != CM_LARGE
16703 && GET_CODE (x) == SYMBOL_REF
16704 && SYMBOL_REF_FUNCTION_P (x)
16705 && !SYMBOL_REF_LOCAL_P (x));
16708 /* Determine if a given RTX is a valid constant. We already know this
16709 satisfies CONSTANT_P. */
16711 static bool
16712 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16714 /* Pointer bounds constants are not valid. */
16715 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16716 return false;
16718 switch (GET_CODE (x))
16720 case CONST:
16721 x = XEXP (x, 0);
16723 if (GET_CODE (x) == PLUS)
16725 if (!CONST_INT_P (XEXP (x, 1)))
16726 return false;
16727 x = XEXP (x, 0);
16730 if (TARGET_MACHO && darwin_local_data_pic (x))
16731 return true;
16733 /* Only some unspecs are valid as "constants". */
16734 if (GET_CODE (x) == UNSPEC)
16735 switch (XINT (x, 1))
16737 case UNSPEC_GOT:
16738 case UNSPEC_GOTOFF:
16739 case UNSPEC_PLTOFF:
16740 return TARGET_64BIT;
16741 case UNSPEC_TPOFF:
16742 case UNSPEC_NTPOFF:
16743 x = XVECEXP (x, 0, 0);
16744 return (GET_CODE (x) == SYMBOL_REF
16745 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16746 case UNSPEC_DTPOFF:
16747 x = XVECEXP (x, 0, 0);
16748 return (GET_CODE (x) == SYMBOL_REF
16749 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16750 default:
16751 return false;
16754 /* We must have drilled down to a symbol. */
16755 if (GET_CODE (x) == LABEL_REF)
16756 return true;
16757 if (GET_CODE (x) != SYMBOL_REF)
16758 return false;
16759 /* FALLTHRU */
16761 case SYMBOL_REF:
16762 /* TLS symbols are never valid. */
16763 if (SYMBOL_REF_TLS_MODEL (x))
16764 return false;
16766 /* DLLIMPORT symbols are never valid. */
16767 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16768 && SYMBOL_REF_DLLIMPORT_P (x))
16769 return false;
16771 #if TARGET_MACHO
16772 /* mdynamic-no-pic */
16773 if (MACHO_DYNAMIC_NO_PIC_P)
16774 return machopic_symbol_defined_p (x);
16775 #endif
16777 /* External function address should be loaded
16778 via the GOT slot to avoid PLT. */
16779 if (ix86_force_load_from_GOT_p (x))
16780 return false;
16782 break;
16784 CASE_CONST_SCALAR_INT:
16785 switch (mode)
16787 case E_TImode:
16788 if (TARGET_64BIT)
16789 return true;
16790 /* FALLTHRU */
16791 case E_OImode:
16792 case E_XImode:
16793 if (!standard_sse_constant_p (x, mode))
16794 return false;
16795 default:
16796 break;
16798 break;
16800 case CONST_VECTOR:
16801 if (!standard_sse_constant_p (x, mode))
16802 return false;
16804 default:
16805 break;
16808 /* Otherwise we handle everything else in the move patterns. */
16809 return true;
16812 /* Determine if it's legal to put X into the constant pool. This
16813 is not possible for the address of thread-local symbols, which
16814 is checked above. */
16816 static bool
16817 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16819 /* We can put any immediate constant in memory. */
16820 switch (GET_CODE (x))
16822 CASE_CONST_ANY:
16823 return false;
16825 default:
16826 break;
16829 return !ix86_legitimate_constant_p (mode, x);
16832 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16833 otherwise zero. */
16835 static bool
16836 is_imported_p (rtx x)
16838 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16839 || GET_CODE (x) != SYMBOL_REF)
16840 return false;
16842 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16846 /* Nonzero if the constant value X is a legitimate general operand
16847 when generating PIC code. It is given that flag_pic is on and
16848 that X satisfies CONSTANT_P. */
16850 bool
16851 legitimate_pic_operand_p (rtx x)
16853 rtx inner;
16855 switch (GET_CODE (x))
16857 case CONST:
16858 inner = XEXP (x, 0);
16859 if (GET_CODE (inner) == PLUS
16860 && CONST_INT_P (XEXP (inner, 1)))
16861 inner = XEXP (inner, 0);
16863 /* Only some unspecs are valid as "constants". */
16864 if (GET_CODE (inner) == UNSPEC)
16865 switch (XINT (inner, 1))
16867 case UNSPEC_GOT:
16868 case UNSPEC_GOTOFF:
16869 case UNSPEC_PLTOFF:
16870 return TARGET_64BIT;
16871 case UNSPEC_TPOFF:
16872 x = XVECEXP (inner, 0, 0);
16873 return (GET_CODE (x) == SYMBOL_REF
16874 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16875 case UNSPEC_MACHOPIC_OFFSET:
16876 return legitimate_pic_address_disp_p (x);
16877 default:
16878 return false;
16880 /* FALLTHRU */
16882 case SYMBOL_REF:
16883 case LABEL_REF:
16884 return legitimate_pic_address_disp_p (x);
16886 default:
16887 return true;
16891 /* Determine if a given CONST RTX is a valid memory displacement
16892 in PIC mode. */
16894 bool
16895 legitimate_pic_address_disp_p (rtx disp)
16897 bool saw_plus;
16899 /* In 64bit mode we can allow direct addresses of symbols and labels
16900 when they are not dynamic symbols. */
16901 if (TARGET_64BIT)
16903 rtx op0 = disp, op1;
16905 switch (GET_CODE (disp))
16907 case LABEL_REF:
16908 return true;
16910 case CONST:
16911 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16912 break;
16913 op0 = XEXP (XEXP (disp, 0), 0);
16914 op1 = XEXP (XEXP (disp, 0), 1);
16915 if (!CONST_INT_P (op1)
16916 || INTVAL (op1) >= 16*1024*1024
16917 || INTVAL (op1) < -16*1024*1024)
16918 break;
16919 if (GET_CODE (op0) == LABEL_REF)
16920 return true;
16921 if (GET_CODE (op0) == CONST
16922 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16923 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16924 return true;
16925 if (GET_CODE (op0) == UNSPEC
16926 && XINT (op0, 1) == UNSPEC_PCREL)
16927 return true;
16928 if (GET_CODE (op0) != SYMBOL_REF)
16929 break;
16930 /* FALLTHRU */
16932 case SYMBOL_REF:
16933 /* TLS references should always be enclosed in UNSPEC.
16934 The dllimported symbol needs always to be resolved. */
16935 if (SYMBOL_REF_TLS_MODEL (op0)
16936 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16937 return false;
16939 if (TARGET_PECOFF)
16941 if (is_imported_p (op0))
16942 return true;
16944 if (SYMBOL_REF_FAR_ADDR_P (op0)
16945 || !SYMBOL_REF_LOCAL_P (op0))
16946 break;
16948 /* Function-symbols need to be resolved only for
16949 large-model.
16950 For the small-model we don't need to resolve anything
16951 here. */
16952 if ((ix86_cmodel != CM_LARGE_PIC
16953 && SYMBOL_REF_FUNCTION_P (op0))
16954 || ix86_cmodel == CM_SMALL_PIC)
16955 return true;
16956 /* Non-external symbols don't need to be resolved for
16957 large, and medium-model. */
16958 if ((ix86_cmodel == CM_LARGE_PIC
16959 || ix86_cmodel == CM_MEDIUM_PIC)
16960 && !SYMBOL_REF_EXTERNAL_P (op0))
16961 return true;
16963 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16964 && (SYMBOL_REF_LOCAL_P (op0)
16965 || (HAVE_LD_PIE_COPYRELOC
16966 && flag_pie
16967 && !SYMBOL_REF_WEAK (op0)
16968 && !SYMBOL_REF_FUNCTION_P (op0)))
16969 && ix86_cmodel != CM_LARGE_PIC)
16970 return true;
16971 break;
16973 default:
16974 break;
16977 if (GET_CODE (disp) != CONST)
16978 return false;
16979 disp = XEXP (disp, 0);
16981 if (TARGET_64BIT)
16983 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16984 of GOT tables. We should not need these anyway. */
16985 if (GET_CODE (disp) != UNSPEC
16986 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16987 && XINT (disp, 1) != UNSPEC_GOTOFF
16988 && XINT (disp, 1) != UNSPEC_PCREL
16989 && XINT (disp, 1) != UNSPEC_PLTOFF))
16990 return false;
16992 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16993 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16994 return false;
16995 return true;
16998 saw_plus = false;
16999 if (GET_CODE (disp) == PLUS)
17001 if (!CONST_INT_P (XEXP (disp, 1)))
17002 return false;
17003 disp = XEXP (disp, 0);
17004 saw_plus = true;
17007 if (TARGET_MACHO && darwin_local_data_pic (disp))
17008 return true;
17010 if (GET_CODE (disp) != UNSPEC)
17011 return false;
17013 switch (XINT (disp, 1))
17015 case UNSPEC_GOT:
17016 if (saw_plus)
17017 return false;
17018 /* We need to check for both symbols and labels because VxWorks loads
17019 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
17020 details. */
17021 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
17022 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
17023 case UNSPEC_GOTOFF:
17024 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
17025 While ABI specify also 32bit relocation but we don't produce it in
17026 small PIC model at all. */
17027 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
17028 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
17029 && !TARGET_64BIT)
17030 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
17031 return false;
17032 case UNSPEC_GOTTPOFF:
17033 case UNSPEC_GOTNTPOFF:
17034 case UNSPEC_INDNTPOFF:
17035 if (saw_plus)
17036 return false;
17037 disp = XVECEXP (disp, 0, 0);
17038 return (GET_CODE (disp) == SYMBOL_REF
17039 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
17040 case UNSPEC_NTPOFF:
17041 disp = XVECEXP (disp, 0, 0);
17042 return (GET_CODE (disp) == SYMBOL_REF
17043 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
17044 case UNSPEC_DTPOFF:
17045 disp = XVECEXP (disp, 0, 0);
17046 return (GET_CODE (disp) == SYMBOL_REF
17047 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
17050 return false;
17053 /* Determine if op is suitable RTX for an address register.
17054 Return naked register if a register or a register subreg is
17055 found, otherwise return NULL_RTX. */
17057 static rtx
17058 ix86_validate_address_register (rtx op)
17060 machine_mode mode = GET_MODE (op);
17062 /* Only SImode or DImode registers can form the address. */
17063 if (mode != SImode && mode != DImode)
17064 return NULL_RTX;
17066 if (REG_P (op))
17067 return op;
17068 else if (SUBREG_P (op))
17070 rtx reg = SUBREG_REG (op);
17072 if (!REG_P (reg))
17073 return NULL_RTX;
17075 mode = GET_MODE (reg);
17077 /* Don't allow SUBREGs that span more than a word. It can
17078 lead to spill failures when the register is one word out
17079 of a two word structure. */
17080 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
17081 return NULL_RTX;
17083 /* Allow only SUBREGs of non-eliminable hard registers. */
17084 if (register_no_elim_operand (reg, mode))
17085 return reg;
17088 /* Op is not a register. */
17089 return NULL_RTX;
17092 /* Recognizes RTL expressions that are valid memory addresses for an
17093 instruction. The MODE argument is the machine mode for the MEM
17094 expression that wants to use this address.
17096 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
17097 convert common non-canonical forms to canonical form so that they will
17098 be recognized. */
17100 static bool
17101 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
17103 struct ix86_address parts;
17104 rtx base, index, disp;
17105 HOST_WIDE_INT scale;
17106 addr_space_t seg;
17108 if (ix86_decompose_address (addr, &parts) <= 0)
17109 /* Decomposition failed. */
17110 return false;
17112 base = parts.base;
17113 index = parts.index;
17114 disp = parts.disp;
17115 scale = parts.scale;
17116 seg = parts.seg;
17118 /* Validate base register. */
17119 if (base)
17121 rtx reg = ix86_validate_address_register (base);
17123 if (reg == NULL_RTX)
17124 return false;
17126 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
17127 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
17128 /* Base is not valid. */
17129 return false;
17132 /* Validate index register. */
17133 if (index)
17135 rtx reg = ix86_validate_address_register (index);
17137 if (reg == NULL_RTX)
17138 return false;
17140 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
17141 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
17142 /* Index is not valid. */
17143 return false;
17146 /* Index and base should have the same mode. */
17147 if (base && index
17148 && GET_MODE (base) != GET_MODE (index))
17149 return false;
17151 /* Address override works only on the (%reg) part of %fs:(%reg). */
17152 if (seg != ADDR_SPACE_GENERIC
17153 && ((base && GET_MODE (base) != word_mode)
17154 || (index && GET_MODE (index) != word_mode)))
17155 return false;
17157 /* Validate scale factor. */
17158 if (scale != 1)
17160 if (!index)
17161 /* Scale without index. */
17162 return false;
17164 if (scale != 2 && scale != 4 && scale != 8)
17165 /* Scale is not a valid multiplier. */
17166 return false;
17169 /* Validate displacement. */
17170 if (disp)
17172 if (GET_CODE (disp) == CONST
17173 && GET_CODE (XEXP (disp, 0)) == UNSPEC
17174 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
17175 switch (XINT (XEXP (disp, 0), 1))
17177 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
17178 when used. While ABI specify also 32bit relocations, we
17179 don't produce them at all and use IP relative instead.
17180 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
17181 should be loaded via GOT. */
17182 case UNSPEC_GOT:
17183 if (!TARGET_64BIT
17184 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
17185 goto is_legitimate_pic;
17186 /* FALLTHRU */
17187 case UNSPEC_GOTOFF:
17188 gcc_assert (flag_pic);
17189 if (!TARGET_64BIT)
17190 goto is_legitimate_pic;
17192 /* 64bit address unspec. */
17193 return false;
17195 case UNSPEC_GOTPCREL:
17196 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
17197 goto is_legitimate_pic;
17198 /* FALLTHRU */
17199 case UNSPEC_PCREL:
17200 gcc_assert (flag_pic);
17201 goto is_legitimate_pic;
17203 case UNSPEC_GOTTPOFF:
17204 case UNSPEC_GOTNTPOFF:
17205 case UNSPEC_INDNTPOFF:
17206 case UNSPEC_NTPOFF:
17207 case UNSPEC_DTPOFF:
17208 break;
17210 default:
17211 /* Invalid address unspec. */
17212 return false;
17215 else if (SYMBOLIC_CONST (disp)
17216 && (flag_pic
17217 || (TARGET_MACHO
17218 #if TARGET_MACHO
17219 && MACHOPIC_INDIRECT
17220 && !machopic_operand_p (disp)
17221 #endif
17225 is_legitimate_pic:
17226 if (TARGET_64BIT && (index || base))
17228 /* foo@dtpoff(%rX) is ok. */
17229 if (GET_CODE (disp) != CONST
17230 || GET_CODE (XEXP (disp, 0)) != PLUS
17231 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
17232 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
17233 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
17234 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
17235 /* Non-constant pic memory reference. */
17236 return false;
17238 else if ((!TARGET_MACHO || flag_pic)
17239 && ! legitimate_pic_address_disp_p (disp))
17240 /* Displacement is an invalid pic construct. */
17241 return false;
17242 #if TARGET_MACHO
17243 else if (MACHO_DYNAMIC_NO_PIC_P
17244 && !ix86_legitimate_constant_p (Pmode, disp))
17245 /* displacment must be referenced via non_lazy_pointer */
17246 return false;
17247 #endif
17249 /* This code used to verify that a symbolic pic displacement
17250 includes the pic_offset_table_rtx register.
17252 While this is good idea, unfortunately these constructs may
17253 be created by "adds using lea" optimization for incorrect
17254 code like:
17256 int a;
17257 int foo(int i)
17259 return *(&a+i);
17262 This code is nonsensical, but results in addressing
17263 GOT table with pic_offset_table_rtx base. We can't
17264 just refuse it easily, since it gets matched by
17265 "addsi3" pattern, that later gets split to lea in the
17266 case output register differs from input. While this
17267 can be handled by separate addsi pattern for this case
17268 that never results in lea, this seems to be easier and
17269 correct fix for crash to disable this test. */
17271 else if (GET_CODE (disp) != LABEL_REF
17272 && !CONST_INT_P (disp)
17273 && (GET_CODE (disp) != CONST
17274 || !ix86_legitimate_constant_p (Pmode, disp))
17275 && (GET_CODE (disp) != SYMBOL_REF
17276 || !ix86_legitimate_constant_p (Pmode, disp)))
17277 /* Displacement is not constant. */
17278 return false;
17279 else if (TARGET_64BIT
17280 && !x86_64_immediate_operand (disp, VOIDmode))
17281 /* Displacement is out of range. */
17282 return false;
17283 /* In x32 mode, constant addresses are sign extended to 64bit, so
17284 we have to prevent addresses from 0x80000000 to 0xffffffff. */
17285 else if (TARGET_X32 && !(index || base)
17286 && CONST_INT_P (disp)
17287 && val_signbit_known_set_p (SImode, INTVAL (disp)))
17288 return false;
17291 /* Everything looks valid. */
17292 return true;
17295 /* Determine if a given RTX is a valid constant address. */
17297 bool
17298 constant_address_p (rtx x)
17300 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
17303 /* Return a unique alias set for the GOT. */
17305 static alias_set_type
17306 ix86_GOT_alias_set (void)
17308 static alias_set_type set = -1;
17309 if (set == -1)
17310 set = new_alias_set ();
17311 return set;
17314 /* Return a legitimate reference for ORIG (an address) using the
17315 register REG. If REG is 0, a new pseudo is generated.
17317 There are two types of references that must be handled:
17319 1. Global data references must load the address from the GOT, via
17320 the PIC reg. An insn is emitted to do this load, and the reg is
17321 returned.
17323 2. Static data references, constant pool addresses, and code labels
17324 compute the address as an offset from the GOT, whose base is in
17325 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
17326 differentiate them from global data objects. The returned
17327 address is the PIC reg + an unspec constant.
17329 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
17330 reg also appears in the address. */
17332 static rtx
17333 legitimize_pic_address (rtx orig, rtx reg)
17335 rtx addr = orig;
17336 rtx new_rtx = orig;
17338 #if TARGET_MACHO
17339 if (TARGET_MACHO && !TARGET_64BIT)
17341 if (reg == 0)
17342 reg = gen_reg_rtx (Pmode);
17343 /* Use the generic Mach-O PIC machinery. */
17344 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
17346 #endif
17348 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17350 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17351 if (tmp)
17352 return tmp;
17355 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
17356 new_rtx = addr;
17357 else if ((!TARGET_64BIT
17358 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
17359 && !TARGET_PECOFF
17360 && gotoff_operand (addr, Pmode))
17362 /* This symbol may be referenced via a displacement
17363 from the PIC base address (@GOTOFF). */
17364 if (GET_CODE (addr) == CONST)
17365 addr = XEXP (addr, 0);
17367 if (GET_CODE (addr) == PLUS)
17369 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
17370 UNSPEC_GOTOFF);
17371 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
17373 else
17374 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17376 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17378 if (TARGET_64BIT)
17379 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17381 if (reg != 0)
17383 gcc_assert (REG_P (reg));
17384 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17385 new_rtx, reg, 1, OPTAB_DIRECT);
17387 else
17388 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17390 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17391 /* We can't use @GOTOFF for text labels
17392 on VxWorks, see gotoff_operand. */
17393 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17395 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17396 if (tmp)
17397 return tmp;
17399 /* For x64 PE-COFF there is no GOT table,
17400 so we use address directly. */
17401 if (TARGET_64BIT && TARGET_PECOFF)
17403 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17404 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17406 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17408 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17409 UNSPEC_GOTPCREL);
17410 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17411 new_rtx = gen_const_mem (Pmode, new_rtx);
17412 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17414 else
17416 /* This symbol must be referenced via a load
17417 from the Global Offset Table (@GOT). */
17418 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17419 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17420 if (TARGET_64BIT)
17421 new_rtx = force_reg (Pmode, new_rtx);
17422 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17423 new_rtx = gen_const_mem (Pmode, new_rtx);
17424 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17427 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17429 else
17431 if (CONST_INT_P (addr)
17432 && !x86_64_immediate_operand (addr, VOIDmode))
17433 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17434 else if (GET_CODE (addr) == CONST)
17436 addr = XEXP (addr, 0);
17438 /* We must match stuff we generate before. Assume the only
17439 unspecs that can get here are ours. Not that we could do
17440 anything with them anyway.... */
17441 if (GET_CODE (addr) == UNSPEC
17442 || (GET_CODE (addr) == PLUS
17443 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17444 return orig;
17445 gcc_assert (GET_CODE (addr) == PLUS);
17448 if (GET_CODE (addr) == PLUS)
17450 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17452 /* Check first to see if this is a constant
17453 offset from a @GOTOFF symbol reference. */
17454 if (!TARGET_PECOFF
17455 && gotoff_operand (op0, Pmode)
17456 && CONST_INT_P (op1))
17458 if (!TARGET_64BIT)
17460 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17461 UNSPEC_GOTOFF);
17462 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17463 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17465 if (reg != 0)
17467 gcc_assert (REG_P (reg));
17468 new_rtx = expand_simple_binop (Pmode, PLUS,
17469 pic_offset_table_rtx,
17470 new_rtx, reg, 1,
17471 OPTAB_DIRECT);
17473 else
17474 new_rtx
17475 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17477 else
17479 if (INTVAL (op1) < -16*1024*1024
17480 || INTVAL (op1) >= 16*1024*1024)
17482 if (!x86_64_immediate_operand (op1, Pmode))
17483 op1 = force_reg (Pmode, op1);
17485 new_rtx
17486 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17490 else
17492 rtx base = legitimize_pic_address (op0, reg);
17493 machine_mode mode = GET_MODE (base);
17494 new_rtx
17495 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17497 if (CONST_INT_P (new_rtx))
17499 if (INTVAL (new_rtx) < -16*1024*1024
17500 || INTVAL (new_rtx) >= 16*1024*1024)
17502 if (!x86_64_immediate_operand (new_rtx, mode))
17503 new_rtx = force_reg (mode, new_rtx);
17505 new_rtx
17506 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17508 else
17509 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17511 else
17513 /* For %rip addressing, we have to use
17514 just disp32, not base nor index. */
17515 if (TARGET_64BIT
17516 && (GET_CODE (base) == SYMBOL_REF
17517 || GET_CODE (base) == LABEL_REF))
17518 base = force_reg (mode, base);
17519 if (GET_CODE (new_rtx) == PLUS
17520 && CONSTANT_P (XEXP (new_rtx, 1)))
17522 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17523 new_rtx = XEXP (new_rtx, 1);
17525 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17530 return new_rtx;
17533 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17535 static rtx
17536 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17538 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17540 if (GET_MODE (tp) != tp_mode)
17542 gcc_assert (GET_MODE (tp) == SImode);
17543 gcc_assert (tp_mode == DImode);
17545 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17548 if (to_reg)
17549 tp = copy_to_mode_reg (tp_mode, tp);
17551 return tp;
17554 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17556 static GTY(()) rtx ix86_tls_symbol;
17558 static rtx
17559 ix86_tls_get_addr (void)
17561 if (!ix86_tls_symbol)
17563 const char *sym
17564 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17565 ? "___tls_get_addr" : "__tls_get_addr");
17567 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17570 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17572 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17573 UNSPEC_PLTOFF);
17574 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17575 gen_rtx_CONST (Pmode, unspec));
17578 return ix86_tls_symbol;
17581 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17583 static GTY(()) rtx ix86_tls_module_base_symbol;
17586 ix86_tls_module_base (void)
17588 if (!ix86_tls_module_base_symbol)
17590 ix86_tls_module_base_symbol
17591 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17593 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17594 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17597 return ix86_tls_module_base_symbol;
17600 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17601 false if we expect this to be used for a memory address and true if
17602 we expect to load the address into a register. */
17604 static rtx
17605 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17607 rtx dest, base, off;
17608 rtx pic = NULL_RTX, tp = NULL_RTX;
17609 machine_mode tp_mode = Pmode;
17610 int type;
17612 /* Fall back to global dynamic model if tool chain cannot support local
17613 dynamic. */
17614 if (TARGET_SUN_TLS && !TARGET_64BIT
17615 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17616 && model == TLS_MODEL_LOCAL_DYNAMIC)
17617 model = TLS_MODEL_GLOBAL_DYNAMIC;
17619 switch (model)
17621 case TLS_MODEL_GLOBAL_DYNAMIC:
17622 dest = gen_reg_rtx (Pmode);
17624 if (!TARGET_64BIT)
17626 if (flag_pic && !TARGET_PECOFF)
17627 pic = pic_offset_table_rtx;
17628 else
17630 pic = gen_reg_rtx (Pmode);
17631 emit_insn (gen_set_got (pic));
17635 if (TARGET_GNU2_TLS)
17637 if (TARGET_64BIT)
17638 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17639 else
17640 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17642 tp = get_thread_pointer (Pmode, true);
17643 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17645 if (GET_MODE (x) != Pmode)
17646 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17648 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17650 else
17652 rtx caddr = ix86_tls_get_addr ();
17654 if (TARGET_64BIT)
17656 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17657 rtx_insn *insns;
17659 start_sequence ();
17660 emit_call_insn
17661 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17662 insns = get_insns ();
17663 end_sequence ();
17665 if (GET_MODE (x) != Pmode)
17666 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17668 RTL_CONST_CALL_P (insns) = 1;
17669 emit_libcall_block (insns, dest, rax, x);
17671 else
17672 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17674 break;
17676 case TLS_MODEL_LOCAL_DYNAMIC:
17677 base = gen_reg_rtx (Pmode);
17679 if (!TARGET_64BIT)
17681 if (flag_pic)
17682 pic = pic_offset_table_rtx;
17683 else
17685 pic = gen_reg_rtx (Pmode);
17686 emit_insn (gen_set_got (pic));
17690 if (TARGET_GNU2_TLS)
17692 rtx tmp = ix86_tls_module_base ();
17694 if (TARGET_64BIT)
17695 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17696 else
17697 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17699 tp = get_thread_pointer (Pmode, true);
17700 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17701 gen_rtx_MINUS (Pmode, tmp, tp));
17703 else
17705 rtx caddr = ix86_tls_get_addr ();
17707 if (TARGET_64BIT)
17709 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17710 rtx_insn *insns;
17711 rtx eqv;
17713 start_sequence ();
17714 emit_call_insn
17715 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17716 insns = get_insns ();
17717 end_sequence ();
17719 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17720 share the LD_BASE result with other LD model accesses. */
17721 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17722 UNSPEC_TLS_LD_BASE);
17724 RTL_CONST_CALL_P (insns) = 1;
17725 emit_libcall_block (insns, base, rax, eqv);
17727 else
17728 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17731 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17732 off = gen_rtx_CONST (Pmode, off);
17734 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17736 if (TARGET_GNU2_TLS)
17738 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17740 if (GET_MODE (x) != Pmode)
17741 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17743 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17745 break;
17747 case TLS_MODEL_INITIAL_EXEC:
17748 if (TARGET_64BIT)
17750 if (TARGET_SUN_TLS && !TARGET_X32)
17752 /* The Sun linker took the AMD64 TLS spec literally
17753 and can only handle %rax as destination of the
17754 initial executable code sequence. */
17756 dest = gen_reg_rtx (DImode);
17757 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17758 return dest;
17761 /* Generate DImode references to avoid %fs:(%reg32)
17762 problems and linker IE->LE relaxation bug. */
17763 tp_mode = DImode;
17764 pic = NULL;
17765 type = UNSPEC_GOTNTPOFF;
17767 else if (flag_pic)
17769 pic = pic_offset_table_rtx;
17770 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17772 else if (!TARGET_ANY_GNU_TLS)
17774 pic = gen_reg_rtx (Pmode);
17775 emit_insn (gen_set_got (pic));
17776 type = UNSPEC_GOTTPOFF;
17778 else
17780 pic = NULL;
17781 type = UNSPEC_INDNTPOFF;
17784 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17785 off = gen_rtx_CONST (tp_mode, off);
17786 if (pic)
17787 off = gen_rtx_PLUS (tp_mode, pic, off);
17788 off = gen_const_mem (tp_mode, off);
17789 set_mem_alias_set (off, ix86_GOT_alias_set ());
17791 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17793 base = get_thread_pointer (tp_mode,
17794 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17795 off = force_reg (tp_mode, off);
17796 dest = gen_rtx_PLUS (tp_mode, base, off);
17797 if (tp_mode != Pmode)
17798 dest = convert_to_mode (Pmode, dest, 1);
17800 else
17802 base = get_thread_pointer (Pmode, true);
17803 dest = gen_reg_rtx (Pmode);
17804 emit_insn (ix86_gen_sub3 (dest, base, off));
17806 break;
17808 case TLS_MODEL_LOCAL_EXEC:
17809 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17810 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17811 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17812 off = gen_rtx_CONST (Pmode, off);
17814 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17816 base = get_thread_pointer (Pmode,
17817 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17818 return gen_rtx_PLUS (Pmode, base, off);
17820 else
17822 base = get_thread_pointer (Pmode, true);
17823 dest = gen_reg_rtx (Pmode);
17824 emit_insn (ix86_gen_sub3 (dest, base, off));
17826 break;
17828 default:
17829 gcc_unreachable ();
17832 return dest;
17835 /* Return true if OP refers to a TLS address. */
17836 bool
17837 ix86_tls_address_pattern_p (rtx op)
17839 subrtx_var_iterator::array_type array;
17840 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
17842 rtx op = *iter;
17843 if (MEM_P (op))
17845 rtx *x = &XEXP (op, 0);
17846 while (GET_CODE (*x) == PLUS)
17848 int i;
17849 for (i = 0; i < 2; i++)
17851 rtx u = XEXP (*x, i);
17852 if (GET_CODE (u) == ZERO_EXTEND)
17853 u = XEXP (u, 0);
17854 if (GET_CODE (u) == UNSPEC
17855 && XINT (u, 1) == UNSPEC_TP)
17856 return true;
17858 x = &XEXP (*x, 0);
17861 iter.skip_subrtxes ();
17865 return false;
17868 /* Rewrite *LOC so that it refers to a default TLS address space. */
17869 void
17870 ix86_rewrite_tls_address_1 (rtx *loc)
17872 subrtx_ptr_iterator::array_type array;
17873 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
17875 rtx *loc = *iter;
17876 if (MEM_P (*loc))
17878 rtx addr = XEXP (*loc, 0);
17879 rtx *x = &addr;
17880 while (GET_CODE (*x) == PLUS)
17882 int i;
17883 for (i = 0; i < 2; i++)
17885 rtx u = XEXP (*x, i);
17886 if (GET_CODE (u) == ZERO_EXTEND)
17887 u = XEXP (u, 0);
17888 if (GET_CODE (u) == UNSPEC
17889 && XINT (u, 1) == UNSPEC_TP)
17891 addr_space_t as = DEFAULT_TLS_SEG_REG;
17893 *x = XEXP (*x, 1 - i);
17895 *loc = replace_equiv_address_nv (*loc, addr, true);
17896 set_mem_addr_space (*loc, as);
17897 return;
17900 x = &XEXP (*x, 0);
17903 iter.skip_subrtxes ();
17908 /* Rewrite instruction pattern involvning TLS address
17909 so that it refers to a default TLS address space. */
17911 ix86_rewrite_tls_address (rtx pattern)
17913 pattern = copy_insn (pattern);
17914 ix86_rewrite_tls_address_1 (&pattern);
17915 return pattern;
17918 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17919 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17920 unique refptr-DECL symbol corresponding to symbol DECL. */
17922 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17924 static inline hashval_t hash (tree_map *m) { return m->hash; }
17925 static inline bool
17926 equal (tree_map *a, tree_map *b)
17928 return a->base.from == b->base.from;
17931 static int
17932 keep_cache_entry (tree_map *&m)
17934 return ggc_marked_p (m->base.from);
17938 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17940 static tree
17941 get_dllimport_decl (tree decl, bool beimport)
17943 struct tree_map *h, in;
17944 const char *name;
17945 const char *prefix;
17946 size_t namelen, prefixlen;
17947 char *imp_name;
17948 tree to;
17949 rtx rtl;
17951 if (!dllimport_map)
17952 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17954 in.hash = htab_hash_pointer (decl);
17955 in.base.from = decl;
17956 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17957 h = *loc;
17958 if (h)
17959 return h->to;
17961 *loc = h = ggc_alloc<tree_map> ();
17962 h->hash = in.hash;
17963 h->base.from = decl;
17964 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17965 VAR_DECL, NULL, ptr_type_node);
17966 DECL_ARTIFICIAL (to) = 1;
17967 DECL_IGNORED_P (to) = 1;
17968 DECL_EXTERNAL (to) = 1;
17969 TREE_READONLY (to) = 1;
17971 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17972 name = targetm.strip_name_encoding (name);
17973 if (beimport)
17974 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17975 ? "*__imp_" : "*__imp__";
17976 else
17977 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17978 namelen = strlen (name);
17979 prefixlen = strlen (prefix);
17980 imp_name = (char *) alloca (namelen + prefixlen + 1);
17981 memcpy (imp_name, prefix, prefixlen);
17982 memcpy (imp_name + prefixlen, name, namelen + 1);
17984 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17985 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17986 SET_SYMBOL_REF_DECL (rtl, to);
17987 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17988 if (!beimport)
17990 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17991 #ifdef SUB_TARGET_RECORD_STUB
17992 SUB_TARGET_RECORD_STUB (name);
17993 #endif
17996 rtl = gen_const_mem (Pmode, rtl);
17997 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17999 SET_DECL_RTL (to, rtl);
18000 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
18002 return to;
18005 /* Expand SYMBOL into its corresponding far-address symbol.
18006 WANT_REG is true if we require the result be a register. */
18008 static rtx
18009 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
18011 tree imp_decl;
18012 rtx x;
18014 gcc_assert (SYMBOL_REF_DECL (symbol));
18015 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
18017 x = DECL_RTL (imp_decl);
18018 if (want_reg)
18019 x = force_reg (Pmode, x);
18020 return x;
18023 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
18024 true if we require the result be a register. */
18026 static rtx
18027 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
18029 tree imp_decl;
18030 rtx x;
18032 gcc_assert (SYMBOL_REF_DECL (symbol));
18033 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
18035 x = DECL_RTL (imp_decl);
18036 if (want_reg)
18037 x = force_reg (Pmode, x);
18038 return x;
18041 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
18042 is true if we require the result be a register. */
18044 static rtx
18045 legitimize_pe_coff_symbol (rtx addr, bool inreg)
18047 if (!TARGET_PECOFF)
18048 return NULL_RTX;
18050 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
18052 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
18053 return legitimize_dllimport_symbol (addr, inreg);
18054 if (GET_CODE (addr) == CONST
18055 && GET_CODE (XEXP (addr, 0)) == PLUS
18056 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
18057 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
18059 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
18060 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
18064 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
18065 return NULL_RTX;
18066 if (GET_CODE (addr) == SYMBOL_REF
18067 && !is_imported_p (addr)
18068 && SYMBOL_REF_EXTERNAL_P (addr)
18069 && SYMBOL_REF_DECL (addr))
18070 return legitimize_pe_coff_extern_decl (addr, inreg);
18072 if (GET_CODE (addr) == CONST
18073 && GET_CODE (XEXP (addr, 0)) == PLUS
18074 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
18075 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
18076 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
18077 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
18079 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
18080 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
18082 return NULL_RTX;
18085 /* Try machine-dependent ways of modifying an illegitimate address
18086 to be legitimate. If we find one, return the new, valid address.
18087 This macro is used in only one place: `memory_address' in explow.c.
18089 OLDX is the address as it was before break_out_memory_refs was called.
18090 In some cases it is useful to look at this to decide what needs to be done.
18092 It is always safe for this macro to do nothing. It exists to recognize
18093 opportunities to optimize the output.
18095 For the 80386, we handle X+REG by loading X into a register R and
18096 using R+REG. R will go in a general reg and indexing will be used.
18097 However, if REG is a broken-out memory address or multiplication,
18098 nothing needs to be done because REG can certainly go in a general reg.
18100 When -fpic is used, special handling is needed for symbolic references.
18101 See comments by legitimize_pic_address in i386.c for details. */
18103 static rtx
18104 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
18106 bool changed = false;
18107 unsigned log;
18109 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
18110 if (log)
18111 return legitimize_tls_address (x, (enum tls_model) log, false);
18112 if (GET_CODE (x) == CONST
18113 && GET_CODE (XEXP (x, 0)) == PLUS
18114 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
18115 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
18117 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
18118 (enum tls_model) log, false);
18119 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
18122 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
18124 rtx tmp = legitimize_pe_coff_symbol (x, true);
18125 if (tmp)
18126 return tmp;
18129 if (flag_pic && SYMBOLIC_CONST (x))
18130 return legitimize_pic_address (x, 0);
18132 #if TARGET_MACHO
18133 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
18134 return machopic_indirect_data_reference (x, 0);
18135 #endif
18137 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
18138 if (GET_CODE (x) == ASHIFT
18139 && CONST_INT_P (XEXP (x, 1))
18140 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
18142 changed = true;
18143 log = INTVAL (XEXP (x, 1));
18144 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
18145 GEN_INT (1 << log));
18148 if (GET_CODE (x) == PLUS)
18150 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
18152 if (GET_CODE (XEXP (x, 0)) == ASHIFT
18153 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18154 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
18156 changed = true;
18157 log = INTVAL (XEXP (XEXP (x, 0), 1));
18158 XEXP (x, 0) = gen_rtx_MULT (Pmode,
18159 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
18160 GEN_INT (1 << log));
18163 if (GET_CODE (XEXP (x, 1)) == ASHIFT
18164 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
18165 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
18167 changed = true;
18168 log = INTVAL (XEXP (XEXP (x, 1), 1));
18169 XEXP (x, 1) = gen_rtx_MULT (Pmode,
18170 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
18171 GEN_INT (1 << log));
18174 /* Put multiply first if it isn't already. */
18175 if (GET_CODE (XEXP (x, 1)) == MULT)
18177 std::swap (XEXP (x, 0), XEXP (x, 1));
18178 changed = true;
18181 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
18182 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
18183 created by virtual register instantiation, register elimination, and
18184 similar optimizations. */
18185 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
18187 changed = true;
18188 x = gen_rtx_PLUS (Pmode,
18189 gen_rtx_PLUS (Pmode, XEXP (x, 0),
18190 XEXP (XEXP (x, 1), 0)),
18191 XEXP (XEXP (x, 1), 1));
18194 /* Canonicalize
18195 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
18196 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
18197 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
18198 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
18199 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
18200 && CONSTANT_P (XEXP (x, 1)))
18202 rtx constant;
18203 rtx other = NULL_RTX;
18205 if (CONST_INT_P (XEXP (x, 1)))
18207 constant = XEXP (x, 1);
18208 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
18210 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
18212 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
18213 other = XEXP (x, 1);
18215 else
18216 constant = 0;
18218 if (constant)
18220 changed = true;
18221 x = gen_rtx_PLUS (Pmode,
18222 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
18223 XEXP (XEXP (XEXP (x, 0), 1), 0)),
18224 plus_constant (Pmode, other,
18225 INTVAL (constant)));
18229 if (changed && ix86_legitimate_address_p (mode, x, false))
18230 return x;
18232 if (GET_CODE (XEXP (x, 0)) == MULT)
18234 changed = true;
18235 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
18238 if (GET_CODE (XEXP (x, 1)) == MULT)
18240 changed = true;
18241 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
18244 if (changed
18245 && REG_P (XEXP (x, 1))
18246 && REG_P (XEXP (x, 0)))
18247 return x;
18249 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
18251 changed = true;
18252 x = legitimize_pic_address (x, 0);
18255 if (changed && ix86_legitimate_address_p (mode, x, false))
18256 return x;
18258 if (REG_P (XEXP (x, 0)))
18260 rtx temp = gen_reg_rtx (Pmode);
18261 rtx val = force_operand (XEXP (x, 1), temp);
18262 if (val != temp)
18264 val = convert_to_mode (Pmode, val, 1);
18265 emit_move_insn (temp, val);
18268 XEXP (x, 1) = temp;
18269 return x;
18272 else if (REG_P (XEXP (x, 1)))
18274 rtx temp = gen_reg_rtx (Pmode);
18275 rtx val = force_operand (XEXP (x, 0), temp);
18276 if (val != temp)
18278 val = convert_to_mode (Pmode, val, 1);
18279 emit_move_insn (temp, val);
18282 XEXP (x, 0) = temp;
18283 return x;
18287 return x;
18290 /* Print an integer constant expression in assembler syntax. Addition
18291 and subtraction are the only arithmetic that may appear in these
18292 expressions. FILE is the stdio stream to write to, X is the rtx, and
18293 CODE is the operand print code from the output string. */
18295 static void
18296 output_pic_addr_const (FILE *file, rtx x, int code)
18298 char buf[256];
18300 switch (GET_CODE (x))
18302 case PC:
18303 gcc_assert (flag_pic);
18304 putc ('.', file);
18305 break;
18307 case SYMBOL_REF:
18308 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
18309 output_addr_const (file, x);
18310 else
18312 const char *name = XSTR (x, 0);
18314 /* Mark the decl as referenced so that cgraph will
18315 output the function. */
18316 if (SYMBOL_REF_DECL (x))
18317 mark_decl_referenced (SYMBOL_REF_DECL (x));
18319 #if TARGET_MACHO
18320 if (MACHOPIC_INDIRECT
18321 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
18322 name = machopic_indirection_name (x, /*stub_p=*/true);
18323 #endif
18324 assemble_name (file, name);
18326 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
18327 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
18328 fputs ("@PLT", file);
18329 break;
18331 case LABEL_REF:
18332 x = XEXP (x, 0);
18333 /* FALLTHRU */
18334 case CODE_LABEL:
18335 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
18336 assemble_name (asm_out_file, buf);
18337 break;
18339 case CONST_INT:
18340 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18341 break;
18343 case CONST:
18344 /* This used to output parentheses around the expression,
18345 but that does not work on the 386 (either ATT or BSD assembler). */
18346 output_pic_addr_const (file, XEXP (x, 0), code);
18347 break;
18349 case CONST_DOUBLE:
18350 /* We can't handle floating point constants;
18351 TARGET_PRINT_OPERAND must handle them. */
18352 output_operand_lossage ("floating constant misused");
18353 break;
18355 case PLUS:
18356 /* Some assemblers need integer constants to appear first. */
18357 if (CONST_INT_P (XEXP (x, 0)))
18359 output_pic_addr_const (file, XEXP (x, 0), code);
18360 putc ('+', file);
18361 output_pic_addr_const (file, XEXP (x, 1), code);
18363 else
18365 gcc_assert (CONST_INT_P (XEXP (x, 1)));
18366 output_pic_addr_const (file, XEXP (x, 1), code);
18367 putc ('+', file);
18368 output_pic_addr_const (file, XEXP (x, 0), code);
18370 break;
18372 case MINUS:
18373 if (!TARGET_MACHO)
18374 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
18375 output_pic_addr_const (file, XEXP (x, 0), code);
18376 putc ('-', file);
18377 output_pic_addr_const (file, XEXP (x, 1), code);
18378 if (!TARGET_MACHO)
18379 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
18380 break;
18382 case UNSPEC:
18383 gcc_assert (XVECLEN (x, 0) == 1);
18384 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
18385 switch (XINT (x, 1))
18387 case UNSPEC_GOT:
18388 fputs ("@GOT", file);
18389 break;
18390 case UNSPEC_GOTOFF:
18391 fputs ("@GOTOFF", file);
18392 break;
18393 case UNSPEC_PLTOFF:
18394 fputs ("@PLTOFF", file);
18395 break;
18396 case UNSPEC_PCREL:
18397 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18398 "(%rip)" : "[rip]", file);
18399 break;
18400 case UNSPEC_GOTPCREL:
18401 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18402 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
18403 break;
18404 case UNSPEC_GOTTPOFF:
18405 /* FIXME: This might be @TPOFF in Sun ld too. */
18406 fputs ("@gottpoff", file);
18407 break;
18408 case UNSPEC_TPOFF:
18409 fputs ("@tpoff", file);
18410 break;
18411 case UNSPEC_NTPOFF:
18412 if (TARGET_64BIT)
18413 fputs ("@tpoff", file);
18414 else
18415 fputs ("@ntpoff", file);
18416 break;
18417 case UNSPEC_DTPOFF:
18418 fputs ("@dtpoff", file);
18419 break;
18420 case UNSPEC_GOTNTPOFF:
18421 if (TARGET_64BIT)
18422 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18423 "@gottpoff(%rip)": "@gottpoff[rip]", file);
18424 else
18425 fputs ("@gotntpoff", file);
18426 break;
18427 case UNSPEC_INDNTPOFF:
18428 fputs ("@indntpoff", file);
18429 break;
18430 #if TARGET_MACHO
18431 case UNSPEC_MACHOPIC_OFFSET:
18432 putc ('-', file);
18433 machopic_output_function_base_name (file);
18434 break;
18435 #endif
18436 default:
18437 output_operand_lossage ("invalid UNSPEC as operand");
18438 break;
18440 break;
18442 default:
18443 output_operand_lossage ("invalid expression as operand");
18447 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
18448 We need to emit DTP-relative relocations. */
18450 static void ATTRIBUTE_UNUSED
18451 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18453 fputs (ASM_LONG, file);
18454 output_addr_const (file, x);
18455 fputs ("@dtpoff", file);
18456 switch (size)
18458 case 4:
18459 break;
18460 case 8:
18461 fputs (", 0", file);
18462 break;
18463 default:
18464 gcc_unreachable ();
18468 /* Return true if X is a representation of the PIC register. This copes
18469 with calls from ix86_find_base_term, where the register might have
18470 been replaced by a cselib value. */
18472 static bool
18473 ix86_pic_register_p (rtx x)
18475 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18476 return (pic_offset_table_rtx
18477 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18478 else if (!REG_P (x))
18479 return false;
18480 else if (pic_offset_table_rtx)
18482 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18483 return true;
18484 if (HARD_REGISTER_P (x)
18485 && !HARD_REGISTER_P (pic_offset_table_rtx)
18486 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18487 return true;
18488 return false;
18490 else
18491 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18494 /* Helper function for ix86_delegitimize_address.
18495 Attempt to delegitimize TLS local-exec accesses. */
18497 static rtx
18498 ix86_delegitimize_tls_address (rtx orig_x)
18500 rtx x = orig_x, unspec;
18501 struct ix86_address addr;
18503 if (!TARGET_TLS_DIRECT_SEG_REFS)
18504 return orig_x;
18505 if (MEM_P (x))
18506 x = XEXP (x, 0);
18507 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18508 return orig_x;
18509 if (ix86_decompose_address (x, &addr) == 0
18510 || addr.seg != DEFAULT_TLS_SEG_REG
18511 || addr.disp == NULL_RTX
18512 || GET_CODE (addr.disp) != CONST)
18513 return orig_x;
18514 unspec = XEXP (addr.disp, 0);
18515 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18516 unspec = XEXP (unspec, 0);
18517 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18518 return orig_x;
18519 x = XVECEXP (unspec, 0, 0);
18520 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18521 if (unspec != XEXP (addr.disp, 0))
18522 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18523 if (addr.index)
18525 rtx idx = addr.index;
18526 if (addr.scale != 1)
18527 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18528 x = gen_rtx_PLUS (Pmode, idx, x);
18530 if (addr.base)
18531 x = gen_rtx_PLUS (Pmode, addr.base, x);
18532 if (MEM_P (orig_x))
18533 x = replace_equiv_address_nv (orig_x, x);
18534 return x;
18537 /* In the name of slightly smaller debug output, and to cater to
18538 general assembler lossage, recognize PIC+GOTOFF and turn it back
18539 into a direct symbol reference.
18541 On Darwin, this is necessary to avoid a crash, because Darwin
18542 has a different PIC label for each routine but the DWARF debugging
18543 information is not associated with any particular routine, so it's
18544 necessary to remove references to the PIC label from RTL stored by
18545 the DWARF output code.
18547 This helper is used in the normal ix86_delegitimize_address
18548 entrypoint (e.g. used in the target delegitimization hook) and
18549 in ix86_find_base_term. As compile time memory optimization, we
18550 avoid allocating rtxes that will not change anything on the outcome
18551 of the callers (find_base_value and find_base_term). */
18553 static inline rtx
18554 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18556 rtx orig_x = delegitimize_mem_from_attrs (x);
18557 /* addend is NULL or some rtx if x is something+GOTOFF where
18558 something doesn't include the PIC register. */
18559 rtx addend = NULL_RTX;
18560 /* reg_addend is NULL or a multiple of some register. */
18561 rtx reg_addend = NULL_RTX;
18562 /* const_addend is NULL or a const_int. */
18563 rtx const_addend = NULL_RTX;
18564 /* This is the result, or NULL. */
18565 rtx result = NULL_RTX;
18567 x = orig_x;
18569 if (MEM_P (x))
18570 x = XEXP (x, 0);
18572 if (TARGET_64BIT)
18574 if (GET_CODE (x) == CONST
18575 && GET_CODE (XEXP (x, 0)) == PLUS
18576 && GET_MODE (XEXP (x, 0)) == Pmode
18577 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18578 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18579 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18581 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18582 base. A CONST can't be arg_pointer_rtx based. */
18583 if (base_term_p && MEM_P (orig_x))
18584 return orig_x;
18585 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18586 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18587 if (MEM_P (orig_x))
18588 x = replace_equiv_address_nv (orig_x, x);
18589 return x;
18592 if (GET_CODE (x) == CONST
18593 && GET_CODE (XEXP (x, 0)) == UNSPEC
18594 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18595 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18596 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18598 x = XVECEXP (XEXP (x, 0), 0, 0);
18599 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18601 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18602 if (x == NULL_RTX)
18603 return orig_x;
18605 return x;
18608 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18609 return ix86_delegitimize_tls_address (orig_x);
18611 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18612 and -mcmodel=medium -fpic. */
18615 if (GET_CODE (x) != PLUS
18616 || GET_CODE (XEXP (x, 1)) != CONST)
18617 return ix86_delegitimize_tls_address (orig_x);
18619 if (ix86_pic_register_p (XEXP (x, 0)))
18620 /* %ebx + GOT/GOTOFF */
18622 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18624 /* %ebx + %reg * scale + GOT/GOTOFF */
18625 reg_addend = XEXP (x, 0);
18626 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18627 reg_addend = XEXP (reg_addend, 1);
18628 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18629 reg_addend = XEXP (reg_addend, 0);
18630 else
18632 reg_addend = NULL_RTX;
18633 addend = XEXP (x, 0);
18636 else
18637 addend = XEXP (x, 0);
18639 x = XEXP (XEXP (x, 1), 0);
18640 if (GET_CODE (x) == PLUS
18641 && CONST_INT_P (XEXP (x, 1)))
18643 const_addend = XEXP (x, 1);
18644 x = XEXP (x, 0);
18647 if (GET_CODE (x) == UNSPEC
18648 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18649 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18650 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18651 && !MEM_P (orig_x) && !addend)))
18652 result = XVECEXP (x, 0, 0);
18654 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18655 && !MEM_P (orig_x))
18656 result = XVECEXP (x, 0, 0);
18658 if (! result)
18659 return ix86_delegitimize_tls_address (orig_x);
18661 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18662 recurse on the first operand. */
18663 if (const_addend && !base_term_p)
18664 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18665 if (reg_addend)
18666 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18667 if (addend)
18669 /* If the rest of original X doesn't involve the PIC register, add
18670 addend and subtract pic_offset_table_rtx. This can happen e.g.
18671 for code like:
18672 leal (%ebx, %ecx, 4), %ecx
18674 movl foo@GOTOFF(%ecx), %edx
18675 in which case we return (%ecx - %ebx) + foo
18676 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18677 and reload has completed. */
18678 if (pic_offset_table_rtx
18679 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18680 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18681 pic_offset_table_rtx),
18682 result);
18683 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18685 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18686 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18687 result = gen_rtx_PLUS (Pmode, tmp, result);
18689 else
18690 return orig_x;
18692 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18694 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18695 if (result == NULL_RTX)
18696 return orig_x;
18698 return result;
18701 /* The normal instantiation of the above template. */
18703 static rtx
18704 ix86_delegitimize_address (rtx x)
18706 return ix86_delegitimize_address_1 (x, false);
18709 /* If X is a machine specific address (i.e. a symbol or label being
18710 referenced as a displacement from the GOT implemented using an
18711 UNSPEC), then return the base term. Otherwise return X. */
18714 ix86_find_base_term (rtx x)
18716 rtx term;
18718 if (TARGET_64BIT)
18720 if (GET_CODE (x) != CONST)
18721 return x;
18722 term = XEXP (x, 0);
18723 if (GET_CODE (term) == PLUS
18724 && CONST_INT_P (XEXP (term, 1)))
18725 term = XEXP (term, 0);
18726 if (GET_CODE (term) != UNSPEC
18727 || (XINT (term, 1) != UNSPEC_GOTPCREL
18728 && XINT (term, 1) != UNSPEC_PCREL))
18729 return x;
18731 return XVECEXP (term, 0, 0);
18734 return ix86_delegitimize_address_1 (x, true);
18737 static void
18738 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18739 bool fp, FILE *file)
18741 const char *suffix;
18743 if (mode == CCFPmode || mode == CCFPUmode)
18745 code = ix86_fp_compare_code_to_integer (code);
18746 mode = CCmode;
18748 if (reverse)
18749 code = reverse_condition (code);
18751 switch (code)
18753 case EQ:
18754 switch (mode)
18756 case E_CCAmode:
18757 suffix = "a";
18758 break;
18759 case E_CCCmode:
18760 suffix = "c";
18761 break;
18762 case E_CCOmode:
18763 suffix = "o";
18764 break;
18765 case E_CCPmode:
18766 suffix = "p";
18767 break;
18768 case E_CCSmode:
18769 suffix = "s";
18770 break;
18771 default:
18772 suffix = "e";
18773 break;
18775 break;
18776 case NE:
18777 switch (mode)
18779 case E_CCAmode:
18780 suffix = "na";
18781 break;
18782 case E_CCCmode:
18783 suffix = "nc";
18784 break;
18785 case E_CCOmode:
18786 suffix = "no";
18787 break;
18788 case E_CCPmode:
18789 suffix = "np";
18790 break;
18791 case E_CCSmode:
18792 suffix = "ns";
18793 break;
18794 default:
18795 suffix = "ne";
18796 break;
18798 break;
18799 case GT:
18800 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18801 suffix = "g";
18802 break;
18803 case GTU:
18804 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18805 Those same assemblers have the same but opposite lossage on cmov. */
18806 if (mode == CCmode)
18807 suffix = fp ? "nbe" : "a";
18808 else
18809 gcc_unreachable ();
18810 break;
18811 case LT:
18812 switch (mode)
18814 case E_CCNOmode:
18815 case E_CCGOCmode:
18816 suffix = "s";
18817 break;
18819 case E_CCmode:
18820 case E_CCGCmode:
18821 suffix = "l";
18822 break;
18824 default:
18825 gcc_unreachable ();
18827 break;
18828 case LTU:
18829 if (mode == CCmode)
18830 suffix = "b";
18831 else if (mode == CCCmode)
18832 suffix = fp ? "b" : "c";
18833 else
18834 gcc_unreachable ();
18835 break;
18836 case GE:
18837 switch (mode)
18839 case E_CCNOmode:
18840 case E_CCGOCmode:
18841 suffix = "ns";
18842 break;
18844 case E_CCmode:
18845 case E_CCGCmode:
18846 suffix = "ge";
18847 break;
18849 default:
18850 gcc_unreachable ();
18852 break;
18853 case GEU:
18854 if (mode == CCmode)
18855 suffix = "nb";
18856 else if (mode == CCCmode)
18857 suffix = fp ? "nb" : "nc";
18858 else
18859 gcc_unreachable ();
18860 break;
18861 case LE:
18862 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18863 suffix = "le";
18864 break;
18865 case LEU:
18866 if (mode == CCmode)
18867 suffix = "be";
18868 else
18869 gcc_unreachable ();
18870 break;
18871 case UNORDERED:
18872 suffix = fp ? "u" : "p";
18873 break;
18874 case ORDERED:
18875 suffix = fp ? "nu" : "np";
18876 break;
18877 default:
18878 gcc_unreachable ();
18880 fputs (suffix, file);
18883 /* Print the name of register X to FILE based on its machine mode and number.
18884 If CODE is 'w', pretend the mode is HImode.
18885 If CODE is 'b', pretend the mode is QImode.
18886 If CODE is 'k', pretend the mode is SImode.
18887 If CODE is 'q', pretend the mode is DImode.
18888 If CODE is 'x', pretend the mode is V4SFmode.
18889 If CODE is 't', pretend the mode is V8SFmode.
18890 If CODE is 'g', pretend the mode is V16SFmode.
18891 If CODE is 'h', pretend the reg is the 'high' byte register.
18892 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18893 If CODE is 'd', duplicate the operand for AVX instruction.
18896 void
18897 print_reg (rtx x, int code, FILE *file)
18899 const char *reg;
18900 int msize;
18901 unsigned int regno;
18902 bool duplicated;
18904 if (ASSEMBLER_DIALECT == ASM_ATT)
18905 putc ('%', file);
18907 if (x == pc_rtx)
18909 gcc_assert (TARGET_64BIT);
18910 fputs ("rip", file);
18911 return;
18914 if (code == 'y' && STACK_TOP_P (x))
18916 fputs ("st(0)", file);
18917 return;
18920 if (code == 'w')
18921 msize = 2;
18922 else if (code == 'b')
18923 msize = 1;
18924 else if (code == 'k')
18925 msize = 4;
18926 else if (code == 'q')
18927 msize = 8;
18928 else if (code == 'h')
18929 msize = 0;
18930 else if (code == 'x')
18931 msize = 16;
18932 else if (code == 't')
18933 msize = 32;
18934 else if (code == 'g')
18935 msize = 64;
18936 else
18937 msize = GET_MODE_SIZE (GET_MODE (x));
18939 regno = REGNO (x);
18941 if (regno == ARG_POINTER_REGNUM
18942 || regno == FRAME_POINTER_REGNUM
18943 || regno == FPSR_REG
18944 || regno == FPCR_REG)
18946 output_operand_lossage
18947 ("invalid use of register '%s'", reg_names[regno]);
18948 return;
18950 else if (regno == FLAGS_REG)
18952 output_operand_lossage ("invalid use of asm flag output");
18953 return;
18956 duplicated = code == 'd' && TARGET_AVX;
18958 switch (msize)
18960 case 16:
18961 case 12:
18962 case 8:
18963 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18964 warning (0, "unsupported size for integer register");
18965 /* FALLTHRU */
18966 case 4:
18967 if (LEGACY_INT_REGNO_P (regno))
18968 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18969 /* FALLTHRU */
18970 case 2:
18971 normal:
18972 reg = hi_reg_name[regno];
18973 break;
18974 case 1:
18975 if (regno >= ARRAY_SIZE (qi_reg_name))
18976 goto normal;
18977 if (!ANY_QI_REGNO_P (regno))
18978 error ("unsupported size for integer register");
18979 reg = qi_reg_name[regno];
18980 break;
18981 case 0:
18982 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18983 goto normal;
18984 reg = qi_high_reg_name[regno];
18985 break;
18986 case 32:
18987 case 64:
18988 if (SSE_REGNO_P (regno))
18990 gcc_assert (!duplicated);
18991 putc (msize == 32 ? 'y' : 'z', file);
18992 reg = hi_reg_name[regno] + 1;
18993 break;
18995 goto normal;
18996 default:
18997 gcc_unreachable ();
19000 fputs (reg, file);
19002 /* Irritatingly, AMD extended registers use
19003 different naming convention: "r%d[bwd]" */
19004 if (REX_INT_REGNO_P (regno))
19006 gcc_assert (TARGET_64BIT);
19007 switch (msize)
19009 case 0:
19010 error ("extended registers have no high halves");
19011 break;
19012 case 1:
19013 putc ('b', file);
19014 break;
19015 case 2:
19016 putc ('w', file);
19017 break;
19018 case 4:
19019 putc ('d', file);
19020 break;
19021 case 8:
19022 /* no suffix */
19023 break;
19024 default:
19025 error ("unsupported operand size for extended register");
19026 break;
19028 return;
19031 if (duplicated)
19033 if (ASSEMBLER_DIALECT == ASM_ATT)
19034 fprintf (file, ", %%%s", reg);
19035 else
19036 fprintf (file, ", %s", reg);
19040 /* Meaning of CODE:
19041 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
19042 C -- print opcode suffix for set/cmov insn.
19043 c -- like C, but print reversed condition
19044 F,f -- likewise, but for floating-point.
19045 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
19046 otherwise nothing
19047 R -- print embeded rounding and sae.
19048 r -- print only sae.
19049 z -- print the opcode suffix for the size of the current operand.
19050 Z -- likewise, with special suffixes for x87 instructions.
19051 * -- print a star (in certain assembler syntax)
19052 A -- print an absolute memory reference.
19053 E -- print address with DImode register names if TARGET_64BIT.
19054 w -- print the operand as if it's a "word" (HImode) even if it isn't.
19055 s -- print a shift double count, followed by the assemblers argument
19056 delimiter.
19057 b -- print the QImode name of the register for the indicated operand.
19058 %b0 would print %al if operands[0] is reg 0.
19059 w -- likewise, print the HImode name of the register.
19060 k -- likewise, print the SImode name of the register.
19061 q -- likewise, print the DImode name of the register.
19062 x -- likewise, print the V4SFmode name of the register.
19063 t -- likewise, print the V8SFmode name of the register.
19064 g -- likewise, print the V16SFmode name of the register.
19065 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
19066 y -- print "st(0)" instead of "st" as a register.
19067 d -- print duplicated register operand for AVX instruction.
19068 D -- print condition for SSE cmp instruction.
19069 P -- if PIC, print an @PLT suffix.
19070 p -- print raw symbol name.
19071 X -- don't print any sort of PIC '@' suffix for a symbol.
19072 & -- print some in-use local-dynamic symbol name.
19073 H -- print a memory address offset by 8; used for sse high-parts
19074 Y -- print condition for XOP pcom* instruction.
19075 + -- print a branch hint as 'cs' or 'ds' prefix
19076 ; -- print a semicolon (after prefixes due to bug in older gas).
19077 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
19078 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
19079 ! -- print MPX prefix for jxx/call/ret instructions if required.
19082 void
19083 ix86_print_operand (FILE *file, rtx x, int code)
19085 if (code)
19087 switch (code)
19089 case 'A':
19090 switch (ASSEMBLER_DIALECT)
19092 case ASM_ATT:
19093 putc ('*', file);
19094 break;
19096 case ASM_INTEL:
19097 /* Intel syntax. For absolute addresses, registers should not
19098 be surrounded by braces. */
19099 if (!REG_P (x))
19101 putc ('[', file);
19102 ix86_print_operand (file, x, 0);
19103 putc (']', file);
19104 return;
19106 break;
19108 default:
19109 gcc_unreachable ();
19112 ix86_print_operand (file, x, 0);
19113 return;
19115 case 'E':
19116 /* Wrap address in an UNSPEC to declare special handling. */
19117 if (TARGET_64BIT)
19118 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
19120 output_address (VOIDmode, x);
19121 return;
19123 case 'L':
19124 if (ASSEMBLER_DIALECT == ASM_ATT)
19125 putc ('l', file);
19126 return;
19128 case 'W':
19129 if (ASSEMBLER_DIALECT == ASM_ATT)
19130 putc ('w', file);
19131 return;
19133 case 'B':
19134 if (ASSEMBLER_DIALECT == ASM_ATT)
19135 putc ('b', file);
19136 return;
19138 case 'Q':
19139 if (ASSEMBLER_DIALECT == ASM_ATT)
19140 putc ('l', file);
19141 return;
19143 case 'S':
19144 if (ASSEMBLER_DIALECT == ASM_ATT)
19145 putc ('s', file);
19146 return;
19148 case 'T':
19149 if (ASSEMBLER_DIALECT == ASM_ATT)
19150 putc ('t', file);
19151 return;
19153 case 'O':
19154 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19155 if (ASSEMBLER_DIALECT != ASM_ATT)
19156 return;
19158 switch (GET_MODE_SIZE (GET_MODE (x)))
19160 case 2:
19161 putc ('w', file);
19162 break;
19164 case 4:
19165 putc ('l', file);
19166 break;
19168 case 8:
19169 putc ('q', file);
19170 break;
19172 default:
19173 output_operand_lossage ("invalid operand size for operand "
19174 "code 'O'");
19175 return;
19178 putc ('.', file);
19179 #endif
19180 return;
19182 case 'z':
19183 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
19185 /* Opcodes don't get size suffixes if using Intel opcodes. */
19186 if (ASSEMBLER_DIALECT == ASM_INTEL)
19187 return;
19189 switch (GET_MODE_SIZE (GET_MODE (x)))
19191 case 1:
19192 putc ('b', file);
19193 return;
19195 case 2:
19196 putc ('w', file);
19197 return;
19199 case 4:
19200 putc ('l', file);
19201 return;
19203 case 8:
19204 putc ('q', file);
19205 return;
19207 default:
19208 output_operand_lossage ("invalid operand size for operand "
19209 "code 'z'");
19210 return;
19214 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
19215 warning (0, "non-integer operand used with operand code 'z'");
19216 /* FALLTHRU */
19218 case 'Z':
19219 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
19220 if (ASSEMBLER_DIALECT == ASM_INTEL)
19221 return;
19223 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
19225 switch (GET_MODE_SIZE (GET_MODE (x)))
19227 case 2:
19228 #ifdef HAVE_AS_IX86_FILDS
19229 putc ('s', file);
19230 #endif
19231 return;
19233 case 4:
19234 putc ('l', file);
19235 return;
19237 case 8:
19238 #ifdef HAVE_AS_IX86_FILDQ
19239 putc ('q', file);
19240 #else
19241 fputs ("ll", file);
19242 #endif
19243 return;
19245 default:
19246 break;
19249 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
19251 /* 387 opcodes don't get size suffixes
19252 if the operands are registers. */
19253 if (STACK_REG_P (x))
19254 return;
19256 switch (GET_MODE_SIZE (GET_MODE (x)))
19258 case 4:
19259 putc ('s', file);
19260 return;
19262 case 8:
19263 putc ('l', file);
19264 return;
19266 case 12:
19267 case 16:
19268 putc ('t', file);
19269 return;
19271 default:
19272 break;
19275 else
19277 output_operand_lossage ("invalid operand type used with "
19278 "operand code 'Z'");
19279 return;
19282 output_operand_lossage ("invalid operand size for operand code 'Z'");
19283 return;
19285 case 'd':
19286 case 'b':
19287 case 'w':
19288 case 'k':
19289 case 'q':
19290 case 'h':
19291 case 't':
19292 case 'g':
19293 case 'y':
19294 case 'x':
19295 case 'X':
19296 case 'P':
19297 case 'p':
19298 break;
19300 case 's':
19301 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
19303 ix86_print_operand (file, x, 0);
19304 fputs (", ", file);
19306 return;
19308 case 'Y':
19309 switch (GET_CODE (x))
19311 case NE:
19312 fputs ("neq", file);
19313 break;
19314 case EQ:
19315 fputs ("eq", file);
19316 break;
19317 case GE:
19318 case GEU:
19319 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
19320 break;
19321 case GT:
19322 case GTU:
19323 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
19324 break;
19325 case LE:
19326 case LEU:
19327 fputs ("le", file);
19328 break;
19329 case LT:
19330 case LTU:
19331 fputs ("lt", file);
19332 break;
19333 case UNORDERED:
19334 fputs ("unord", file);
19335 break;
19336 case ORDERED:
19337 fputs ("ord", file);
19338 break;
19339 case UNEQ:
19340 fputs ("ueq", file);
19341 break;
19342 case UNGE:
19343 fputs ("nlt", file);
19344 break;
19345 case UNGT:
19346 fputs ("nle", file);
19347 break;
19348 case UNLE:
19349 fputs ("ule", file);
19350 break;
19351 case UNLT:
19352 fputs ("ult", file);
19353 break;
19354 case LTGT:
19355 fputs ("une", file);
19356 break;
19357 default:
19358 output_operand_lossage ("operand is not a condition code, "
19359 "invalid operand code 'Y'");
19360 return;
19362 return;
19364 case 'D':
19365 /* Little bit of braindamage here. The SSE compare instructions
19366 does use completely different names for the comparisons that the
19367 fp conditional moves. */
19368 switch (GET_CODE (x))
19370 case UNEQ:
19371 if (TARGET_AVX)
19373 fputs ("eq_us", file);
19374 break;
19376 /* FALLTHRU */
19377 case EQ:
19378 fputs ("eq", file);
19379 break;
19380 case UNLT:
19381 if (TARGET_AVX)
19383 fputs ("nge", file);
19384 break;
19386 /* FALLTHRU */
19387 case LT:
19388 fputs ("lt", file);
19389 break;
19390 case UNLE:
19391 if (TARGET_AVX)
19393 fputs ("ngt", file);
19394 break;
19396 /* FALLTHRU */
19397 case LE:
19398 fputs ("le", file);
19399 break;
19400 case UNORDERED:
19401 fputs ("unord", file);
19402 break;
19403 case LTGT:
19404 if (TARGET_AVX)
19406 fputs ("neq_oq", file);
19407 break;
19409 /* FALLTHRU */
19410 case NE:
19411 fputs ("neq", file);
19412 break;
19413 case GE:
19414 if (TARGET_AVX)
19416 fputs ("ge", file);
19417 break;
19419 /* FALLTHRU */
19420 case UNGE:
19421 fputs ("nlt", file);
19422 break;
19423 case GT:
19424 if (TARGET_AVX)
19426 fputs ("gt", file);
19427 break;
19429 /* FALLTHRU */
19430 case UNGT:
19431 fputs ("nle", file);
19432 break;
19433 case ORDERED:
19434 fputs ("ord", file);
19435 break;
19436 default:
19437 output_operand_lossage ("operand is not a condition code, "
19438 "invalid operand code 'D'");
19439 return;
19441 return;
19443 case 'F':
19444 case 'f':
19445 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19446 if (ASSEMBLER_DIALECT == ASM_ATT)
19447 putc ('.', file);
19448 gcc_fallthrough ();
19449 #endif
19451 case 'C':
19452 case 'c':
19453 if (!COMPARISON_P (x))
19455 output_operand_lossage ("operand is not a condition code, "
19456 "invalid operand code '%c'", code);
19457 return;
19459 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19460 code == 'c' || code == 'f',
19461 code == 'F' || code == 'f',
19462 file);
19463 return;
19465 case 'H':
19466 if (!offsettable_memref_p (x))
19468 output_operand_lossage ("operand is not an offsettable memory "
19469 "reference, invalid operand code 'H'");
19470 return;
19472 /* It doesn't actually matter what mode we use here, as we're
19473 only going to use this for printing. */
19474 x = adjust_address_nv (x, DImode, 8);
19475 /* Output 'qword ptr' for intel assembler dialect. */
19476 if (ASSEMBLER_DIALECT == ASM_INTEL)
19477 code = 'q';
19478 break;
19480 case 'K':
19481 if (!CONST_INT_P (x))
19483 output_operand_lossage ("operand is not an integer, invalid "
19484 "operand code 'K'");
19485 return;
19488 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19489 #ifdef HAVE_AS_IX86_HLE
19490 fputs ("xacquire ", file);
19491 #else
19492 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19493 #endif
19494 else if (INTVAL (x) & IX86_HLE_RELEASE)
19495 #ifdef HAVE_AS_IX86_HLE
19496 fputs ("xrelease ", file);
19497 #else
19498 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19499 #endif
19500 /* We do not want to print value of the operand. */
19501 return;
19503 case 'N':
19504 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19505 fputs ("{z}", file);
19506 return;
19508 case 'r':
19509 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19511 output_operand_lossage ("operand is not a specific integer, "
19512 "invalid operand code 'r'");
19513 return;
19516 if (ASSEMBLER_DIALECT == ASM_INTEL)
19517 fputs (", ", file);
19519 fputs ("{sae}", file);
19521 if (ASSEMBLER_DIALECT == ASM_ATT)
19522 fputs (", ", file);
19524 return;
19526 case 'R':
19527 if (!CONST_INT_P (x))
19529 output_operand_lossage ("operand is not an integer, invalid "
19530 "operand code 'R'");
19531 return;
19534 if (ASSEMBLER_DIALECT == ASM_INTEL)
19535 fputs (", ", file);
19537 switch (INTVAL (x))
19539 case ROUND_NEAREST_INT | ROUND_SAE:
19540 fputs ("{rn-sae}", file);
19541 break;
19542 case ROUND_NEG_INF | ROUND_SAE:
19543 fputs ("{rd-sae}", file);
19544 break;
19545 case ROUND_POS_INF | ROUND_SAE:
19546 fputs ("{ru-sae}", file);
19547 break;
19548 case ROUND_ZERO | ROUND_SAE:
19549 fputs ("{rz-sae}", file);
19550 break;
19551 default:
19552 output_operand_lossage ("operand is not a specific integer, "
19553 "invalid operand code 'R'");
19556 if (ASSEMBLER_DIALECT == ASM_ATT)
19557 fputs (", ", file);
19559 return;
19561 case '*':
19562 if (ASSEMBLER_DIALECT == ASM_ATT)
19563 putc ('*', file);
19564 return;
19566 case '&':
19568 const char *name = get_some_local_dynamic_name ();
19569 if (name == NULL)
19570 output_operand_lossage ("'%%&' used without any "
19571 "local dynamic TLS references");
19572 else
19573 assemble_name (file, name);
19574 return;
19577 case '+':
19579 rtx x;
19581 if (!optimize
19582 || optimize_function_for_size_p (cfun)
19583 || !TARGET_BRANCH_PREDICTION_HINTS)
19584 return;
19586 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19587 if (x)
19589 int pred_val = profile_probability::from_reg_br_prob_note
19590 (XINT (x, 0)).to_reg_br_prob_base ();
19592 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19593 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19595 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19596 bool cputaken
19597 = final_forward_branch_p (current_output_insn) == 0;
19599 /* Emit hints only in the case default branch prediction
19600 heuristics would fail. */
19601 if (taken != cputaken)
19603 /* We use 3e (DS) prefix for taken branches and
19604 2e (CS) prefix for not taken branches. */
19605 if (taken)
19606 fputs ("ds ; ", file);
19607 else
19608 fputs ("cs ; ", file);
19612 return;
19615 case ';':
19616 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19617 putc (';', file);
19618 #endif
19619 return;
19621 case '~':
19622 putc (TARGET_AVX2 ? 'i' : 'f', file);
19623 return;
19625 case '^':
19626 if (TARGET_64BIT && Pmode != word_mode)
19627 fputs ("addr32 ", file);
19628 return;
19630 case '!':
19631 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19632 fputs ("bnd ", file);
19633 return;
19635 default:
19636 output_operand_lossage ("invalid operand code '%c'", code);
19640 if (REG_P (x))
19641 print_reg (x, code, file);
19643 else if (MEM_P (x))
19645 rtx addr = XEXP (x, 0);
19647 /* No `byte ptr' prefix for call instructions ... */
19648 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19650 machine_mode mode = GET_MODE (x);
19651 const char *size;
19653 /* Check for explicit size override codes. */
19654 if (code == 'b')
19655 size = "BYTE";
19656 else if (code == 'w')
19657 size = "WORD";
19658 else if (code == 'k')
19659 size = "DWORD";
19660 else if (code == 'q')
19661 size = "QWORD";
19662 else if (code == 'x')
19663 size = "XMMWORD";
19664 else if (code == 't')
19665 size = "YMMWORD";
19666 else if (code == 'g')
19667 size = "ZMMWORD";
19668 else if (mode == BLKmode)
19669 /* ... or BLKmode operands, when not overridden. */
19670 size = NULL;
19671 else
19672 switch (GET_MODE_SIZE (mode))
19674 case 1: size = "BYTE"; break;
19675 case 2: size = "WORD"; break;
19676 case 4: size = "DWORD"; break;
19677 case 8: size = "QWORD"; break;
19678 case 12: size = "TBYTE"; break;
19679 case 16:
19680 if (mode == XFmode)
19681 size = "TBYTE";
19682 else
19683 size = "XMMWORD";
19684 break;
19685 case 32: size = "YMMWORD"; break;
19686 case 64: size = "ZMMWORD"; break;
19687 default:
19688 gcc_unreachable ();
19690 if (size)
19692 fputs (size, file);
19693 fputs (" PTR ", file);
19697 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19698 output_operand_lossage ("invalid constraints for operand");
19699 else
19700 ix86_print_operand_address_as
19701 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19704 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19706 long l;
19708 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19710 if (ASSEMBLER_DIALECT == ASM_ATT)
19711 putc ('$', file);
19712 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19713 if (code == 'q')
19714 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19715 (unsigned long long) (int) l);
19716 else
19717 fprintf (file, "0x%08x", (unsigned int) l);
19720 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19722 long l[2];
19724 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19726 if (ASSEMBLER_DIALECT == ASM_ATT)
19727 putc ('$', file);
19728 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19731 /* These float cases don't actually occur as immediate operands. */
19732 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19734 char dstr[30];
19736 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19737 fputs (dstr, file);
19740 else
19742 /* We have patterns that allow zero sets of memory, for instance.
19743 In 64-bit mode, we should probably support all 8-byte vectors,
19744 since we can in fact encode that into an immediate. */
19745 if (GET_CODE (x) == CONST_VECTOR)
19747 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19748 x = const0_rtx;
19751 if (code != 'P' && code != 'p')
19753 if (CONST_INT_P (x))
19755 if (ASSEMBLER_DIALECT == ASM_ATT)
19756 putc ('$', file);
19758 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19759 || GET_CODE (x) == LABEL_REF)
19761 if (ASSEMBLER_DIALECT == ASM_ATT)
19762 putc ('$', file);
19763 else
19764 fputs ("OFFSET FLAT:", file);
19767 if (CONST_INT_P (x))
19768 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19769 else if (flag_pic || MACHOPIC_INDIRECT)
19770 output_pic_addr_const (file, x, code);
19771 else
19772 output_addr_const (file, x);
19776 static bool
19777 ix86_print_operand_punct_valid_p (unsigned char code)
19779 return (code == '*' || code == '+' || code == '&' || code == ';'
19780 || code == '~' || code == '^' || code == '!');
19783 /* Print a memory operand whose address is ADDR. */
19785 static void
19786 ix86_print_operand_address_as (FILE *file, rtx addr,
19787 addr_space_t as, bool no_rip)
19789 struct ix86_address parts;
19790 rtx base, index, disp;
19791 int scale;
19792 int ok;
19793 bool vsib = false;
19794 int code = 0;
19796 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19798 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19799 gcc_assert (parts.index == NULL_RTX);
19800 parts.index = XVECEXP (addr, 0, 1);
19801 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19802 addr = XVECEXP (addr, 0, 0);
19803 vsib = true;
19805 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19807 gcc_assert (TARGET_64BIT);
19808 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19809 code = 'q';
19811 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19813 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19814 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19815 if (parts.base != NULL_RTX)
19817 parts.index = parts.base;
19818 parts.scale = 1;
19820 parts.base = XVECEXP (addr, 0, 0);
19821 addr = XVECEXP (addr, 0, 0);
19823 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19825 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19826 gcc_assert (parts.index == NULL_RTX);
19827 parts.index = XVECEXP (addr, 0, 1);
19828 addr = XVECEXP (addr, 0, 0);
19830 else
19831 ok = ix86_decompose_address (addr, &parts);
19833 gcc_assert (ok);
19835 base = parts.base;
19836 index = parts.index;
19837 disp = parts.disp;
19838 scale = parts.scale;
19840 if (ADDR_SPACE_GENERIC_P (as))
19841 as = parts.seg;
19842 else
19843 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19845 if (!ADDR_SPACE_GENERIC_P (as))
19847 const char *string;
19849 if (as == ADDR_SPACE_SEG_FS)
19850 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19851 else if (as == ADDR_SPACE_SEG_GS)
19852 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19853 else
19854 gcc_unreachable ();
19855 fputs (string, file);
19858 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19859 if (TARGET_64BIT && !base && !index && !no_rip)
19861 rtx symbol = disp;
19863 if (GET_CODE (disp) == CONST
19864 && GET_CODE (XEXP (disp, 0)) == PLUS
19865 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19866 symbol = XEXP (XEXP (disp, 0), 0);
19868 if (GET_CODE (symbol) == LABEL_REF
19869 || (GET_CODE (symbol) == SYMBOL_REF
19870 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19871 base = pc_rtx;
19874 if (!base && !index)
19876 /* Displacement only requires special attention. */
19877 if (CONST_INT_P (disp))
19879 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19880 fputs ("ds:", file);
19881 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19883 /* Load the external function address via the GOT slot to avoid PLT. */
19884 else if (GET_CODE (disp) == CONST
19885 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19886 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19887 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19888 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19889 output_pic_addr_const (file, disp, 0);
19890 else if (flag_pic)
19891 output_pic_addr_const (file, disp, 0);
19892 else
19893 output_addr_const (file, disp);
19895 else
19897 /* Print SImode register names to force addr32 prefix. */
19898 if (SImode_address_operand (addr, VOIDmode))
19900 if (flag_checking)
19902 gcc_assert (TARGET_64BIT);
19903 switch (GET_CODE (addr))
19905 case SUBREG:
19906 gcc_assert (GET_MODE (addr) == SImode);
19907 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19908 break;
19909 case ZERO_EXTEND:
19910 case AND:
19911 gcc_assert (GET_MODE (addr) == DImode);
19912 break;
19913 default:
19914 gcc_unreachable ();
19917 gcc_assert (!code);
19918 code = 'k';
19920 else if (code == 0
19921 && TARGET_X32
19922 && disp
19923 && CONST_INT_P (disp)
19924 && INTVAL (disp) < -16*1024*1024)
19926 /* X32 runs in 64-bit mode, where displacement, DISP, in
19927 address DISP(%r64), is encoded as 32-bit immediate sign-
19928 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19929 address is %r64 + 0xffffffffbffffd00. When %r64 <
19930 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19931 which is invalid for x32. The correct address is %r64
19932 - 0x40000300 == 0xf7ffdd64. To properly encode
19933 -0x40000300(%r64) for x32, we zero-extend negative
19934 displacement by forcing addr32 prefix which truncates
19935 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19936 zero-extend all negative displacements, including -1(%rsp).
19937 However, for small negative displacements, sign-extension
19938 won't cause overflow. We only zero-extend negative
19939 displacements if they < -16*1024*1024, which is also used
19940 to check legitimate address displacements for PIC. */
19941 code = 'k';
19944 if (ASSEMBLER_DIALECT == ASM_ATT)
19946 if (disp)
19948 if (flag_pic)
19949 output_pic_addr_const (file, disp, 0);
19950 else if (GET_CODE (disp) == LABEL_REF)
19951 output_asm_label (disp);
19952 else
19953 output_addr_const (file, disp);
19956 putc ('(', file);
19957 if (base)
19958 print_reg (base, code, file);
19959 if (index)
19961 putc (',', file);
19962 print_reg (index, vsib ? 0 : code, file);
19963 if (scale != 1 || vsib)
19964 fprintf (file, ",%d", scale);
19966 putc (')', file);
19968 else
19970 rtx offset = NULL_RTX;
19972 if (disp)
19974 /* Pull out the offset of a symbol; print any symbol itself. */
19975 if (GET_CODE (disp) == CONST
19976 && GET_CODE (XEXP (disp, 0)) == PLUS
19977 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19979 offset = XEXP (XEXP (disp, 0), 1);
19980 disp = gen_rtx_CONST (VOIDmode,
19981 XEXP (XEXP (disp, 0), 0));
19984 if (flag_pic)
19985 output_pic_addr_const (file, disp, 0);
19986 else if (GET_CODE (disp) == LABEL_REF)
19987 output_asm_label (disp);
19988 else if (CONST_INT_P (disp))
19989 offset = disp;
19990 else
19991 output_addr_const (file, disp);
19994 putc ('[', file);
19995 if (base)
19997 print_reg (base, code, file);
19998 if (offset)
20000 if (INTVAL (offset) >= 0)
20001 putc ('+', file);
20002 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
20005 else if (offset)
20006 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
20007 else
20008 putc ('0', file);
20010 if (index)
20012 putc ('+', file);
20013 print_reg (index, vsib ? 0 : code, file);
20014 if (scale != 1 || vsib)
20015 fprintf (file, "*%d", scale);
20017 putc (']', file);
20022 static void
20023 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
20025 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
20028 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
20030 static bool
20031 i386_asm_output_addr_const_extra (FILE *file, rtx x)
20033 rtx op;
20035 if (GET_CODE (x) != UNSPEC)
20036 return false;
20038 op = XVECEXP (x, 0, 0);
20039 switch (XINT (x, 1))
20041 case UNSPEC_GOTTPOFF:
20042 output_addr_const (file, op);
20043 /* FIXME: This might be @TPOFF in Sun ld. */
20044 fputs ("@gottpoff", file);
20045 break;
20046 case UNSPEC_TPOFF:
20047 output_addr_const (file, op);
20048 fputs ("@tpoff", file);
20049 break;
20050 case UNSPEC_NTPOFF:
20051 output_addr_const (file, op);
20052 if (TARGET_64BIT)
20053 fputs ("@tpoff", file);
20054 else
20055 fputs ("@ntpoff", file);
20056 break;
20057 case UNSPEC_DTPOFF:
20058 output_addr_const (file, op);
20059 fputs ("@dtpoff", file);
20060 break;
20061 case UNSPEC_GOTNTPOFF:
20062 output_addr_const (file, op);
20063 if (TARGET_64BIT)
20064 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
20065 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
20066 else
20067 fputs ("@gotntpoff", file);
20068 break;
20069 case UNSPEC_INDNTPOFF:
20070 output_addr_const (file, op);
20071 fputs ("@indntpoff", file);
20072 break;
20073 #if TARGET_MACHO
20074 case UNSPEC_MACHOPIC_OFFSET:
20075 output_addr_const (file, op);
20076 putc ('-', file);
20077 machopic_output_function_base_name (file);
20078 break;
20079 #endif
20081 default:
20082 return false;
20085 return true;
20088 /* Split one or more double-mode RTL references into pairs of half-mode
20089 references. The RTL can be REG, offsettable MEM, integer constant, or
20090 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
20091 split and "num" is its length. lo_half and hi_half are output arrays
20092 that parallel "operands". */
20094 void
20095 split_double_mode (machine_mode mode, rtx operands[],
20096 int num, rtx lo_half[], rtx hi_half[])
20098 machine_mode half_mode;
20099 unsigned int byte;
20101 switch (mode)
20103 case E_TImode:
20104 half_mode = DImode;
20105 break;
20106 case E_DImode:
20107 half_mode = SImode;
20108 break;
20109 default:
20110 gcc_unreachable ();
20113 byte = GET_MODE_SIZE (half_mode);
20115 while (num--)
20117 rtx op = operands[num];
20119 /* simplify_subreg refuse to split volatile memory addresses,
20120 but we still have to handle it. */
20121 if (MEM_P (op))
20123 lo_half[num] = adjust_address (op, half_mode, 0);
20124 hi_half[num] = adjust_address (op, half_mode, byte);
20126 else
20128 lo_half[num] = simplify_gen_subreg (half_mode, op,
20129 GET_MODE (op) == VOIDmode
20130 ? mode : GET_MODE (op), 0);
20131 hi_half[num] = simplify_gen_subreg (half_mode, op,
20132 GET_MODE (op) == VOIDmode
20133 ? mode : GET_MODE (op), byte);
20138 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
20139 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
20140 is the expression of the binary operation. The output may either be
20141 emitted here, or returned to the caller, like all output_* functions.
20143 There is no guarantee that the operands are the same mode, as they
20144 might be within FLOAT or FLOAT_EXTEND expressions. */
20146 #ifndef SYSV386_COMPAT
20147 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
20148 wants to fix the assemblers because that causes incompatibility
20149 with gcc. No-one wants to fix gcc because that causes
20150 incompatibility with assemblers... You can use the option of
20151 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
20152 #define SYSV386_COMPAT 1
20153 #endif
20155 const char *
20156 output_387_binary_op (rtx_insn *insn, rtx *operands)
20158 static char buf[40];
20159 const char *p;
20160 const char *ssep;
20161 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
20163 /* Even if we do not want to check the inputs, this documents input
20164 constraints. Which helps in understanding the following code. */
20165 if (flag_checking)
20167 if (STACK_REG_P (operands[0])
20168 && ((REG_P (operands[1])
20169 && REGNO (operands[0]) == REGNO (operands[1])
20170 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
20171 || (REG_P (operands[2])
20172 && REGNO (operands[0]) == REGNO (operands[2])
20173 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
20174 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
20175 ; /* ok */
20176 else
20177 gcc_assert (is_sse);
20180 switch (GET_CODE (operands[3]))
20182 case PLUS:
20183 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20184 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20185 p = "fiadd";
20186 else
20187 p = "fadd";
20188 ssep = "vadd";
20189 break;
20191 case MINUS:
20192 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20193 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20194 p = "fisub";
20195 else
20196 p = "fsub";
20197 ssep = "vsub";
20198 break;
20200 case MULT:
20201 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20202 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20203 p = "fimul";
20204 else
20205 p = "fmul";
20206 ssep = "vmul";
20207 break;
20209 case DIV:
20210 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
20211 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
20212 p = "fidiv";
20213 else
20214 p = "fdiv";
20215 ssep = "vdiv";
20216 break;
20218 default:
20219 gcc_unreachable ();
20222 if (is_sse)
20224 if (TARGET_AVX)
20226 strcpy (buf, ssep);
20227 if (GET_MODE (operands[0]) == SFmode)
20228 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
20229 else
20230 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
20232 else
20234 strcpy (buf, ssep + 1);
20235 if (GET_MODE (operands[0]) == SFmode)
20236 strcat (buf, "ss\t{%2, %0|%0, %2}");
20237 else
20238 strcat (buf, "sd\t{%2, %0|%0, %2}");
20240 return buf;
20242 strcpy (buf, p);
20244 switch (GET_CODE (operands[3]))
20246 case MULT:
20247 case PLUS:
20248 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
20249 std::swap (operands[1], operands[2]);
20251 /* know operands[0] == operands[1]. */
20253 if (MEM_P (operands[2]))
20255 p = "%Z2\t%2";
20256 break;
20259 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20261 if (STACK_TOP_P (operands[0]))
20262 /* How is it that we are storing to a dead operand[2]?
20263 Well, presumably operands[1] is dead too. We can't
20264 store the result to st(0) as st(0) gets popped on this
20265 instruction. Instead store to operands[2] (which I
20266 think has to be st(1)). st(1) will be popped later.
20267 gcc <= 2.8.1 didn't have this check and generated
20268 assembly code that the Unixware assembler rejected. */
20269 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20270 else
20271 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20272 break;
20275 if (STACK_TOP_P (operands[0]))
20276 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20277 else
20278 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20279 break;
20281 case MINUS:
20282 case DIV:
20283 if (MEM_P (operands[1]))
20285 p = "r%Z1\t%1";
20286 break;
20289 if (MEM_P (operands[2]))
20291 p = "%Z2\t%2";
20292 break;
20295 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20297 #if SYSV386_COMPAT
20298 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
20299 derived assemblers, confusingly reverse the direction of
20300 the operation for fsub{r} and fdiv{r} when the
20301 destination register is not st(0). The Intel assembler
20302 doesn't have this brain damage. Read !SYSV386_COMPAT to
20303 figure out what the hardware really does. */
20304 if (STACK_TOP_P (operands[0]))
20305 p = "{p\t%0, %2|rp\t%2, %0}";
20306 else
20307 p = "{rp\t%2, %0|p\t%0, %2}";
20308 #else
20309 if (STACK_TOP_P (operands[0]))
20310 /* As above for fmul/fadd, we can't store to st(0). */
20311 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20312 else
20313 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20314 #endif
20315 break;
20318 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20320 #if SYSV386_COMPAT
20321 if (STACK_TOP_P (operands[0]))
20322 p = "{rp\t%0, %1|p\t%1, %0}";
20323 else
20324 p = "{p\t%1, %0|rp\t%0, %1}";
20325 #else
20326 if (STACK_TOP_P (operands[0]))
20327 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
20328 else
20329 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
20330 #endif
20331 break;
20334 if (STACK_TOP_P (operands[0]))
20336 if (STACK_TOP_P (operands[1]))
20337 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20338 else
20339 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
20340 break;
20342 else if (STACK_TOP_P (operands[1]))
20344 #if SYSV386_COMPAT
20345 p = "{\t%1, %0|r\t%0, %1}";
20346 #else
20347 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
20348 #endif
20350 else
20352 #if SYSV386_COMPAT
20353 p = "{r\t%2, %0|\t%0, %2}";
20354 #else
20355 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20356 #endif
20358 break;
20360 default:
20361 gcc_unreachable ();
20364 strcat (buf, p);
20365 return buf;
20368 /* Return needed mode for entity in optimize_mode_switching pass. */
20370 static int
20371 ix86_dirflag_mode_needed (rtx_insn *insn)
20373 if (CALL_P (insn))
20375 if (cfun->machine->func_type == TYPE_NORMAL)
20376 return X86_DIRFLAG_ANY;
20377 else
20378 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
20379 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
20382 if (recog_memoized (insn) < 0)
20383 return X86_DIRFLAG_ANY;
20385 if (get_attr_type (insn) == TYPE_STR)
20387 /* Emit cld instruction if stringops are used in the function. */
20388 if (cfun->machine->func_type == TYPE_NORMAL)
20389 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
20390 else
20391 return X86_DIRFLAG_RESET;
20394 return X86_DIRFLAG_ANY;
20397 /* Check if a 256bit AVX register is referenced inside of EXP. */
20399 static bool
20400 ix86_check_avx256_register (const_rtx exp)
20402 if (SUBREG_P (exp))
20403 exp = SUBREG_REG (exp);
20405 return (REG_P (exp)
20406 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
20409 /* Return needed mode for entity in optimize_mode_switching pass. */
20411 static int
20412 ix86_avx_u128_mode_needed (rtx_insn *insn)
20414 if (CALL_P (insn))
20416 rtx link;
20418 /* Needed mode is set to AVX_U128_CLEAN if there are
20419 no 256bit modes used in function arguments. */
20420 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20421 link;
20422 link = XEXP (link, 1))
20424 if (GET_CODE (XEXP (link, 0)) == USE)
20426 rtx arg = XEXP (XEXP (link, 0), 0);
20428 if (ix86_check_avx256_register (arg))
20429 return AVX_U128_DIRTY;
20433 return AVX_U128_CLEAN;
20436 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20437 changes state only when a 256bit register is written to, but we need
20438 to prevent the compiler from moving optimal insertion point above
20439 eventual read from 256bit register. */
20440 subrtx_iterator::array_type array;
20441 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20442 if (ix86_check_avx256_register (*iter))
20443 return AVX_U128_DIRTY;
20445 return AVX_U128_ANY;
20448 /* Return mode that i387 must be switched into
20449 prior to the execution of insn. */
20451 static int
20452 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20454 enum attr_i387_cw mode;
20456 /* The mode UNINITIALIZED is used to store control word after a
20457 function call or ASM pattern. The mode ANY specify that function
20458 has no requirements on the control word and make no changes in the
20459 bits we are interested in. */
20461 if (CALL_P (insn)
20462 || (NONJUMP_INSN_P (insn)
20463 && (asm_noperands (PATTERN (insn)) >= 0
20464 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20465 return I387_CW_UNINITIALIZED;
20467 if (recog_memoized (insn) < 0)
20468 return I387_CW_ANY;
20470 mode = get_attr_i387_cw (insn);
20472 switch (entity)
20474 case I387_TRUNC:
20475 if (mode == I387_CW_TRUNC)
20476 return mode;
20477 break;
20479 case I387_FLOOR:
20480 if (mode == I387_CW_FLOOR)
20481 return mode;
20482 break;
20484 case I387_CEIL:
20485 if (mode == I387_CW_CEIL)
20486 return mode;
20487 break;
20489 case I387_MASK_PM:
20490 if (mode == I387_CW_MASK_PM)
20491 return mode;
20492 break;
20494 default:
20495 gcc_unreachable ();
20498 return I387_CW_ANY;
20501 /* Return mode that entity must be switched into
20502 prior to the execution of insn. */
20504 static int
20505 ix86_mode_needed (int entity, rtx_insn *insn)
20507 switch (entity)
20509 case X86_DIRFLAG:
20510 return ix86_dirflag_mode_needed (insn);
20511 case AVX_U128:
20512 return ix86_avx_u128_mode_needed (insn);
20513 case I387_TRUNC:
20514 case I387_FLOOR:
20515 case I387_CEIL:
20516 case I387_MASK_PM:
20517 return ix86_i387_mode_needed (entity, insn);
20518 default:
20519 gcc_unreachable ();
20521 return 0;
20524 /* Check if a 256bit AVX register is referenced in stores. */
20526 static void
20527 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20529 if (ix86_check_avx256_register (dest))
20531 bool *used = (bool *) data;
20532 *used = true;
20536 /* Calculate mode of upper 128bit AVX registers after the insn. */
20538 static int
20539 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20541 rtx pat = PATTERN (insn);
20543 if (vzeroupper_operation (pat, VOIDmode)
20544 || vzeroall_operation (pat, VOIDmode))
20545 return AVX_U128_CLEAN;
20547 /* We know that state is clean after CALL insn if there are no
20548 256bit registers used in the function return register. */
20549 if (CALL_P (insn))
20551 bool avx_reg256_found = false;
20552 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20554 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20557 /* Otherwise, return current mode. Remember that if insn
20558 references AVX 256bit registers, the mode was already changed
20559 to DIRTY from MODE_NEEDED. */
20560 return mode;
20563 /* Return the mode that an insn results in. */
20565 static int
20566 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20568 switch (entity)
20570 case X86_DIRFLAG:
20571 return mode;
20572 case AVX_U128:
20573 return ix86_avx_u128_mode_after (mode, insn);
20574 case I387_TRUNC:
20575 case I387_FLOOR:
20576 case I387_CEIL:
20577 case I387_MASK_PM:
20578 return mode;
20579 default:
20580 gcc_unreachable ();
20584 static int
20585 ix86_dirflag_mode_entry (void)
20587 /* For TARGET_CLD or in the interrupt handler we can't assume
20588 direction flag state at function entry. */
20589 if (TARGET_CLD
20590 || cfun->machine->func_type != TYPE_NORMAL)
20591 return X86_DIRFLAG_ANY;
20593 return X86_DIRFLAG_RESET;
20596 static int
20597 ix86_avx_u128_mode_entry (void)
20599 tree arg;
20601 /* Entry mode is set to AVX_U128_DIRTY if there are
20602 256bit modes used in function arguments. */
20603 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20604 arg = TREE_CHAIN (arg))
20606 rtx incoming = DECL_INCOMING_RTL (arg);
20608 if (incoming && ix86_check_avx256_register (incoming))
20609 return AVX_U128_DIRTY;
20612 return AVX_U128_CLEAN;
20615 /* Return a mode that ENTITY is assumed to be
20616 switched to at function entry. */
20618 static int
20619 ix86_mode_entry (int entity)
20621 switch (entity)
20623 case X86_DIRFLAG:
20624 return ix86_dirflag_mode_entry ();
20625 case AVX_U128:
20626 return ix86_avx_u128_mode_entry ();
20627 case I387_TRUNC:
20628 case I387_FLOOR:
20629 case I387_CEIL:
20630 case I387_MASK_PM:
20631 return I387_CW_ANY;
20632 default:
20633 gcc_unreachable ();
20637 static int
20638 ix86_avx_u128_mode_exit (void)
20640 rtx reg = crtl->return_rtx;
20642 /* Exit mode is set to AVX_U128_DIRTY if there are
20643 256bit modes used in the function return register. */
20644 if (reg && ix86_check_avx256_register (reg))
20645 return AVX_U128_DIRTY;
20647 return AVX_U128_CLEAN;
20650 /* Return a mode that ENTITY is assumed to be
20651 switched to at function exit. */
20653 static int
20654 ix86_mode_exit (int entity)
20656 switch (entity)
20658 case X86_DIRFLAG:
20659 return X86_DIRFLAG_ANY;
20660 case AVX_U128:
20661 return ix86_avx_u128_mode_exit ();
20662 case I387_TRUNC:
20663 case I387_FLOOR:
20664 case I387_CEIL:
20665 case I387_MASK_PM:
20666 return I387_CW_ANY;
20667 default:
20668 gcc_unreachable ();
20672 static int
20673 ix86_mode_priority (int, int n)
20675 return n;
20678 /* Output code to initialize control word copies used by trunc?f?i and
20679 rounding patterns. CURRENT_MODE is set to current control word,
20680 while NEW_MODE is set to new control word. */
20682 static void
20683 emit_i387_cw_initialization (int mode)
20685 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20686 rtx new_mode;
20688 enum ix86_stack_slot slot;
20690 rtx reg = gen_reg_rtx (HImode);
20692 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20693 emit_move_insn (reg, copy_rtx (stored_mode));
20695 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20696 || optimize_insn_for_size_p ())
20698 switch (mode)
20700 case I387_CW_TRUNC:
20701 /* round toward zero (truncate) */
20702 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20703 slot = SLOT_CW_TRUNC;
20704 break;
20706 case I387_CW_FLOOR:
20707 /* round down toward -oo */
20708 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20709 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20710 slot = SLOT_CW_FLOOR;
20711 break;
20713 case I387_CW_CEIL:
20714 /* round up toward +oo */
20715 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20716 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20717 slot = SLOT_CW_CEIL;
20718 break;
20720 case I387_CW_MASK_PM:
20721 /* mask precision exception for nearbyint() */
20722 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20723 slot = SLOT_CW_MASK_PM;
20724 break;
20726 default:
20727 gcc_unreachable ();
20730 else
20732 switch (mode)
20734 case I387_CW_TRUNC:
20735 /* round toward zero (truncate) */
20736 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20737 slot = SLOT_CW_TRUNC;
20738 break;
20740 case I387_CW_FLOOR:
20741 /* round down toward -oo */
20742 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20743 slot = SLOT_CW_FLOOR;
20744 break;
20746 case I387_CW_CEIL:
20747 /* round up toward +oo */
20748 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20749 slot = SLOT_CW_CEIL;
20750 break;
20752 case I387_CW_MASK_PM:
20753 /* mask precision exception for nearbyint() */
20754 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20755 slot = SLOT_CW_MASK_PM;
20756 break;
20758 default:
20759 gcc_unreachable ();
20763 gcc_assert (slot < MAX_386_STACK_LOCALS);
20765 new_mode = assign_386_stack_local (HImode, slot);
20766 emit_move_insn (new_mode, reg);
20769 /* Emit vzeroupper. */
20771 void
20772 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20774 int i;
20776 /* Cancel automatic vzeroupper insertion if there are
20777 live call-saved SSE registers at the insertion point. */
20779 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20780 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20781 return;
20783 if (TARGET_64BIT)
20784 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20785 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20786 return;
20788 emit_insn (gen_avx_vzeroupper ());
20791 /* Generate one or more insns to set ENTITY to MODE. */
20793 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20794 is the set of hard registers live at the point where the insn(s)
20795 are to be inserted. */
20797 static void
20798 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20799 HARD_REG_SET regs_live)
20801 switch (entity)
20803 case X86_DIRFLAG:
20804 if (mode == X86_DIRFLAG_RESET)
20805 emit_insn (gen_cld ());
20806 break;
20807 case AVX_U128:
20808 if (mode == AVX_U128_CLEAN)
20809 ix86_avx_emit_vzeroupper (regs_live);
20810 break;
20811 case I387_TRUNC:
20812 case I387_FLOOR:
20813 case I387_CEIL:
20814 case I387_MASK_PM:
20815 if (mode != I387_CW_ANY
20816 && mode != I387_CW_UNINITIALIZED)
20817 emit_i387_cw_initialization (mode);
20818 break;
20819 default:
20820 gcc_unreachable ();
20824 /* Output code for INSN to convert a float to a signed int. OPERANDS
20825 are the insn operands. The output may be [HSD]Imode and the input
20826 operand may be [SDX]Fmode. */
20828 const char *
20829 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20831 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20832 int dimode_p = GET_MODE (operands[0]) == DImode;
20833 int round_mode = get_attr_i387_cw (insn);
20835 /* Jump through a hoop or two for DImode, since the hardware has no
20836 non-popping instruction. We used to do this a different way, but
20837 that was somewhat fragile and broke with post-reload splitters. */
20838 if ((dimode_p || fisttp) && !stack_top_dies)
20839 output_asm_insn ("fld\t%y1", operands);
20841 gcc_assert (STACK_TOP_P (operands[1]));
20842 gcc_assert (MEM_P (operands[0]));
20843 gcc_assert (GET_MODE (operands[1]) != TFmode);
20845 if (fisttp)
20846 output_asm_insn ("fisttp%Z0\t%0", operands);
20847 else
20849 if (round_mode != I387_CW_ANY)
20850 output_asm_insn ("fldcw\t%3", operands);
20851 if (stack_top_dies || dimode_p)
20852 output_asm_insn ("fistp%Z0\t%0", operands);
20853 else
20854 output_asm_insn ("fist%Z0\t%0", operands);
20855 if (round_mode != I387_CW_ANY)
20856 output_asm_insn ("fldcw\t%2", operands);
20859 return "";
20862 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20863 have the values zero or one, indicates the ffreep insn's operand
20864 from the OPERANDS array. */
20866 static const char *
20867 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20869 if (TARGET_USE_FFREEP)
20870 #ifdef HAVE_AS_IX86_FFREEP
20871 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20872 #else
20874 static char retval[32];
20875 int regno = REGNO (operands[opno]);
20877 gcc_assert (STACK_REGNO_P (regno));
20879 regno -= FIRST_STACK_REG;
20881 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20882 return retval;
20884 #endif
20886 return opno ? "fstp\t%y1" : "fstp\t%y0";
20890 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20891 should be used. UNORDERED_P is true when fucom should be used. */
20893 const char *
20894 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20896 int stack_top_dies;
20897 rtx cmp_op0, cmp_op1;
20898 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20900 if (eflags_p)
20902 cmp_op0 = operands[0];
20903 cmp_op1 = operands[1];
20905 else
20907 cmp_op0 = operands[1];
20908 cmp_op1 = operands[2];
20911 if (is_sse)
20913 if (GET_MODE (operands[0]) == SFmode)
20914 if (unordered_p)
20915 return "%vucomiss\t{%1, %0|%0, %1}";
20916 else
20917 return "%vcomiss\t{%1, %0|%0, %1}";
20918 else
20919 if (unordered_p)
20920 return "%vucomisd\t{%1, %0|%0, %1}";
20921 else
20922 return "%vcomisd\t{%1, %0|%0, %1}";
20925 gcc_assert (STACK_TOP_P (cmp_op0));
20927 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20929 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20931 if (stack_top_dies)
20933 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20934 return output_387_ffreep (operands, 1);
20936 else
20937 return "ftst\n\tfnstsw\t%0";
20940 if (STACK_REG_P (cmp_op1)
20941 && stack_top_dies
20942 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20943 && REGNO (cmp_op1) != FIRST_STACK_REG)
20945 /* If both the top of the 387 stack dies, and the other operand
20946 is also a stack register that dies, then this must be a
20947 `fcompp' float compare */
20949 if (eflags_p)
20951 /* There is no double popping fcomi variant. Fortunately,
20952 eflags is immune from the fstp's cc clobbering. */
20953 if (unordered_p)
20954 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20955 else
20956 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20957 return output_387_ffreep (operands, 0);
20959 else
20961 if (unordered_p)
20962 return "fucompp\n\tfnstsw\t%0";
20963 else
20964 return "fcompp\n\tfnstsw\t%0";
20967 else
20969 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20971 static const char * const alt[16] =
20973 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20974 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20975 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20976 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20978 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20979 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20980 NULL,
20981 NULL,
20983 "fcomi\t{%y1, %0|%0, %y1}",
20984 "fcomip\t{%y1, %0|%0, %y1}",
20985 "fucomi\t{%y1, %0|%0, %y1}",
20986 "fucomip\t{%y1, %0|%0, %y1}",
20988 NULL,
20989 NULL,
20990 NULL,
20991 NULL
20994 int mask;
20995 const char *ret;
20997 mask = eflags_p << 3;
20998 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20999 mask |= unordered_p << 1;
21000 mask |= stack_top_dies;
21002 gcc_assert (mask < 16);
21003 ret = alt[mask];
21004 gcc_assert (ret);
21006 return ret;
21010 void
21011 ix86_output_addr_vec_elt (FILE *file, int value)
21013 const char *directive = ASM_LONG;
21015 #ifdef ASM_QUAD
21016 if (TARGET_LP64)
21017 directive = ASM_QUAD;
21018 #else
21019 gcc_assert (!TARGET_64BIT);
21020 #endif
21022 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
21025 void
21026 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
21028 const char *directive = ASM_LONG;
21030 #ifdef ASM_QUAD
21031 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
21032 directive = ASM_QUAD;
21033 #else
21034 gcc_assert (!TARGET_64BIT);
21035 #endif
21036 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
21037 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
21038 fprintf (file, "%s%s%d-%s%d\n",
21039 directive, LPREFIX, value, LPREFIX, rel);
21040 else if (HAVE_AS_GOTOFF_IN_DATA)
21041 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
21042 #if TARGET_MACHO
21043 else if (TARGET_MACHO)
21045 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
21046 machopic_output_function_base_name (file);
21047 putc ('\n', file);
21049 #endif
21050 else
21051 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
21052 GOT_SYMBOL_NAME, LPREFIX, value);
21055 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
21056 for the target. */
21058 void
21059 ix86_expand_clear (rtx dest)
21061 rtx tmp;
21063 /* We play register width games, which are only valid after reload. */
21064 gcc_assert (reload_completed);
21066 /* Avoid HImode and its attendant prefix byte. */
21067 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
21068 dest = gen_rtx_REG (SImode, REGNO (dest));
21069 tmp = gen_rtx_SET (dest, const0_rtx);
21071 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
21073 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21074 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
21077 emit_insn (tmp);
21080 /* X is an unchanging MEM. If it is a constant pool reference, return
21081 the constant pool rtx, else NULL. */
21084 maybe_get_pool_constant (rtx x)
21086 x = ix86_delegitimize_address (XEXP (x, 0));
21088 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
21089 return get_pool_constant (x);
21091 return NULL_RTX;
21094 void
21095 ix86_expand_move (machine_mode mode, rtx operands[])
21097 rtx op0, op1;
21098 rtx tmp, addend = NULL_RTX;
21099 enum tls_model model;
21101 op0 = operands[0];
21102 op1 = operands[1];
21104 switch (GET_CODE (op1))
21106 case CONST:
21107 tmp = XEXP (op1, 0);
21109 if (GET_CODE (tmp) != PLUS
21110 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
21111 break;
21113 op1 = XEXP (tmp, 0);
21114 addend = XEXP (tmp, 1);
21115 /* FALLTHRU */
21117 case SYMBOL_REF:
21118 model = SYMBOL_REF_TLS_MODEL (op1);
21120 if (model)
21121 op1 = legitimize_tls_address (op1, model, true);
21122 else if (ix86_force_load_from_GOT_p (op1))
21124 /* Load the external function address via GOT slot to avoid PLT. */
21125 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
21126 (TARGET_64BIT
21127 ? UNSPEC_GOTPCREL
21128 : UNSPEC_GOT));
21129 op1 = gen_rtx_CONST (Pmode, op1);
21130 op1 = gen_const_mem (Pmode, op1);
21131 set_mem_alias_set (op1, ix86_GOT_alias_set ());
21133 else
21135 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
21136 if (tmp)
21138 op1 = tmp;
21139 if (!addend)
21140 break;
21142 else
21144 op1 = operands[1];
21145 break;
21149 if (addend)
21151 op1 = force_operand (op1, NULL_RTX);
21152 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
21153 op0, 1, OPTAB_DIRECT);
21155 else
21156 op1 = force_operand (op1, op0);
21158 if (op1 == op0)
21159 return;
21161 op1 = convert_to_mode (mode, op1, 1);
21163 default:
21164 break;
21167 if ((flag_pic || MACHOPIC_INDIRECT)
21168 && symbolic_operand (op1, mode))
21170 if (TARGET_MACHO && !TARGET_64BIT)
21172 #if TARGET_MACHO
21173 /* dynamic-no-pic */
21174 if (MACHOPIC_INDIRECT)
21176 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
21177 ? op0 : gen_reg_rtx (Pmode);
21178 op1 = machopic_indirect_data_reference (op1, temp);
21179 if (MACHOPIC_PURE)
21180 op1 = machopic_legitimize_pic_address (op1, mode,
21181 temp == op1 ? 0 : temp);
21183 if (op0 != op1 && GET_CODE (op0) != MEM)
21185 rtx insn = gen_rtx_SET (op0, op1);
21186 emit_insn (insn);
21187 return;
21189 if (GET_CODE (op0) == MEM)
21190 op1 = force_reg (Pmode, op1);
21191 else
21193 rtx temp = op0;
21194 if (GET_CODE (temp) != REG)
21195 temp = gen_reg_rtx (Pmode);
21196 temp = legitimize_pic_address (op1, temp);
21197 if (temp == op0)
21198 return;
21199 op1 = temp;
21201 /* dynamic-no-pic */
21202 #endif
21204 else
21206 if (MEM_P (op0))
21207 op1 = force_reg (mode, op1);
21208 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
21210 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
21211 op1 = legitimize_pic_address (op1, reg);
21212 if (op0 == op1)
21213 return;
21214 op1 = convert_to_mode (mode, op1, 1);
21218 else
21220 if (MEM_P (op0)
21221 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
21222 || !push_operand (op0, mode))
21223 && MEM_P (op1))
21224 op1 = force_reg (mode, op1);
21226 if (push_operand (op0, mode)
21227 && ! general_no_elim_operand (op1, mode))
21228 op1 = copy_to_mode_reg (mode, op1);
21230 /* Force large constants in 64bit compilation into register
21231 to get them CSEed. */
21232 if (can_create_pseudo_p ()
21233 && (mode == DImode) && TARGET_64BIT
21234 && immediate_operand (op1, mode)
21235 && !x86_64_zext_immediate_operand (op1, VOIDmode)
21236 && !register_operand (op0, mode)
21237 && optimize)
21238 op1 = copy_to_mode_reg (mode, op1);
21240 if (can_create_pseudo_p ()
21241 && CONST_DOUBLE_P (op1))
21243 /* If we are loading a floating point constant to a register,
21244 force the value to memory now, since we'll get better code
21245 out the back end. */
21247 op1 = validize_mem (force_const_mem (mode, op1));
21248 if (!register_operand (op0, mode))
21250 rtx temp = gen_reg_rtx (mode);
21251 emit_insn (gen_rtx_SET (temp, op1));
21252 emit_move_insn (op0, temp);
21253 return;
21258 emit_insn (gen_rtx_SET (op0, op1));
21261 void
21262 ix86_expand_vector_move (machine_mode mode, rtx operands[])
21264 rtx op0 = operands[0], op1 = operands[1];
21265 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
21266 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
21267 unsigned int align = (TARGET_IAMCU
21268 ? GET_MODE_BITSIZE (mode)
21269 : GET_MODE_ALIGNMENT (mode));
21271 if (push_operand (op0, VOIDmode))
21272 op0 = emit_move_resolve_push (mode, op0);
21274 /* Force constants other than zero into memory. We do not know how
21275 the instructions used to build constants modify the upper 64 bits
21276 of the register, once we have that information we may be able
21277 to handle some of them more efficiently. */
21278 if (can_create_pseudo_p ()
21279 && (CONSTANT_P (op1)
21280 || (SUBREG_P (op1)
21281 && CONSTANT_P (SUBREG_REG (op1))))
21282 && ((register_operand (op0, mode)
21283 && !standard_sse_constant_p (op1, mode))
21284 /* ix86_expand_vector_move_misalign() does not like constants. */
21285 || (SSE_REG_MODE_P (mode)
21286 && MEM_P (op0)
21287 && MEM_ALIGN (op0) < align)))
21289 if (SUBREG_P (op1))
21291 machine_mode imode = GET_MODE (SUBREG_REG (op1));
21292 rtx r = force_const_mem (imode, SUBREG_REG (op1));
21293 if (r)
21294 r = validize_mem (r);
21295 else
21296 r = force_reg (imode, SUBREG_REG (op1));
21297 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
21299 else
21300 op1 = validize_mem (force_const_mem (mode, op1));
21303 /* We need to check memory alignment for SSE mode since attribute
21304 can make operands unaligned. */
21305 if (can_create_pseudo_p ()
21306 && SSE_REG_MODE_P (mode)
21307 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
21308 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
21310 rtx tmp[2];
21312 /* ix86_expand_vector_move_misalign() does not like both
21313 arguments in memory. */
21314 if (!register_operand (op0, mode)
21315 && !register_operand (op1, mode))
21316 op1 = force_reg (mode, op1);
21318 tmp[0] = op0; tmp[1] = op1;
21319 ix86_expand_vector_move_misalign (mode, tmp);
21320 return;
21323 /* Make operand1 a register if it isn't already. */
21324 if (can_create_pseudo_p ()
21325 && !register_operand (op0, mode)
21326 && !register_operand (op1, mode))
21328 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
21329 return;
21332 emit_insn (gen_rtx_SET (op0, op1));
21335 /* Split 32-byte AVX unaligned load and store if needed. */
21337 static void
21338 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
21340 rtx m;
21341 rtx (*extract) (rtx, rtx, rtx);
21342 machine_mode mode;
21344 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
21345 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
21347 emit_insn (gen_rtx_SET (op0, op1));
21348 return;
21351 rtx orig_op0 = NULL_RTX;
21352 mode = GET_MODE (op0);
21353 switch (GET_MODE_CLASS (mode))
21355 case MODE_VECTOR_INT:
21356 case MODE_INT:
21357 if (mode != V32QImode)
21359 if (!MEM_P (op0))
21361 orig_op0 = op0;
21362 op0 = gen_reg_rtx (V32QImode);
21364 else
21365 op0 = gen_lowpart (V32QImode, op0);
21366 op1 = gen_lowpart (V32QImode, op1);
21367 mode = V32QImode;
21369 break;
21370 case MODE_VECTOR_FLOAT:
21371 break;
21372 default:
21373 gcc_unreachable ();
21376 switch (mode)
21378 default:
21379 gcc_unreachable ();
21380 case E_V32QImode:
21381 extract = gen_avx_vextractf128v32qi;
21382 mode = V16QImode;
21383 break;
21384 case E_V8SFmode:
21385 extract = gen_avx_vextractf128v8sf;
21386 mode = V4SFmode;
21387 break;
21388 case E_V4DFmode:
21389 extract = gen_avx_vextractf128v4df;
21390 mode = V2DFmode;
21391 break;
21394 if (MEM_P (op1))
21396 rtx r = gen_reg_rtx (mode);
21397 m = adjust_address (op1, mode, 0);
21398 emit_move_insn (r, m);
21399 m = adjust_address (op1, mode, 16);
21400 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
21401 emit_move_insn (op0, r);
21403 else if (MEM_P (op0))
21405 m = adjust_address (op0, mode, 0);
21406 emit_insn (extract (m, op1, const0_rtx));
21407 m = adjust_address (op0, mode, 16);
21408 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
21410 else
21411 gcc_unreachable ();
21413 if (orig_op0)
21414 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
21417 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
21418 straight to ix86_expand_vector_move. */
21419 /* Code generation for scalar reg-reg moves of single and double precision data:
21420 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21421 movaps reg, reg
21422 else
21423 movss reg, reg
21424 if (x86_sse_partial_reg_dependency == true)
21425 movapd reg, reg
21426 else
21427 movsd reg, reg
21429 Code generation for scalar loads of double precision data:
21430 if (x86_sse_split_regs == true)
21431 movlpd mem, reg (gas syntax)
21432 else
21433 movsd mem, reg
21435 Code generation for unaligned packed loads of single precision data
21436 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21437 if (x86_sse_unaligned_move_optimal)
21438 movups mem, reg
21440 if (x86_sse_partial_reg_dependency == true)
21442 xorps reg, reg
21443 movlps mem, reg
21444 movhps mem+8, reg
21446 else
21448 movlps mem, reg
21449 movhps mem+8, reg
21452 Code generation for unaligned packed loads of double precision data
21453 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21454 if (x86_sse_unaligned_move_optimal)
21455 movupd mem, reg
21457 if (x86_sse_split_regs == true)
21459 movlpd mem, reg
21460 movhpd mem+8, reg
21462 else
21464 movsd mem, reg
21465 movhpd mem+8, reg
21469 void
21470 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21472 rtx op0, op1, m;
21474 op0 = operands[0];
21475 op1 = operands[1];
21477 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21478 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21480 emit_insn (gen_rtx_SET (op0, op1));
21481 return;
21484 if (TARGET_AVX)
21486 if (GET_MODE_SIZE (mode) == 32)
21487 ix86_avx256_split_vector_move_misalign (op0, op1);
21488 else
21489 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21490 emit_insn (gen_rtx_SET (op0, op1));
21491 return;
21494 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21495 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21497 emit_insn (gen_rtx_SET (op0, op1));
21498 return;
21501 /* ??? If we have typed data, then it would appear that using
21502 movdqu is the only way to get unaligned data loaded with
21503 integer type. */
21504 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21506 emit_insn (gen_rtx_SET (op0, op1));
21507 return;
21510 if (MEM_P (op1))
21512 if (TARGET_SSE2 && mode == V2DFmode)
21514 rtx zero;
21516 /* When SSE registers are split into halves, we can avoid
21517 writing to the top half twice. */
21518 if (TARGET_SSE_SPLIT_REGS)
21520 emit_clobber (op0);
21521 zero = op0;
21523 else
21525 /* ??? Not sure about the best option for the Intel chips.
21526 The following would seem to satisfy; the register is
21527 entirely cleared, breaking the dependency chain. We
21528 then store to the upper half, with a dependency depth
21529 of one. A rumor has it that Intel recommends two movsd
21530 followed by an unpacklpd, but this is unconfirmed. And
21531 given that the dependency depth of the unpacklpd would
21532 still be one, I'm not sure why this would be better. */
21533 zero = CONST0_RTX (V2DFmode);
21536 m = adjust_address (op1, DFmode, 0);
21537 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21538 m = adjust_address (op1, DFmode, 8);
21539 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21541 else
21543 rtx t;
21545 if (mode != V4SFmode)
21546 t = gen_reg_rtx (V4SFmode);
21547 else
21548 t = op0;
21550 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21551 emit_move_insn (t, CONST0_RTX (V4SFmode));
21552 else
21553 emit_clobber (t);
21555 m = adjust_address (op1, V2SFmode, 0);
21556 emit_insn (gen_sse_loadlps (t, t, m));
21557 m = adjust_address (op1, V2SFmode, 8);
21558 emit_insn (gen_sse_loadhps (t, t, m));
21559 if (mode != V4SFmode)
21560 emit_move_insn (op0, gen_lowpart (mode, t));
21563 else if (MEM_P (op0))
21565 if (TARGET_SSE2 && mode == V2DFmode)
21567 m = adjust_address (op0, DFmode, 0);
21568 emit_insn (gen_sse2_storelpd (m, op1));
21569 m = adjust_address (op0, DFmode, 8);
21570 emit_insn (gen_sse2_storehpd (m, op1));
21572 else
21574 if (mode != V4SFmode)
21575 op1 = gen_lowpart (V4SFmode, op1);
21577 m = adjust_address (op0, V2SFmode, 0);
21578 emit_insn (gen_sse_storelps (m, op1));
21579 m = adjust_address (op0, V2SFmode, 8);
21580 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21583 else
21584 gcc_unreachable ();
21587 /* Helper function of ix86_fixup_binary_operands to canonicalize
21588 operand order. Returns true if the operands should be swapped. */
21590 static bool
21591 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21592 rtx operands[])
21594 rtx dst = operands[0];
21595 rtx src1 = operands[1];
21596 rtx src2 = operands[2];
21598 /* If the operation is not commutative, we can't do anything. */
21599 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21600 return false;
21602 /* Highest priority is that src1 should match dst. */
21603 if (rtx_equal_p (dst, src1))
21604 return false;
21605 if (rtx_equal_p (dst, src2))
21606 return true;
21608 /* Next highest priority is that immediate constants come second. */
21609 if (immediate_operand (src2, mode))
21610 return false;
21611 if (immediate_operand (src1, mode))
21612 return true;
21614 /* Lowest priority is that memory references should come second. */
21615 if (MEM_P (src2))
21616 return false;
21617 if (MEM_P (src1))
21618 return true;
21620 return false;
21624 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21625 destination to use for the operation. If different from the true
21626 destination in operands[0], a copy operation will be required. */
21629 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21630 rtx operands[])
21632 rtx dst = operands[0];
21633 rtx src1 = operands[1];
21634 rtx src2 = operands[2];
21636 /* Canonicalize operand order. */
21637 if (ix86_swap_binary_operands_p (code, mode, operands))
21639 /* It is invalid to swap operands of different modes. */
21640 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21642 std::swap (src1, src2);
21645 /* Both source operands cannot be in memory. */
21646 if (MEM_P (src1) && MEM_P (src2))
21648 /* Optimization: Only read from memory once. */
21649 if (rtx_equal_p (src1, src2))
21651 src2 = force_reg (mode, src2);
21652 src1 = src2;
21654 else if (rtx_equal_p (dst, src1))
21655 src2 = force_reg (mode, src2);
21656 else
21657 src1 = force_reg (mode, src1);
21660 /* If the destination is memory, and we do not have matching source
21661 operands, do things in registers. */
21662 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21663 dst = gen_reg_rtx (mode);
21665 /* Source 1 cannot be a constant. */
21666 if (CONSTANT_P (src1))
21667 src1 = force_reg (mode, src1);
21669 /* Source 1 cannot be a non-matching memory. */
21670 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21671 src1 = force_reg (mode, src1);
21673 /* Improve address combine. */
21674 if (code == PLUS
21675 && GET_MODE_CLASS (mode) == MODE_INT
21676 && MEM_P (src2))
21677 src2 = force_reg (mode, src2);
21679 operands[1] = src1;
21680 operands[2] = src2;
21681 return dst;
21684 /* Similarly, but assume that the destination has already been
21685 set up properly. */
21687 void
21688 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21689 machine_mode mode, rtx operands[])
21691 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21692 gcc_assert (dst == operands[0]);
21695 /* Attempt to expand a binary operator. Make the expansion closer to the
21696 actual machine, then just general_operand, which will allow 3 separate
21697 memory references (one output, two input) in a single insn. */
21699 void
21700 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21701 rtx operands[])
21703 rtx src1, src2, dst, op, clob;
21705 dst = ix86_fixup_binary_operands (code, mode, operands);
21706 src1 = operands[1];
21707 src2 = operands[2];
21709 /* Emit the instruction. */
21711 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21713 if (reload_completed
21714 && code == PLUS
21715 && !rtx_equal_p (dst, src1))
21717 /* This is going to be an LEA; avoid splitting it later. */
21718 emit_insn (op);
21720 else
21722 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21723 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21726 /* Fix up the destination if needed. */
21727 if (dst != operands[0])
21728 emit_move_insn (operands[0], dst);
21731 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21732 the given OPERANDS. */
21734 void
21735 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21736 rtx operands[])
21738 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21739 if (SUBREG_P (operands[1]))
21741 op1 = operands[1];
21742 op2 = operands[2];
21744 else if (SUBREG_P (operands[2]))
21746 op1 = operands[2];
21747 op2 = operands[1];
21749 /* Optimize (__m128i) d | (__m128i) e and similar code
21750 when d and e are float vectors into float vector logical
21751 insn. In C/C++ without using intrinsics there is no other way
21752 to express vector logical operation on float vectors than
21753 to cast them temporarily to integer vectors. */
21754 if (op1
21755 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21756 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21757 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21758 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21759 && SUBREG_BYTE (op1) == 0
21760 && (GET_CODE (op2) == CONST_VECTOR
21761 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21762 && SUBREG_BYTE (op2) == 0))
21763 && can_create_pseudo_p ())
21765 rtx dst;
21766 switch (GET_MODE (SUBREG_REG (op1)))
21768 case E_V4SFmode:
21769 case E_V8SFmode:
21770 case E_V16SFmode:
21771 case E_V2DFmode:
21772 case E_V4DFmode:
21773 case E_V8DFmode:
21774 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21775 if (GET_CODE (op2) == CONST_VECTOR)
21777 op2 = gen_lowpart (GET_MODE (dst), op2);
21778 op2 = force_reg (GET_MODE (dst), op2);
21780 else
21782 op1 = operands[1];
21783 op2 = SUBREG_REG (operands[2]);
21784 if (!vector_operand (op2, GET_MODE (dst)))
21785 op2 = force_reg (GET_MODE (dst), op2);
21787 op1 = SUBREG_REG (op1);
21788 if (!vector_operand (op1, GET_MODE (dst)))
21789 op1 = force_reg (GET_MODE (dst), op1);
21790 emit_insn (gen_rtx_SET (dst,
21791 gen_rtx_fmt_ee (code, GET_MODE (dst),
21792 op1, op2)));
21793 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21794 return;
21795 default:
21796 break;
21799 if (!vector_operand (operands[1], mode))
21800 operands[1] = force_reg (mode, operands[1]);
21801 if (!vector_operand (operands[2], mode))
21802 operands[2] = force_reg (mode, operands[2]);
21803 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21804 emit_insn (gen_rtx_SET (operands[0],
21805 gen_rtx_fmt_ee (code, mode, operands[1],
21806 operands[2])));
21809 /* Return TRUE or FALSE depending on whether the binary operator meets the
21810 appropriate constraints. */
21812 bool
21813 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21814 rtx operands[3])
21816 rtx dst = operands[0];
21817 rtx src1 = operands[1];
21818 rtx src2 = operands[2];
21820 /* Both source operands cannot be in memory. */
21821 if (MEM_P (src1) && MEM_P (src2))
21822 return false;
21824 /* Canonicalize operand order for commutative operators. */
21825 if (ix86_swap_binary_operands_p (code, mode, operands))
21826 std::swap (src1, src2);
21828 /* If the destination is memory, we must have a matching source operand. */
21829 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21830 return false;
21832 /* Source 1 cannot be a constant. */
21833 if (CONSTANT_P (src1))
21834 return false;
21836 /* Source 1 cannot be a non-matching memory. */
21837 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21838 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21839 return (code == AND
21840 && (mode == HImode
21841 || mode == SImode
21842 || (TARGET_64BIT && mode == DImode))
21843 && satisfies_constraint_L (src2));
21845 return true;
21848 /* Attempt to expand a unary operator. Make the expansion closer to the
21849 actual machine, then just general_operand, which will allow 2 separate
21850 memory references (one output, one input) in a single insn. */
21852 void
21853 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21854 rtx operands[])
21856 bool matching_memory = false;
21857 rtx src, dst, op, clob;
21859 dst = operands[0];
21860 src = operands[1];
21862 /* If the destination is memory, and we do not have matching source
21863 operands, do things in registers. */
21864 if (MEM_P (dst))
21866 if (rtx_equal_p (dst, src))
21867 matching_memory = true;
21868 else
21869 dst = gen_reg_rtx (mode);
21872 /* When source operand is memory, destination must match. */
21873 if (MEM_P (src) && !matching_memory)
21874 src = force_reg (mode, src);
21876 /* Emit the instruction. */
21878 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21880 if (code == NOT)
21881 emit_insn (op);
21882 else
21884 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21885 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21888 /* Fix up the destination if needed. */
21889 if (dst != operands[0])
21890 emit_move_insn (operands[0], dst);
21893 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21894 divisor are within the range [0-255]. */
21896 void
21897 ix86_split_idivmod (machine_mode mode, rtx operands[],
21898 bool signed_p)
21900 rtx_code_label *end_label, *qimode_label;
21901 rtx div, mod;
21902 rtx_insn *insn;
21903 rtx scratch, tmp0, tmp1, tmp2;
21904 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21905 rtx (*gen_zero_extend) (rtx, rtx);
21906 rtx (*gen_test_ccno_1) (rtx, rtx);
21908 switch (mode)
21910 case E_SImode:
21911 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21912 gen_test_ccno_1 = gen_testsi_ccno_1;
21913 gen_zero_extend = gen_zero_extendqisi2;
21914 break;
21915 case E_DImode:
21916 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21917 gen_test_ccno_1 = gen_testdi_ccno_1;
21918 gen_zero_extend = gen_zero_extendqidi2;
21919 break;
21920 default:
21921 gcc_unreachable ();
21924 end_label = gen_label_rtx ();
21925 qimode_label = gen_label_rtx ();
21927 scratch = gen_reg_rtx (mode);
21929 /* Use 8bit unsigned divimod if dividend and divisor are within
21930 the range [0-255]. */
21931 emit_move_insn (scratch, operands[2]);
21932 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21933 scratch, 1, OPTAB_DIRECT);
21934 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21935 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21936 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21937 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21938 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21939 pc_rtx);
21940 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21941 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21942 JUMP_LABEL (insn) = qimode_label;
21944 /* Generate original signed/unsigned divimod. */
21945 div = gen_divmod4_1 (operands[0], operands[1],
21946 operands[2], operands[3]);
21947 emit_insn (div);
21949 /* Branch to the end. */
21950 emit_jump_insn (gen_jump (end_label));
21951 emit_barrier ();
21953 /* Generate 8bit unsigned divide. */
21954 emit_label (qimode_label);
21955 /* Don't use operands[0] for result of 8bit divide since not all
21956 registers support QImode ZERO_EXTRACT. */
21957 tmp0 = lowpart_subreg (HImode, scratch, mode);
21958 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21959 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21960 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21962 if (signed_p)
21964 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21965 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21967 else
21969 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21970 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21973 /* Extract remainder from AH. */
21974 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21975 if (REG_P (operands[1]))
21976 insn = emit_move_insn (operands[1], tmp1);
21977 else
21979 /* Need a new scratch register since the old one has result
21980 of 8bit divide. */
21981 scratch = gen_reg_rtx (mode);
21982 emit_move_insn (scratch, tmp1);
21983 insn = emit_move_insn (operands[1], scratch);
21985 set_unique_reg_note (insn, REG_EQUAL, mod);
21987 /* Zero extend quotient from AL. */
21988 tmp1 = gen_lowpart (QImode, tmp0);
21989 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21990 set_unique_reg_note (insn, REG_EQUAL, div);
21992 emit_label (end_label);
21995 #define LEA_MAX_STALL (3)
21996 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21998 /* Increase given DISTANCE in half-cycles according to
21999 dependencies between PREV and NEXT instructions.
22000 Add 1 half-cycle if there is no dependency and
22001 go to next cycle if there is some dependecy. */
22003 static unsigned int
22004 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
22006 df_ref def, use;
22008 if (!prev || !next)
22009 return distance + (distance & 1) + 2;
22011 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
22012 return distance + 1;
22014 FOR_EACH_INSN_USE (use, next)
22015 FOR_EACH_INSN_DEF (def, prev)
22016 if (!DF_REF_IS_ARTIFICIAL (def)
22017 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
22018 return distance + (distance & 1) + 2;
22020 return distance + 1;
22023 /* Function checks if instruction INSN defines register number
22024 REGNO1 or REGNO2. */
22026 static bool
22027 insn_defines_reg (unsigned int regno1, unsigned int regno2,
22028 rtx_insn *insn)
22030 df_ref def;
22032 FOR_EACH_INSN_DEF (def, insn)
22033 if (DF_REF_REG_DEF_P (def)
22034 && !DF_REF_IS_ARTIFICIAL (def)
22035 && (regno1 == DF_REF_REGNO (def)
22036 || regno2 == DF_REF_REGNO (def)))
22037 return true;
22039 return false;
22042 /* Function checks if instruction INSN uses register number
22043 REGNO as a part of address expression. */
22045 static bool
22046 insn_uses_reg_mem (unsigned int regno, rtx insn)
22048 df_ref use;
22050 FOR_EACH_INSN_USE (use, insn)
22051 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
22052 return true;
22054 return false;
22057 /* Search backward for non-agu definition of register number REGNO1
22058 or register number REGNO2 in basic block starting from instruction
22059 START up to head of basic block or instruction INSN.
22061 Function puts true value into *FOUND var if definition was found
22062 and false otherwise.
22064 Distance in half-cycles between START and found instruction or head
22065 of BB is added to DISTANCE and returned. */
22067 static int
22068 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
22069 rtx_insn *insn, int distance,
22070 rtx_insn *start, bool *found)
22072 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
22073 rtx_insn *prev = start;
22074 rtx_insn *next = NULL;
22076 *found = false;
22078 while (prev
22079 && prev != insn
22080 && distance < LEA_SEARCH_THRESHOLD)
22082 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
22084 distance = increase_distance (prev, next, distance);
22085 if (insn_defines_reg (regno1, regno2, prev))
22087 if (recog_memoized (prev) < 0
22088 || get_attr_type (prev) != TYPE_LEA)
22090 *found = true;
22091 return distance;
22095 next = prev;
22097 if (prev == BB_HEAD (bb))
22098 break;
22100 prev = PREV_INSN (prev);
22103 return distance;
22106 /* Search backward for non-agu definition of register number REGNO1
22107 or register number REGNO2 in INSN's basic block until
22108 1. Pass LEA_SEARCH_THRESHOLD instructions, or
22109 2. Reach neighbor BBs boundary, or
22110 3. Reach agu definition.
22111 Returns the distance between the non-agu definition point and INSN.
22112 If no definition point, returns -1. */
22114 static int
22115 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
22116 rtx_insn *insn)
22118 basic_block bb = BLOCK_FOR_INSN (insn);
22119 int distance = 0;
22120 bool found = false;
22122 if (insn != BB_HEAD (bb))
22123 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
22124 distance, PREV_INSN (insn),
22125 &found);
22127 if (!found && distance < LEA_SEARCH_THRESHOLD)
22129 edge e;
22130 edge_iterator ei;
22131 bool simple_loop = false;
22133 FOR_EACH_EDGE (e, ei, bb->preds)
22134 if (e->src == bb)
22136 simple_loop = true;
22137 break;
22140 if (simple_loop)
22141 distance = distance_non_agu_define_in_bb (regno1, regno2,
22142 insn, distance,
22143 BB_END (bb), &found);
22144 else
22146 int shortest_dist = -1;
22147 bool found_in_bb = false;
22149 FOR_EACH_EDGE (e, ei, bb->preds)
22151 int bb_dist
22152 = distance_non_agu_define_in_bb (regno1, regno2,
22153 insn, distance,
22154 BB_END (e->src),
22155 &found_in_bb);
22156 if (found_in_bb)
22158 if (shortest_dist < 0)
22159 shortest_dist = bb_dist;
22160 else if (bb_dist > 0)
22161 shortest_dist = MIN (bb_dist, shortest_dist);
22163 found = true;
22167 distance = shortest_dist;
22171 /* get_attr_type may modify recog data. We want to make sure
22172 that recog data is valid for instruction INSN, on which
22173 distance_non_agu_define is called. INSN is unchanged here. */
22174 extract_insn_cached (insn);
22176 if (!found)
22177 return -1;
22179 return distance >> 1;
22182 /* Return the distance in half-cycles between INSN and the next
22183 insn that uses register number REGNO in memory address added
22184 to DISTANCE. Return -1 if REGNO0 is set.
22186 Put true value into *FOUND if register usage was found and
22187 false otherwise.
22188 Put true value into *REDEFINED if register redefinition was
22189 found and false otherwise. */
22191 static int
22192 distance_agu_use_in_bb (unsigned int regno,
22193 rtx_insn *insn, int distance, rtx_insn *start,
22194 bool *found, bool *redefined)
22196 basic_block bb = NULL;
22197 rtx_insn *next = start;
22198 rtx_insn *prev = NULL;
22200 *found = false;
22201 *redefined = false;
22203 if (start != NULL_RTX)
22205 bb = BLOCK_FOR_INSN (start);
22206 if (start != BB_HEAD (bb))
22207 /* If insn and start belong to the same bb, set prev to insn,
22208 so the call to increase_distance will increase the distance
22209 between insns by 1. */
22210 prev = insn;
22213 while (next
22214 && next != insn
22215 && distance < LEA_SEARCH_THRESHOLD)
22217 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
22219 distance = increase_distance(prev, next, distance);
22220 if (insn_uses_reg_mem (regno, next))
22222 /* Return DISTANCE if OP0 is used in memory
22223 address in NEXT. */
22224 *found = true;
22225 return distance;
22228 if (insn_defines_reg (regno, INVALID_REGNUM, next))
22230 /* Return -1 if OP0 is set in NEXT. */
22231 *redefined = true;
22232 return -1;
22235 prev = next;
22238 if (next == BB_END (bb))
22239 break;
22241 next = NEXT_INSN (next);
22244 return distance;
22247 /* Return the distance between INSN and the next insn that uses
22248 register number REGNO0 in memory address. Return -1 if no such
22249 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
22251 static int
22252 distance_agu_use (unsigned int regno0, rtx_insn *insn)
22254 basic_block bb = BLOCK_FOR_INSN (insn);
22255 int distance = 0;
22256 bool found = false;
22257 bool redefined = false;
22259 if (insn != BB_END (bb))
22260 distance = distance_agu_use_in_bb (regno0, insn, distance,
22261 NEXT_INSN (insn),
22262 &found, &redefined);
22264 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
22266 edge e;
22267 edge_iterator ei;
22268 bool simple_loop = false;
22270 FOR_EACH_EDGE (e, ei, bb->succs)
22271 if (e->dest == bb)
22273 simple_loop = true;
22274 break;
22277 if (simple_loop)
22278 distance = distance_agu_use_in_bb (regno0, insn,
22279 distance, BB_HEAD (bb),
22280 &found, &redefined);
22281 else
22283 int shortest_dist = -1;
22284 bool found_in_bb = false;
22285 bool redefined_in_bb = false;
22287 FOR_EACH_EDGE (e, ei, bb->succs)
22289 int bb_dist
22290 = distance_agu_use_in_bb (regno0, insn,
22291 distance, BB_HEAD (e->dest),
22292 &found_in_bb, &redefined_in_bb);
22293 if (found_in_bb)
22295 if (shortest_dist < 0)
22296 shortest_dist = bb_dist;
22297 else if (bb_dist > 0)
22298 shortest_dist = MIN (bb_dist, shortest_dist);
22300 found = true;
22304 distance = shortest_dist;
22308 if (!found || redefined)
22309 return -1;
22311 return distance >> 1;
22314 /* Define this macro to tune LEA priority vs ADD, it take effect when
22315 there is a dilemma of choicing LEA or ADD
22316 Negative value: ADD is more preferred than LEA
22317 Zero: Netrual
22318 Positive value: LEA is more preferred than ADD*/
22319 #define IX86_LEA_PRIORITY 0
22321 /* Return true if usage of lea INSN has performance advantage
22322 over a sequence of instructions. Instructions sequence has
22323 SPLIT_COST cycles higher latency than lea latency. */
22325 static bool
22326 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
22327 unsigned int regno2, int split_cost, bool has_scale)
22329 int dist_define, dist_use;
22331 /* For Silvermont if using a 2-source or 3-source LEA for
22332 non-destructive destination purposes, or due to wanting
22333 ability to use SCALE, the use of LEA is justified. */
22334 if (TARGET_SILVERMONT || TARGET_INTEL)
22336 if (has_scale)
22337 return true;
22338 if (split_cost < 1)
22339 return false;
22340 if (regno0 == regno1 || regno0 == regno2)
22341 return false;
22342 return true;
22345 dist_define = distance_non_agu_define (regno1, regno2, insn);
22346 dist_use = distance_agu_use (regno0, insn);
22348 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
22350 /* If there is no non AGU operand definition, no AGU
22351 operand usage and split cost is 0 then both lea
22352 and non lea variants have same priority. Currently
22353 we prefer lea for 64 bit code and non lea on 32 bit
22354 code. */
22355 if (dist_use < 0 && split_cost == 0)
22356 return TARGET_64BIT || IX86_LEA_PRIORITY;
22357 else
22358 return true;
22361 /* With longer definitions distance lea is more preferable.
22362 Here we change it to take into account splitting cost and
22363 lea priority. */
22364 dist_define += split_cost + IX86_LEA_PRIORITY;
22366 /* If there is no use in memory addess then we just check
22367 that split cost exceeds AGU stall. */
22368 if (dist_use < 0)
22369 return dist_define > LEA_MAX_STALL;
22371 /* If this insn has both backward non-agu dependence and forward
22372 agu dependence, the one with short distance takes effect. */
22373 return dist_define >= dist_use;
22376 /* Return true if it is legal to clobber flags by INSN and
22377 false otherwise. */
22379 static bool
22380 ix86_ok_to_clobber_flags (rtx_insn *insn)
22382 basic_block bb = BLOCK_FOR_INSN (insn);
22383 df_ref use;
22384 bitmap live;
22386 while (insn)
22388 if (NONDEBUG_INSN_P (insn))
22390 FOR_EACH_INSN_USE (use, insn)
22391 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
22392 return false;
22394 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
22395 return true;
22398 if (insn == BB_END (bb))
22399 break;
22401 insn = NEXT_INSN (insn);
22404 live = df_get_live_out(bb);
22405 return !REGNO_REG_SET_P (live, FLAGS_REG);
22408 /* Return true if we need to split op0 = op1 + op2 into a sequence of
22409 move and add to avoid AGU stalls. */
22411 bool
22412 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
22414 unsigned int regno0, regno1, regno2;
22416 /* Check if we need to optimize. */
22417 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22418 return false;
22420 /* Check it is correct to split here. */
22421 if (!ix86_ok_to_clobber_flags(insn))
22422 return false;
22424 regno0 = true_regnum (operands[0]);
22425 regno1 = true_regnum (operands[1]);
22426 regno2 = true_regnum (operands[2]);
22428 /* We need to split only adds with non destructive
22429 destination operand. */
22430 if (regno0 == regno1 || regno0 == regno2)
22431 return false;
22432 else
22433 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22436 /* Return true if we should emit lea instruction instead of mov
22437 instruction. */
22439 bool
22440 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22442 unsigned int regno0, regno1;
22444 /* Check if we need to optimize. */
22445 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22446 return false;
22448 /* Use lea for reg to reg moves only. */
22449 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22450 return false;
22452 regno0 = true_regnum (operands[0]);
22453 regno1 = true_regnum (operands[1]);
22455 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22458 /* Return true if we need to split lea into a sequence of
22459 instructions to avoid AGU stalls. */
22461 bool
22462 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22464 unsigned int regno0, regno1, regno2;
22465 int split_cost;
22466 struct ix86_address parts;
22467 int ok;
22469 /* Check we need to optimize. */
22470 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22471 return false;
22473 /* The "at least two components" test below might not catch simple
22474 move or zero extension insns if parts.base is non-NULL and parts.disp
22475 is const0_rtx as the only components in the address, e.g. if the
22476 register is %rbp or %r13. As this test is much cheaper and moves or
22477 zero extensions are the common case, do this check first. */
22478 if (REG_P (operands[1])
22479 || (SImode_address_operand (operands[1], VOIDmode)
22480 && REG_P (XEXP (operands[1], 0))))
22481 return false;
22483 /* Check if it is OK to split here. */
22484 if (!ix86_ok_to_clobber_flags (insn))
22485 return false;
22487 ok = ix86_decompose_address (operands[1], &parts);
22488 gcc_assert (ok);
22490 /* There should be at least two components in the address. */
22491 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22492 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22493 return false;
22495 /* We should not split into add if non legitimate pic
22496 operand is used as displacement. */
22497 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22498 return false;
22500 regno0 = true_regnum (operands[0]) ;
22501 regno1 = INVALID_REGNUM;
22502 regno2 = INVALID_REGNUM;
22504 if (parts.base)
22505 regno1 = true_regnum (parts.base);
22506 if (parts.index)
22507 regno2 = true_regnum (parts.index);
22509 split_cost = 0;
22511 /* Compute how many cycles we will add to execution time
22512 if split lea into a sequence of instructions. */
22513 if (parts.base || parts.index)
22515 /* Have to use mov instruction if non desctructive
22516 destination form is used. */
22517 if (regno1 != regno0 && regno2 != regno0)
22518 split_cost += 1;
22520 /* Have to add index to base if both exist. */
22521 if (parts.base && parts.index)
22522 split_cost += 1;
22524 /* Have to use shift and adds if scale is 2 or greater. */
22525 if (parts.scale > 1)
22527 if (regno0 != regno1)
22528 split_cost += 1;
22529 else if (regno2 == regno0)
22530 split_cost += 4;
22531 else
22532 split_cost += parts.scale;
22535 /* Have to use add instruction with immediate if
22536 disp is non zero. */
22537 if (parts.disp && parts.disp != const0_rtx)
22538 split_cost += 1;
22540 /* Subtract the price of lea. */
22541 split_cost -= 1;
22544 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22545 parts.scale > 1);
22548 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22549 matches destination. RTX includes clobber of FLAGS_REG. */
22551 static void
22552 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22553 rtx dst, rtx src)
22555 rtx op, clob;
22557 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22558 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22560 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22563 /* Return true if regno1 def is nearest to the insn. */
22565 static bool
22566 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22568 rtx_insn *prev = insn;
22569 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22571 if (insn == start)
22572 return false;
22573 while (prev && prev != start)
22575 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22577 prev = PREV_INSN (prev);
22578 continue;
22580 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22581 return true;
22582 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22583 return false;
22584 prev = PREV_INSN (prev);
22587 /* None of the regs is defined in the bb. */
22588 return false;
22591 /* Split lea instructions into a sequence of instructions
22592 which are executed on ALU to avoid AGU stalls.
22593 It is assumed that it is allowed to clobber flags register
22594 at lea position. */
22596 void
22597 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22599 unsigned int regno0, regno1, regno2;
22600 struct ix86_address parts;
22601 rtx target, tmp;
22602 int ok, adds;
22604 ok = ix86_decompose_address (operands[1], &parts);
22605 gcc_assert (ok);
22607 target = gen_lowpart (mode, operands[0]);
22609 regno0 = true_regnum (target);
22610 regno1 = INVALID_REGNUM;
22611 regno2 = INVALID_REGNUM;
22613 if (parts.base)
22615 parts.base = gen_lowpart (mode, parts.base);
22616 regno1 = true_regnum (parts.base);
22619 if (parts.index)
22621 parts.index = gen_lowpart (mode, parts.index);
22622 regno2 = true_regnum (parts.index);
22625 if (parts.disp)
22626 parts.disp = gen_lowpart (mode, parts.disp);
22628 if (parts.scale > 1)
22630 /* Case r1 = r1 + ... */
22631 if (regno1 == regno0)
22633 /* If we have a case r1 = r1 + C * r2 then we
22634 should use multiplication which is very
22635 expensive. Assume cost model is wrong if we
22636 have such case here. */
22637 gcc_assert (regno2 != regno0);
22639 for (adds = parts.scale; adds > 0; adds--)
22640 ix86_emit_binop (PLUS, mode, target, parts.index);
22642 else
22644 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22645 if (regno0 != regno2)
22646 emit_insn (gen_rtx_SET (target, parts.index));
22648 /* Use shift for scaling. */
22649 ix86_emit_binop (ASHIFT, mode, target,
22650 GEN_INT (exact_log2 (parts.scale)));
22652 if (parts.base)
22653 ix86_emit_binop (PLUS, mode, target, parts.base);
22655 if (parts.disp && parts.disp != const0_rtx)
22656 ix86_emit_binop (PLUS, mode, target, parts.disp);
22659 else if (!parts.base && !parts.index)
22661 gcc_assert(parts.disp);
22662 emit_insn (gen_rtx_SET (target, parts.disp));
22664 else
22666 if (!parts.base)
22668 if (regno0 != regno2)
22669 emit_insn (gen_rtx_SET (target, parts.index));
22671 else if (!parts.index)
22673 if (regno0 != regno1)
22674 emit_insn (gen_rtx_SET (target, parts.base));
22676 else
22678 if (regno0 == regno1)
22679 tmp = parts.index;
22680 else if (regno0 == regno2)
22681 tmp = parts.base;
22682 else
22684 rtx tmp1;
22686 /* Find better operand for SET instruction, depending
22687 on which definition is farther from the insn. */
22688 if (find_nearest_reg_def (insn, regno1, regno2))
22689 tmp = parts.index, tmp1 = parts.base;
22690 else
22691 tmp = parts.base, tmp1 = parts.index;
22693 emit_insn (gen_rtx_SET (target, tmp));
22695 if (parts.disp && parts.disp != const0_rtx)
22696 ix86_emit_binop (PLUS, mode, target, parts.disp);
22698 ix86_emit_binop (PLUS, mode, target, tmp1);
22699 return;
22702 ix86_emit_binop (PLUS, mode, target, tmp);
22705 if (parts.disp && parts.disp != const0_rtx)
22706 ix86_emit_binop (PLUS, mode, target, parts.disp);
22710 /* Return true if it is ok to optimize an ADD operation to LEA
22711 operation to avoid flag register consumation. For most processors,
22712 ADD is faster than LEA. For the processors like BONNELL, if the
22713 destination register of LEA holds an actual address which will be
22714 used soon, LEA is better and otherwise ADD is better. */
22716 bool
22717 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22719 unsigned int regno0 = true_regnum (operands[0]);
22720 unsigned int regno1 = true_regnum (operands[1]);
22721 unsigned int regno2 = true_regnum (operands[2]);
22723 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22724 if (regno0 != regno1 && regno0 != regno2)
22725 return true;
22727 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22728 return false;
22730 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22733 /* Return true if destination reg of SET_BODY is shift count of
22734 USE_BODY. */
22736 static bool
22737 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22739 rtx set_dest;
22740 rtx shift_rtx;
22741 int i;
22743 /* Retrieve destination of SET_BODY. */
22744 switch (GET_CODE (set_body))
22746 case SET:
22747 set_dest = SET_DEST (set_body);
22748 if (!set_dest || !REG_P (set_dest))
22749 return false;
22750 break;
22751 case PARALLEL:
22752 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22753 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22754 use_body))
22755 return true;
22756 /* FALLTHROUGH */
22757 default:
22758 return false;
22761 /* Retrieve shift count of USE_BODY. */
22762 switch (GET_CODE (use_body))
22764 case SET:
22765 shift_rtx = XEXP (use_body, 1);
22766 break;
22767 case PARALLEL:
22768 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22769 if (ix86_dep_by_shift_count_body (set_body,
22770 XVECEXP (use_body, 0, i)))
22771 return true;
22772 /* FALLTHROUGH */
22773 default:
22774 return false;
22777 if (shift_rtx
22778 && (GET_CODE (shift_rtx) == ASHIFT
22779 || GET_CODE (shift_rtx) == LSHIFTRT
22780 || GET_CODE (shift_rtx) == ASHIFTRT
22781 || GET_CODE (shift_rtx) == ROTATE
22782 || GET_CODE (shift_rtx) == ROTATERT))
22784 rtx shift_count = XEXP (shift_rtx, 1);
22786 /* Return true if shift count is dest of SET_BODY. */
22787 if (REG_P (shift_count))
22789 /* Add check since it can be invoked before register
22790 allocation in pre-reload schedule. */
22791 if (reload_completed
22792 && true_regnum (set_dest) == true_regnum (shift_count))
22793 return true;
22794 else if (REGNO(set_dest) == REGNO(shift_count))
22795 return true;
22799 return false;
22802 /* Return true if destination reg of SET_INSN is shift count of
22803 USE_INSN. */
22805 bool
22806 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22808 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22809 PATTERN (use_insn));
22812 /* Return TRUE or FALSE depending on whether the unary operator meets the
22813 appropriate constraints. */
22815 bool
22816 ix86_unary_operator_ok (enum rtx_code,
22817 machine_mode,
22818 rtx operands[2])
22820 /* If one of operands is memory, source and destination must match. */
22821 if ((MEM_P (operands[0])
22822 || MEM_P (operands[1]))
22823 && ! rtx_equal_p (operands[0], operands[1]))
22824 return false;
22825 return true;
22828 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22829 are ok, keeping in mind the possible movddup alternative. */
22831 bool
22832 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22834 if (MEM_P (operands[0]))
22835 return rtx_equal_p (operands[0], operands[1 + high]);
22836 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22837 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22838 return true;
22841 /* Post-reload splitter for converting an SF or DFmode value in an
22842 SSE register into an unsigned SImode. */
22844 void
22845 ix86_split_convert_uns_si_sse (rtx operands[])
22847 machine_mode vecmode;
22848 rtx value, large, zero_or_two31, input, two31, x;
22850 large = operands[1];
22851 zero_or_two31 = operands[2];
22852 input = operands[3];
22853 two31 = operands[4];
22854 vecmode = GET_MODE (large);
22855 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22857 /* Load up the value into the low element. We must ensure that the other
22858 elements are valid floats -- zero is the easiest such value. */
22859 if (MEM_P (input))
22861 if (vecmode == V4SFmode)
22862 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22863 else
22864 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22866 else
22868 input = gen_rtx_REG (vecmode, REGNO (input));
22869 emit_move_insn (value, CONST0_RTX (vecmode));
22870 if (vecmode == V4SFmode)
22871 emit_insn (gen_sse_movss (value, value, input));
22872 else
22873 emit_insn (gen_sse2_movsd (value, value, input));
22876 emit_move_insn (large, two31);
22877 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22879 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22880 emit_insn (gen_rtx_SET (large, x));
22882 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22883 emit_insn (gen_rtx_SET (zero_or_two31, x));
22885 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22886 emit_insn (gen_rtx_SET (value, x));
22888 large = gen_rtx_REG (V4SImode, REGNO (large));
22889 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22891 x = gen_rtx_REG (V4SImode, REGNO (value));
22892 if (vecmode == V4SFmode)
22893 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22894 else
22895 emit_insn (gen_sse2_cvttpd2dq (x, value));
22896 value = x;
22898 emit_insn (gen_xorv4si3 (value, value, large));
22901 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22902 Expects the 64-bit DImode to be supplied in a pair of integral
22903 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22904 -mfpmath=sse, !optimize_size only. */
22906 void
22907 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22909 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22910 rtx int_xmm, fp_xmm;
22911 rtx biases, exponents;
22912 rtx x;
22914 int_xmm = gen_reg_rtx (V4SImode);
22915 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22916 emit_insn (gen_movdi_to_sse (int_xmm, input));
22917 else if (TARGET_SSE_SPLIT_REGS)
22919 emit_clobber (int_xmm);
22920 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22922 else
22924 x = gen_reg_rtx (V2DImode);
22925 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22926 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22929 x = gen_rtx_CONST_VECTOR (V4SImode,
22930 gen_rtvec (4, GEN_INT (0x43300000UL),
22931 GEN_INT (0x45300000UL),
22932 const0_rtx, const0_rtx));
22933 exponents = validize_mem (force_const_mem (V4SImode, x));
22935 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22936 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22938 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22939 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22940 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22941 (0x1.0p84 + double(fp_value_hi_xmm)).
22942 Note these exponents differ by 32. */
22944 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22946 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22947 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22948 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22949 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22950 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22951 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22952 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22953 biases = validize_mem (force_const_mem (V2DFmode, biases));
22954 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22956 /* Add the upper and lower DFmode values together. */
22957 if (TARGET_SSE3)
22958 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22959 else
22961 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22962 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22963 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22966 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22969 /* Not used, but eases macroization of patterns. */
22970 void
22971 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22973 gcc_unreachable ();
22976 /* Convert an unsigned SImode value into a DFmode. Only currently used
22977 for SSE, but applicable anywhere. */
22979 void
22980 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22982 REAL_VALUE_TYPE TWO31r;
22983 rtx x, fp;
22985 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22986 NULL, 1, OPTAB_DIRECT);
22988 fp = gen_reg_rtx (DFmode);
22989 emit_insn (gen_floatsidf2 (fp, x));
22991 real_ldexp (&TWO31r, &dconst1, 31);
22992 x = const_double_from_real_value (TWO31r, DFmode);
22994 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22995 if (x != target)
22996 emit_move_insn (target, x);
22999 /* Convert a signed DImode value into a DFmode. Only used for SSE in
23000 32-bit mode; otherwise we have a direct convert instruction. */
23002 void
23003 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
23005 REAL_VALUE_TYPE TWO32r;
23006 rtx fp_lo, fp_hi, x;
23008 fp_lo = gen_reg_rtx (DFmode);
23009 fp_hi = gen_reg_rtx (DFmode);
23011 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
23013 real_ldexp (&TWO32r, &dconst1, 32);
23014 x = const_double_from_real_value (TWO32r, DFmode);
23015 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
23017 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
23019 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
23020 0, OPTAB_DIRECT);
23021 if (x != target)
23022 emit_move_insn (target, x);
23025 /* Convert an unsigned SImode value into a SFmode, using only SSE.
23026 For x86_32, -mfpmath=sse, !optimize_size only. */
23027 void
23028 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
23030 REAL_VALUE_TYPE ONE16r;
23031 rtx fp_hi, fp_lo, int_hi, int_lo, x;
23033 real_ldexp (&ONE16r, &dconst1, 16);
23034 x = const_double_from_real_value (ONE16r, SFmode);
23035 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
23036 NULL, 0, OPTAB_DIRECT);
23037 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
23038 NULL, 0, OPTAB_DIRECT);
23039 fp_hi = gen_reg_rtx (SFmode);
23040 fp_lo = gen_reg_rtx (SFmode);
23041 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
23042 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
23043 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
23044 0, OPTAB_DIRECT);
23045 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
23046 0, OPTAB_DIRECT);
23047 if (!rtx_equal_p (target, fp_hi))
23048 emit_move_insn (target, fp_hi);
23051 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
23052 a vector of unsigned ints VAL to vector of floats TARGET. */
23054 void
23055 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
23057 rtx tmp[8];
23058 REAL_VALUE_TYPE TWO16r;
23059 machine_mode intmode = GET_MODE (val);
23060 machine_mode fltmode = GET_MODE (target);
23061 rtx (*cvt) (rtx, rtx);
23063 if (intmode == V4SImode)
23064 cvt = gen_floatv4siv4sf2;
23065 else
23066 cvt = gen_floatv8siv8sf2;
23067 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
23068 tmp[0] = force_reg (intmode, tmp[0]);
23069 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
23070 OPTAB_DIRECT);
23071 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
23072 NULL_RTX, 1, OPTAB_DIRECT);
23073 tmp[3] = gen_reg_rtx (fltmode);
23074 emit_insn (cvt (tmp[3], tmp[1]));
23075 tmp[4] = gen_reg_rtx (fltmode);
23076 emit_insn (cvt (tmp[4], tmp[2]));
23077 real_ldexp (&TWO16r, &dconst1, 16);
23078 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
23079 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
23080 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
23081 OPTAB_DIRECT);
23082 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
23083 OPTAB_DIRECT);
23084 if (tmp[7] != target)
23085 emit_move_insn (target, tmp[7]);
23088 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
23089 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
23090 This is done by doing just signed conversion if < 0x1p31, and otherwise by
23091 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
23094 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
23096 REAL_VALUE_TYPE TWO31r;
23097 rtx two31r, tmp[4];
23098 machine_mode mode = GET_MODE (val);
23099 machine_mode scalarmode = GET_MODE_INNER (mode);
23100 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
23101 rtx (*cmp) (rtx, rtx, rtx, rtx);
23102 int i;
23104 for (i = 0; i < 3; i++)
23105 tmp[i] = gen_reg_rtx (mode);
23106 real_ldexp (&TWO31r, &dconst1, 31);
23107 two31r = const_double_from_real_value (TWO31r, scalarmode);
23108 two31r = ix86_build_const_vector (mode, 1, two31r);
23109 two31r = force_reg (mode, two31r);
23110 switch (mode)
23112 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
23113 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
23114 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
23115 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
23116 default: gcc_unreachable ();
23118 tmp[3] = gen_rtx_LE (mode, two31r, val);
23119 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
23120 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
23121 0, OPTAB_DIRECT);
23122 if (intmode == V4SImode || TARGET_AVX2)
23123 *xorp = expand_simple_binop (intmode, ASHIFT,
23124 gen_lowpart (intmode, tmp[0]),
23125 GEN_INT (31), NULL_RTX, 0,
23126 OPTAB_DIRECT);
23127 else
23129 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
23130 two31 = ix86_build_const_vector (intmode, 1, two31);
23131 *xorp = expand_simple_binop (intmode, AND,
23132 gen_lowpart (intmode, tmp[0]),
23133 two31, NULL_RTX, 0,
23134 OPTAB_DIRECT);
23136 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
23137 0, OPTAB_DIRECT);
23140 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
23141 then replicate the value for all elements of the vector
23142 register. */
23145 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
23147 int i, n_elt;
23148 rtvec v;
23149 machine_mode scalar_mode;
23151 switch (mode)
23153 case E_V64QImode:
23154 case E_V32QImode:
23155 case E_V16QImode:
23156 case E_V32HImode:
23157 case E_V16HImode:
23158 case E_V8HImode:
23159 case E_V16SImode:
23160 case E_V8SImode:
23161 case E_V4SImode:
23162 case E_V8DImode:
23163 case E_V4DImode:
23164 case E_V2DImode:
23165 gcc_assert (vect);
23166 /* FALLTHRU */
23167 case E_V16SFmode:
23168 case E_V8SFmode:
23169 case E_V4SFmode:
23170 case E_V8DFmode:
23171 case E_V4DFmode:
23172 case E_V2DFmode:
23173 n_elt = GET_MODE_NUNITS (mode);
23174 v = rtvec_alloc (n_elt);
23175 scalar_mode = GET_MODE_INNER (mode);
23177 RTVEC_ELT (v, 0) = value;
23179 for (i = 1; i < n_elt; ++i)
23180 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
23182 return gen_rtx_CONST_VECTOR (mode, v);
23184 default:
23185 gcc_unreachable ();
23189 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
23190 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
23191 for an SSE register. If VECT is true, then replicate the mask for
23192 all elements of the vector register. If INVERT is true, then create
23193 a mask excluding the sign bit. */
23196 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
23198 machine_mode vec_mode, imode;
23199 wide_int w;
23200 rtx mask, v;
23202 switch (mode)
23204 case E_V16SImode:
23205 case E_V16SFmode:
23206 case E_V8SImode:
23207 case E_V4SImode:
23208 case E_V8SFmode:
23209 case E_V4SFmode:
23210 vec_mode = mode;
23211 imode = SImode;
23212 break;
23214 case E_V8DImode:
23215 case E_V4DImode:
23216 case E_V2DImode:
23217 case E_V8DFmode:
23218 case E_V4DFmode:
23219 case E_V2DFmode:
23220 vec_mode = mode;
23221 imode = DImode;
23222 break;
23224 case E_TImode:
23225 case E_TFmode:
23226 vec_mode = VOIDmode;
23227 imode = TImode;
23228 break;
23230 default:
23231 gcc_unreachable ();
23234 machine_mode inner_mode = GET_MODE_INNER (mode);
23235 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
23236 GET_MODE_BITSIZE (inner_mode));
23237 if (invert)
23238 w = wi::bit_not (w);
23240 /* Force this value into the low part of a fp vector constant. */
23241 mask = immed_wide_int_const (w, imode);
23242 mask = gen_lowpart (inner_mode, mask);
23244 if (vec_mode == VOIDmode)
23245 return force_reg (inner_mode, mask);
23247 v = ix86_build_const_vector (vec_mode, vect, mask);
23248 return force_reg (vec_mode, v);
23251 /* Generate code for floating point ABS or NEG. */
23253 void
23254 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
23255 rtx operands[])
23257 rtx mask, set, dst, src;
23258 bool use_sse = false;
23259 bool vector_mode = VECTOR_MODE_P (mode);
23260 machine_mode vmode = mode;
23262 if (vector_mode)
23263 use_sse = true;
23264 else if (mode == TFmode)
23265 use_sse = true;
23266 else if (TARGET_SSE_MATH)
23268 use_sse = SSE_FLOAT_MODE_P (mode);
23269 if (mode == SFmode)
23270 vmode = V4SFmode;
23271 else if (mode == DFmode)
23272 vmode = V2DFmode;
23275 /* NEG and ABS performed with SSE use bitwise mask operations.
23276 Create the appropriate mask now. */
23277 if (use_sse)
23278 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
23279 else
23280 mask = NULL_RTX;
23282 dst = operands[0];
23283 src = operands[1];
23285 set = gen_rtx_fmt_e (code, mode, src);
23286 set = gen_rtx_SET (dst, set);
23288 if (mask)
23290 rtx use, clob;
23291 rtvec par;
23293 use = gen_rtx_USE (VOIDmode, mask);
23294 if (vector_mode)
23295 par = gen_rtvec (2, set, use);
23296 else
23298 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
23299 par = gen_rtvec (3, set, use, clob);
23301 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
23303 else
23304 emit_insn (set);
23307 /* Expand a copysign operation. Special case operand 0 being a constant. */
23309 void
23310 ix86_expand_copysign (rtx operands[])
23312 machine_mode mode, vmode;
23313 rtx dest, op0, op1, mask, nmask;
23315 dest = operands[0];
23316 op0 = operands[1];
23317 op1 = operands[2];
23319 mode = GET_MODE (dest);
23321 if (mode == SFmode)
23322 vmode = V4SFmode;
23323 else if (mode == DFmode)
23324 vmode = V2DFmode;
23325 else
23326 vmode = mode;
23328 if (CONST_DOUBLE_P (op0))
23330 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
23332 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
23333 op0 = simplify_unary_operation (ABS, mode, op0, mode);
23335 if (mode == SFmode || mode == DFmode)
23337 if (op0 == CONST0_RTX (mode))
23338 op0 = CONST0_RTX (vmode);
23339 else
23341 rtx v = ix86_build_const_vector (vmode, false, op0);
23343 op0 = force_reg (vmode, v);
23346 else if (op0 != CONST0_RTX (mode))
23347 op0 = force_reg (mode, op0);
23349 mask = ix86_build_signbit_mask (vmode, 0, 0);
23351 if (mode == SFmode)
23352 copysign_insn = gen_copysignsf3_const;
23353 else if (mode == DFmode)
23354 copysign_insn = gen_copysigndf3_const;
23355 else
23356 copysign_insn = gen_copysigntf3_const;
23358 emit_insn (copysign_insn (dest, op0, op1, mask));
23360 else
23362 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
23364 nmask = ix86_build_signbit_mask (vmode, 0, 1);
23365 mask = ix86_build_signbit_mask (vmode, 0, 0);
23367 if (mode == SFmode)
23368 copysign_insn = gen_copysignsf3_var;
23369 else if (mode == DFmode)
23370 copysign_insn = gen_copysigndf3_var;
23371 else
23372 copysign_insn = gen_copysigntf3_var;
23374 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
23378 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
23379 be a constant, and so has already been expanded into a vector constant. */
23381 void
23382 ix86_split_copysign_const (rtx operands[])
23384 machine_mode mode, vmode;
23385 rtx dest, op0, mask, x;
23387 dest = operands[0];
23388 op0 = operands[1];
23389 mask = operands[3];
23391 mode = GET_MODE (dest);
23392 vmode = GET_MODE (mask);
23394 dest = lowpart_subreg (vmode, dest, mode);
23395 x = gen_rtx_AND (vmode, dest, mask);
23396 emit_insn (gen_rtx_SET (dest, x));
23398 if (op0 != CONST0_RTX (vmode))
23400 x = gen_rtx_IOR (vmode, dest, op0);
23401 emit_insn (gen_rtx_SET (dest, x));
23405 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
23406 so we have to do two masks. */
23408 void
23409 ix86_split_copysign_var (rtx operands[])
23411 machine_mode mode, vmode;
23412 rtx dest, scratch, op0, op1, mask, nmask, x;
23414 dest = operands[0];
23415 scratch = operands[1];
23416 op0 = operands[2];
23417 op1 = operands[3];
23418 nmask = operands[4];
23419 mask = operands[5];
23421 mode = GET_MODE (dest);
23422 vmode = GET_MODE (mask);
23424 if (rtx_equal_p (op0, op1))
23426 /* Shouldn't happen often (it's useless, obviously), but when it does
23427 we'd generate incorrect code if we continue below. */
23428 emit_move_insn (dest, op0);
23429 return;
23432 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23434 gcc_assert (REGNO (op1) == REGNO (scratch));
23436 x = gen_rtx_AND (vmode, scratch, mask);
23437 emit_insn (gen_rtx_SET (scratch, x));
23439 dest = mask;
23440 op0 = lowpart_subreg (vmode, op0, mode);
23441 x = gen_rtx_NOT (vmode, dest);
23442 x = gen_rtx_AND (vmode, x, op0);
23443 emit_insn (gen_rtx_SET (dest, x));
23445 else
23447 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23449 x = gen_rtx_AND (vmode, scratch, mask);
23451 else /* alternative 2,4 */
23453 gcc_assert (REGNO (mask) == REGNO (scratch));
23454 op1 = lowpart_subreg (vmode, op1, mode);
23455 x = gen_rtx_AND (vmode, scratch, op1);
23457 emit_insn (gen_rtx_SET (scratch, x));
23459 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23461 dest = lowpart_subreg (vmode, op0, mode);
23462 x = gen_rtx_AND (vmode, dest, nmask);
23464 else /* alternative 3,4 */
23466 gcc_assert (REGNO (nmask) == REGNO (dest));
23467 dest = nmask;
23468 op0 = lowpart_subreg (vmode, op0, mode);
23469 x = gen_rtx_AND (vmode, dest, op0);
23471 emit_insn (gen_rtx_SET (dest, x));
23474 x = gen_rtx_IOR (vmode, dest, scratch);
23475 emit_insn (gen_rtx_SET (dest, x));
23478 /* Return TRUE or FALSE depending on whether the first SET in INSN
23479 has source and destination with matching CC modes, and that the
23480 CC mode is at least as constrained as REQ_MODE. */
23482 bool
23483 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23485 rtx set;
23486 machine_mode set_mode;
23488 set = PATTERN (insn);
23489 if (GET_CODE (set) == PARALLEL)
23490 set = XVECEXP (set, 0, 0);
23491 gcc_assert (GET_CODE (set) == SET);
23492 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23494 set_mode = GET_MODE (SET_DEST (set));
23495 switch (set_mode)
23497 case E_CCNOmode:
23498 if (req_mode != CCNOmode
23499 && (req_mode != CCmode
23500 || XEXP (SET_SRC (set), 1) != const0_rtx))
23501 return false;
23502 break;
23503 case E_CCmode:
23504 if (req_mode == CCGCmode)
23505 return false;
23506 /* FALLTHRU */
23507 case E_CCGCmode:
23508 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23509 return false;
23510 /* FALLTHRU */
23511 case E_CCGOCmode:
23512 if (req_mode == CCZmode)
23513 return false;
23514 /* FALLTHRU */
23515 case E_CCZmode:
23516 break;
23518 case E_CCAmode:
23519 case E_CCCmode:
23520 case E_CCOmode:
23521 case E_CCPmode:
23522 case E_CCSmode:
23523 if (set_mode != req_mode)
23524 return false;
23525 break;
23527 default:
23528 gcc_unreachable ();
23531 return GET_MODE (SET_SRC (set)) == set_mode;
23534 /* Generate insn patterns to do an integer compare of OPERANDS. */
23536 static rtx
23537 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23539 machine_mode cmpmode;
23540 rtx tmp, flags;
23542 cmpmode = SELECT_CC_MODE (code, op0, op1);
23543 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23545 /* This is very simple, but making the interface the same as in the
23546 FP case makes the rest of the code easier. */
23547 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23548 emit_insn (gen_rtx_SET (flags, tmp));
23550 /* Return the test that should be put into the flags user, i.e.
23551 the bcc, scc, or cmov instruction. */
23552 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23555 /* Figure out whether to use ordered or unordered fp comparisons.
23556 Return the appropriate mode to use. */
23558 machine_mode
23559 ix86_fp_compare_mode (enum rtx_code)
23561 /* ??? In order to make all comparisons reversible, we do all comparisons
23562 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23563 all forms trapping and nontrapping comparisons, we can make inequality
23564 comparisons trapping again, since it results in better code when using
23565 FCOM based compares. */
23566 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23569 machine_mode
23570 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23572 machine_mode mode = GET_MODE (op0);
23574 if (SCALAR_FLOAT_MODE_P (mode))
23576 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23577 return ix86_fp_compare_mode (code);
23580 switch (code)
23582 /* Only zero flag is needed. */
23583 case EQ: /* ZF=0 */
23584 case NE: /* ZF!=0 */
23585 return CCZmode;
23586 /* Codes needing carry flag. */
23587 case GEU: /* CF=0 */
23588 case LTU: /* CF=1 */
23589 /* Detect overflow checks. They need just the carry flag. */
23590 if (GET_CODE (op0) == PLUS
23591 && (rtx_equal_p (op1, XEXP (op0, 0))
23592 || rtx_equal_p (op1, XEXP (op0, 1))))
23593 return CCCmode;
23594 else
23595 return CCmode;
23596 case GTU: /* CF=0 & ZF=0 */
23597 case LEU: /* CF=1 | ZF=1 */
23598 return CCmode;
23599 /* Codes possibly doable only with sign flag when
23600 comparing against zero. */
23601 case GE: /* SF=OF or SF=0 */
23602 case LT: /* SF<>OF or SF=1 */
23603 if (op1 == const0_rtx)
23604 return CCGOCmode;
23605 else
23606 /* For other cases Carry flag is not required. */
23607 return CCGCmode;
23608 /* Codes doable only with sign flag when comparing
23609 against zero, but we miss jump instruction for it
23610 so we need to use relational tests against overflow
23611 that thus needs to be zero. */
23612 case GT: /* ZF=0 & SF=OF */
23613 case LE: /* ZF=1 | SF<>OF */
23614 if (op1 == const0_rtx)
23615 return CCNOmode;
23616 else
23617 return CCGCmode;
23618 /* strcmp pattern do (use flags) and combine may ask us for proper
23619 mode. */
23620 case USE:
23621 return CCmode;
23622 default:
23623 gcc_unreachable ();
23627 /* Return the fixed registers used for condition codes. */
23629 static bool
23630 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23632 *p1 = FLAGS_REG;
23633 *p2 = FPSR_REG;
23634 return true;
23637 /* If two condition code modes are compatible, return a condition code
23638 mode which is compatible with both. Otherwise, return
23639 VOIDmode. */
23641 static machine_mode
23642 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23644 if (m1 == m2)
23645 return m1;
23647 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23648 return VOIDmode;
23650 if ((m1 == CCGCmode && m2 == CCGOCmode)
23651 || (m1 == CCGOCmode && m2 == CCGCmode))
23652 return CCGCmode;
23654 if ((m1 == CCNOmode && m2 == CCGOCmode)
23655 || (m1 == CCGOCmode && m2 == CCNOmode))
23656 return CCNOmode;
23658 if (m1 == CCZmode
23659 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23660 return m2;
23661 else if (m2 == CCZmode
23662 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23663 return m1;
23665 switch (m1)
23667 default:
23668 gcc_unreachable ();
23670 case E_CCmode:
23671 case E_CCGCmode:
23672 case E_CCGOCmode:
23673 case E_CCNOmode:
23674 case E_CCAmode:
23675 case E_CCCmode:
23676 case E_CCOmode:
23677 case E_CCPmode:
23678 case E_CCSmode:
23679 case E_CCZmode:
23680 switch (m2)
23682 default:
23683 return VOIDmode;
23685 case E_CCmode:
23686 case E_CCGCmode:
23687 case E_CCGOCmode:
23688 case E_CCNOmode:
23689 case E_CCAmode:
23690 case E_CCCmode:
23691 case E_CCOmode:
23692 case E_CCPmode:
23693 case E_CCSmode:
23694 case E_CCZmode:
23695 return CCmode;
23698 case E_CCFPmode:
23699 case E_CCFPUmode:
23700 /* These are only compatible with themselves, which we already
23701 checked above. */
23702 return VOIDmode;
23707 /* Return a comparison we can do and that it is equivalent to
23708 swap_condition (code) apart possibly from orderedness.
23709 But, never change orderedness if TARGET_IEEE_FP, returning
23710 UNKNOWN in that case if necessary. */
23712 static enum rtx_code
23713 ix86_fp_swap_condition (enum rtx_code code)
23715 switch (code)
23717 case GT: /* GTU - CF=0 & ZF=0 */
23718 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23719 case GE: /* GEU - CF=0 */
23720 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23721 case UNLT: /* LTU - CF=1 */
23722 return TARGET_IEEE_FP ? UNKNOWN : GT;
23723 case UNLE: /* LEU - CF=1 | ZF=1 */
23724 return TARGET_IEEE_FP ? UNKNOWN : GE;
23725 default:
23726 return swap_condition (code);
23730 /* Return cost of comparison CODE using the best strategy for performance.
23731 All following functions do use number of instructions as a cost metrics.
23732 In future this should be tweaked to compute bytes for optimize_size and
23733 take into account performance of various instructions on various CPUs. */
23735 static int
23736 ix86_fp_comparison_cost (enum rtx_code code)
23738 int arith_cost;
23740 /* The cost of code using bit-twiddling on %ah. */
23741 switch (code)
23743 case UNLE:
23744 case UNLT:
23745 case LTGT:
23746 case GT:
23747 case GE:
23748 case UNORDERED:
23749 case ORDERED:
23750 case UNEQ:
23751 arith_cost = 4;
23752 break;
23753 case LT:
23754 case NE:
23755 case EQ:
23756 case UNGE:
23757 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23758 break;
23759 case LE:
23760 case UNGT:
23761 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23762 break;
23763 default:
23764 gcc_unreachable ();
23767 switch (ix86_fp_comparison_strategy (code))
23769 case IX86_FPCMP_COMI:
23770 return arith_cost > 4 ? 3 : 2;
23771 case IX86_FPCMP_SAHF:
23772 return arith_cost > 4 ? 4 : 3;
23773 default:
23774 return arith_cost;
23778 /* Return strategy to use for floating-point. We assume that fcomi is always
23779 preferrable where available, since that is also true when looking at size
23780 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23782 enum ix86_fpcmp_strategy
23783 ix86_fp_comparison_strategy (enum rtx_code)
23785 /* Do fcomi/sahf based test when profitable. */
23787 if (TARGET_CMOVE)
23788 return IX86_FPCMP_COMI;
23790 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23791 return IX86_FPCMP_SAHF;
23793 return IX86_FPCMP_ARITH;
23796 /* Swap, force into registers, or otherwise massage the two operands
23797 to a fp comparison. The operands are updated in place; the new
23798 comparison code is returned. */
23800 static enum rtx_code
23801 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23803 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23804 rtx op0 = *pop0, op1 = *pop1;
23805 machine_mode op_mode = GET_MODE (op0);
23806 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23808 /* All of the unordered compare instructions only work on registers.
23809 The same is true of the fcomi compare instructions. The XFmode
23810 compare instructions require registers except when comparing
23811 against zero or when converting operand 1 from fixed point to
23812 floating point. */
23814 if (!is_sse
23815 && (fpcmp_mode == CCFPUmode
23816 || (op_mode == XFmode
23817 && ! (standard_80387_constant_p (op0) == 1
23818 || standard_80387_constant_p (op1) == 1)
23819 && GET_CODE (op1) != FLOAT)
23820 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23822 op0 = force_reg (op_mode, op0);
23823 op1 = force_reg (op_mode, op1);
23825 else
23827 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23828 things around if they appear profitable, otherwise force op0
23829 into a register. */
23831 if (standard_80387_constant_p (op0) == 0
23832 || (MEM_P (op0)
23833 && ! (standard_80387_constant_p (op1) == 0
23834 || MEM_P (op1))))
23836 enum rtx_code new_code = ix86_fp_swap_condition (code);
23837 if (new_code != UNKNOWN)
23839 std::swap (op0, op1);
23840 code = new_code;
23844 if (!REG_P (op0))
23845 op0 = force_reg (op_mode, op0);
23847 if (CONSTANT_P (op1))
23849 int tmp = standard_80387_constant_p (op1);
23850 if (tmp == 0)
23851 op1 = validize_mem (force_const_mem (op_mode, op1));
23852 else if (tmp == 1)
23854 if (TARGET_CMOVE)
23855 op1 = force_reg (op_mode, op1);
23857 else
23858 op1 = force_reg (op_mode, op1);
23862 /* Try to rearrange the comparison to make it cheaper. */
23863 if (ix86_fp_comparison_cost (code)
23864 > ix86_fp_comparison_cost (swap_condition (code))
23865 && (REG_P (op1) || can_create_pseudo_p ()))
23867 std::swap (op0, op1);
23868 code = swap_condition (code);
23869 if (!REG_P (op0))
23870 op0 = force_reg (op_mode, op0);
23873 *pop0 = op0;
23874 *pop1 = op1;
23875 return code;
23878 /* Convert comparison codes we use to represent FP comparison to integer
23879 code that will result in proper branch. Return UNKNOWN if no such code
23880 is available. */
23882 enum rtx_code
23883 ix86_fp_compare_code_to_integer (enum rtx_code code)
23885 switch (code)
23887 case GT:
23888 return GTU;
23889 case GE:
23890 return GEU;
23891 case ORDERED:
23892 case UNORDERED:
23893 return code;
23894 case UNEQ:
23895 return EQ;
23896 case UNLT:
23897 return LTU;
23898 case UNLE:
23899 return LEU;
23900 case LTGT:
23901 return NE;
23902 default:
23903 return UNKNOWN;
23907 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23909 static rtx
23910 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23912 machine_mode fpcmp_mode, intcmp_mode;
23913 rtx tmp, tmp2;
23915 fpcmp_mode = ix86_fp_compare_mode (code);
23916 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23918 /* Do fcomi/sahf based test when profitable. */
23919 switch (ix86_fp_comparison_strategy (code))
23921 case IX86_FPCMP_COMI:
23922 intcmp_mode = fpcmp_mode;
23923 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23924 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23925 emit_insn (tmp);
23926 break;
23928 case IX86_FPCMP_SAHF:
23929 intcmp_mode = fpcmp_mode;
23930 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23931 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23933 if (!scratch)
23934 scratch = gen_reg_rtx (HImode);
23935 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23936 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23937 break;
23939 case IX86_FPCMP_ARITH:
23940 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23941 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23942 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23943 if (!scratch)
23944 scratch = gen_reg_rtx (HImode);
23945 emit_insn (gen_rtx_SET (scratch, tmp2));
23947 /* In the unordered case, we have to check C2 for NaN's, which
23948 doesn't happen to work out to anything nice combination-wise.
23949 So do some bit twiddling on the value we've got in AH to come
23950 up with an appropriate set of condition codes. */
23952 intcmp_mode = CCNOmode;
23953 switch (code)
23955 case GT:
23956 case UNGT:
23957 if (code == GT || !TARGET_IEEE_FP)
23959 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23960 code = EQ;
23962 else
23964 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23965 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23966 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23967 intcmp_mode = CCmode;
23968 code = GEU;
23970 break;
23971 case LT:
23972 case UNLT:
23973 if (code == LT && TARGET_IEEE_FP)
23975 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23976 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23977 intcmp_mode = CCmode;
23978 code = EQ;
23980 else
23982 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23983 code = NE;
23985 break;
23986 case GE:
23987 case UNGE:
23988 if (code == GE || !TARGET_IEEE_FP)
23990 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23991 code = EQ;
23993 else
23995 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23996 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23997 code = NE;
23999 break;
24000 case LE:
24001 case UNLE:
24002 if (code == LE && TARGET_IEEE_FP)
24004 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
24005 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
24006 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
24007 intcmp_mode = CCmode;
24008 code = LTU;
24010 else
24012 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
24013 code = NE;
24015 break;
24016 case EQ:
24017 case UNEQ:
24018 if (code == EQ && TARGET_IEEE_FP)
24020 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
24021 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
24022 intcmp_mode = CCmode;
24023 code = EQ;
24025 else
24027 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
24028 code = NE;
24030 break;
24031 case NE:
24032 case LTGT:
24033 if (code == NE && TARGET_IEEE_FP)
24035 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
24036 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
24037 GEN_INT (0x40)));
24038 code = NE;
24040 else
24042 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
24043 code = EQ;
24045 break;
24047 case UNORDERED:
24048 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
24049 code = NE;
24050 break;
24051 case ORDERED:
24052 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
24053 code = EQ;
24054 break;
24056 default:
24057 gcc_unreachable ();
24059 break;
24061 default:
24062 gcc_unreachable();
24065 /* Return the test that should be put into the flags user, i.e.
24066 the bcc, scc, or cmov instruction. */
24067 return gen_rtx_fmt_ee (code, VOIDmode,
24068 gen_rtx_REG (intcmp_mode, FLAGS_REG),
24069 const0_rtx);
24072 static rtx
24073 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
24075 rtx ret;
24077 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
24078 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
24080 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
24082 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
24083 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
24085 else
24086 ret = ix86_expand_int_compare (code, op0, op1);
24088 return ret;
24091 void
24092 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
24094 machine_mode mode = GET_MODE (op0);
24095 rtx tmp;
24097 /* Handle special case - vector comparsion with boolean result, transform
24098 it using ptest instruction. */
24099 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
24101 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
24102 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
24104 gcc_assert (code == EQ || code == NE);
24105 /* Generate XOR since we can't check that one operand is zero vector. */
24106 tmp = gen_reg_rtx (mode);
24107 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
24108 tmp = gen_lowpart (p_mode, tmp);
24109 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
24110 gen_rtx_UNSPEC (CCmode,
24111 gen_rtvec (2, tmp, tmp),
24112 UNSPEC_PTEST)));
24113 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
24114 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24115 gen_rtx_LABEL_REF (VOIDmode, label),
24116 pc_rtx);
24117 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
24118 return;
24121 switch (mode)
24123 case E_SFmode:
24124 case E_DFmode:
24125 case E_XFmode:
24126 case E_QImode:
24127 case E_HImode:
24128 case E_SImode:
24129 simple:
24130 tmp = ix86_expand_compare (code, op0, op1);
24131 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24132 gen_rtx_LABEL_REF (VOIDmode, label),
24133 pc_rtx);
24134 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
24135 return;
24137 case E_DImode:
24138 if (TARGET_64BIT)
24139 goto simple;
24140 /* For 32-bit target DI comparison may be performed on
24141 SSE registers. To allow this we should avoid split
24142 to SI mode which is achieved by doing xor in DI mode
24143 and then comparing with zero (which is recognized by
24144 STV pass). We don't compare using xor when optimizing
24145 for size. */
24146 if (!optimize_insn_for_size_p ()
24147 && TARGET_STV
24148 && (code == EQ || code == NE))
24150 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
24151 op1 = const0_rtx;
24153 /* FALLTHRU */
24154 case E_TImode:
24155 /* Expand DImode branch into multiple compare+branch. */
24157 rtx lo[2], hi[2];
24158 rtx_code_label *label2;
24159 enum rtx_code code1, code2, code3;
24160 machine_mode submode;
24162 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
24164 std::swap (op0, op1);
24165 code = swap_condition (code);
24168 split_double_mode (mode, &op0, 1, lo+0, hi+0);
24169 split_double_mode (mode, &op1, 1, lo+1, hi+1);
24171 submode = mode == DImode ? SImode : DImode;
24173 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
24174 avoid two branches. This costs one extra insn, so disable when
24175 optimizing for size. */
24177 if ((code == EQ || code == NE)
24178 && (!optimize_insn_for_size_p ()
24179 || hi[1] == const0_rtx || lo[1] == const0_rtx))
24181 rtx xor0, xor1;
24183 xor1 = hi[0];
24184 if (hi[1] != const0_rtx)
24185 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
24186 NULL_RTX, 0, OPTAB_WIDEN);
24188 xor0 = lo[0];
24189 if (lo[1] != const0_rtx)
24190 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
24191 NULL_RTX, 0, OPTAB_WIDEN);
24193 tmp = expand_binop (submode, ior_optab, xor1, xor0,
24194 NULL_RTX, 0, OPTAB_WIDEN);
24196 ix86_expand_branch (code, tmp, const0_rtx, label);
24197 return;
24200 /* Otherwise, if we are doing less-than or greater-or-equal-than,
24201 op1 is a constant and the low word is zero, then we can just
24202 examine the high word. Similarly for low word -1 and
24203 less-or-equal-than or greater-than. */
24205 if (CONST_INT_P (hi[1]))
24206 switch (code)
24208 case LT: case LTU: case GE: case GEU:
24209 if (lo[1] == const0_rtx)
24211 ix86_expand_branch (code, hi[0], hi[1], label);
24212 return;
24214 break;
24215 case LE: case LEU: case GT: case GTU:
24216 if (lo[1] == constm1_rtx)
24218 ix86_expand_branch (code, hi[0], hi[1], label);
24219 return;
24221 break;
24222 default:
24223 break;
24226 /* Otherwise, we need two or three jumps. */
24228 label2 = gen_label_rtx ();
24230 code1 = code;
24231 code2 = swap_condition (code);
24232 code3 = unsigned_condition (code);
24234 switch (code)
24236 case LT: case GT: case LTU: case GTU:
24237 break;
24239 case LE: code1 = LT; code2 = GT; break;
24240 case GE: code1 = GT; code2 = LT; break;
24241 case LEU: code1 = LTU; code2 = GTU; break;
24242 case GEU: code1 = GTU; code2 = LTU; break;
24244 case EQ: code1 = UNKNOWN; code2 = NE; break;
24245 case NE: code2 = UNKNOWN; break;
24247 default:
24248 gcc_unreachable ();
24252 * a < b =>
24253 * if (hi(a) < hi(b)) goto true;
24254 * if (hi(a) > hi(b)) goto false;
24255 * if (lo(a) < lo(b)) goto true;
24256 * false:
24259 if (code1 != UNKNOWN)
24260 ix86_expand_branch (code1, hi[0], hi[1], label);
24261 if (code2 != UNKNOWN)
24262 ix86_expand_branch (code2, hi[0], hi[1], label2);
24264 ix86_expand_branch (code3, lo[0], lo[1], label);
24266 if (code2 != UNKNOWN)
24267 emit_label (label2);
24268 return;
24271 default:
24272 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
24273 goto simple;
24277 /* Split branch based on floating point condition. */
24278 void
24279 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
24280 rtx target1, rtx target2, rtx tmp)
24282 rtx condition;
24283 rtx_insn *i;
24285 if (target2 != pc_rtx)
24287 std::swap (target1, target2);
24288 code = reverse_condition_maybe_unordered (code);
24291 condition = ix86_expand_fp_compare (code, op1, op2,
24292 tmp);
24294 i = emit_jump_insn (gen_rtx_SET
24295 (pc_rtx,
24296 gen_rtx_IF_THEN_ELSE (VOIDmode,
24297 condition, target1, target2)));
24298 if (split_branch_probability.initialized_p ())
24299 add_reg_br_prob_note (i, split_branch_probability);
24302 void
24303 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
24305 rtx ret;
24307 gcc_assert (GET_MODE (dest) == QImode);
24309 ret = ix86_expand_compare (code, op0, op1);
24310 PUT_MODE (ret, QImode);
24311 emit_insn (gen_rtx_SET (dest, ret));
24314 /* Expand comparison setting or clearing carry flag. Return true when
24315 successful and set pop for the operation. */
24316 static bool
24317 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
24319 machine_mode mode =
24320 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
24322 /* Do not handle double-mode compares that go through special path. */
24323 if (mode == (TARGET_64BIT ? TImode : DImode))
24324 return false;
24326 if (SCALAR_FLOAT_MODE_P (mode))
24328 rtx compare_op;
24329 rtx_insn *compare_seq;
24331 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
24333 /* Shortcut: following common codes never translate
24334 into carry flag compares. */
24335 if (code == EQ || code == NE || code == UNEQ || code == LTGT
24336 || code == ORDERED || code == UNORDERED)
24337 return false;
24339 /* These comparisons require zero flag; swap operands so they won't. */
24340 if ((code == GT || code == UNLE || code == LE || code == UNGT)
24341 && !TARGET_IEEE_FP)
24343 std::swap (op0, op1);
24344 code = swap_condition (code);
24347 /* Try to expand the comparison and verify that we end up with
24348 carry flag based comparison. This fails to be true only when
24349 we decide to expand comparison using arithmetic that is not
24350 too common scenario. */
24351 start_sequence ();
24352 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
24353 compare_seq = get_insns ();
24354 end_sequence ();
24356 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
24357 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
24358 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
24359 else
24360 code = GET_CODE (compare_op);
24362 if (code != LTU && code != GEU)
24363 return false;
24365 emit_insn (compare_seq);
24366 *pop = compare_op;
24367 return true;
24370 if (!INTEGRAL_MODE_P (mode))
24371 return false;
24373 switch (code)
24375 case LTU:
24376 case GEU:
24377 break;
24379 /* Convert a==0 into (unsigned)a<1. */
24380 case EQ:
24381 case NE:
24382 if (op1 != const0_rtx)
24383 return false;
24384 op1 = const1_rtx;
24385 code = (code == EQ ? LTU : GEU);
24386 break;
24388 /* Convert a>b into b<a or a>=b-1. */
24389 case GTU:
24390 case LEU:
24391 if (CONST_INT_P (op1))
24393 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
24394 /* Bail out on overflow. We still can swap operands but that
24395 would force loading of the constant into register. */
24396 if (op1 == const0_rtx
24397 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
24398 return false;
24399 code = (code == GTU ? GEU : LTU);
24401 else
24403 std::swap (op0, op1);
24404 code = (code == GTU ? LTU : GEU);
24406 break;
24408 /* Convert a>=0 into (unsigned)a<0x80000000. */
24409 case LT:
24410 case GE:
24411 if (mode == DImode || op1 != const0_rtx)
24412 return false;
24413 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24414 code = (code == LT ? GEU : LTU);
24415 break;
24416 case LE:
24417 case GT:
24418 if (mode == DImode || op1 != constm1_rtx)
24419 return false;
24420 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24421 code = (code == LE ? GEU : LTU);
24422 break;
24424 default:
24425 return false;
24427 /* Swapping operands may cause constant to appear as first operand. */
24428 if (!nonimmediate_operand (op0, VOIDmode))
24430 if (!can_create_pseudo_p ())
24431 return false;
24432 op0 = force_reg (mode, op0);
24434 *pop = ix86_expand_compare (code, op0, op1);
24435 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24436 return true;
24439 bool
24440 ix86_expand_int_movcc (rtx operands[])
24442 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24443 rtx_insn *compare_seq;
24444 rtx compare_op;
24445 machine_mode mode = GET_MODE (operands[0]);
24446 bool sign_bit_compare_p = false;
24447 rtx op0 = XEXP (operands[1], 0);
24448 rtx op1 = XEXP (operands[1], 1);
24450 if (GET_MODE (op0) == TImode
24451 || (GET_MODE (op0) == DImode
24452 && !TARGET_64BIT))
24453 return false;
24455 start_sequence ();
24456 compare_op = ix86_expand_compare (code, op0, op1);
24457 compare_seq = get_insns ();
24458 end_sequence ();
24460 compare_code = GET_CODE (compare_op);
24462 if ((op1 == const0_rtx && (code == GE || code == LT))
24463 || (op1 == constm1_rtx && (code == GT || code == LE)))
24464 sign_bit_compare_p = true;
24466 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24467 HImode insns, we'd be swallowed in word prefix ops. */
24469 if ((mode != HImode || TARGET_FAST_PREFIX)
24470 && (mode != (TARGET_64BIT ? TImode : DImode))
24471 && CONST_INT_P (operands[2])
24472 && CONST_INT_P (operands[3]))
24474 rtx out = operands[0];
24475 HOST_WIDE_INT ct = INTVAL (operands[2]);
24476 HOST_WIDE_INT cf = INTVAL (operands[3]);
24477 HOST_WIDE_INT diff;
24479 diff = ct - cf;
24480 /* Sign bit compares are better done using shifts than we do by using
24481 sbb. */
24482 if (sign_bit_compare_p
24483 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24485 /* Detect overlap between destination and compare sources. */
24486 rtx tmp = out;
24488 if (!sign_bit_compare_p)
24490 rtx flags;
24491 bool fpcmp = false;
24493 compare_code = GET_CODE (compare_op);
24495 flags = XEXP (compare_op, 0);
24497 if (GET_MODE (flags) == CCFPmode
24498 || GET_MODE (flags) == CCFPUmode)
24500 fpcmp = true;
24501 compare_code
24502 = ix86_fp_compare_code_to_integer (compare_code);
24505 /* To simplify rest of code, restrict to the GEU case. */
24506 if (compare_code == LTU)
24508 std::swap (ct, cf);
24509 compare_code = reverse_condition (compare_code);
24510 code = reverse_condition (code);
24512 else
24514 if (fpcmp)
24515 PUT_CODE (compare_op,
24516 reverse_condition_maybe_unordered
24517 (GET_CODE (compare_op)));
24518 else
24519 PUT_CODE (compare_op,
24520 reverse_condition (GET_CODE (compare_op)));
24522 diff = ct - cf;
24524 if (reg_overlap_mentioned_p (out, op0)
24525 || reg_overlap_mentioned_p (out, op1))
24526 tmp = gen_reg_rtx (mode);
24528 if (mode == DImode)
24529 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24530 else
24531 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24532 flags, compare_op));
24534 else
24536 if (code == GT || code == GE)
24537 code = reverse_condition (code);
24538 else
24540 std::swap (ct, cf);
24541 diff = ct - cf;
24543 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24546 if (diff == 1)
24549 * cmpl op0,op1
24550 * sbbl dest,dest
24551 * [addl dest, ct]
24553 * Size 5 - 8.
24555 if (ct)
24556 tmp = expand_simple_binop (mode, PLUS,
24557 tmp, GEN_INT (ct),
24558 copy_rtx (tmp), 1, OPTAB_DIRECT);
24560 else if (cf == -1)
24563 * cmpl op0,op1
24564 * sbbl dest,dest
24565 * orl $ct, dest
24567 * Size 8.
24569 tmp = expand_simple_binop (mode, IOR,
24570 tmp, GEN_INT (ct),
24571 copy_rtx (tmp), 1, OPTAB_DIRECT);
24573 else if (diff == -1 && ct)
24576 * cmpl op0,op1
24577 * sbbl dest,dest
24578 * notl dest
24579 * [addl dest, cf]
24581 * Size 8 - 11.
24583 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24584 if (cf)
24585 tmp = expand_simple_binop (mode, PLUS,
24586 copy_rtx (tmp), GEN_INT (cf),
24587 copy_rtx (tmp), 1, OPTAB_DIRECT);
24589 else
24592 * cmpl op0,op1
24593 * sbbl dest,dest
24594 * [notl dest]
24595 * andl cf - ct, dest
24596 * [addl dest, ct]
24598 * Size 8 - 11.
24601 if (cf == 0)
24603 cf = ct;
24604 ct = 0;
24605 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24608 tmp = expand_simple_binop (mode, AND,
24609 copy_rtx (tmp),
24610 gen_int_mode (cf - ct, mode),
24611 copy_rtx (tmp), 1, OPTAB_DIRECT);
24612 if (ct)
24613 tmp = expand_simple_binop (mode, PLUS,
24614 copy_rtx (tmp), GEN_INT (ct),
24615 copy_rtx (tmp), 1, OPTAB_DIRECT);
24618 if (!rtx_equal_p (tmp, out))
24619 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24621 return true;
24624 if (diff < 0)
24626 machine_mode cmp_mode = GET_MODE (op0);
24627 enum rtx_code new_code;
24629 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24631 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24633 /* We may be reversing unordered compare to normal compare, that
24634 is not valid in general (we may convert non-trapping condition
24635 to trapping one), however on i386 we currently emit all
24636 comparisons unordered. */
24637 new_code = reverse_condition_maybe_unordered (code);
24639 else
24640 new_code = ix86_reverse_condition (code, cmp_mode);
24641 if (new_code != UNKNOWN)
24643 std::swap (ct, cf);
24644 diff = -diff;
24645 code = new_code;
24649 compare_code = UNKNOWN;
24650 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24651 && CONST_INT_P (op1))
24653 if (op1 == const0_rtx
24654 && (code == LT || code == GE))
24655 compare_code = code;
24656 else if (op1 == constm1_rtx)
24658 if (code == LE)
24659 compare_code = LT;
24660 else if (code == GT)
24661 compare_code = GE;
24665 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24666 if (compare_code != UNKNOWN
24667 && GET_MODE (op0) == GET_MODE (out)
24668 && (cf == -1 || ct == -1))
24670 /* If lea code below could be used, only optimize
24671 if it results in a 2 insn sequence. */
24673 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24674 || diff == 3 || diff == 5 || diff == 9)
24675 || (compare_code == LT && ct == -1)
24676 || (compare_code == GE && cf == -1))
24679 * notl op1 (if necessary)
24680 * sarl $31, op1
24681 * orl cf, op1
24683 if (ct != -1)
24685 cf = ct;
24686 ct = -1;
24687 code = reverse_condition (code);
24690 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24692 out = expand_simple_binop (mode, IOR,
24693 out, GEN_INT (cf),
24694 out, 1, OPTAB_DIRECT);
24695 if (out != operands[0])
24696 emit_move_insn (operands[0], out);
24698 return true;
24703 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24704 || diff == 3 || diff == 5 || diff == 9)
24705 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24706 && (mode != DImode
24707 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24710 * xorl dest,dest
24711 * cmpl op1,op2
24712 * setcc dest
24713 * lea cf(dest*(ct-cf)),dest
24715 * Size 14.
24717 * This also catches the degenerate setcc-only case.
24720 rtx tmp;
24721 int nops;
24723 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24725 nops = 0;
24726 /* On x86_64 the lea instruction operates on Pmode, so we need
24727 to get arithmetics done in proper mode to match. */
24728 if (diff == 1)
24729 tmp = copy_rtx (out);
24730 else
24732 rtx out1;
24733 out1 = copy_rtx (out);
24734 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24735 nops++;
24736 if (diff & 1)
24738 tmp = gen_rtx_PLUS (mode, tmp, out1);
24739 nops++;
24742 if (cf != 0)
24744 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24745 nops++;
24747 if (!rtx_equal_p (tmp, out))
24749 if (nops == 1)
24750 out = force_operand (tmp, copy_rtx (out));
24751 else
24752 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24754 if (!rtx_equal_p (out, operands[0]))
24755 emit_move_insn (operands[0], copy_rtx (out));
24757 return true;
24761 * General case: Jumpful:
24762 * xorl dest,dest cmpl op1, op2
24763 * cmpl op1, op2 movl ct, dest
24764 * setcc dest jcc 1f
24765 * decl dest movl cf, dest
24766 * andl (cf-ct),dest 1:
24767 * addl ct,dest
24769 * Size 20. Size 14.
24771 * This is reasonably steep, but branch mispredict costs are
24772 * high on modern cpus, so consider failing only if optimizing
24773 * for space.
24776 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24777 && BRANCH_COST (optimize_insn_for_speed_p (),
24778 false) >= 2)
24780 if (cf == 0)
24782 machine_mode cmp_mode = GET_MODE (op0);
24783 enum rtx_code new_code;
24785 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24787 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24789 /* We may be reversing unordered compare to normal compare,
24790 that is not valid in general (we may convert non-trapping
24791 condition to trapping one), however on i386 we currently
24792 emit all comparisons unordered. */
24793 new_code = reverse_condition_maybe_unordered (code);
24795 else
24797 new_code = ix86_reverse_condition (code, cmp_mode);
24798 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24799 compare_code = reverse_condition (compare_code);
24802 if (new_code != UNKNOWN)
24804 cf = ct;
24805 ct = 0;
24806 code = new_code;
24810 if (compare_code != UNKNOWN)
24812 /* notl op1 (if needed)
24813 sarl $31, op1
24814 andl (cf-ct), op1
24815 addl ct, op1
24817 For x < 0 (resp. x <= -1) there will be no notl,
24818 so if possible swap the constants to get rid of the
24819 complement.
24820 True/false will be -1/0 while code below (store flag
24821 followed by decrement) is 0/-1, so the constants need
24822 to be exchanged once more. */
24824 if (compare_code == GE || !cf)
24826 code = reverse_condition (code);
24827 compare_code = LT;
24829 else
24830 std::swap (ct, cf);
24832 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24834 else
24836 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24838 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24839 constm1_rtx,
24840 copy_rtx (out), 1, OPTAB_DIRECT);
24843 out = expand_simple_binop (mode, AND, copy_rtx (out),
24844 gen_int_mode (cf - ct, mode),
24845 copy_rtx (out), 1, OPTAB_DIRECT);
24846 if (ct)
24847 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24848 copy_rtx (out), 1, OPTAB_DIRECT);
24849 if (!rtx_equal_p (out, operands[0]))
24850 emit_move_insn (operands[0], copy_rtx (out));
24852 return true;
24856 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24858 /* Try a few things more with specific constants and a variable. */
24860 optab op;
24861 rtx var, orig_out, out, tmp;
24863 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24864 return false;
24866 /* If one of the two operands is an interesting constant, load a
24867 constant with the above and mask it in with a logical operation. */
24869 if (CONST_INT_P (operands[2]))
24871 var = operands[3];
24872 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24873 operands[3] = constm1_rtx, op = and_optab;
24874 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24875 operands[3] = const0_rtx, op = ior_optab;
24876 else
24877 return false;
24879 else if (CONST_INT_P (operands[3]))
24881 var = operands[2];
24882 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24883 operands[2] = constm1_rtx, op = and_optab;
24884 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24885 operands[2] = const0_rtx, op = ior_optab;
24886 else
24887 return false;
24889 else
24890 return false;
24892 orig_out = operands[0];
24893 tmp = gen_reg_rtx (mode);
24894 operands[0] = tmp;
24896 /* Recurse to get the constant loaded. */
24897 if (!ix86_expand_int_movcc (operands))
24898 return false;
24900 /* Mask in the interesting variable. */
24901 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24902 OPTAB_WIDEN);
24903 if (!rtx_equal_p (out, orig_out))
24904 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24906 return true;
24910 * For comparison with above,
24912 * movl cf,dest
24913 * movl ct,tmp
24914 * cmpl op1,op2
24915 * cmovcc tmp,dest
24917 * Size 15.
24920 if (! nonimmediate_operand (operands[2], mode))
24921 operands[2] = force_reg (mode, operands[2]);
24922 if (! nonimmediate_operand (operands[3], mode))
24923 operands[3] = force_reg (mode, operands[3]);
24925 if (! register_operand (operands[2], VOIDmode)
24926 && (mode == QImode
24927 || ! register_operand (operands[3], VOIDmode)))
24928 operands[2] = force_reg (mode, operands[2]);
24930 if (mode == QImode
24931 && ! register_operand (operands[3], VOIDmode))
24932 operands[3] = force_reg (mode, operands[3]);
24934 emit_insn (compare_seq);
24935 emit_insn (gen_rtx_SET (operands[0],
24936 gen_rtx_IF_THEN_ELSE (mode,
24937 compare_op, operands[2],
24938 operands[3])));
24939 return true;
24942 /* Swap, force into registers, or otherwise massage the two operands
24943 to an sse comparison with a mask result. Thus we differ a bit from
24944 ix86_prepare_fp_compare_args which expects to produce a flags result.
24946 The DEST operand exists to help determine whether to commute commutative
24947 operators. The POP0/POP1 operands are updated in place. The new
24948 comparison code is returned, or UNKNOWN if not implementable. */
24950 static enum rtx_code
24951 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24952 rtx *pop0, rtx *pop1)
24954 switch (code)
24956 case LTGT:
24957 case UNEQ:
24958 /* AVX supports all the needed comparisons. */
24959 if (TARGET_AVX)
24960 break;
24961 /* We have no LTGT as an operator. We could implement it with
24962 NE & ORDERED, but this requires an extra temporary. It's
24963 not clear that it's worth it. */
24964 return UNKNOWN;
24966 case LT:
24967 case LE:
24968 case UNGT:
24969 case UNGE:
24970 /* These are supported directly. */
24971 break;
24973 case EQ:
24974 case NE:
24975 case UNORDERED:
24976 case ORDERED:
24977 /* AVX has 3 operand comparisons, no need to swap anything. */
24978 if (TARGET_AVX)
24979 break;
24980 /* For commutative operators, try to canonicalize the destination
24981 operand to be first in the comparison - this helps reload to
24982 avoid extra moves. */
24983 if (!dest || !rtx_equal_p (dest, *pop1))
24984 break;
24985 /* FALLTHRU */
24987 case GE:
24988 case GT:
24989 case UNLE:
24990 case UNLT:
24991 /* These are not supported directly before AVX, and furthermore
24992 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24993 comparison operands to transform into something that is
24994 supported. */
24995 std::swap (*pop0, *pop1);
24996 code = swap_condition (code);
24997 break;
24999 default:
25000 gcc_unreachable ();
25003 return code;
25006 /* Detect conditional moves that exactly match min/max operational
25007 semantics. Note that this is IEEE safe, as long as we don't
25008 interchange the operands.
25010 Returns FALSE if this conditional move doesn't match a MIN/MAX,
25011 and TRUE if the operation is successful and instructions are emitted. */
25013 static bool
25014 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
25015 rtx cmp_op1, rtx if_true, rtx if_false)
25017 machine_mode mode;
25018 bool is_min;
25019 rtx tmp;
25021 if (code == LT)
25023 else if (code == UNGE)
25024 std::swap (if_true, if_false);
25025 else
25026 return false;
25028 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
25029 is_min = true;
25030 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
25031 is_min = false;
25032 else
25033 return false;
25035 mode = GET_MODE (dest);
25037 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
25038 but MODE may be a vector mode and thus not appropriate. */
25039 if (!flag_finite_math_only || flag_signed_zeros)
25041 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
25042 rtvec v;
25044 if_true = force_reg (mode, if_true);
25045 v = gen_rtvec (2, if_true, if_false);
25046 tmp = gen_rtx_UNSPEC (mode, v, u);
25048 else
25050 code = is_min ? SMIN : SMAX;
25051 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
25054 emit_insn (gen_rtx_SET (dest, tmp));
25055 return true;
25058 /* Expand an sse vector comparison. Return the register with the result. */
25060 static rtx
25061 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
25062 rtx op_true, rtx op_false)
25064 machine_mode mode = GET_MODE (dest);
25065 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
25067 /* In general case result of comparison can differ from operands' type. */
25068 machine_mode cmp_mode;
25070 /* In AVX512F the result of comparison is an integer mask. */
25071 bool maskcmp = false;
25072 rtx x;
25074 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
25076 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
25077 cmp_mode = int_mode_for_size (nbits, 0).require ();
25078 maskcmp = true;
25080 else
25081 cmp_mode = cmp_ops_mode;
25084 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
25085 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
25086 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
25088 if (optimize
25089 || (maskcmp && cmp_mode != mode)
25090 || (op_true && reg_overlap_mentioned_p (dest, op_true))
25091 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
25092 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
25094 /* Compare patterns for int modes are unspec in AVX512F only. */
25095 if (maskcmp && (code == GT || code == EQ))
25097 rtx (*gen)(rtx, rtx, rtx);
25099 switch (cmp_ops_mode)
25101 case E_V64QImode:
25102 gcc_assert (TARGET_AVX512BW);
25103 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
25104 break;
25105 case E_V32HImode:
25106 gcc_assert (TARGET_AVX512BW);
25107 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
25108 break;
25109 case E_V16SImode:
25110 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
25111 break;
25112 case E_V8DImode:
25113 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
25114 break;
25115 default:
25116 gen = NULL;
25119 if (gen)
25121 emit_insn (gen (dest, cmp_op0, cmp_op1));
25122 return dest;
25125 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
25127 if (cmp_mode != mode && !maskcmp)
25129 x = force_reg (cmp_ops_mode, x);
25130 convert_move (dest, x, false);
25132 else
25133 emit_insn (gen_rtx_SET (dest, x));
25135 return dest;
25138 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
25139 operations. This is used for both scalar and vector conditional moves. */
25141 void
25142 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
25144 machine_mode mode = GET_MODE (dest);
25145 machine_mode cmpmode = GET_MODE (cmp);
25147 /* In AVX512F the result of comparison is an integer mask. */
25148 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
25150 rtx t2, t3, x;
25152 /* If we have an integer mask and FP value then we need
25153 to cast mask to FP mode. */
25154 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
25156 cmp = force_reg (cmpmode, cmp);
25157 cmp = gen_rtx_SUBREG (mode, cmp, 0);
25160 if (vector_all_ones_operand (op_true, mode)
25161 && rtx_equal_p (op_false, CONST0_RTX (mode))
25162 && !maskcmp)
25164 emit_insn (gen_rtx_SET (dest, cmp));
25166 else if (op_false == CONST0_RTX (mode)
25167 && !maskcmp)
25169 op_true = force_reg (mode, op_true);
25170 x = gen_rtx_AND (mode, cmp, op_true);
25171 emit_insn (gen_rtx_SET (dest, x));
25173 else if (op_true == CONST0_RTX (mode)
25174 && !maskcmp)
25176 op_false = force_reg (mode, op_false);
25177 x = gen_rtx_NOT (mode, cmp);
25178 x = gen_rtx_AND (mode, x, op_false);
25179 emit_insn (gen_rtx_SET (dest, x));
25181 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
25182 && !maskcmp)
25184 op_false = force_reg (mode, op_false);
25185 x = gen_rtx_IOR (mode, cmp, op_false);
25186 emit_insn (gen_rtx_SET (dest, x));
25188 else if (TARGET_XOP
25189 && !maskcmp)
25191 op_true = force_reg (mode, op_true);
25193 if (!nonimmediate_operand (op_false, mode))
25194 op_false = force_reg (mode, op_false);
25196 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
25197 op_true,
25198 op_false)));
25200 else
25202 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25203 rtx d = dest;
25205 if (!nonimmediate_operand (op_true, mode))
25206 op_true = force_reg (mode, op_true);
25208 op_false = force_reg (mode, op_false);
25210 switch (mode)
25212 case E_V4SFmode:
25213 if (TARGET_SSE4_1)
25214 gen = gen_sse4_1_blendvps;
25215 break;
25216 case E_V2DFmode:
25217 if (TARGET_SSE4_1)
25218 gen = gen_sse4_1_blendvpd;
25219 break;
25220 case E_V16QImode:
25221 case E_V8HImode:
25222 case E_V4SImode:
25223 case E_V2DImode:
25224 if (TARGET_SSE4_1)
25226 gen = gen_sse4_1_pblendvb;
25227 if (mode != V16QImode)
25228 d = gen_reg_rtx (V16QImode);
25229 op_false = gen_lowpart (V16QImode, op_false);
25230 op_true = gen_lowpart (V16QImode, op_true);
25231 cmp = gen_lowpart (V16QImode, cmp);
25233 break;
25234 case E_V8SFmode:
25235 if (TARGET_AVX)
25236 gen = gen_avx_blendvps256;
25237 break;
25238 case E_V4DFmode:
25239 if (TARGET_AVX)
25240 gen = gen_avx_blendvpd256;
25241 break;
25242 case E_V32QImode:
25243 case E_V16HImode:
25244 case E_V8SImode:
25245 case E_V4DImode:
25246 if (TARGET_AVX2)
25248 gen = gen_avx2_pblendvb;
25249 if (mode != V32QImode)
25250 d = gen_reg_rtx (V32QImode);
25251 op_false = gen_lowpart (V32QImode, op_false);
25252 op_true = gen_lowpart (V32QImode, op_true);
25253 cmp = gen_lowpart (V32QImode, cmp);
25255 break;
25257 case E_V64QImode:
25258 gen = gen_avx512bw_blendmv64qi;
25259 break;
25260 case E_V32HImode:
25261 gen = gen_avx512bw_blendmv32hi;
25262 break;
25263 case E_V16SImode:
25264 gen = gen_avx512f_blendmv16si;
25265 break;
25266 case E_V8DImode:
25267 gen = gen_avx512f_blendmv8di;
25268 break;
25269 case E_V8DFmode:
25270 gen = gen_avx512f_blendmv8df;
25271 break;
25272 case E_V16SFmode:
25273 gen = gen_avx512f_blendmv16sf;
25274 break;
25276 default:
25277 break;
25280 if (gen != NULL)
25282 emit_insn (gen (d, op_false, op_true, cmp));
25283 if (d != dest)
25284 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
25286 else
25288 op_true = force_reg (mode, op_true);
25290 t2 = gen_reg_rtx (mode);
25291 if (optimize)
25292 t3 = gen_reg_rtx (mode);
25293 else
25294 t3 = dest;
25296 x = gen_rtx_AND (mode, op_true, cmp);
25297 emit_insn (gen_rtx_SET (t2, x));
25299 x = gen_rtx_NOT (mode, cmp);
25300 x = gen_rtx_AND (mode, x, op_false);
25301 emit_insn (gen_rtx_SET (t3, x));
25303 x = gen_rtx_IOR (mode, t3, t2);
25304 emit_insn (gen_rtx_SET (dest, x));
25309 /* Expand a floating-point conditional move. Return true if successful. */
25311 bool
25312 ix86_expand_fp_movcc (rtx operands[])
25314 machine_mode mode = GET_MODE (operands[0]);
25315 enum rtx_code code = GET_CODE (operands[1]);
25316 rtx tmp, compare_op;
25317 rtx op0 = XEXP (operands[1], 0);
25318 rtx op1 = XEXP (operands[1], 1);
25320 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
25322 machine_mode cmode;
25324 /* Since we've no cmove for sse registers, don't force bad register
25325 allocation just to gain access to it. Deny movcc when the
25326 comparison mode doesn't match the move mode. */
25327 cmode = GET_MODE (op0);
25328 if (cmode == VOIDmode)
25329 cmode = GET_MODE (op1);
25330 if (cmode != mode)
25331 return false;
25333 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
25334 if (code == UNKNOWN)
25335 return false;
25337 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
25338 operands[2], operands[3]))
25339 return true;
25341 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
25342 operands[2], operands[3]);
25343 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
25344 return true;
25347 if (GET_MODE (op0) == TImode
25348 || (GET_MODE (op0) == DImode
25349 && !TARGET_64BIT))
25350 return false;
25352 /* The floating point conditional move instructions don't directly
25353 support conditions resulting from a signed integer comparison. */
25355 compare_op = ix86_expand_compare (code, op0, op1);
25356 if (!fcmov_comparison_operator (compare_op, VOIDmode))
25358 tmp = gen_reg_rtx (QImode);
25359 ix86_expand_setcc (tmp, code, op0, op1);
25361 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
25364 emit_insn (gen_rtx_SET (operands[0],
25365 gen_rtx_IF_THEN_ELSE (mode, compare_op,
25366 operands[2], operands[3])));
25368 return true;
25371 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
25373 static int
25374 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
25376 switch (code)
25378 case EQ:
25379 return 0;
25380 case LT:
25381 case LTU:
25382 return 1;
25383 case LE:
25384 case LEU:
25385 return 2;
25386 case NE:
25387 return 4;
25388 case GE:
25389 case GEU:
25390 return 5;
25391 case GT:
25392 case GTU:
25393 return 6;
25394 default:
25395 gcc_unreachable ();
25399 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
25401 static int
25402 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
25404 switch (code)
25406 case EQ:
25407 return 0x00;
25408 case NE:
25409 return 0x04;
25410 case GT:
25411 return 0x0e;
25412 case LE:
25413 return 0x02;
25414 case GE:
25415 return 0x0d;
25416 case LT:
25417 return 0x01;
25418 case UNLE:
25419 return 0x0a;
25420 case UNLT:
25421 return 0x09;
25422 case UNGE:
25423 return 0x05;
25424 case UNGT:
25425 return 0x06;
25426 case UNEQ:
25427 return 0x18;
25428 case LTGT:
25429 return 0x0c;
25430 case ORDERED:
25431 return 0x07;
25432 case UNORDERED:
25433 return 0x03;
25434 default:
25435 gcc_unreachable ();
25439 /* Return immediate value to be used in UNSPEC_PCMP
25440 for comparison CODE in MODE. */
25442 static int
25443 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25445 if (FLOAT_MODE_P (mode))
25446 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25447 return ix86_int_cmp_code_to_pcmp_immediate (code);
25450 /* Expand AVX-512 vector comparison. */
25452 bool
25453 ix86_expand_mask_vec_cmp (rtx operands[])
25455 machine_mode mask_mode = GET_MODE (operands[0]);
25456 machine_mode cmp_mode = GET_MODE (operands[2]);
25457 enum rtx_code code = GET_CODE (operands[1]);
25458 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25459 int unspec_code;
25460 rtx unspec;
25462 switch (code)
25464 case LEU:
25465 case GTU:
25466 case GEU:
25467 case LTU:
25468 unspec_code = UNSPEC_UNSIGNED_PCMP;
25469 break;
25471 default:
25472 unspec_code = UNSPEC_PCMP;
25475 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25476 operands[3], imm),
25477 unspec_code);
25478 emit_insn (gen_rtx_SET (operands[0], unspec));
25480 return true;
25483 /* Expand fp vector comparison. */
25485 bool
25486 ix86_expand_fp_vec_cmp (rtx operands[])
25488 enum rtx_code code = GET_CODE (operands[1]);
25489 rtx cmp;
25491 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25492 &operands[2], &operands[3]);
25493 if (code == UNKNOWN)
25495 rtx temp;
25496 switch (GET_CODE (operands[1]))
25498 case LTGT:
25499 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25500 operands[3], NULL, NULL);
25501 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25502 operands[3], NULL, NULL);
25503 code = AND;
25504 break;
25505 case UNEQ:
25506 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25507 operands[3], NULL, NULL);
25508 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25509 operands[3], NULL, NULL);
25510 code = IOR;
25511 break;
25512 default:
25513 gcc_unreachable ();
25515 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25516 OPTAB_DIRECT);
25518 else
25519 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25520 operands[1], operands[2]);
25522 if (operands[0] != cmp)
25523 emit_move_insn (operands[0], cmp);
25525 return true;
25528 static rtx
25529 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25530 rtx op_true, rtx op_false, bool *negate)
25532 machine_mode data_mode = GET_MODE (dest);
25533 machine_mode mode = GET_MODE (cop0);
25534 rtx x;
25536 *negate = false;
25538 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25539 if (TARGET_XOP
25540 && (mode == V16QImode || mode == V8HImode
25541 || mode == V4SImode || mode == V2DImode))
25543 else
25545 /* Canonicalize the comparison to EQ, GT, GTU. */
25546 switch (code)
25548 case EQ:
25549 case GT:
25550 case GTU:
25551 break;
25553 case NE:
25554 case LE:
25555 case LEU:
25556 code = reverse_condition (code);
25557 *negate = true;
25558 break;
25560 case GE:
25561 case GEU:
25562 code = reverse_condition (code);
25563 *negate = true;
25564 /* FALLTHRU */
25566 case LT:
25567 case LTU:
25568 std::swap (cop0, cop1);
25569 code = swap_condition (code);
25570 break;
25572 default:
25573 gcc_unreachable ();
25576 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25577 if (mode == V2DImode)
25579 switch (code)
25581 case EQ:
25582 /* SSE4.1 supports EQ. */
25583 if (!TARGET_SSE4_1)
25584 return NULL;
25585 break;
25587 case GT:
25588 case GTU:
25589 /* SSE4.2 supports GT/GTU. */
25590 if (!TARGET_SSE4_2)
25591 return NULL;
25592 break;
25594 default:
25595 gcc_unreachable ();
25599 /* Unsigned parallel compare is not supported by the hardware.
25600 Play some tricks to turn this into a signed comparison
25601 against 0. */
25602 if (code == GTU)
25604 cop0 = force_reg (mode, cop0);
25606 switch (mode)
25608 case E_V16SImode:
25609 case E_V8DImode:
25610 case E_V8SImode:
25611 case E_V4DImode:
25612 case E_V4SImode:
25613 case E_V2DImode:
25615 rtx t1, t2, mask;
25616 rtx (*gen_sub3) (rtx, rtx, rtx);
25618 switch (mode)
25620 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
25621 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
25622 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
25623 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
25624 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
25625 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
25626 default:
25627 gcc_unreachable ();
25629 /* Subtract (-(INT MAX) - 1) from both operands to make
25630 them signed. */
25631 mask = ix86_build_signbit_mask (mode, true, false);
25632 t1 = gen_reg_rtx (mode);
25633 emit_insn (gen_sub3 (t1, cop0, mask));
25635 t2 = gen_reg_rtx (mode);
25636 emit_insn (gen_sub3 (t2, cop1, mask));
25638 cop0 = t1;
25639 cop1 = t2;
25640 code = GT;
25642 break;
25644 case E_V64QImode:
25645 case E_V32HImode:
25646 case E_V32QImode:
25647 case E_V16HImode:
25648 case E_V16QImode:
25649 case E_V8HImode:
25650 /* Perform a parallel unsigned saturating subtraction. */
25651 x = gen_reg_rtx (mode);
25652 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25653 cop1)));
25655 cop0 = x;
25656 cop1 = CONST0_RTX (mode);
25657 code = EQ;
25658 *negate = !*negate;
25659 break;
25661 default:
25662 gcc_unreachable ();
25667 if (*negate)
25668 std::swap (op_true, op_false);
25670 /* Allow the comparison to be done in one mode, but the movcc to
25671 happen in another mode. */
25672 if (data_mode == mode)
25674 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25675 op_true, op_false);
25677 else
25679 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25680 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25681 op_true, op_false);
25682 if (GET_MODE (x) == mode)
25683 x = gen_lowpart (data_mode, x);
25686 return x;
25689 /* Expand integer vector comparison. */
25691 bool
25692 ix86_expand_int_vec_cmp (rtx operands[])
25694 rtx_code code = GET_CODE (operands[1]);
25695 bool negate = false;
25696 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25697 operands[3], NULL, NULL, &negate);
25699 if (!cmp)
25700 return false;
25702 if (negate)
25703 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25704 CONST0_RTX (GET_MODE (cmp)),
25705 NULL, NULL, &negate);
25707 gcc_assert (!negate);
25709 if (operands[0] != cmp)
25710 emit_move_insn (operands[0], cmp);
25712 return true;
25715 /* Expand a floating-point vector conditional move; a vcond operation
25716 rather than a movcc operation. */
25718 bool
25719 ix86_expand_fp_vcond (rtx operands[])
25721 enum rtx_code code = GET_CODE (operands[3]);
25722 rtx cmp;
25724 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25725 &operands[4], &operands[5]);
25726 if (code == UNKNOWN)
25728 rtx temp;
25729 switch (GET_CODE (operands[3]))
25731 case LTGT:
25732 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25733 operands[5], operands[0], operands[0]);
25734 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25735 operands[5], operands[1], operands[2]);
25736 code = AND;
25737 break;
25738 case UNEQ:
25739 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25740 operands[5], operands[0], operands[0]);
25741 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25742 operands[5], operands[1], operands[2]);
25743 code = IOR;
25744 break;
25745 default:
25746 gcc_unreachable ();
25748 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25749 OPTAB_DIRECT);
25750 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25751 return true;
25754 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25755 operands[5], operands[1], operands[2]))
25756 return true;
25758 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25759 operands[1], operands[2]);
25760 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25761 return true;
25764 /* Expand a signed/unsigned integral vector conditional move. */
25766 bool
25767 ix86_expand_int_vcond (rtx operands[])
25769 machine_mode data_mode = GET_MODE (operands[0]);
25770 machine_mode mode = GET_MODE (operands[4]);
25771 enum rtx_code code = GET_CODE (operands[3]);
25772 bool negate = false;
25773 rtx x, cop0, cop1;
25775 cop0 = operands[4];
25776 cop1 = operands[5];
25778 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25779 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25780 if ((code == LT || code == GE)
25781 && data_mode == mode
25782 && cop1 == CONST0_RTX (mode)
25783 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25784 && GET_MODE_UNIT_SIZE (data_mode) > 1
25785 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25786 && (GET_MODE_SIZE (data_mode) == 16
25787 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25789 rtx negop = operands[2 - (code == LT)];
25790 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25791 if (negop == CONST1_RTX (data_mode))
25793 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25794 operands[0], 1, OPTAB_DIRECT);
25795 if (res != operands[0])
25796 emit_move_insn (operands[0], res);
25797 return true;
25799 else if (GET_MODE_INNER (data_mode) != DImode
25800 && vector_all_ones_operand (negop, data_mode))
25802 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25803 operands[0], 0, OPTAB_DIRECT);
25804 if (res != operands[0])
25805 emit_move_insn (operands[0], res);
25806 return true;
25810 if (!nonimmediate_operand (cop1, mode))
25811 cop1 = force_reg (mode, cop1);
25812 if (!general_operand (operands[1], data_mode))
25813 operands[1] = force_reg (data_mode, operands[1]);
25814 if (!general_operand (operands[2], data_mode))
25815 operands[2] = force_reg (data_mode, operands[2]);
25817 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25818 operands[1], operands[2], &negate);
25820 if (!x)
25821 return false;
25823 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25824 operands[2-negate]);
25825 return true;
25828 /* AVX512F does support 64-byte integer vector operations,
25829 thus the longest vector we are faced with is V64QImode. */
25830 #define MAX_VECT_LEN 64
25832 struct expand_vec_perm_d
25834 rtx target, op0, op1;
25835 unsigned char perm[MAX_VECT_LEN];
25836 machine_mode vmode;
25837 unsigned char nelt;
25838 bool one_operand_p;
25839 bool testing_p;
25842 static bool
25843 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25844 struct expand_vec_perm_d *d)
25846 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25847 expander, so args are either in d, or in op0, op1 etc. */
25848 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25849 machine_mode maskmode = mode;
25850 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25852 switch (mode)
25854 case E_V8HImode:
25855 if (TARGET_AVX512VL && TARGET_AVX512BW)
25856 gen = gen_avx512vl_vpermi2varv8hi3;
25857 break;
25858 case E_V16HImode:
25859 if (TARGET_AVX512VL && TARGET_AVX512BW)
25860 gen = gen_avx512vl_vpermi2varv16hi3;
25861 break;
25862 case E_V64QImode:
25863 if (TARGET_AVX512VBMI)
25864 gen = gen_avx512bw_vpermi2varv64qi3;
25865 break;
25866 case E_V32HImode:
25867 if (TARGET_AVX512BW)
25868 gen = gen_avx512bw_vpermi2varv32hi3;
25869 break;
25870 case E_V4SImode:
25871 if (TARGET_AVX512VL)
25872 gen = gen_avx512vl_vpermi2varv4si3;
25873 break;
25874 case E_V8SImode:
25875 if (TARGET_AVX512VL)
25876 gen = gen_avx512vl_vpermi2varv8si3;
25877 break;
25878 case E_V16SImode:
25879 if (TARGET_AVX512F)
25880 gen = gen_avx512f_vpermi2varv16si3;
25881 break;
25882 case E_V4SFmode:
25883 if (TARGET_AVX512VL)
25885 gen = gen_avx512vl_vpermi2varv4sf3;
25886 maskmode = V4SImode;
25888 break;
25889 case E_V8SFmode:
25890 if (TARGET_AVX512VL)
25892 gen = gen_avx512vl_vpermi2varv8sf3;
25893 maskmode = V8SImode;
25895 break;
25896 case E_V16SFmode:
25897 if (TARGET_AVX512F)
25899 gen = gen_avx512f_vpermi2varv16sf3;
25900 maskmode = V16SImode;
25902 break;
25903 case E_V2DImode:
25904 if (TARGET_AVX512VL)
25905 gen = gen_avx512vl_vpermi2varv2di3;
25906 break;
25907 case E_V4DImode:
25908 if (TARGET_AVX512VL)
25909 gen = gen_avx512vl_vpermi2varv4di3;
25910 break;
25911 case E_V8DImode:
25912 if (TARGET_AVX512F)
25913 gen = gen_avx512f_vpermi2varv8di3;
25914 break;
25915 case E_V2DFmode:
25916 if (TARGET_AVX512VL)
25918 gen = gen_avx512vl_vpermi2varv2df3;
25919 maskmode = V2DImode;
25921 break;
25922 case E_V4DFmode:
25923 if (TARGET_AVX512VL)
25925 gen = gen_avx512vl_vpermi2varv4df3;
25926 maskmode = V4DImode;
25928 break;
25929 case E_V8DFmode:
25930 if (TARGET_AVX512F)
25932 gen = gen_avx512f_vpermi2varv8df3;
25933 maskmode = V8DImode;
25935 break;
25936 default:
25937 break;
25940 if (gen == NULL)
25941 return false;
25943 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25944 expander, so args are either in d, or in op0, op1 etc. */
25945 if (d)
25947 rtx vec[64];
25948 target = d->target;
25949 op0 = d->op0;
25950 op1 = d->op1;
25951 for (int i = 0; i < d->nelt; ++i)
25952 vec[i] = GEN_INT (d->perm[i]);
25953 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25956 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25957 return true;
25960 /* Expand a variable vector permutation. */
25962 void
25963 ix86_expand_vec_perm (rtx operands[])
25965 rtx target = operands[0];
25966 rtx op0 = operands[1];
25967 rtx op1 = operands[2];
25968 rtx mask = operands[3];
25969 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25970 machine_mode mode = GET_MODE (op0);
25971 machine_mode maskmode = GET_MODE (mask);
25972 int w, e, i;
25973 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25975 /* Number of elements in the vector. */
25976 w = GET_MODE_NUNITS (mode);
25977 e = GET_MODE_UNIT_SIZE (mode);
25978 gcc_assert (w <= 64);
25980 if (TARGET_AVX512F && one_operand_shuffle)
25982 rtx (*gen) (rtx, rtx, rtx) = NULL;
25983 switch (mode)
25985 case E_V16SImode:
25986 gen =gen_avx512f_permvarv16si;
25987 break;
25988 case E_V16SFmode:
25989 gen = gen_avx512f_permvarv16sf;
25990 break;
25991 case E_V8DImode:
25992 gen = gen_avx512f_permvarv8di;
25993 break;
25994 case E_V8DFmode:
25995 gen = gen_avx512f_permvarv8df;
25996 break;
25997 default:
25998 break;
26000 if (gen != NULL)
26002 emit_insn (gen (target, op0, mask));
26003 return;
26007 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
26008 return;
26010 if (TARGET_AVX2)
26012 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
26014 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
26015 an constant shuffle operand. With a tiny bit of effort we can
26016 use VPERMD instead. A re-interpretation stall for V4DFmode is
26017 unfortunate but there's no avoiding it.
26018 Similarly for V16HImode we don't have instructions for variable
26019 shuffling, while for V32QImode we can use after preparing suitable
26020 masks vpshufb; vpshufb; vpermq; vpor. */
26022 if (mode == V16HImode)
26024 maskmode = mode = V32QImode;
26025 w = 32;
26026 e = 1;
26028 else
26030 maskmode = mode = V8SImode;
26031 w = 8;
26032 e = 4;
26034 t1 = gen_reg_rtx (maskmode);
26036 /* Replicate the low bits of the V4DImode mask into V8SImode:
26037 mask = { A B C D }
26038 t1 = { A A B B C C D D }. */
26039 for (i = 0; i < w / 2; ++i)
26040 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
26041 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26042 vt = force_reg (maskmode, vt);
26043 mask = gen_lowpart (maskmode, mask);
26044 if (maskmode == V8SImode)
26045 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
26046 else
26047 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
26049 /* Multiply the shuffle indicies by two. */
26050 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
26051 OPTAB_DIRECT);
26053 /* Add one to the odd shuffle indicies:
26054 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
26055 for (i = 0; i < w / 2; ++i)
26057 vec[i * 2] = const0_rtx;
26058 vec[i * 2 + 1] = const1_rtx;
26060 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26061 vt = validize_mem (force_const_mem (maskmode, vt));
26062 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
26063 OPTAB_DIRECT);
26065 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
26066 operands[3] = mask = t1;
26067 target = gen_reg_rtx (mode);
26068 op0 = gen_lowpart (mode, op0);
26069 op1 = gen_lowpart (mode, op1);
26072 switch (mode)
26074 case E_V8SImode:
26075 /* The VPERMD and VPERMPS instructions already properly ignore
26076 the high bits of the shuffle elements. No need for us to
26077 perform an AND ourselves. */
26078 if (one_operand_shuffle)
26080 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
26081 if (target != operands[0])
26082 emit_move_insn (operands[0],
26083 gen_lowpart (GET_MODE (operands[0]), target));
26085 else
26087 t1 = gen_reg_rtx (V8SImode);
26088 t2 = gen_reg_rtx (V8SImode);
26089 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
26090 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
26091 goto merge_two;
26093 return;
26095 case E_V8SFmode:
26096 mask = gen_lowpart (V8SImode, mask);
26097 if (one_operand_shuffle)
26098 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
26099 else
26101 t1 = gen_reg_rtx (V8SFmode);
26102 t2 = gen_reg_rtx (V8SFmode);
26103 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
26104 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
26105 goto merge_two;
26107 return;
26109 case E_V4SImode:
26110 /* By combining the two 128-bit input vectors into one 256-bit
26111 input vector, we can use VPERMD and VPERMPS for the full
26112 two-operand shuffle. */
26113 t1 = gen_reg_rtx (V8SImode);
26114 t2 = gen_reg_rtx (V8SImode);
26115 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
26116 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
26117 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
26118 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
26119 return;
26121 case E_V4SFmode:
26122 t1 = gen_reg_rtx (V8SFmode);
26123 t2 = gen_reg_rtx (V8SImode);
26124 mask = gen_lowpart (V4SImode, mask);
26125 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
26126 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
26127 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
26128 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
26129 return;
26131 case E_V32QImode:
26132 t1 = gen_reg_rtx (V32QImode);
26133 t2 = gen_reg_rtx (V32QImode);
26134 t3 = gen_reg_rtx (V32QImode);
26135 vt2 = GEN_INT (-128);
26136 for (i = 0; i < 32; i++)
26137 vec[i] = vt2;
26138 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
26139 vt = force_reg (V32QImode, vt);
26140 for (i = 0; i < 32; i++)
26141 vec[i] = i < 16 ? vt2 : const0_rtx;
26142 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
26143 vt2 = force_reg (V32QImode, vt2);
26144 /* From mask create two adjusted masks, which contain the same
26145 bits as mask in the low 7 bits of each vector element.
26146 The first mask will have the most significant bit clear
26147 if it requests element from the same 128-bit lane
26148 and MSB set if it requests element from the other 128-bit lane.
26149 The second mask will have the opposite values of the MSB,
26150 and additionally will have its 128-bit lanes swapped.
26151 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
26152 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
26153 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
26154 stands for other 12 bytes. */
26155 /* The bit whether element is from the same lane or the other
26156 lane is bit 4, so shift it up by 3 to the MSB position. */
26157 t5 = gen_reg_rtx (V4DImode);
26158 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
26159 GEN_INT (3)));
26160 /* Clear MSB bits from the mask just in case it had them set. */
26161 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
26162 /* After this t1 will have MSB set for elements from other lane. */
26163 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
26164 /* Clear bits other than MSB. */
26165 emit_insn (gen_andv32qi3 (t1, t1, vt));
26166 /* Or in the lower bits from mask into t3. */
26167 emit_insn (gen_iorv32qi3 (t3, t1, t2));
26168 /* And invert MSB bits in t1, so MSB is set for elements from the same
26169 lane. */
26170 emit_insn (gen_xorv32qi3 (t1, t1, vt));
26171 /* Swap 128-bit lanes in t3. */
26172 t6 = gen_reg_rtx (V4DImode);
26173 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
26174 const2_rtx, GEN_INT (3),
26175 const0_rtx, const1_rtx));
26176 /* And or in the lower bits from mask into t1. */
26177 emit_insn (gen_iorv32qi3 (t1, t1, t2));
26178 if (one_operand_shuffle)
26180 /* Each of these shuffles will put 0s in places where
26181 element from the other 128-bit lane is needed, otherwise
26182 will shuffle in the requested value. */
26183 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
26184 gen_lowpart (V32QImode, t6)));
26185 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
26186 /* For t3 the 128-bit lanes are swapped again. */
26187 t7 = gen_reg_rtx (V4DImode);
26188 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
26189 const2_rtx, GEN_INT (3),
26190 const0_rtx, const1_rtx));
26191 /* And oring both together leads to the result. */
26192 emit_insn (gen_iorv32qi3 (target, t1,
26193 gen_lowpart (V32QImode, t7)));
26194 if (target != operands[0])
26195 emit_move_insn (operands[0],
26196 gen_lowpart (GET_MODE (operands[0]), target));
26197 return;
26200 t4 = gen_reg_rtx (V32QImode);
26201 /* Similarly to the above one_operand_shuffle code,
26202 just for repeated twice for each operand. merge_two:
26203 code will merge the two results together. */
26204 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
26205 gen_lowpart (V32QImode, t6)));
26206 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
26207 gen_lowpart (V32QImode, t6)));
26208 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
26209 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
26210 t7 = gen_reg_rtx (V4DImode);
26211 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
26212 const2_rtx, GEN_INT (3),
26213 const0_rtx, const1_rtx));
26214 t8 = gen_reg_rtx (V4DImode);
26215 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
26216 const2_rtx, GEN_INT (3),
26217 const0_rtx, const1_rtx));
26218 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
26219 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
26220 t1 = t4;
26221 t2 = t3;
26222 goto merge_two;
26224 default:
26225 gcc_assert (GET_MODE_SIZE (mode) <= 16);
26226 break;
26230 if (TARGET_XOP)
26232 /* The XOP VPPERM insn supports three inputs. By ignoring the
26233 one_operand_shuffle special case, we avoid creating another
26234 set of constant vectors in memory. */
26235 one_operand_shuffle = false;
26237 /* mask = mask & {2*w-1, ...} */
26238 vt = GEN_INT (2*w - 1);
26240 else
26242 /* mask = mask & {w-1, ...} */
26243 vt = GEN_INT (w - 1);
26246 for (i = 0; i < w; i++)
26247 vec[i] = vt;
26248 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26249 mask = expand_simple_binop (maskmode, AND, mask, vt,
26250 NULL_RTX, 0, OPTAB_DIRECT);
26252 /* For non-QImode operations, convert the word permutation control
26253 into a byte permutation control. */
26254 if (mode != V16QImode)
26256 mask = expand_simple_binop (maskmode, ASHIFT, mask,
26257 GEN_INT (exact_log2 (e)),
26258 NULL_RTX, 0, OPTAB_DIRECT);
26260 /* Convert mask to vector of chars. */
26261 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
26263 /* Replicate each of the input bytes into byte positions:
26264 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
26265 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
26266 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
26267 for (i = 0; i < 16; ++i)
26268 vec[i] = GEN_INT (i/e * e);
26269 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
26270 vt = validize_mem (force_const_mem (V16QImode, vt));
26271 if (TARGET_XOP)
26272 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
26273 else
26274 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
26276 /* Convert it into the byte positions by doing
26277 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
26278 for (i = 0; i < 16; ++i)
26279 vec[i] = GEN_INT (i % e);
26280 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
26281 vt = validize_mem (force_const_mem (V16QImode, vt));
26282 emit_insn (gen_addv16qi3 (mask, mask, vt));
26285 /* The actual shuffle operations all operate on V16QImode. */
26286 op0 = gen_lowpart (V16QImode, op0);
26287 op1 = gen_lowpart (V16QImode, op1);
26289 if (TARGET_XOP)
26291 if (GET_MODE (target) != V16QImode)
26292 target = gen_reg_rtx (V16QImode);
26293 emit_insn (gen_xop_pperm (target, op0, op1, mask));
26294 if (target != operands[0])
26295 emit_move_insn (operands[0],
26296 gen_lowpart (GET_MODE (operands[0]), target));
26298 else if (one_operand_shuffle)
26300 if (GET_MODE (target) != V16QImode)
26301 target = gen_reg_rtx (V16QImode);
26302 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
26303 if (target != operands[0])
26304 emit_move_insn (operands[0],
26305 gen_lowpart (GET_MODE (operands[0]), target));
26307 else
26309 rtx xops[6];
26310 bool ok;
26312 /* Shuffle the two input vectors independently. */
26313 t1 = gen_reg_rtx (V16QImode);
26314 t2 = gen_reg_rtx (V16QImode);
26315 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
26316 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
26318 merge_two:
26319 /* Then merge them together. The key is whether any given control
26320 element contained a bit set that indicates the second word. */
26321 mask = operands[3];
26322 vt = GEN_INT (w);
26323 if (maskmode == V2DImode && !TARGET_SSE4_1)
26325 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
26326 more shuffle to convert the V2DI input mask into a V4SI
26327 input mask. At which point the masking that expand_int_vcond
26328 will work as desired. */
26329 rtx t3 = gen_reg_rtx (V4SImode);
26330 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
26331 const0_rtx, const0_rtx,
26332 const2_rtx, const2_rtx));
26333 mask = t3;
26334 maskmode = V4SImode;
26335 e = w = 4;
26338 for (i = 0; i < w; i++)
26339 vec[i] = vt;
26340 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26341 vt = force_reg (maskmode, vt);
26342 mask = expand_simple_binop (maskmode, AND, mask, vt,
26343 NULL_RTX, 0, OPTAB_DIRECT);
26345 if (GET_MODE (target) != mode)
26346 target = gen_reg_rtx (mode);
26347 xops[0] = target;
26348 xops[1] = gen_lowpart (mode, t2);
26349 xops[2] = gen_lowpart (mode, t1);
26350 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
26351 xops[4] = mask;
26352 xops[5] = vt;
26353 ok = ix86_expand_int_vcond (xops);
26354 gcc_assert (ok);
26355 if (target != operands[0])
26356 emit_move_insn (operands[0],
26357 gen_lowpart (GET_MODE (operands[0]), target));
26361 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
26362 true if we should do zero extension, else sign extension. HIGH_P is
26363 true if we want the N/2 high elements, else the low elements. */
26365 void
26366 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
26368 machine_mode imode = GET_MODE (src);
26369 rtx tmp;
26371 if (TARGET_SSE4_1)
26373 rtx (*unpack)(rtx, rtx);
26374 rtx (*extract)(rtx, rtx) = NULL;
26375 machine_mode halfmode = BLKmode;
26377 switch (imode)
26379 case E_V64QImode:
26380 if (unsigned_p)
26381 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
26382 else
26383 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
26384 halfmode = V32QImode;
26385 extract
26386 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
26387 break;
26388 case E_V32QImode:
26389 if (unsigned_p)
26390 unpack = gen_avx2_zero_extendv16qiv16hi2;
26391 else
26392 unpack = gen_avx2_sign_extendv16qiv16hi2;
26393 halfmode = V16QImode;
26394 extract
26395 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
26396 break;
26397 case E_V32HImode:
26398 if (unsigned_p)
26399 unpack = gen_avx512f_zero_extendv16hiv16si2;
26400 else
26401 unpack = gen_avx512f_sign_extendv16hiv16si2;
26402 halfmode = V16HImode;
26403 extract
26404 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
26405 break;
26406 case E_V16HImode:
26407 if (unsigned_p)
26408 unpack = gen_avx2_zero_extendv8hiv8si2;
26409 else
26410 unpack = gen_avx2_sign_extendv8hiv8si2;
26411 halfmode = V8HImode;
26412 extract
26413 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
26414 break;
26415 case E_V16SImode:
26416 if (unsigned_p)
26417 unpack = gen_avx512f_zero_extendv8siv8di2;
26418 else
26419 unpack = gen_avx512f_sign_extendv8siv8di2;
26420 halfmode = V8SImode;
26421 extract
26422 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26423 break;
26424 case E_V8SImode:
26425 if (unsigned_p)
26426 unpack = gen_avx2_zero_extendv4siv4di2;
26427 else
26428 unpack = gen_avx2_sign_extendv4siv4di2;
26429 halfmode = V4SImode;
26430 extract
26431 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26432 break;
26433 case E_V16QImode:
26434 if (unsigned_p)
26435 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26436 else
26437 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26438 break;
26439 case E_V8HImode:
26440 if (unsigned_p)
26441 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26442 else
26443 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26444 break;
26445 case E_V4SImode:
26446 if (unsigned_p)
26447 unpack = gen_sse4_1_zero_extendv2siv2di2;
26448 else
26449 unpack = gen_sse4_1_sign_extendv2siv2di2;
26450 break;
26451 default:
26452 gcc_unreachable ();
26455 if (GET_MODE_SIZE (imode) >= 32)
26457 tmp = gen_reg_rtx (halfmode);
26458 emit_insn (extract (tmp, src));
26460 else if (high_p)
26462 /* Shift higher 8 bytes to lower 8 bytes. */
26463 tmp = gen_reg_rtx (V1TImode);
26464 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26465 GEN_INT (64)));
26466 tmp = gen_lowpart (imode, tmp);
26468 else
26469 tmp = src;
26471 emit_insn (unpack (dest, tmp));
26473 else
26475 rtx (*unpack)(rtx, rtx, rtx);
26477 switch (imode)
26479 case E_V16QImode:
26480 if (high_p)
26481 unpack = gen_vec_interleave_highv16qi;
26482 else
26483 unpack = gen_vec_interleave_lowv16qi;
26484 break;
26485 case E_V8HImode:
26486 if (high_p)
26487 unpack = gen_vec_interleave_highv8hi;
26488 else
26489 unpack = gen_vec_interleave_lowv8hi;
26490 break;
26491 case E_V4SImode:
26492 if (high_p)
26493 unpack = gen_vec_interleave_highv4si;
26494 else
26495 unpack = gen_vec_interleave_lowv4si;
26496 break;
26497 default:
26498 gcc_unreachable ();
26501 if (unsigned_p)
26502 tmp = force_reg (imode, CONST0_RTX (imode));
26503 else
26504 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26505 src, pc_rtx, pc_rtx);
26507 rtx tmp2 = gen_reg_rtx (imode);
26508 emit_insn (unpack (tmp2, src, tmp));
26509 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26513 /* Expand conditional increment or decrement using adb/sbb instructions.
26514 The default case using setcc followed by the conditional move can be
26515 done by generic code. */
26516 bool
26517 ix86_expand_int_addcc (rtx operands[])
26519 enum rtx_code code = GET_CODE (operands[1]);
26520 rtx flags;
26521 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26522 rtx compare_op;
26523 rtx val = const0_rtx;
26524 bool fpcmp = false;
26525 machine_mode mode;
26526 rtx op0 = XEXP (operands[1], 0);
26527 rtx op1 = XEXP (operands[1], 1);
26529 if (operands[3] != const1_rtx
26530 && operands[3] != constm1_rtx)
26531 return false;
26532 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26533 return false;
26534 code = GET_CODE (compare_op);
26536 flags = XEXP (compare_op, 0);
26538 if (GET_MODE (flags) == CCFPmode
26539 || GET_MODE (flags) == CCFPUmode)
26541 fpcmp = true;
26542 code = ix86_fp_compare_code_to_integer (code);
26545 if (code != LTU)
26547 val = constm1_rtx;
26548 if (fpcmp)
26549 PUT_CODE (compare_op,
26550 reverse_condition_maybe_unordered
26551 (GET_CODE (compare_op)));
26552 else
26553 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26556 mode = GET_MODE (operands[0]);
26558 /* Construct either adc or sbb insn. */
26559 if ((code == LTU) == (operands[3] == constm1_rtx))
26561 switch (mode)
26563 case E_QImode:
26564 insn = gen_subqi3_carry;
26565 break;
26566 case E_HImode:
26567 insn = gen_subhi3_carry;
26568 break;
26569 case E_SImode:
26570 insn = gen_subsi3_carry;
26571 break;
26572 case E_DImode:
26573 insn = gen_subdi3_carry;
26574 break;
26575 default:
26576 gcc_unreachable ();
26579 else
26581 switch (mode)
26583 case E_QImode:
26584 insn = gen_addqi3_carry;
26585 break;
26586 case E_HImode:
26587 insn = gen_addhi3_carry;
26588 break;
26589 case E_SImode:
26590 insn = gen_addsi3_carry;
26591 break;
26592 case E_DImode:
26593 insn = gen_adddi3_carry;
26594 break;
26595 default:
26596 gcc_unreachable ();
26599 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26601 return true;
26605 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26606 but works for floating pointer parameters and nonoffsetable memories.
26607 For pushes, it returns just stack offsets; the values will be saved
26608 in the right order. Maximally three parts are generated. */
26610 static int
26611 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26613 int size;
26615 if (!TARGET_64BIT)
26616 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26617 else
26618 size = (GET_MODE_SIZE (mode) + 4) / 8;
26620 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26621 gcc_assert (size >= 2 && size <= 4);
26623 /* Optimize constant pool reference to immediates. This is used by fp
26624 moves, that force all constants to memory to allow combining. */
26625 if (MEM_P (operand) && MEM_READONLY_P (operand))
26627 rtx tmp = maybe_get_pool_constant (operand);
26628 if (tmp)
26629 operand = tmp;
26632 if (MEM_P (operand) && !offsettable_memref_p (operand))
26634 /* The only non-offsetable memories we handle are pushes. */
26635 int ok = push_operand (operand, VOIDmode);
26637 gcc_assert (ok);
26639 operand = copy_rtx (operand);
26640 PUT_MODE (operand, word_mode);
26641 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26642 return size;
26645 if (GET_CODE (operand) == CONST_VECTOR)
26647 scalar_int_mode imode = int_mode_for_mode (mode).require ();
26648 /* Caution: if we looked through a constant pool memory above,
26649 the operand may actually have a different mode now. That's
26650 ok, since we want to pun this all the way back to an integer. */
26651 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26652 gcc_assert (operand != NULL);
26653 mode = imode;
26656 if (!TARGET_64BIT)
26658 if (mode == DImode)
26659 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26660 else
26662 int i;
26664 if (REG_P (operand))
26666 gcc_assert (reload_completed);
26667 for (i = 0; i < size; i++)
26668 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26670 else if (offsettable_memref_p (operand))
26672 operand = adjust_address (operand, SImode, 0);
26673 parts[0] = operand;
26674 for (i = 1; i < size; i++)
26675 parts[i] = adjust_address (operand, SImode, 4 * i);
26677 else if (CONST_DOUBLE_P (operand))
26679 const REAL_VALUE_TYPE *r;
26680 long l[4];
26682 r = CONST_DOUBLE_REAL_VALUE (operand);
26683 switch (mode)
26685 case E_TFmode:
26686 real_to_target (l, r, mode);
26687 parts[3] = gen_int_mode (l[3], SImode);
26688 parts[2] = gen_int_mode (l[2], SImode);
26689 break;
26690 case E_XFmode:
26691 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26692 long double may not be 80-bit. */
26693 real_to_target (l, r, mode);
26694 parts[2] = gen_int_mode (l[2], SImode);
26695 break;
26696 case E_DFmode:
26697 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26698 break;
26699 default:
26700 gcc_unreachable ();
26702 parts[1] = gen_int_mode (l[1], SImode);
26703 parts[0] = gen_int_mode (l[0], SImode);
26705 else
26706 gcc_unreachable ();
26709 else
26711 if (mode == TImode)
26712 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26713 if (mode == XFmode || mode == TFmode)
26715 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26716 if (REG_P (operand))
26718 gcc_assert (reload_completed);
26719 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26720 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26722 else if (offsettable_memref_p (operand))
26724 operand = adjust_address (operand, DImode, 0);
26725 parts[0] = operand;
26726 parts[1] = adjust_address (operand, upper_mode, 8);
26728 else if (CONST_DOUBLE_P (operand))
26730 long l[4];
26732 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26734 /* real_to_target puts 32-bit pieces in each long. */
26735 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26736 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26737 << 32), DImode);
26739 if (upper_mode == SImode)
26740 parts[1] = gen_int_mode (l[2], SImode);
26741 else
26742 parts[1]
26743 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26744 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26745 << 32), DImode);
26747 else
26748 gcc_unreachable ();
26752 return size;
26755 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26756 Return false when normal moves are needed; true when all required
26757 insns have been emitted. Operands 2-4 contain the input values
26758 int the correct order; operands 5-7 contain the output values. */
26760 void
26761 ix86_split_long_move (rtx operands[])
26763 rtx part[2][4];
26764 int nparts, i, j;
26765 int push = 0;
26766 int collisions = 0;
26767 machine_mode mode = GET_MODE (operands[0]);
26768 bool collisionparts[4];
26770 /* The DFmode expanders may ask us to move double.
26771 For 64bit target this is single move. By hiding the fact
26772 here we simplify i386.md splitters. */
26773 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26775 /* Optimize constant pool reference to immediates. This is used by
26776 fp moves, that force all constants to memory to allow combining. */
26778 if (MEM_P (operands[1])
26779 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26780 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26781 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26782 if (push_operand (operands[0], VOIDmode))
26784 operands[0] = copy_rtx (operands[0]);
26785 PUT_MODE (operands[0], word_mode);
26787 else
26788 operands[0] = gen_lowpart (DImode, operands[0]);
26789 operands[1] = gen_lowpart (DImode, operands[1]);
26790 emit_move_insn (operands[0], operands[1]);
26791 return;
26794 /* The only non-offsettable memory we handle is push. */
26795 if (push_operand (operands[0], VOIDmode))
26796 push = 1;
26797 else
26798 gcc_assert (!MEM_P (operands[0])
26799 || offsettable_memref_p (operands[0]));
26801 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26802 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26804 /* When emitting push, take care for source operands on the stack. */
26805 if (push && MEM_P (operands[1])
26806 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26808 rtx src_base = XEXP (part[1][nparts - 1], 0);
26810 /* Compensate for the stack decrement by 4. */
26811 if (!TARGET_64BIT && nparts == 3
26812 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26813 src_base = plus_constant (Pmode, src_base, 4);
26815 /* src_base refers to the stack pointer and is
26816 automatically decreased by emitted push. */
26817 for (i = 0; i < nparts; i++)
26818 part[1][i] = change_address (part[1][i],
26819 GET_MODE (part[1][i]), src_base);
26822 /* We need to do copy in the right order in case an address register
26823 of the source overlaps the destination. */
26824 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26826 rtx tmp;
26828 for (i = 0; i < nparts; i++)
26830 collisionparts[i]
26831 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26832 if (collisionparts[i])
26833 collisions++;
26836 /* Collision in the middle part can be handled by reordering. */
26837 if (collisions == 1 && nparts == 3 && collisionparts [1])
26839 std::swap (part[0][1], part[0][2]);
26840 std::swap (part[1][1], part[1][2]);
26842 else if (collisions == 1
26843 && nparts == 4
26844 && (collisionparts [1] || collisionparts [2]))
26846 if (collisionparts [1])
26848 std::swap (part[0][1], part[0][2]);
26849 std::swap (part[1][1], part[1][2]);
26851 else
26853 std::swap (part[0][2], part[0][3]);
26854 std::swap (part[1][2], part[1][3]);
26858 /* If there are more collisions, we can't handle it by reordering.
26859 Do an lea to the last part and use only one colliding move. */
26860 else if (collisions > 1)
26862 rtx base, addr;
26864 collisions = 1;
26866 base = part[0][nparts - 1];
26868 /* Handle the case when the last part isn't valid for lea.
26869 Happens in 64-bit mode storing the 12-byte XFmode. */
26870 if (GET_MODE (base) != Pmode)
26871 base = gen_rtx_REG (Pmode, REGNO (base));
26873 addr = XEXP (part[1][0], 0);
26874 if (TARGET_TLS_DIRECT_SEG_REFS)
26876 struct ix86_address parts;
26877 int ok = ix86_decompose_address (addr, &parts);
26878 gcc_assert (ok);
26879 /* It is not valid to use %gs: or %fs: in lea. */
26880 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
26882 emit_insn (gen_rtx_SET (base, addr));
26883 part[1][0] = replace_equiv_address (part[1][0], base);
26884 for (i = 1; i < nparts; i++)
26886 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26887 part[1][i] = replace_equiv_address (part[1][i], tmp);
26892 if (push)
26894 if (!TARGET_64BIT)
26896 if (nparts == 3)
26898 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26899 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26900 stack_pointer_rtx, GEN_INT (-4)));
26901 emit_move_insn (part[0][2], part[1][2]);
26903 else if (nparts == 4)
26905 emit_move_insn (part[0][3], part[1][3]);
26906 emit_move_insn (part[0][2], part[1][2]);
26909 else
26911 /* In 64bit mode we don't have 32bit push available. In case this is
26912 register, it is OK - we will just use larger counterpart. We also
26913 retype memory - these comes from attempt to avoid REX prefix on
26914 moving of second half of TFmode value. */
26915 if (GET_MODE (part[1][1]) == SImode)
26917 switch (GET_CODE (part[1][1]))
26919 case MEM:
26920 part[1][1] = adjust_address (part[1][1], DImode, 0);
26921 break;
26923 case REG:
26924 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26925 break;
26927 default:
26928 gcc_unreachable ();
26931 if (GET_MODE (part[1][0]) == SImode)
26932 part[1][0] = part[1][1];
26935 emit_move_insn (part[0][1], part[1][1]);
26936 emit_move_insn (part[0][0], part[1][0]);
26937 return;
26940 /* Choose correct order to not overwrite the source before it is copied. */
26941 if ((REG_P (part[0][0])
26942 && REG_P (part[1][1])
26943 && (REGNO (part[0][0]) == REGNO (part[1][1])
26944 || (nparts == 3
26945 && REGNO (part[0][0]) == REGNO (part[1][2]))
26946 || (nparts == 4
26947 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26948 || (collisions > 0
26949 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26951 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26953 operands[2 + i] = part[0][j];
26954 operands[6 + i] = part[1][j];
26957 else
26959 for (i = 0; i < nparts; i++)
26961 operands[2 + i] = part[0][i];
26962 operands[6 + i] = part[1][i];
26966 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26967 if (optimize_insn_for_size_p ())
26969 for (j = 0; j < nparts - 1; j++)
26970 if (CONST_INT_P (operands[6 + j])
26971 && operands[6 + j] != const0_rtx
26972 && REG_P (operands[2 + j]))
26973 for (i = j; i < nparts - 1; i++)
26974 if (CONST_INT_P (operands[7 + i])
26975 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26976 operands[7 + i] = operands[2 + j];
26979 for (i = 0; i < nparts; i++)
26980 emit_move_insn (operands[2 + i], operands[6 + i]);
26982 return;
26985 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26986 left shift by a constant, either using a single shift or
26987 a sequence of add instructions. */
26989 static void
26990 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26992 rtx (*insn)(rtx, rtx, rtx);
26994 if (count == 1
26995 || (count * ix86_cost->add <= ix86_cost->shift_const
26996 && !optimize_insn_for_size_p ()))
26998 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26999 while (count-- > 0)
27000 emit_insn (insn (operand, operand, operand));
27002 else
27004 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
27005 emit_insn (insn (operand, operand, GEN_INT (count)));
27009 void
27010 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
27012 rtx (*gen_ashl3)(rtx, rtx, rtx);
27013 rtx (*gen_shld)(rtx, rtx, rtx);
27014 int half_width = GET_MODE_BITSIZE (mode) >> 1;
27016 rtx low[2], high[2];
27017 int count;
27019 if (CONST_INT_P (operands[2]))
27021 split_double_mode (mode, operands, 2, low, high);
27022 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27024 if (count >= half_width)
27026 emit_move_insn (high[0], low[1]);
27027 emit_move_insn (low[0], const0_rtx);
27029 if (count > half_width)
27030 ix86_expand_ashl_const (high[0], count - half_width, mode);
27032 else
27034 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
27036 if (!rtx_equal_p (operands[0], operands[1]))
27037 emit_move_insn (operands[0], operands[1]);
27039 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
27040 ix86_expand_ashl_const (low[0], count, mode);
27042 return;
27045 split_double_mode (mode, operands, 1, low, high);
27047 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
27049 if (operands[1] == const1_rtx)
27051 /* Assuming we've chosen a QImode capable registers, then 1 << N
27052 can be done with two 32/64-bit shifts, no branches, no cmoves. */
27053 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
27055 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
27057 ix86_expand_clear (low[0]);
27058 ix86_expand_clear (high[0]);
27059 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
27061 d = gen_lowpart (QImode, low[0]);
27062 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
27063 s = gen_rtx_EQ (QImode, flags, const0_rtx);
27064 emit_insn (gen_rtx_SET (d, s));
27066 d = gen_lowpart (QImode, high[0]);
27067 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
27068 s = gen_rtx_NE (QImode, flags, const0_rtx);
27069 emit_insn (gen_rtx_SET (d, s));
27072 /* Otherwise, we can get the same results by manually performing
27073 a bit extract operation on bit 5/6, and then performing the two
27074 shifts. The two methods of getting 0/1 into low/high are exactly
27075 the same size. Avoiding the shift in the bit extract case helps
27076 pentium4 a bit; no one else seems to care much either way. */
27077 else
27079 machine_mode half_mode;
27080 rtx (*gen_lshr3)(rtx, rtx, rtx);
27081 rtx (*gen_and3)(rtx, rtx, rtx);
27082 rtx (*gen_xor3)(rtx, rtx, rtx);
27083 HOST_WIDE_INT bits;
27084 rtx x;
27086 if (mode == DImode)
27088 half_mode = SImode;
27089 gen_lshr3 = gen_lshrsi3;
27090 gen_and3 = gen_andsi3;
27091 gen_xor3 = gen_xorsi3;
27092 bits = 5;
27094 else
27096 half_mode = DImode;
27097 gen_lshr3 = gen_lshrdi3;
27098 gen_and3 = gen_anddi3;
27099 gen_xor3 = gen_xordi3;
27100 bits = 6;
27103 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
27104 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
27105 else
27106 x = gen_lowpart (half_mode, operands[2]);
27107 emit_insn (gen_rtx_SET (high[0], x));
27109 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
27110 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
27111 emit_move_insn (low[0], high[0]);
27112 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
27115 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
27116 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
27117 return;
27120 if (operands[1] == constm1_rtx)
27122 /* For -1 << N, we can avoid the shld instruction, because we
27123 know that we're shifting 0...31/63 ones into a -1. */
27124 emit_move_insn (low[0], constm1_rtx);
27125 if (optimize_insn_for_size_p ())
27126 emit_move_insn (high[0], low[0]);
27127 else
27128 emit_move_insn (high[0], constm1_rtx);
27130 else
27132 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
27134 if (!rtx_equal_p (operands[0], operands[1]))
27135 emit_move_insn (operands[0], operands[1]);
27137 split_double_mode (mode, operands, 1, low, high);
27138 emit_insn (gen_shld (high[0], low[0], operands[2]));
27141 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
27143 if (TARGET_CMOVE && scratch)
27145 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27146 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27148 ix86_expand_clear (scratch);
27149 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
27151 else
27153 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27154 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27156 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
27160 void
27161 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
27163 rtx (*gen_ashr3)(rtx, rtx, rtx)
27164 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
27165 rtx (*gen_shrd)(rtx, rtx, rtx);
27166 int half_width = GET_MODE_BITSIZE (mode) >> 1;
27168 rtx low[2], high[2];
27169 int count;
27171 if (CONST_INT_P (operands[2]))
27173 split_double_mode (mode, operands, 2, low, high);
27174 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27176 if (count == GET_MODE_BITSIZE (mode) - 1)
27178 emit_move_insn (high[0], high[1]);
27179 emit_insn (gen_ashr3 (high[0], high[0],
27180 GEN_INT (half_width - 1)));
27181 emit_move_insn (low[0], high[0]);
27184 else if (count >= half_width)
27186 emit_move_insn (low[0], high[1]);
27187 emit_move_insn (high[0], low[0]);
27188 emit_insn (gen_ashr3 (high[0], high[0],
27189 GEN_INT (half_width - 1)));
27191 if (count > half_width)
27192 emit_insn (gen_ashr3 (low[0], low[0],
27193 GEN_INT (count - half_width)));
27195 else
27197 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27199 if (!rtx_equal_p (operands[0], operands[1]))
27200 emit_move_insn (operands[0], operands[1]);
27202 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27203 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
27206 else
27208 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27210 if (!rtx_equal_p (operands[0], operands[1]))
27211 emit_move_insn (operands[0], operands[1]);
27213 split_double_mode (mode, operands, 1, low, high);
27215 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27216 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
27218 if (TARGET_CMOVE && scratch)
27220 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27221 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27223 emit_move_insn (scratch, high[0]);
27224 emit_insn (gen_ashr3 (scratch, scratch,
27225 GEN_INT (half_width - 1)));
27226 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27227 scratch));
27229 else
27231 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
27232 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
27234 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
27239 void
27240 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
27242 rtx (*gen_lshr3)(rtx, rtx, rtx)
27243 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
27244 rtx (*gen_shrd)(rtx, rtx, rtx);
27245 int half_width = GET_MODE_BITSIZE (mode) >> 1;
27247 rtx low[2], high[2];
27248 int count;
27250 if (CONST_INT_P (operands[2]))
27252 split_double_mode (mode, operands, 2, low, high);
27253 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27255 if (count >= half_width)
27257 emit_move_insn (low[0], high[1]);
27258 ix86_expand_clear (high[0]);
27260 if (count > half_width)
27261 emit_insn (gen_lshr3 (low[0], low[0],
27262 GEN_INT (count - half_width)));
27264 else
27266 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27268 if (!rtx_equal_p (operands[0], operands[1]))
27269 emit_move_insn (operands[0], operands[1]);
27271 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27272 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
27275 else
27277 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27279 if (!rtx_equal_p (operands[0], operands[1]))
27280 emit_move_insn (operands[0], operands[1]);
27282 split_double_mode (mode, operands, 1, low, high);
27284 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27285 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
27287 if (TARGET_CMOVE && scratch)
27289 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27290 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27292 ix86_expand_clear (scratch);
27293 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27294 scratch));
27296 else
27298 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27299 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27301 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
27306 /* Predict just emitted jump instruction to be taken with probability PROB. */
27307 static void
27308 predict_jump (int prob)
27310 rtx_insn *insn = get_last_insn ();
27311 gcc_assert (JUMP_P (insn));
27312 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
27315 /* Helper function for the string operations below. Dest VARIABLE whether
27316 it is aligned to VALUE bytes. If true, jump to the label. */
27317 static rtx_code_label *
27318 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
27320 rtx_code_label *label = gen_label_rtx ();
27321 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
27322 if (GET_MODE (variable) == DImode)
27323 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
27324 else
27325 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
27326 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
27327 1, label);
27328 if (epilogue)
27329 predict_jump (REG_BR_PROB_BASE * 50 / 100);
27330 else
27331 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27332 return label;
27335 /* Adjust COUNTER by the VALUE. */
27336 static void
27337 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
27339 rtx (*gen_add)(rtx, rtx, rtx)
27340 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
27342 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
27345 /* Zero extend possibly SImode EXP to Pmode register. */
27347 ix86_zero_extend_to_Pmode (rtx exp)
27349 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
27352 /* Divide COUNTREG by SCALE. */
27353 static rtx
27354 scale_counter (rtx countreg, int scale)
27356 rtx sc;
27358 if (scale == 1)
27359 return countreg;
27360 if (CONST_INT_P (countreg))
27361 return GEN_INT (INTVAL (countreg) / scale);
27362 gcc_assert (REG_P (countreg));
27364 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
27365 GEN_INT (exact_log2 (scale)),
27366 NULL, 1, OPTAB_DIRECT);
27367 return sc;
27370 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
27371 DImode for constant loop counts. */
27373 static machine_mode
27374 counter_mode (rtx count_exp)
27376 if (GET_MODE (count_exp) != VOIDmode)
27377 return GET_MODE (count_exp);
27378 if (!CONST_INT_P (count_exp))
27379 return Pmode;
27380 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
27381 return DImode;
27382 return SImode;
27385 /* Copy the address to a Pmode register. This is used for x32 to
27386 truncate DImode TLS address to a SImode register. */
27388 static rtx
27389 ix86_copy_addr_to_reg (rtx addr)
27391 rtx reg;
27392 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27394 reg = copy_addr_to_reg (addr);
27395 REG_POINTER (reg) = 1;
27396 return reg;
27398 else
27400 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27401 reg = copy_to_mode_reg (DImode, addr);
27402 REG_POINTER (reg) = 1;
27403 return gen_rtx_SUBREG (SImode, reg, 0);
27407 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27408 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27409 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27410 memory by VALUE (supposed to be in MODE).
27412 The size is rounded down to whole number of chunk size moved at once.
27413 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27416 static void
27417 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27418 rtx destptr, rtx srcptr, rtx value,
27419 rtx count, machine_mode mode, int unroll,
27420 int expected_size, bool issetmem)
27422 rtx_code_label *out_label, *top_label;
27423 rtx iter, tmp;
27424 machine_mode iter_mode = counter_mode (count);
27425 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27426 rtx piece_size = GEN_INT (piece_size_n);
27427 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27428 rtx size;
27429 int i;
27431 top_label = gen_label_rtx ();
27432 out_label = gen_label_rtx ();
27433 iter = gen_reg_rtx (iter_mode);
27435 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27436 NULL, 1, OPTAB_DIRECT);
27437 /* Those two should combine. */
27438 if (piece_size == const1_rtx)
27440 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27441 true, out_label);
27442 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27444 emit_move_insn (iter, const0_rtx);
27446 emit_label (top_label);
27448 tmp = convert_modes (Pmode, iter_mode, iter, true);
27450 /* This assert could be relaxed - in this case we'll need to compute
27451 smallest power of two, containing in PIECE_SIZE_N and pass it to
27452 offset_address. */
27453 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27454 destmem = offset_address (destmem, tmp, piece_size_n);
27455 destmem = adjust_address (destmem, mode, 0);
27457 if (!issetmem)
27459 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27460 srcmem = adjust_address (srcmem, mode, 0);
27462 /* When unrolling for chips that reorder memory reads and writes,
27463 we can save registers by using single temporary.
27464 Also using 4 temporaries is overkill in 32bit mode. */
27465 if (!TARGET_64BIT && 0)
27467 for (i = 0; i < unroll; i++)
27469 if (i)
27471 destmem =
27472 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27473 srcmem =
27474 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27476 emit_move_insn (destmem, srcmem);
27479 else
27481 rtx tmpreg[4];
27482 gcc_assert (unroll <= 4);
27483 for (i = 0; i < unroll; i++)
27485 tmpreg[i] = gen_reg_rtx (mode);
27486 if (i)
27488 srcmem =
27489 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27491 emit_move_insn (tmpreg[i], srcmem);
27493 for (i = 0; i < unroll; i++)
27495 if (i)
27497 destmem =
27498 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27500 emit_move_insn (destmem, tmpreg[i]);
27504 else
27505 for (i = 0; i < unroll; i++)
27507 if (i)
27508 destmem =
27509 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27510 emit_move_insn (destmem, value);
27513 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27514 true, OPTAB_LIB_WIDEN);
27515 if (tmp != iter)
27516 emit_move_insn (iter, tmp);
27518 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27519 true, top_label);
27520 if (expected_size != -1)
27522 expected_size /= GET_MODE_SIZE (mode) * unroll;
27523 if (expected_size == 0)
27524 predict_jump (0);
27525 else if (expected_size > REG_BR_PROB_BASE)
27526 predict_jump (REG_BR_PROB_BASE - 1);
27527 else
27528 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27530 else
27531 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27532 iter = ix86_zero_extend_to_Pmode (iter);
27533 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27534 true, OPTAB_LIB_WIDEN);
27535 if (tmp != destptr)
27536 emit_move_insn (destptr, tmp);
27537 if (!issetmem)
27539 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27540 true, OPTAB_LIB_WIDEN);
27541 if (tmp != srcptr)
27542 emit_move_insn (srcptr, tmp);
27544 emit_label (out_label);
27547 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27548 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27549 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27550 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27551 ORIG_VALUE is the original value passed to memset to fill the memory with.
27552 Other arguments have same meaning as for previous function. */
27554 static void
27555 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27556 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27557 rtx count,
27558 machine_mode mode, bool issetmem)
27560 rtx destexp;
27561 rtx srcexp;
27562 rtx countreg;
27563 HOST_WIDE_INT rounded_count;
27565 /* If possible, it is shorter to use rep movs.
27566 TODO: Maybe it is better to move this logic to decide_alg. */
27567 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27568 && (!issetmem || orig_value == const0_rtx))
27569 mode = SImode;
27571 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27572 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27574 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27575 GET_MODE_SIZE (mode)));
27576 if (mode != QImode)
27578 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27579 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27580 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27582 else
27583 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27584 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27586 rounded_count
27587 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27588 destmem = shallow_copy_rtx (destmem);
27589 set_mem_size (destmem, rounded_count);
27591 else if (MEM_SIZE_KNOWN_P (destmem))
27592 clear_mem_size (destmem);
27594 if (issetmem)
27596 value = force_reg (mode, gen_lowpart (mode, value));
27597 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27599 else
27601 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27602 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27603 if (mode != QImode)
27605 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27606 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27607 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27609 else
27610 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27611 if (CONST_INT_P (count))
27613 rounded_count
27614 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27615 srcmem = shallow_copy_rtx (srcmem);
27616 set_mem_size (srcmem, rounded_count);
27618 else
27620 if (MEM_SIZE_KNOWN_P (srcmem))
27621 clear_mem_size (srcmem);
27623 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27624 destexp, srcexp));
27628 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27629 DESTMEM.
27630 SRC is passed by pointer to be updated on return.
27631 Return value is updated DST. */
27632 static rtx
27633 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27634 HOST_WIDE_INT size_to_move)
27636 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27637 enum insn_code code;
27638 machine_mode move_mode;
27639 int piece_size, i;
27641 /* Find the widest mode in which we could perform moves.
27642 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27643 it until move of such size is supported. */
27644 piece_size = 1 << floor_log2 (size_to_move);
27645 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
27646 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
27648 gcc_assert (piece_size > 1);
27649 piece_size >>= 1;
27652 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27653 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27654 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27656 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27657 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27658 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
27660 move_mode = word_mode;
27661 piece_size = GET_MODE_SIZE (move_mode);
27662 code = optab_handler (mov_optab, move_mode);
27665 gcc_assert (code != CODE_FOR_nothing);
27667 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27668 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27670 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27671 gcc_assert (size_to_move % piece_size == 0);
27672 adjust = GEN_INT (piece_size);
27673 for (i = 0; i < size_to_move; i += piece_size)
27675 /* We move from memory to memory, so we'll need to do it via
27676 a temporary register. */
27677 tempreg = gen_reg_rtx (move_mode);
27678 emit_insn (GEN_FCN (code) (tempreg, src));
27679 emit_insn (GEN_FCN (code) (dst, tempreg));
27681 emit_move_insn (destptr,
27682 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27683 emit_move_insn (srcptr,
27684 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27686 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27687 piece_size);
27688 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27689 piece_size);
27692 /* Update DST and SRC rtx. */
27693 *srcmem = src;
27694 return dst;
27697 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27698 static void
27699 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27700 rtx destptr, rtx srcptr, rtx count, int max_size)
27702 rtx src, dest;
27703 if (CONST_INT_P (count))
27705 HOST_WIDE_INT countval = INTVAL (count);
27706 HOST_WIDE_INT epilogue_size = countval % max_size;
27707 int i;
27709 /* For now MAX_SIZE should be a power of 2. This assert could be
27710 relaxed, but it'll require a bit more complicated epilogue
27711 expanding. */
27712 gcc_assert ((max_size & (max_size - 1)) == 0);
27713 for (i = max_size; i >= 1; i >>= 1)
27715 if (epilogue_size & i)
27716 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27718 return;
27720 if (max_size > 8)
27722 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27723 count, 1, OPTAB_DIRECT);
27724 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27725 count, QImode, 1, 4, false);
27726 return;
27729 /* When there are stringops, we can cheaply increase dest and src pointers.
27730 Otherwise we save code size by maintaining offset (zero is readily
27731 available from preceding rep operation) and using x86 addressing modes.
27733 if (TARGET_SINGLE_STRINGOP)
27735 if (max_size > 4)
27737 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27738 src = change_address (srcmem, SImode, srcptr);
27739 dest = change_address (destmem, SImode, destptr);
27740 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27741 emit_label (label);
27742 LABEL_NUSES (label) = 1;
27744 if (max_size > 2)
27746 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27747 src = change_address (srcmem, HImode, srcptr);
27748 dest = change_address (destmem, HImode, destptr);
27749 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27750 emit_label (label);
27751 LABEL_NUSES (label) = 1;
27753 if (max_size > 1)
27755 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27756 src = change_address (srcmem, QImode, srcptr);
27757 dest = change_address (destmem, QImode, destptr);
27758 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27759 emit_label (label);
27760 LABEL_NUSES (label) = 1;
27763 else
27765 rtx offset = force_reg (Pmode, const0_rtx);
27766 rtx tmp;
27768 if (max_size > 4)
27770 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27771 src = change_address (srcmem, SImode, srcptr);
27772 dest = change_address (destmem, SImode, destptr);
27773 emit_move_insn (dest, src);
27774 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27775 true, OPTAB_LIB_WIDEN);
27776 if (tmp != offset)
27777 emit_move_insn (offset, tmp);
27778 emit_label (label);
27779 LABEL_NUSES (label) = 1;
27781 if (max_size > 2)
27783 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27784 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27785 src = change_address (srcmem, HImode, tmp);
27786 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27787 dest = change_address (destmem, HImode, tmp);
27788 emit_move_insn (dest, src);
27789 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27790 true, OPTAB_LIB_WIDEN);
27791 if (tmp != offset)
27792 emit_move_insn (offset, tmp);
27793 emit_label (label);
27794 LABEL_NUSES (label) = 1;
27796 if (max_size > 1)
27798 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27799 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27800 src = change_address (srcmem, QImode, tmp);
27801 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27802 dest = change_address (destmem, QImode, tmp);
27803 emit_move_insn (dest, src);
27804 emit_label (label);
27805 LABEL_NUSES (label) = 1;
27810 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27811 with value PROMOTED_VAL.
27812 SRC is passed by pointer to be updated on return.
27813 Return value is updated DST. */
27814 static rtx
27815 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27816 HOST_WIDE_INT size_to_move)
27818 rtx dst = destmem, adjust;
27819 enum insn_code code;
27820 machine_mode move_mode;
27821 int piece_size, i;
27823 /* Find the widest mode in which we could perform moves.
27824 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27825 it until move of such size is supported. */
27826 move_mode = GET_MODE (promoted_val);
27827 if (move_mode == VOIDmode)
27828 move_mode = QImode;
27829 if (size_to_move < GET_MODE_SIZE (move_mode))
27831 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
27832 move_mode = int_mode_for_size (move_bits, 0).require ();
27833 promoted_val = gen_lowpart (move_mode, promoted_val);
27835 piece_size = GET_MODE_SIZE (move_mode);
27836 code = optab_handler (mov_optab, move_mode);
27837 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27839 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27841 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27842 gcc_assert (size_to_move % piece_size == 0);
27843 adjust = GEN_INT (piece_size);
27844 for (i = 0; i < size_to_move; i += piece_size)
27846 if (piece_size <= GET_MODE_SIZE (word_mode))
27848 emit_insn (gen_strset (destptr, dst, promoted_val));
27849 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27850 piece_size);
27851 continue;
27854 emit_insn (GEN_FCN (code) (dst, promoted_val));
27856 emit_move_insn (destptr,
27857 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27859 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27860 piece_size);
27863 /* Update DST rtx. */
27864 return dst;
27866 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27867 static void
27868 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27869 rtx count, int max_size)
27871 count =
27872 expand_simple_binop (counter_mode (count), AND, count,
27873 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27874 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27875 gen_lowpart (QImode, value), count, QImode,
27876 1, max_size / 2, true);
27879 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27880 static void
27881 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27882 rtx count, int max_size)
27884 rtx dest;
27886 if (CONST_INT_P (count))
27888 HOST_WIDE_INT countval = INTVAL (count);
27889 HOST_WIDE_INT epilogue_size = countval % max_size;
27890 int i;
27892 /* For now MAX_SIZE should be a power of 2. This assert could be
27893 relaxed, but it'll require a bit more complicated epilogue
27894 expanding. */
27895 gcc_assert ((max_size & (max_size - 1)) == 0);
27896 for (i = max_size; i >= 1; i >>= 1)
27898 if (epilogue_size & i)
27900 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27901 destmem = emit_memset (destmem, destptr, vec_value, i);
27902 else
27903 destmem = emit_memset (destmem, destptr, value, i);
27906 return;
27908 if (max_size > 32)
27910 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27911 return;
27913 if (max_size > 16)
27915 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27916 if (TARGET_64BIT)
27918 dest = change_address (destmem, DImode, destptr);
27919 emit_insn (gen_strset (destptr, dest, value));
27920 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27921 emit_insn (gen_strset (destptr, dest, value));
27923 else
27925 dest = change_address (destmem, SImode, destptr);
27926 emit_insn (gen_strset (destptr, dest, value));
27927 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27928 emit_insn (gen_strset (destptr, dest, value));
27929 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27930 emit_insn (gen_strset (destptr, dest, value));
27931 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27932 emit_insn (gen_strset (destptr, dest, value));
27934 emit_label (label);
27935 LABEL_NUSES (label) = 1;
27937 if (max_size > 8)
27939 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27940 if (TARGET_64BIT)
27942 dest = change_address (destmem, DImode, destptr);
27943 emit_insn (gen_strset (destptr, dest, value));
27945 else
27947 dest = change_address (destmem, SImode, destptr);
27948 emit_insn (gen_strset (destptr, dest, value));
27949 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27950 emit_insn (gen_strset (destptr, dest, value));
27952 emit_label (label);
27953 LABEL_NUSES (label) = 1;
27955 if (max_size > 4)
27957 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27958 dest = change_address (destmem, SImode, destptr);
27959 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27960 emit_label (label);
27961 LABEL_NUSES (label) = 1;
27963 if (max_size > 2)
27965 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27966 dest = change_address (destmem, HImode, destptr);
27967 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27968 emit_label (label);
27969 LABEL_NUSES (label) = 1;
27971 if (max_size > 1)
27973 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27974 dest = change_address (destmem, QImode, destptr);
27975 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27976 emit_label (label);
27977 LABEL_NUSES (label) = 1;
27981 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27982 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27983 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27984 ignored.
27985 Return value is updated DESTMEM. */
27986 static rtx
27987 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27988 rtx destptr, rtx srcptr, rtx value,
27989 rtx vec_value, rtx count, int align,
27990 int desired_alignment, bool issetmem)
27992 int i;
27993 for (i = 1; i < desired_alignment; i <<= 1)
27995 if (align <= i)
27997 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27998 if (issetmem)
28000 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
28001 destmem = emit_memset (destmem, destptr, vec_value, i);
28002 else
28003 destmem = emit_memset (destmem, destptr, value, i);
28005 else
28006 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
28007 ix86_adjust_counter (count, i);
28008 emit_label (label);
28009 LABEL_NUSES (label) = 1;
28010 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
28013 return destmem;
28016 /* Test if COUNT&SIZE is nonzero and if so, expand movme
28017 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
28018 and jump to DONE_LABEL. */
28019 static void
28020 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
28021 rtx destptr, rtx srcptr,
28022 rtx value, rtx vec_value,
28023 rtx count, int size,
28024 rtx done_label, bool issetmem)
28026 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
28027 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
28028 rtx modesize;
28029 int n;
28031 /* If we do not have vector value to copy, we must reduce size. */
28032 if (issetmem)
28034 if (!vec_value)
28036 if (GET_MODE (value) == VOIDmode && size > 8)
28037 mode = Pmode;
28038 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
28039 mode = GET_MODE (value);
28041 else
28042 mode = GET_MODE (vec_value), value = vec_value;
28044 else
28046 /* Choose appropriate vector mode. */
28047 if (size >= 32)
28048 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
28049 else if (size >= 16)
28050 mode = TARGET_SSE ? V16QImode : DImode;
28051 srcmem = change_address (srcmem, mode, srcptr);
28053 destmem = change_address (destmem, mode, destptr);
28054 modesize = GEN_INT (GET_MODE_SIZE (mode));
28055 gcc_assert (GET_MODE_SIZE (mode) <= size);
28056 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
28058 if (issetmem)
28059 emit_move_insn (destmem, gen_lowpart (mode, value));
28060 else
28062 emit_move_insn (destmem, srcmem);
28063 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28065 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28068 destmem = offset_address (destmem, count, 1);
28069 destmem = offset_address (destmem, GEN_INT (-2 * size),
28070 GET_MODE_SIZE (mode));
28071 if (!issetmem)
28073 srcmem = offset_address (srcmem, count, 1);
28074 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
28075 GET_MODE_SIZE (mode));
28077 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
28079 if (issetmem)
28080 emit_move_insn (destmem, gen_lowpart (mode, value));
28081 else
28083 emit_move_insn (destmem, srcmem);
28084 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28086 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28088 emit_jump_insn (gen_jump (done_label));
28089 emit_barrier ();
28091 emit_label (label);
28092 LABEL_NUSES (label) = 1;
28095 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
28096 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
28097 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
28098 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
28099 DONE_LABEL is a label after the whole copying sequence. The label is created
28100 on demand if *DONE_LABEL is NULL.
28101 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
28102 bounds after the initial copies.
28104 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
28105 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
28106 we will dispatch to a library call for large blocks.
28108 In pseudocode we do:
28110 if (COUNT < SIZE)
28112 Assume that SIZE is 4. Bigger sizes are handled analogously
28113 if (COUNT & 4)
28115 copy 4 bytes from SRCPTR to DESTPTR
28116 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
28117 goto done_label
28119 if (!COUNT)
28120 goto done_label;
28121 copy 1 byte from SRCPTR to DESTPTR
28122 if (COUNT & 2)
28124 copy 2 bytes from SRCPTR to DESTPTR
28125 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
28128 else
28130 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
28131 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
28133 OLD_DESPTR = DESTPTR;
28134 Align DESTPTR up to DESIRED_ALIGN
28135 SRCPTR += DESTPTR - OLD_DESTPTR
28136 COUNT -= DEST_PTR - OLD_DESTPTR
28137 if (DYNAMIC_CHECK)
28138 Round COUNT down to multiple of SIZE
28139 << optional caller supplied zero size guard is here >>
28140 << optional caller supplied dynamic check is here >>
28141 << caller supplied main copy loop is here >>
28143 done_label:
28145 static void
28146 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
28147 rtx *destptr, rtx *srcptr,
28148 machine_mode mode,
28149 rtx value, rtx vec_value,
28150 rtx *count,
28151 rtx_code_label **done_label,
28152 int size,
28153 int desired_align,
28154 int align,
28155 unsigned HOST_WIDE_INT *min_size,
28156 bool dynamic_check,
28157 bool issetmem)
28159 rtx_code_label *loop_label = NULL, *label;
28160 int n;
28161 rtx modesize;
28162 int prolog_size = 0;
28163 rtx mode_value;
28165 /* Chose proper value to copy. */
28166 if (issetmem && VECTOR_MODE_P (mode))
28167 mode_value = vec_value;
28168 else
28169 mode_value = value;
28170 gcc_assert (GET_MODE_SIZE (mode) <= size);
28172 /* See if block is big or small, handle small blocks. */
28173 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
28175 int size2 = size;
28176 loop_label = gen_label_rtx ();
28178 if (!*done_label)
28179 *done_label = gen_label_rtx ();
28181 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
28182 1, loop_label);
28183 size2 >>= 1;
28185 /* Handle sizes > 3. */
28186 for (;size2 > 2; size2 >>= 1)
28187 expand_small_movmem_or_setmem (destmem, srcmem,
28188 *destptr, *srcptr,
28189 value, vec_value,
28190 *count,
28191 size2, *done_label, issetmem);
28192 /* Nothing to copy? Jump to DONE_LABEL if so */
28193 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
28194 1, *done_label);
28196 /* Do a byte copy. */
28197 destmem = change_address (destmem, QImode, *destptr);
28198 if (issetmem)
28199 emit_move_insn (destmem, gen_lowpart (QImode, value));
28200 else
28202 srcmem = change_address (srcmem, QImode, *srcptr);
28203 emit_move_insn (destmem, srcmem);
28206 /* Handle sizes 2 and 3. */
28207 label = ix86_expand_aligntest (*count, 2, false);
28208 destmem = change_address (destmem, HImode, *destptr);
28209 destmem = offset_address (destmem, *count, 1);
28210 destmem = offset_address (destmem, GEN_INT (-2), 2);
28211 if (issetmem)
28212 emit_move_insn (destmem, gen_lowpart (HImode, value));
28213 else
28215 srcmem = change_address (srcmem, HImode, *srcptr);
28216 srcmem = offset_address (srcmem, *count, 1);
28217 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
28218 emit_move_insn (destmem, srcmem);
28221 emit_label (label);
28222 LABEL_NUSES (label) = 1;
28223 emit_jump_insn (gen_jump (*done_label));
28224 emit_barrier ();
28226 else
28227 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
28228 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
28230 /* Start memcpy for COUNT >= SIZE. */
28231 if (loop_label)
28233 emit_label (loop_label);
28234 LABEL_NUSES (loop_label) = 1;
28237 /* Copy first desired_align bytes. */
28238 if (!issetmem)
28239 srcmem = change_address (srcmem, mode, *srcptr);
28240 destmem = change_address (destmem, mode, *destptr);
28241 modesize = GEN_INT (GET_MODE_SIZE (mode));
28242 for (n = 0; prolog_size < desired_align - align; n++)
28244 if (issetmem)
28245 emit_move_insn (destmem, mode_value);
28246 else
28248 emit_move_insn (destmem, srcmem);
28249 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28251 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28252 prolog_size += GET_MODE_SIZE (mode);
28256 /* Copy last SIZE bytes. */
28257 destmem = offset_address (destmem, *count, 1);
28258 destmem = offset_address (destmem,
28259 GEN_INT (-size - prolog_size),
28261 if (issetmem)
28262 emit_move_insn (destmem, mode_value);
28263 else
28265 srcmem = offset_address (srcmem, *count, 1);
28266 srcmem = offset_address (srcmem,
28267 GEN_INT (-size - prolog_size),
28269 emit_move_insn (destmem, srcmem);
28271 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
28273 destmem = offset_address (destmem, modesize, 1);
28274 if (issetmem)
28275 emit_move_insn (destmem, mode_value);
28276 else
28278 srcmem = offset_address (srcmem, modesize, 1);
28279 emit_move_insn (destmem, srcmem);
28283 /* Align destination. */
28284 if (desired_align > 1 && desired_align > align)
28286 rtx saveddest = *destptr;
28288 gcc_assert (desired_align <= size);
28289 /* Align destptr up, place it to new register. */
28290 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
28291 GEN_INT (prolog_size),
28292 NULL_RTX, 1, OPTAB_DIRECT);
28293 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
28294 REG_POINTER (*destptr) = 1;
28295 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
28296 GEN_INT (-desired_align),
28297 *destptr, 1, OPTAB_DIRECT);
28298 /* See how many bytes we skipped. */
28299 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
28300 *destptr,
28301 saveddest, 1, OPTAB_DIRECT);
28302 /* Adjust srcptr and count. */
28303 if (!issetmem)
28304 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
28305 saveddest, *srcptr, 1, OPTAB_DIRECT);
28306 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28307 saveddest, *count, 1, OPTAB_DIRECT);
28308 /* We copied at most size + prolog_size. */
28309 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
28310 *min_size
28311 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
28312 else
28313 *min_size = 0;
28315 /* Our loops always round down the block size, but for dispatch to
28316 library we need precise value. */
28317 if (dynamic_check)
28318 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
28319 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
28321 else
28323 gcc_assert (prolog_size == 0);
28324 /* Decrease count, so we won't end up copying last word twice. */
28325 if (!CONST_INT_P (*count))
28326 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28327 constm1_rtx, *count, 1, OPTAB_DIRECT);
28328 else
28329 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
28330 (unsigned HOST_WIDE_INT)size));
28331 if (*min_size)
28332 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
28337 /* This function is like the previous one, except here we know how many bytes
28338 need to be copied. That allows us to update alignment not only of DST, which
28339 is returned, but also of SRC, which is passed as a pointer for that
28340 reason. */
28341 static rtx
28342 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
28343 rtx srcreg, rtx value, rtx vec_value,
28344 int desired_align, int align_bytes,
28345 bool issetmem)
28347 rtx src = NULL;
28348 rtx orig_dst = dst;
28349 rtx orig_src = NULL;
28350 int piece_size = 1;
28351 int copied_bytes = 0;
28353 if (!issetmem)
28355 gcc_assert (srcp != NULL);
28356 src = *srcp;
28357 orig_src = src;
28360 for (piece_size = 1;
28361 piece_size <= desired_align && copied_bytes < align_bytes;
28362 piece_size <<= 1)
28364 if (align_bytes & piece_size)
28366 if (issetmem)
28368 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
28369 dst = emit_memset (dst, destreg, vec_value, piece_size);
28370 else
28371 dst = emit_memset (dst, destreg, value, piece_size);
28373 else
28374 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
28375 copied_bytes += piece_size;
28378 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
28379 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28380 if (MEM_SIZE_KNOWN_P (orig_dst))
28381 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
28383 if (!issetmem)
28385 int src_align_bytes = get_mem_align_offset (src, desired_align
28386 * BITS_PER_UNIT);
28387 if (src_align_bytes >= 0)
28388 src_align_bytes = desired_align - src_align_bytes;
28389 if (src_align_bytes >= 0)
28391 unsigned int src_align;
28392 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28394 if ((src_align_bytes & (src_align - 1))
28395 == (align_bytes & (src_align - 1)))
28396 break;
28398 if (src_align > (unsigned int) desired_align)
28399 src_align = desired_align;
28400 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28401 set_mem_align (src, src_align * BITS_PER_UNIT);
28403 if (MEM_SIZE_KNOWN_P (orig_src))
28404 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28405 *srcp = src;
28408 return dst;
28411 /* Return true if ALG can be used in current context.
28412 Assume we expand memset if MEMSET is true. */
28413 static bool
28414 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28416 if (alg == no_stringop)
28417 return false;
28418 if (alg == vector_loop)
28419 return TARGET_SSE || TARGET_AVX;
28420 /* Algorithms using the rep prefix want at least edi and ecx;
28421 additionally, memset wants eax and memcpy wants esi. Don't
28422 consider such algorithms if the user has appropriated those
28423 registers for their own purposes, or if we have a non-default
28424 address space, since some string insns cannot override the segment. */
28425 if (alg == rep_prefix_1_byte
28426 || alg == rep_prefix_4_byte
28427 || alg == rep_prefix_8_byte)
28429 if (have_as)
28430 return false;
28431 if (fixed_regs[CX_REG]
28432 || fixed_regs[DI_REG]
28433 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28434 return false;
28436 return true;
28439 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28440 static enum stringop_alg
28441 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28442 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28443 bool memset, bool zero_memset, bool have_as,
28444 int *dynamic_check, bool *noalign, bool recur)
28446 const struct stringop_algs *algs;
28447 bool optimize_for_speed;
28448 int max = 0;
28449 const struct processor_costs *cost;
28450 int i;
28451 bool any_alg_usable_p = false;
28453 *noalign = false;
28454 *dynamic_check = -1;
28456 /* Even if the string operation call is cold, we still might spend a lot
28457 of time processing large blocks. */
28458 if (optimize_function_for_size_p (cfun)
28459 || (optimize_insn_for_size_p ()
28460 && (max_size < 256
28461 || (expected_size != -1 && expected_size < 256))))
28462 optimize_for_speed = false;
28463 else
28464 optimize_for_speed = true;
28466 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28467 if (memset)
28468 algs = &cost->memset[TARGET_64BIT != 0];
28469 else
28470 algs = &cost->memcpy[TARGET_64BIT != 0];
28472 /* See maximal size for user defined algorithm. */
28473 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28475 enum stringop_alg candidate = algs->size[i].alg;
28476 bool usable = alg_usable_p (candidate, memset, have_as);
28477 any_alg_usable_p |= usable;
28479 if (candidate != libcall && candidate && usable)
28480 max = algs->size[i].max;
28483 /* If expected size is not known but max size is small enough
28484 so inline version is a win, set expected size into
28485 the range. */
28486 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28487 && expected_size == -1)
28488 expected_size = min_size / 2 + max_size / 2;
28490 /* If user specified the algorithm, honor it if possible. */
28491 if (ix86_stringop_alg != no_stringop
28492 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28493 return ix86_stringop_alg;
28494 /* rep; movq or rep; movl is the smallest variant. */
28495 else if (!optimize_for_speed)
28497 *noalign = true;
28498 if (!count || (count & 3) || (memset && !zero_memset))
28499 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28500 ? rep_prefix_1_byte : loop_1_byte;
28501 else
28502 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28503 ? rep_prefix_4_byte : loop;
28505 /* Very tiny blocks are best handled via the loop, REP is expensive to
28506 setup. */
28507 else if (expected_size != -1 && expected_size < 4)
28508 return loop_1_byte;
28509 else if (expected_size != -1)
28511 enum stringop_alg alg = libcall;
28512 bool alg_noalign = false;
28513 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28515 /* We get here if the algorithms that were not libcall-based
28516 were rep-prefix based and we are unable to use rep prefixes
28517 based on global register usage. Break out of the loop and
28518 use the heuristic below. */
28519 if (algs->size[i].max == 0)
28520 break;
28521 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28523 enum stringop_alg candidate = algs->size[i].alg;
28525 if (candidate != libcall
28526 && alg_usable_p (candidate, memset, have_as))
28528 alg = candidate;
28529 alg_noalign = algs->size[i].noalign;
28531 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28532 last non-libcall inline algorithm. */
28533 if (TARGET_INLINE_ALL_STRINGOPS)
28535 /* When the current size is best to be copied by a libcall,
28536 but we are still forced to inline, run the heuristic below
28537 that will pick code for medium sized blocks. */
28538 if (alg != libcall)
28540 *noalign = alg_noalign;
28541 return alg;
28543 else if (!any_alg_usable_p)
28544 break;
28546 else if (alg_usable_p (candidate, memset, have_as))
28548 *noalign = algs->size[i].noalign;
28549 return candidate;
28554 /* When asked to inline the call anyway, try to pick meaningful choice.
28555 We look for maximal size of block that is faster to copy by hand and
28556 take blocks of at most of that size guessing that average size will
28557 be roughly half of the block.
28559 If this turns out to be bad, we might simply specify the preferred
28560 choice in ix86_costs. */
28561 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28562 && (algs->unknown_size == libcall
28563 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28565 enum stringop_alg alg;
28566 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28568 /* If there aren't any usable algorithms or if recursing already,
28569 then recursing on smaller sizes or same size isn't going to
28570 find anything. Just return the simple byte-at-a-time copy loop. */
28571 if (!any_alg_usable_p || recur)
28573 /* Pick something reasonable. */
28574 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28575 *dynamic_check = 128;
28576 return loop_1_byte;
28578 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28579 zero_memset, have_as, dynamic_check, noalign, true);
28580 gcc_assert (*dynamic_check == -1);
28581 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28582 *dynamic_check = max;
28583 else
28584 gcc_assert (alg != libcall);
28585 return alg;
28587 return (alg_usable_p (algs->unknown_size, memset, have_as)
28588 ? algs->unknown_size : libcall);
28591 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28592 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28593 static int
28594 decide_alignment (int align,
28595 enum stringop_alg alg,
28596 int expected_size,
28597 machine_mode move_mode)
28599 int desired_align = 0;
28601 gcc_assert (alg != no_stringop);
28603 if (alg == libcall)
28604 return 0;
28605 if (move_mode == VOIDmode)
28606 return 0;
28608 desired_align = GET_MODE_SIZE (move_mode);
28609 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28610 copying whole cacheline at once. */
28611 if (TARGET_PENTIUMPRO
28612 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28613 desired_align = 8;
28615 if (optimize_size)
28616 desired_align = 1;
28617 if (desired_align < align)
28618 desired_align = align;
28619 if (expected_size != -1 && expected_size < 4)
28620 desired_align = align;
28622 return desired_align;
28626 /* Helper function for memcpy. For QImode value 0xXY produce
28627 0xXYXYXYXY of wide specified by MODE. This is essentially
28628 a * 0x10101010, but we can do slightly better than
28629 synth_mult by unwinding the sequence by hand on CPUs with
28630 slow multiply. */
28631 static rtx
28632 promote_duplicated_reg (machine_mode mode, rtx val)
28634 machine_mode valmode = GET_MODE (val);
28635 rtx tmp;
28636 int nops = mode == DImode ? 3 : 2;
28638 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28639 if (val == const0_rtx)
28640 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28641 if (CONST_INT_P (val))
28643 HOST_WIDE_INT v = INTVAL (val) & 255;
28645 v |= v << 8;
28646 v |= v << 16;
28647 if (mode == DImode)
28648 v |= (v << 16) << 16;
28649 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28652 if (valmode == VOIDmode)
28653 valmode = QImode;
28654 if (valmode != QImode)
28655 val = gen_lowpart (QImode, val);
28656 if (mode == QImode)
28657 return val;
28658 if (!TARGET_PARTIAL_REG_STALL)
28659 nops--;
28660 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28661 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28662 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28663 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28665 rtx reg = convert_modes (mode, QImode, val, true);
28666 tmp = promote_duplicated_reg (mode, const1_rtx);
28667 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28668 OPTAB_DIRECT);
28670 else
28672 rtx reg = convert_modes (mode, QImode, val, true);
28674 if (!TARGET_PARTIAL_REG_STALL)
28675 if (mode == SImode)
28676 emit_insn (gen_insvsi_1 (reg, reg));
28677 else
28678 emit_insn (gen_insvdi_1 (reg, reg));
28679 else
28681 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28682 NULL, 1, OPTAB_DIRECT);
28683 reg =
28684 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28686 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28687 NULL, 1, OPTAB_DIRECT);
28688 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28689 if (mode == SImode)
28690 return reg;
28691 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28692 NULL, 1, OPTAB_DIRECT);
28693 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28694 return reg;
28698 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28699 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28700 alignment from ALIGN to DESIRED_ALIGN. */
28701 static rtx
28702 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28703 int align)
28705 rtx promoted_val;
28707 if (TARGET_64BIT
28708 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28709 promoted_val = promote_duplicated_reg (DImode, val);
28710 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28711 promoted_val = promote_duplicated_reg (SImode, val);
28712 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28713 promoted_val = promote_duplicated_reg (HImode, val);
28714 else
28715 promoted_val = val;
28717 return promoted_val;
28720 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28721 operations when profitable. The code depends upon architecture, block size
28722 and alignment, but always has one of the following overall structures:
28724 Aligned move sequence:
28726 1) Prologue guard: Conditional that jumps up to epilogues for small
28727 blocks that can be handled by epilogue alone. This is faster
28728 but also needed for correctness, since prologue assume the block
28729 is larger than the desired alignment.
28731 Optional dynamic check for size and libcall for large
28732 blocks is emitted here too, with -minline-stringops-dynamically.
28734 2) Prologue: copy first few bytes in order to get destination
28735 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28736 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28737 copied. We emit either a jump tree on power of two sized
28738 blocks, or a byte loop.
28740 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28741 with specified algorithm.
28743 4) Epilogue: code copying tail of the block that is too small to be
28744 handled by main body (or up to size guarded by prologue guard).
28746 Misaligned move sequence
28748 1) missaligned move prologue/epilogue containing:
28749 a) Prologue handling small memory blocks and jumping to done_label
28750 (skipped if blocks are known to be large enough)
28751 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28752 needed by single possibly misaligned move
28753 (skipped if alignment is not needed)
28754 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28756 2) Zero size guard dispatching to done_label, if needed
28758 3) dispatch to library call, if needed,
28760 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28761 with specified algorithm. */
28762 bool
28763 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28764 rtx align_exp, rtx expected_align_exp,
28765 rtx expected_size_exp, rtx min_size_exp,
28766 rtx max_size_exp, rtx probable_max_size_exp,
28767 bool issetmem)
28769 rtx destreg;
28770 rtx srcreg = NULL;
28771 rtx_code_label *label = NULL;
28772 rtx tmp;
28773 rtx_code_label *jump_around_label = NULL;
28774 HOST_WIDE_INT align = 1;
28775 unsigned HOST_WIDE_INT count = 0;
28776 HOST_WIDE_INT expected_size = -1;
28777 int size_needed = 0, epilogue_size_needed;
28778 int desired_align = 0, align_bytes = 0;
28779 enum stringop_alg alg;
28780 rtx promoted_val = NULL;
28781 rtx vec_promoted_val = NULL;
28782 bool force_loopy_epilogue = false;
28783 int dynamic_check;
28784 bool need_zero_guard = false;
28785 bool noalign;
28786 machine_mode move_mode = VOIDmode;
28787 machine_mode wider_mode;
28788 int unroll_factor = 1;
28789 /* TODO: Once value ranges are available, fill in proper data. */
28790 unsigned HOST_WIDE_INT min_size = 0;
28791 unsigned HOST_WIDE_INT max_size = -1;
28792 unsigned HOST_WIDE_INT probable_max_size = -1;
28793 bool misaligned_prologue_used = false;
28794 bool have_as;
28796 if (CONST_INT_P (align_exp))
28797 align = INTVAL (align_exp);
28798 /* i386 can do misaligned access on reasonably increased cost. */
28799 if (CONST_INT_P (expected_align_exp)
28800 && INTVAL (expected_align_exp) > align)
28801 align = INTVAL (expected_align_exp);
28802 /* ALIGN is the minimum of destination and source alignment, but we care here
28803 just about destination alignment. */
28804 else if (!issetmem
28805 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28806 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28808 if (CONST_INT_P (count_exp))
28810 min_size = max_size = probable_max_size = count = expected_size
28811 = INTVAL (count_exp);
28812 /* When COUNT is 0, there is nothing to do. */
28813 if (!count)
28814 return true;
28816 else
28818 if (min_size_exp)
28819 min_size = INTVAL (min_size_exp);
28820 if (max_size_exp)
28821 max_size = INTVAL (max_size_exp);
28822 if (probable_max_size_exp)
28823 probable_max_size = INTVAL (probable_max_size_exp);
28824 if (CONST_INT_P (expected_size_exp))
28825 expected_size = INTVAL (expected_size_exp);
28828 /* Make sure we don't need to care about overflow later on. */
28829 if (count > (HOST_WIDE_INT_1U << 30))
28830 return false;
28832 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28833 if (!issetmem)
28834 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28836 /* Step 0: Decide on preferred algorithm, desired alignment and
28837 size of chunks to be copied by main loop. */
28838 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28839 issetmem,
28840 issetmem && val_exp == const0_rtx, have_as,
28841 &dynamic_check, &noalign, false);
28842 if (alg == libcall)
28843 return false;
28844 gcc_assert (alg != no_stringop);
28846 /* For now vector-version of memset is generated only for memory zeroing, as
28847 creating of promoted vector value is very cheap in this case. */
28848 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28849 alg = unrolled_loop;
28851 if (!count)
28852 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28853 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28854 if (!issetmem)
28855 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28857 unroll_factor = 1;
28858 move_mode = word_mode;
28859 switch (alg)
28861 case libcall:
28862 case no_stringop:
28863 case last_alg:
28864 gcc_unreachable ();
28865 case loop_1_byte:
28866 need_zero_guard = true;
28867 move_mode = QImode;
28868 break;
28869 case loop:
28870 need_zero_guard = true;
28871 break;
28872 case unrolled_loop:
28873 need_zero_guard = true;
28874 unroll_factor = (TARGET_64BIT ? 4 : 2);
28875 break;
28876 case vector_loop:
28877 need_zero_guard = true;
28878 unroll_factor = 4;
28879 /* Find the widest supported mode. */
28880 move_mode = word_mode;
28881 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
28882 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
28883 move_mode = wider_mode;
28885 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28886 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28887 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28889 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28890 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
28891 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28892 move_mode = word_mode;
28894 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28895 break;
28896 case rep_prefix_8_byte:
28897 move_mode = DImode;
28898 break;
28899 case rep_prefix_4_byte:
28900 move_mode = SImode;
28901 break;
28902 case rep_prefix_1_byte:
28903 move_mode = QImode;
28904 break;
28906 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28907 epilogue_size_needed = size_needed;
28909 /* If we are going to call any library calls conditionally, make sure any
28910 pending stack adjustment happen before the first conditional branch,
28911 otherwise they will be emitted before the library call only and won't
28912 happen from the other branches. */
28913 if (dynamic_check != -1)
28914 do_pending_stack_adjust ();
28916 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28917 if (!TARGET_ALIGN_STRINGOPS || noalign)
28918 align = desired_align;
28920 /* Step 1: Prologue guard. */
28922 /* Alignment code needs count to be in register. */
28923 if (CONST_INT_P (count_exp) && desired_align > align)
28925 if (INTVAL (count_exp) > desired_align
28926 && INTVAL (count_exp) > size_needed)
28928 align_bytes
28929 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28930 if (align_bytes <= 0)
28931 align_bytes = 0;
28932 else
28933 align_bytes = desired_align - align_bytes;
28935 if (align_bytes == 0)
28936 count_exp = force_reg (counter_mode (count_exp), count_exp);
28938 gcc_assert (desired_align >= 1 && align >= 1);
28940 /* Misaligned move sequences handle both prologue and epilogue at once.
28941 Default code generation results in a smaller code for large alignments
28942 and also avoids redundant job when sizes are known precisely. */
28943 misaligned_prologue_used
28944 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28945 && MAX (desired_align, epilogue_size_needed) <= 32
28946 && desired_align <= epilogue_size_needed
28947 && ((desired_align > align && !align_bytes)
28948 || (!count && epilogue_size_needed > 1)));
28950 /* Do the cheap promotion to allow better CSE across the
28951 main loop and epilogue (ie one load of the big constant in the
28952 front of all code.
28953 For now the misaligned move sequences do not have fast path
28954 without broadcasting. */
28955 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28957 if (alg == vector_loop)
28959 gcc_assert (val_exp == const0_rtx);
28960 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28961 promoted_val = promote_duplicated_reg_to_size (val_exp,
28962 GET_MODE_SIZE (word_mode),
28963 desired_align, align);
28965 else
28967 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28968 desired_align, align);
28971 /* Misaligned move sequences handles both prologues and epilogues at once.
28972 Default code generation results in smaller code for large alignments and
28973 also avoids redundant job when sizes are known precisely. */
28974 if (misaligned_prologue_used)
28976 /* Misaligned move prologue handled small blocks by itself. */
28977 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28978 (dst, src, &destreg, &srcreg,
28979 move_mode, promoted_val, vec_promoted_val,
28980 &count_exp,
28981 &jump_around_label,
28982 desired_align < align
28983 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28984 desired_align, align, &min_size, dynamic_check, issetmem);
28985 if (!issetmem)
28986 src = change_address (src, BLKmode, srcreg);
28987 dst = change_address (dst, BLKmode, destreg);
28988 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28989 epilogue_size_needed = 0;
28990 if (need_zero_guard
28991 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28993 /* It is possible that we copied enough so the main loop will not
28994 execute. */
28995 gcc_assert (size_needed > 1);
28996 if (jump_around_label == NULL_RTX)
28997 jump_around_label = gen_label_rtx ();
28998 emit_cmp_and_jump_insns (count_exp,
28999 GEN_INT (size_needed),
29000 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
29001 if (expected_size == -1
29002 || expected_size < (desired_align - align) / 2 + size_needed)
29003 predict_jump (REG_BR_PROB_BASE * 20 / 100);
29004 else
29005 predict_jump (REG_BR_PROB_BASE * 60 / 100);
29008 /* Ensure that alignment prologue won't copy past end of block. */
29009 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
29011 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
29012 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
29013 Make sure it is power of 2. */
29014 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
29016 /* To improve performance of small blocks, we jump around the VAL
29017 promoting mode. This mean that if the promoted VAL is not constant,
29018 we might not use it in the epilogue and have to use byte
29019 loop variant. */
29020 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
29021 force_loopy_epilogue = true;
29022 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
29023 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
29025 /* If main algorithm works on QImode, no epilogue is needed.
29026 For small sizes just don't align anything. */
29027 if (size_needed == 1)
29028 desired_align = align;
29029 else
29030 goto epilogue;
29032 else if (!count
29033 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
29035 label = gen_label_rtx ();
29036 emit_cmp_and_jump_insns (count_exp,
29037 GEN_INT (epilogue_size_needed),
29038 LTU, 0, counter_mode (count_exp), 1, label);
29039 if (expected_size == -1 || expected_size < epilogue_size_needed)
29040 predict_jump (REG_BR_PROB_BASE * 60 / 100);
29041 else
29042 predict_jump (REG_BR_PROB_BASE * 20 / 100);
29046 /* Emit code to decide on runtime whether library call or inline should be
29047 used. */
29048 if (dynamic_check != -1)
29050 if (!issetmem && CONST_INT_P (count_exp))
29052 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
29054 emit_block_copy_via_libcall (dst, src, count_exp);
29055 count_exp = const0_rtx;
29056 goto epilogue;
29059 else
29061 rtx_code_label *hot_label = gen_label_rtx ();
29062 if (jump_around_label == NULL_RTX)
29063 jump_around_label = gen_label_rtx ();
29064 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
29065 LEU, 0, counter_mode (count_exp),
29066 1, hot_label);
29067 predict_jump (REG_BR_PROB_BASE * 90 / 100);
29068 if (issetmem)
29069 set_storage_via_libcall (dst, count_exp, val_exp);
29070 else
29071 emit_block_copy_via_libcall (dst, src, count_exp);
29072 emit_jump (jump_around_label);
29073 emit_label (hot_label);
29077 /* Step 2: Alignment prologue. */
29078 /* Do the expensive promotion once we branched off the small blocks. */
29079 if (issetmem && !promoted_val)
29080 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
29081 desired_align, align);
29083 if (desired_align > align && !misaligned_prologue_used)
29085 if (align_bytes == 0)
29087 /* Except for the first move in prologue, we no longer know
29088 constant offset in aliasing info. It don't seems to worth
29089 the pain to maintain it for the first move, so throw away
29090 the info early. */
29091 dst = change_address (dst, BLKmode, destreg);
29092 if (!issetmem)
29093 src = change_address (src, BLKmode, srcreg);
29094 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
29095 promoted_val, vec_promoted_val,
29096 count_exp, align, desired_align,
29097 issetmem);
29098 /* At most desired_align - align bytes are copied. */
29099 if (min_size < (unsigned)(desired_align - align))
29100 min_size = 0;
29101 else
29102 min_size -= desired_align - align;
29104 else
29106 /* If we know how many bytes need to be stored before dst is
29107 sufficiently aligned, maintain aliasing info accurately. */
29108 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
29109 srcreg,
29110 promoted_val,
29111 vec_promoted_val,
29112 desired_align,
29113 align_bytes,
29114 issetmem);
29116 count_exp = plus_constant (counter_mode (count_exp),
29117 count_exp, -align_bytes);
29118 count -= align_bytes;
29119 min_size -= align_bytes;
29120 max_size -= align_bytes;
29122 if (need_zero_guard
29123 && min_size < (unsigned HOST_WIDE_INT) size_needed
29124 && (count < (unsigned HOST_WIDE_INT) size_needed
29125 || (align_bytes == 0
29126 && count < ((unsigned HOST_WIDE_INT) size_needed
29127 + desired_align - align))))
29129 /* It is possible that we copied enough so the main loop will not
29130 execute. */
29131 gcc_assert (size_needed > 1);
29132 if (label == NULL_RTX)
29133 label = gen_label_rtx ();
29134 emit_cmp_and_jump_insns (count_exp,
29135 GEN_INT (size_needed),
29136 LTU, 0, counter_mode (count_exp), 1, label);
29137 if (expected_size == -1
29138 || expected_size < (desired_align - align) / 2 + size_needed)
29139 predict_jump (REG_BR_PROB_BASE * 20 / 100);
29140 else
29141 predict_jump (REG_BR_PROB_BASE * 60 / 100);
29144 if (label && size_needed == 1)
29146 emit_label (label);
29147 LABEL_NUSES (label) = 1;
29148 label = NULL;
29149 epilogue_size_needed = 1;
29150 if (issetmem)
29151 promoted_val = val_exp;
29153 else if (label == NULL_RTX && !misaligned_prologue_used)
29154 epilogue_size_needed = size_needed;
29156 /* Step 3: Main loop. */
29158 switch (alg)
29160 case libcall:
29161 case no_stringop:
29162 case last_alg:
29163 gcc_unreachable ();
29164 case loop_1_byte:
29165 case loop:
29166 case unrolled_loop:
29167 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
29168 count_exp, move_mode, unroll_factor,
29169 expected_size, issetmem);
29170 break;
29171 case vector_loop:
29172 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
29173 vec_promoted_val, count_exp, move_mode,
29174 unroll_factor, expected_size, issetmem);
29175 break;
29176 case rep_prefix_8_byte:
29177 case rep_prefix_4_byte:
29178 case rep_prefix_1_byte:
29179 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
29180 val_exp, count_exp, move_mode, issetmem);
29181 break;
29183 /* Adjust properly the offset of src and dest memory for aliasing. */
29184 if (CONST_INT_P (count_exp))
29186 if (!issetmem)
29187 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
29188 (count / size_needed) * size_needed);
29189 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
29190 (count / size_needed) * size_needed);
29192 else
29194 if (!issetmem)
29195 src = change_address (src, BLKmode, srcreg);
29196 dst = change_address (dst, BLKmode, destreg);
29199 /* Step 4: Epilogue to copy the remaining bytes. */
29200 epilogue:
29201 if (label)
29203 /* When the main loop is done, COUNT_EXP might hold original count,
29204 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
29205 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
29206 bytes. Compensate if needed. */
29208 if (size_needed < epilogue_size_needed)
29210 tmp =
29211 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
29212 GEN_INT (size_needed - 1), count_exp, 1,
29213 OPTAB_DIRECT);
29214 if (tmp != count_exp)
29215 emit_move_insn (count_exp, tmp);
29217 emit_label (label);
29218 LABEL_NUSES (label) = 1;
29221 if (count_exp != const0_rtx && epilogue_size_needed > 1)
29223 if (force_loopy_epilogue)
29224 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
29225 epilogue_size_needed);
29226 else
29228 if (issetmem)
29229 expand_setmem_epilogue (dst, destreg, promoted_val,
29230 vec_promoted_val, count_exp,
29231 epilogue_size_needed);
29232 else
29233 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
29234 epilogue_size_needed);
29237 if (jump_around_label)
29238 emit_label (jump_around_label);
29239 return true;
29243 /* Expand the appropriate insns for doing strlen if not just doing
29244 repnz; scasb
29246 out = result, initialized with the start address
29247 align_rtx = alignment of the address.
29248 scratch = scratch register, initialized with the startaddress when
29249 not aligned, otherwise undefined
29251 This is just the body. It needs the initializations mentioned above and
29252 some address computing at the end. These things are done in i386.md. */
29254 static void
29255 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
29257 int align;
29258 rtx tmp;
29259 rtx_code_label *align_2_label = NULL;
29260 rtx_code_label *align_3_label = NULL;
29261 rtx_code_label *align_4_label = gen_label_rtx ();
29262 rtx_code_label *end_0_label = gen_label_rtx ();
29263 rtx mem;
29264 rtx tmpreg = gen_reg_rtx (SImode);
29265 rtx scratch = gen_reg_rtx (SImode);
29266 rtx cmp;
29268 align = 0;
29269 if (CONST_INT_P (align_rtx))
29270 align = INTVAL (align_rtx);
29272 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
29274 /* Is there a known alignment and is it less than 4? */
29275 if (align < 4)
29277 rtx scratch1 = gen_reg_rtx (Pmode);
29278 emit_move_insn (scratch1, out);
29279 /* Is there a known alignment and is it not 2? */
29280 if (align != 2)
29282 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
29283 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
29285 /* Leave just the 3 lower bits. */
29286 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
29287 NULL_RTX, 0, OPTAB_WIDEN);
29289 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29290 Pmode, 1, align_4_label);
29291 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
29292 Pmode, 1, align_2_label);
29293 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
29294 Pmode, 1, align_3_label);
29296 else
29298 /* Since the alignment is 2, we have to check 2 or 0 bytes;
29299 check if is aligned to 4 - byte. */
29301 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
29302 NULL_RTX, 0, OPTAB_WIDEN);
29304 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29305 Pmode, 1, align_4_label);
29308 mem = change_address (src, QImode, out);
29310 /* Now compare the bytes. */
29312 /* Compare the first n unaligned byte on a byte per byte basis. */
29313 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
29314 QImode, 1, end_0_label);
29316 /* Increment the address. */
29317 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29319 /* Not needed with an alignment of 2 */
29320 if (align != 2)
29322 emit_label (align_2_label);
29324 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29325 end_0_label);
29327 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29329 emit_label (align_3_label);
29332 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29333 end_0_label);
29335 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29338 /* Generate loop to check 4 bytes at a time. It is not a good idea to
29339 align this loop. It gives only huge programs, but does not help to
29340 speed up. */
29341 emit_label (align_4_label);
29343 mem = change_address (src, SImode, out);
29344 emit_move_insn (scratch, mem);
29345 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
29347 /* This formula yields a nonzero result iff one of the bytes is zero.
29348 This saves three branches inside loop and many cycles. */
29350 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
29351 emit_insn (gen_one_cmplsi2 (scratch, scratch));
29352 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
29353 emit_insn (gen_andsi3 (tmpreg, tmpreg,
29354 gen_int_mode (0x80808080, SImode)));
29355 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
29356 align_4_label);
29358 if (TARGET_CMOVE)
29360 rtx reg = gen_reg_rtx (SImode);
29361 rtx reg2 = gen_reg_rtx (Pmode);
29362 emit_move_insn (reg, tmpreg);
29363 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
29365 /* If zero is not in the first two bytes, move two bytes forward. */
29366 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29367 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29368 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29369 emit_insn (gen_rtx_SET (tmpreg,
29370 gen_rtx_IF_THEN_ELSE (SImode, tmp,
29371 reg,
29372 tmpreg)));
29373 /* Emit lea manually to avoid clobbering of flags. */
29374 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
29376 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29377 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29378 emit_insn (gen_rtx_SET (out,
29379 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
29380 reg2,
29381 out)));
29383 else
29385 rtx_code_label *end_2_label = gen_label_rtx ();
29386 /* Is zero in the first two bytes? */
29388 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29389 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29390 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29391 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29392 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29393 pc_rtx);
29394 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29395 JUMP_LABEL (tmp) = end_2_label;
29397 /* Not in the first two. Move two bytes forward. */
29398 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29399 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29401 emit_label (end_2_label);
29405 /* Avoid branch in fixing the byte. */
29406 tmpreg = gen_lowpart (QImode, tmpreg);
29407 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29408 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29409 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29410 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29412 emit_label (end_0_label);
29415 /* Expand strlen. */
29417 bool
29418 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29420 rtx addr, scratch1, scratch2, scratch3, scratch4;
29422 /* The generic case of strlen expander is long. Avoid it's
29423 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29425 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29426 && !TARGET_INLINE_ALL_STRINGOPS
29427 && !optimize_insn_for_size_p ()
29428 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29429 return false;
29431 addr = force_reg (Pmode, XEXP (src, 0));
29432 scratch1 = gen_reg_rtx (Pmode);
29434 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29435 && !optimize_insn_for_size_p ())
29437 /* Well it seems that some optimizer does not combine a call like
29438 foo(strlen(bar), strlen(bar));
29439 when the move and the subtraction is done here. It does calculate
29440 the length just once when these instructions are done inside of
29441 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29442 often used and I use one fewer register for the lifetime of
29443 output_strlen_unroll() this is better. */
29445 emit_move_insn (out, addr);
29447 ix86_expand_strlensi_unroll_1 (out, src, align);
29449 /* strlensi_unroll_1 returns the address of the zero at the end of
29450 the string, like memchr(), so compute the length by subtracting
29451 the start address. */
29452 emit_insn (ix86_gen_sub3 (out, out, addr));
29454 else
29456 rtx unspec;
29458 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29459 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29460 return false;
29461 /* Can't use this for non-default address spaces. */
29462 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29463 return false;
29465 scratch2 = gen_reg_rtx (Pmode);
29466 scratch3 = gen_reg_rtx (Pmode);
29467 scratch4 = force_reg (Pmode, constm1_rtx);
29469 emit_move_insn (scratch3, addr);
29470 eoschar = force_reg (QImode, eoschar);
29472 src = replace_equiv_address_nv (src, scratch3);
29474 /* If .md starts supporting :P, this can be done in .md. */
29475 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29476 scratch4), UNSPEC_SCAS);
29477 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29478 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29479 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29481 return true;
29484 /* For given symbol (function) construct code to compute address of it's PLT
29485 entry in large x86-64 PIC model. */
29486 static rtx
29487 construct_plt_address (rtx symbol)
29489 rtx tmp, unspec;
29491 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29492 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29493 gcc_assert (Pmode == DImode);
29495 tmp = gen_reg_rtx (Pmode);
29496 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29498 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29499 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29500 return tmp;
29504 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29505 rtx callarg2,
29506 rtx pop, bool sibcall)
29508 rtx vec[3];
29509 rtx use = NULL, call;
29510 unsigned int vec_len = 0;
29511 tree fndecl;
29513 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29515 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29516 if (fndecl
29517 && (lookup_attribute ("interrupt",
29518 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29519 error ("interrupt service routine can't be called directly");
29521 else
29522 fndecl = NULL_TREE;
29524 if (pop == const0_rtx)
29525 pop = NULL;
29526 gcc_assert (!TARGET_64BIT || !pop);
29528 if (TARGET_MACHO && !TARGET_64BIT)
29530 #if TARGET_MACHO
29531 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29532 fnaddr = machopic_indirect_call_target (fnaddr);
29533 #endif
29535 else
29537 /* Static functions and indirect calls don't need the pic register. Also,
29538 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29539 it an indirect call. */
29540 rtx addr = XEXP (fnaddr, 0);
29541 if (flag_pic
29542 && GET_CODE (addr) == SYMBOL_REF
29543 && !SYMBOL_REF_LOCAL_P (addr))
29545 if (flag_plt
29546 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29547 || !lookup_attribute ("noplt",
29548 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29550 if (!TARGET_64BIT
29551 || (ix86_cmodel == CM_LARGE_PIC
29552 && DEFAULT_ABI != MS_ABI))
29554 use_reg (&use, gen_rtx_REG (Pmode,
29555 REAL_PIC_OFFSET_TABLE_REGNUM));
29556 if (ix86_use_pseudo_pic_reg ())
29557 emit_move_insn (gen_rtx_REG (Pmode,
29558 REAL_PIC_OFFSET_TABLE_REGNUM),
29559 pic_offset_table_rtx);
29562 else if (!TARGET_PECOFF && !TARGET_MACHO)
29564 if (TARGET_64BIT)
29566 fnaddr = gen_rtx_UNSPEC (Pmode,
29567 gen_rtvec (1, addr),
29568 UNSPEC_GOTPCREL);
29569 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29571 else
29573 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29574 UNSPEC_GOT);
29575 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29576 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29577 fnaddr);
29579 fnaddr = gen_const_mem (Pmode, fnaddr);
29580 /* Pmode may not be the same as word_mode for x32, which
29581 doesn't support indirect branch via 32-bit memory slot.
29582 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29583 indirect branch via x32 GOT slot is OK. */
29584 if (GET_MODE (fnaddr) != word_mode)
29585 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29586 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29591 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29592 parameters passed in vector registers. */
29593 if (TARGET_64BIT
29594 && (INTVAL (callarg2) > 0
29595 || (INTVAL (callarg2) == 0
29596 && (TARGET_SSE || !flag_skip_rax_setup))))
29598 rtx al = gen_rtx_REG (QImode, AX_REG);
29599 emit_move_insn (al, callarg2);
29600 use_reg (&use, al);
29603 if (ix86_cmodel == CM_LARGE_PIC
29604 && !TARGET_PECOFF
29605 && MEM_P (fnaddr)
29606 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29607 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29608 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29609 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29610 branch via x32 GOT slot is OK. */
29611 else if (!(TARGET_X32
29612 && MEM_P (fnaddr)
29613 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29614 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29615 && (sibcall
29616 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29617 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29619 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29620 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29623 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29625 if (retval)
29627 /* We should add bounds as destination register in case
29628 pointer with bounds may be returned. */
29629 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29631 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29632 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29633 if (GET_CODE (retval) == PARALLEL)
29635 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29636 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29637 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29638 retval = chkp_join_splitted_slot (retval, par);
29640 else
29642 retval = gen_rtx_PARALLEL (VOIDmode,
29643 gen_rtvec (3, retval, b0, b1));
29644 chkp_put_regs_to_expr_list (retval);
29648 call = gen_rtx_SET (retval, call);
29650 vec[vec_len++] = call;
29652 if (pop)
29654 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29655 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29656 vec[vec_len++] = pop;
29659 if (cfun->machine->no_caller_saved_registers
29660 && (!fndecl
29661 || (!TREE_THIS_VOLATILE (fndecl)
29662 && !lookup_attribute ("no_caller_saved_registers",
29663 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29665 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29666 bool is_64bit_ms_abi = (TARGET_64BIT
29667 && ix86_function_abi (fndecl) == MS_ABI);
29668 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29670 /* If there are no caller-saved registers, add all registers
29671 that are clobbered by the call which returns. */
29672 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29673 if (!fixed_regs[i]
29674 && (ix86_call_used_regs[i] == 1
29675 || (ix86_call_used_regs[i] & c_mask))
29676 && !STACK_REGNO_P (i)
29677 && !MMX_REGNO_P (i))
29678 clobber_reg (&use,
29679 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29681 else if (TARGET_64BIT_MS_ABI
29682 && (!callarg2 || INTVAL (callarg2) != -2))
29684 unsigned i;
29686 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29688 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29689 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29691 clobber_reg (&use, gen_rtx_REG (mode, regno));
29694 /* Set here, but it may get cleared later. */
29695 if (TARGET_CALL_MS2SYSV_XLOGUES)
29697 if (!TARGET_SSE)
29700 /* Don't break hot-patched functions. */
29701 else if (ix86_function_ms_hook_prologue (current_function_decl))
29704 /* TODO: Cases not yet examined. */
29705 else if (flag_split_stack)
29706 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29708 else
29710 gcc_assert (!reload_completed);
29711 cfun->machine->call_ms2sysv = true;
29716 if (vec_len > 1)
29717 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29718 call = emit_call_insn (call);
29719 if (use)
29720 CALL_INSN_FUNCTION_USAGE (call) = use;
29722 return call;
29725 /* Return true if the function being called was marked with attribute
29726 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29727 to handle the non-PIC case in the backend because there is no easy
29728 interface for the front-end to force non-PLT calls to use the GOT.
29729 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29730 to call the function marked "noplt" indirectly. */
29732 static bool
29733 ix86_nopic_noplt_attribute_p (rtx call_op)
29735 if (flag_pic || ix86_cmodel == CM_LARGE
29736 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29737 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29738 || SYMBOL_REF_LOCAL_P (call_op))
29739 return false;
29741 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29743 if (!flag_plt
29744 || (symbol_decl != NULL_TREE
29745 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29746 return true;
29748 return false;
29751 /* Output the assembly for a call instruction. */
29753 const char *
29754 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29756 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29757 bool seh_nop_p = false;
29758 const char *xasm;
29760 if (SIBLING_CALL_P (insn))
29762 if (direct_p)
29764 if (ix86_nopic_noplt_attribute_p (call_op))
29766 if (TARGET_64BIT)
29767 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29768 else
29769 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29771 else
29772 xasm = "%!jmp\t%P0";
29774 /* SEH epilogue detection requires the indirect branch case
29775 to include REX.W. */
29776 else if (TARGET_SEH)
29777 xasm = "%!rex.W jmp\t%A0";
29778 else
29779 xasm = "%!jmp\t%A0";
29781 output_asm_insn (xasm, &call_op);
29782 return "";
29785 /* SEH unwinding can require an extra nop to be emitted in several
29786 circumstances. Determine if we have one of those. */
29787 if (TARGET_SEH)
29789 rtx_insn *i;
29791 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29793 /* If we get to another real insn, we don't need the nop. */
29794 if (INSN_P (i))
29795 break;
29797 /* If we get to the epilogue note, prevent a catch region from
29798 being adjacent to the standard epilogue sequence. If non-
29799 call-exceptions, we'll have done this during epilogue emission. */
29800 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29801 && !flag_non_call_exceptions
29802 && !can_throw_internal (insn))
29804 seh_nop_p = true;
29805 break;
29809 /* If we didn't find a real insn following the call, prevent the
29810 unwinder from looking into the next function. */
29811 if (i == NULL)
29812 seh_nop_p = true;
29815 if (direct_p)
29817 if (ix86_nopic_noplt_attribute_p (call_op))
29819 if (TARGET_64BIT)
29820 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29821 else
29822 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29824 else
29825 xasm = "%!call\t%P0";
29827 else
29828 xasm = "%!call\t%A0";
29830 output_asm_insn (xasm, &call_op);
29832 if (seh_nop_p)
29833 return "nop";
29835 return "";
29838 /* Clear stack slot assignments remembered from previous functions.
29839 This is called from INIT_EXPANDERS once before RTL is emitted for each
29840 function. */
29842 static struct machine_function *
29843 ix86_init_machine_status (void)
29845 struct machine_function *f;
29847 f = ggc_cleared_alloc<machine_function> ();
29848 f->call_abi = ix86_abi;
29850 return f;
29853 /* Return a MEM corresponding to a stack slot with mode MODE.
29854 Allocate a new slot if necessary.
29856 The RTL for a function can have several slots available: N is
29857 which slot to use. */
29860 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29862 struct stack_local_entry *s;
29864 gcc_assert (n < MAX_386_STACK_LOCALS);
29866 for (s = ix86_stack_locals; s; s = s->next)
29867 if (s->mode == mode && s->n == n)
29868 return validize_mem (copy_rtx (s->rtl));
29870 s = ggc_alloc<stack_local_entry> ();
29871 s->n = n;
29872 s->mode = mode;
29873 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29875 s->next = ix86_stack_locals;
29876 ix86_stack_locals = s;
29877 return validize_mem (copy_rtx (s->rtl));
29880 static void
29881 ix86_instantiate_decls (void)
29883 struct stack_local_entry *s;
29885 for (s = ix86_stack_locals; s; s = s->next)
29886 if (s->rtl != NULL_RTX)
29887 instantiate_decl_rtl (s->rtl);
29890 /* Return the number used for encoding REG, in the range 0..7. */
29892 static int
29893 reg_encoded_number (rtx reg)
29895 unsigned regno = REGNO (reg);
29896 switch (regno)
29898 case AX_REG:
29899 return 0;
29900 case CX_REG:
29901 return 1;
29902 case DX_REG:
29903 return 2;
29904 case BX_REG:
29905 return 3;
29906 case SP_REG:
29907 return 4;
29908 case BP_REG:
29909 return 5;
29910 case SI_REG:
29911 return 6;
29912 case DI_REG:
29913 return 7;
29914 default:
29915 break;
29917 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29918 return regno - FIRST_STACK_REG;
29919 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29920 return regno - FIRST_SSE_REG;
29921 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29922 return regno - FIRST_MMX_REG;
29923 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29924 return regno - FIRST_REX_SSE_REG;
29925 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29926 return regno - FIRST_REX_INT_REG;
29927 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29928 return regno - FIRST_MASK_REG;
29929 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29930 return regno - FIRST_BND_REG;
29931 return -1;
29934 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29935 in its encoding if it could be relevant for ROP mitigation, otherwise
29936 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29937 used for calculating it into them. */
29939 static int
29940 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29941 int *popno0 = 0, int *popno1 = 0)
29943 if (asm_noperands (PATTERN (insn)) >= 0)
29944 return -1;
29945 int has_modrm = get_attr_modrm (insn);
29946 if (!has_modrm)
29947 return -1;
29948 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29949 rtx op0, op1;
29950 switch (cls)
29952 case MODRM_CLASS_OP02:
29953 gcc_assert (noperands >= 3);
29954 if (popno0)
29956 *popno0 = 0;
29957 *popno1 = 2;
29959 op0 = operands[0];
29960 op1 = operands[2];
29961 break;
29962 case MODRM_CLASS_OP01:
29963 gcc_assert (noperands >= 2);
29964 if (popno0)
29966 *popno0 = 0;
29967 *popno1 = 1;
29969 op0 = operands[0];
29970 op1 = operands[1];
29971 break;
29972 default:
29973 return -1;
29975 if (REG_P (op0) && REG_P (op1))
29977 int enc0 = reg_encoded_number (op0);
29978 int enc1 = reg_encoded_number (op1);
29979 return 0xc0 + (enc1 << 3) + enc0;
29981 return -1;
29984 /* Check whether x86 address PARTS is a pc-relative address. */
29986 static bool
29987 rip_relative_addr_p (struct ix86_address *parts)
29989 rtx base, index, disp;
29991 base = parts->base;
29992 index = parts->index;
29993 disp = parts->disp;
29995 if (disp && !base && !index)
29997 if (TARGET_64BIT)
29999 rtx symbol = disp;
30001 if (GET_CODE (disp) == CONST)
30002 symbol = XEXP (disp, 0);
30003 if (GET_CODE (symbol) == PLUS
30004 && CONST_INT_P (XEXP (symbol, 1)))
30005 symbol = XEXP (symbol, 0);
30007 if (GET_CODE (symbol) == LABEL_REF
30008 || (GET_CODE (symbol) == SYMBOL_REF
30009 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
30010 || (GET_CODE (symbol) == UNSPEC
30011 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
30012 || XINT (symbol, 1) == UNSPEC_PCREL
30013 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
30014 return true;
30017 return false;
30020 /* Calculate the length of the memory address in the instruction encoding.
30021 Includes addr32 prefix, does not include the one-byte modrm, opcode,
30022 or other prefixes. We never generate addr32 prefix for LEA insn. */
30025 memory_address_length (rtx addr, bool lea)
30027 struct ix86_address parts;
30028 rtx base, index, disp;
30029 int len;
30030 int ok;
30032 if (GET_CODE (addr) == PRE_DEC
30033 || GET_CODE (addr) == POST_INC
30034 || GET_CODE (addr) == PRE_MODIFY
30035 || GET_CODE (addr) == POST_MODIFY)
30036 return 0;
30038 ok = ix86_decompose_address (addr, &parts);
30039 gcc_assert (ok);
30041 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
30043 /* If this is not LEA instruction, add the length of addr32 prefix. */
30044 if (TARGET_64BIT && !lea
30045 && (SImode_address_operand (addr, VOIDmode)
30046 || (parts.base && GET_MODE (parts.base) == SImode)
30047 || (parts.index && GET_MODE (parts.index) == SImode)))
30048 len++;
30050 base = parts.base;
30051 index = parts.index;
30052 disp = parts.disp;
30054 if (base && SUBREG_P (base))
30055 base = SUBREG_REG (base);
30056 if (index && SUBREG_P (index))
30057 index = SUBREG_REG (index);
30059 gcc_assert (base == NULL_RTX || REG_P (base));
30060 gcc_assert (index == NULL_RTX || REG_P (index));
30062 /* Rule of thumb:
30063 - esp as the base always wants an index,
30064 - ebp as the base always wants a displacement,
30065 - r12 as the base always wants an index,
30066 - r13 as the base always wants a displacement. */
30068 /* Register Indirect. */
30069 if (base && !index && !disp)
30071 /* esp (for its index) and ebp (for its displacement) need
30072 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
30073 code. */
30074 if (base == arg_pointer_rtx
30075 || base == frame_pointer_rtx
30076 || REGNO (base) == SP_REG
30077 || REGNO (base) == BP_REG
30078 || REGNO (base) == R12_REG
30079 || REGNO (base) == R13_REG)
30080 len++;
30083 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
30084 is not disp32, but disp32(%rip), so for disp32
30085 SIB byte is needed, unless print_operand_address
30086 optimizes it into disp32(%rip) or (%rip) is implied
30087 by UNSPEC. */
30088 else if (disp && !base && !index)
30090 len += 4;
30091 if (!rip_relative_addr_p (&parts))
30092 len++;
30094 else
30096 /* Find the length of the displacement constant. */
30097 if (disp)
30099 if (base && satisfies_constraint_K (disp))
30100 len += 1;
30101 else
30102 len += 4;
30104 /* ebp always wants a displacement. Similarly r13. */
30105 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
30106 len++;
30108 /* An index requires the two-byte modrm form.... */
30109 if (index
30110 /* ...like esp (or r12), which always wants an index. */
30111 || base == arg_pointer_rtx
30112 || base == frame_pointer_rtx
30113 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
30114 len++;
30117 return len;
30120 /* Compute default value for "length_immediate" attribute. When SHORTFORM
30121 is set, expect that insn have 8bit immediate alternative. */
30123 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
30125 int len = 0;
30126 int i;
30127 extract_insn_cached (insn);
30128 for (i = recog_data.n_operands - 1; i >= 0; --i)
30129 if (CONSTANT_P (recog_data.operand[i]))
30131 enum attr_mode mode = get_attr_mode (insn);
30133 gcc_assert (!len);
30134 if (shortform && CONST_INT_P (recog_data.operand[i]))
30136 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
30137 switch (mode)
30139 case MODE_QI:
30140 len = 1;
30141 continue;
30142 case MODE_HI:
30143 ival = trunc_int_for_mode (ival, HImode);
30144 break;
30145 case MODE_SI:
30146 ival = trunc_int_for_mode (ival, SImode);
30147 break;
30148 default:
30149 break;
30151 if (IN_RANGE (ival, -128, 127))
30153 len = 1;
30154 continue;
30157 switch (mode)
30159 case MODE_QI:
30160 len = 1;
30161 break;
30162 case MODE_HI:
30163 len = 2;
30164 break;
30165 case MODE_SI:
30166 len = 4;
30167 break;
30168 /* Immediates for DImode instructions are encoded
30169 as 32bit sign extended values. */
30170 case MODE_DI:
30171 len = 4;
30172 break;
30173 default:
30174 fatal_insn ("unknown insn mode", insn);
30177 return len;
30180 /* Compute default value for "length_address" attribute. */
30182 ix86_attr_length_address_default (rtx_insn *insn)
30184 int i;
30186 if (get_attr_type (insn) == TYPE_LEA)
30188 rtx set = PATTERN (insn), addr;
30190 if (GET_CODE (set) == PARALLEL)
30191 set = XVECEXP (set, 0, 0);
30193 gcc_assert (GET_CODE (set) == SET);
30195 addr = SET_SRC (set);
30197 return memory_address_length (addr, true);
30200 extract_insn_cached (insn);
30201 for (i = recog_data.n_operands - 1; i >= 0; --i)
30203 rtx op = recog_data.operand[i];
30204 if (MEM_P (op))
30206 constrain_operands_cached (insn, reload_completed);
30207 if (which_alternative != -1)
30209 const char *constraints = recog_data.constraints[i];
30210 int alt = which_alternative;
30212 while (*constraints == '=' || *constraints == '+')
30213 constraints++;
30214 while (alt-- > 0)
30215 while (*constraints++ != ',')
30217 /* Skip ignored operands. */
30218 if (*constraints == 'X')
30219 continue;
30222 int len = memory_address_length (XEXP (op, 0), false);
30224 /* Account for segment prefix for non-default addr spaces. */
30225 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
30226 len++;
30228 return len;
30231 return 0;
30234 /* Compute default value for "length_vex" attribute. It includes
30235 2 or 3 byte VEX prefix and 1 opcode byte. */
30238 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
30239 bool has_vex_w)
30241 int i;
30243 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
30244 byte VEX prefix. */
30245 if (!has_0f_opcode || has_vex_w)
30246 return 3 + 1;
30248 /* We can always use 2 byte VEX prefix in 32bit. */
30249 if (!TARGET_64BIT)
30250 return 2 + 1;
30252 extract_insn_cached (insn);
30254 for (i = recog_data.n_operands - 1; i >= 0; --i)
30255 if (REG_P (recog_data.operand[i]))
30257 /* REX.W bit uses 3 byte VEX prefix. */
30258 if (GET_MODE (recog_data.operand[i]) == DImode
30259 && GENERAL_REG_P (recog_data.operand[i]))
30260 return 3 + 1;
30262 else
30264 /* REX.X or REX.B bits use 3 byte VEX prefix. */
30265 if (MEM_P (recog_data.operand[i])
30266 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
30267 return 3 + 1;
30270 return 2 + 1;
30273 /* Return the maximum number of instructions a cpu can issue. */
30275 static int
30276 ix86_issue_rate (void)
30278 switch (ix86_tune)
30280 case PROCESSOR_PENTIUM:
30281 case PROCESSOR_LAKEMONT:
30282 case PROCESSOR_BONNELL:
30283 case PROCESSOR_SILVERMONT:
30284 case PROCESSOR_KNL:
30285 case PROCESSOR_INTEL:
30286 case PROCESSOR_K6:
30287 case PROCESSOR_BTVER2:
30288 case PROCESSOR_PENTIUM4:
30289 case PROCESSOR_NOCONA:
30290 return 2;
30292 case PROCESSOR_PENTIUMPRO:
30293 case PROCESSOR_ATHLON:
30294 case PROCESSOR_K8:
30295 case PROCESSOR_AMDFAM10:
30296 case PROCESSOR_GENERIC:
30297 case PROCESSOR_BTVER1:
30298 return 3;
30300 case PROCESSOR_BDVER1:
30301 case PROCESSOR_BDVER2:
30302 case PROCESSOR_BDVER3:
30303 case PROCESSOR_BDVER4:
30304 case PROCESSOR_ZNVER1:
30305 case PROCESSOR_CORE2:
30306 case PROCESSOR_NEHALEM:
30307 case PROCESSOR_SANDYBRIDGE:
30308 case PROCESSOR_HASWELL:
30309 return 4;
30311 default:
30312 return 1;
30316 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
30317 by DEP_INSN and nothing set by DEP_INSN. */
30319 static bool
30320 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
30322 rtx set, set2;
30324 /* Simplify the test for uninteresting insns. */
30325 if (insn_type != TYPE_SETCC
30326 && insn_type != TYPE_ICMOV
30327 && insn_type != TYPE_FCMOV
30328 && insn_type != TYPE_IBR)
30329 return false;
30331 if ((set = single_set (dep_insn)) != 0)
30333 set = SET_DEST (set);
30334 set2 = NULL_RTX;
30336 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
30337 && XVECLEN (PATTERN (dep_insn), 0) == 2
30338 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
30339 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
30341 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30342 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30344 else
30345 return false;
30347 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
30348 return false;
30350 /* This test is true if the dependent insn reads the flags but
30351 not any other potentially set register. */
30352 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
30353 return false;
30355 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
30356 return false;
30358 return true;
30361 /* Return true iff USE_INSN has a memory address with operands set by
30362 SET_INSN. */
30364 bool
30365 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
30367 int i;
30368 extract_insn_cached (use_insn);
30369 for (i = recog_data.n_operands - 1; i >= 0; --i)
30370 if (MEM_P (recog_data.operand[i]))
30372 rtx addr = XEXP (recog_data.operand[i], 0);
30373 if (modified_in_p (addr, set_insn) != 0)
30375 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
30376 has SP based memory (unless index reg is modified in a pop). */
30377 rtx set = single_set (set_insn);
30378 if (set
30379 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
30380 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
30382 struct ix86_address parts;
30383 if (ix86_decompose_address (addr, &parts)
30384 && parts.base == stack_pointer_rtx
30385 && (parts.index == NULL_RTX
30386 || MEM_P (SET_DEST (set))
30387 || !modified_in_p (parts.index, set_insn)))
30388 return false;
30390 return true;
30392 return false;
30394 return false;
30397 /* Helper function for exact_store_load_dependency.
30398 Return true if addr is found in insn. */
30399 static bool
30400 exact_dependency_1 (rtx addr, rtx insn)
30402 enum rtx_code code;
30403 const char *format_ptr;
30404 int i, j;
30406 code = GET_CODE (insn);
30407 switch (code)
30409 case MEM:
30410 if (rtx_equal_p (addr, insn))
30411 return true;
30412 break;
30413 case REG:
30414 CASE_CONST_ANY:
30415 case SYMBOL_REF:
30416 case CODE_LABEL:
30417 case PC:
30418 case CC0:
30419 case EXPR_LIST:
30420 return false;
30421 default:
30422 break;
30425 format_ptr = GET_RTX_FORMAT (code);
30426 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30428 switch (*format_ptr++)
30430 case 'e':
30431 if (exact_dependency_1 (addr, XEXP (insn, i)))
30432 return true;
30433 break;
30434 case 'E':
30435 for (j = 0; j < XVECLEN (insn, i); j++)
30436 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30437 return true;
30438 break;
30441 return false;
30444 /* Return true if there exists exact dependency for store & load, i.e.
30445 the same memory address is used in them. */
30446 static bool
30447 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30449 rtx set1, set2;
30451 set1 = single_set (store);
30452 if (!set1)
30453 return false;
30454 if (!MEM_P (SET_DEST (set1)))
30455 return false;
30456 set2 = single_set (load);
30457 if (!set2)
30458 return false;
30459 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30460 return true;
30461 return false;
30464 static int
30465 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30466 unsigned int)
30468 enum attr_type insn_type, dep_insn_type;
30469 enum attr_memory memory;
30470 rtx set, set2;
30471 int dep_insn_code_number;
30473 /* Anti and output dependencies have zero cost on all CPUs. */
30474 if (dep_type != 0)
30475 return 0;
30477 dep_insn_code_number = recog_memoized (dep_insn);
30479 /* If we can't recognize the insns, we can't really do anything. */
30480 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30481 return cost;
30483 insn_type = get_attr_type (insn);
30484 dep_insn_type = get_attr_type (dep_insn);
30486 switch (ix86_tune)
30488 case PROCESSOR_PENTIUM:
30489 case PROCESSOR_LAKEMONT:
30490 /* Address Generation Interlock adds a cycle of latency. */
30491 if (insn_type == TYPE_LEA)
30493 rtx addr = PATTERN (insn);
30495 if (GET_CODE (addr) == PARALLEL)
30496 addr = XVECEXP (addr, 0, 0);
30498 gcc_assert (GET_CODE (addr) == SET);
30500 addr = SET_SRC (addr);
30501 if (modified_in_p (addr, dep_insn))
30502 cost += 1;
30504 else if (ix86_agi_dependent (dep_insn, insn))
30505 cost += 1;
30507 /* ??? Compares pair with jump/setcc. */
30508 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30509 cost = 0;
30511 /* Floating point stores require value to be ready one cycle earlier. */
30512 if (insn_type == TYPE_FMOV
30513 && get_attr_memory (insn) == MEMORY_STORE
30514 && !ix86_agi_dependent (dep_insn, insn))
30515 cost += 1;
30516 break;
30518 case PROCESSOR_PENTIUMPRO:
30519 /* INT->FP conversion is expensive. */
30520 if (get_attr_fp_int_src (dep_insn))
30521 cost += 5;
30523 /* There is one cycle extra latency between an FP op and a store. */
30524 if (insn_type == TYPE_FMOV
30525 && (set = single_set (dep_insn)) != NULL_RTX
30526 && (set2 = single_set (insn)) != NULL_RTX
30527 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30528 && MEM_P (SET_DEST (set2)))
30529 cost += 1;
30531 memory = get_attr_memory (insn);
30533 /* Show ability of reorder buffer to hide latency of load by executing
30534 in parallel with previous instruction in case
30535 previous instruction is not needed to compute the address. */
30536 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30537 && !ix86_agi_dependent (dep_insn, insn))
30539 /* Claim moves to take one cycle, as core can issue one load
30540 at time and the next load can start cycle later. */
30541 if (dep_insn_type == TYPE_IMOV
30542 || dep_insn_type == TYPE_FMOV)
30543 cost = 1;
30544 else if (cost > 1)
30545 cost--;
30547 break;
30549 case PROCESSOR_K6:
30550 /* The esp dependency is resolved before
30551 the instruction is really finished. */
30552 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30553 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30554 return 1;
30556 /* INT->FP conversion is expensive. */
30557 if (get_attr_fp_int_src (dep_insn))
30558 cost += 5;
30560 memory = get_attr_memory (insn);
30562 /* Show ability of reorder buffer to hide latency of load by executing
30563 in parallel with previous instruction in case
30564 previous instruction is not needed to compute the address. */
30565 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30566 && !ix86_agi_dependent (dep_insn, insn))
30568 /* Claim moves to take one cycle, as core can issue one load
30569 at time and the next load can start cycle later. */
30570 if (dep_insn_type == TYPE_IMOV
30571 || dep_insn_type == TYPE_FMOV)
30572 cost = 1;
30573 else if (cost > 2)
30574 cost -= 2;
30575 else
30576 cost = 1;
30578 break;
30580 case PROCESSOR_AMDFAM10:
30581 case PROCESSOR_BDVER1:
30582 case PROCESSOR_BDVER2:
30583 case PROCESSOR_BDVER3:
30584 case PROCESSOR_BDVER4:
30585 case PROCESSOR_ZNVER1:
30586 case PROCESSOR_BTVER1:
30587 case PROCESSOR_BTVER2:
30588 case PROCESSOR_GENERIC:
30589 /* Stack engine allows to execute push&pop instructions in parall. */
30590 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30591 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30592 return 0;
30593 /* FALLTHRU */
30595 case PROCESSOR_ATHLON:
30596 case PROCESSOR_K8:
30597 memory = get_attr_memory (insn);
30599 /* Show ability of reorder buffer to hide latency of load by executing
30600 in parallel with previous instruction in case
30601 previous instruction is not needed to compute the address. */
30602 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30603 && !ix86_agi_dependent (dep_insn, insn))
30605 enum attr_unit unit = get_attr_unit (insn);
30606 int loadcost = 3;
30608 /* Because of the difference between the length of integer and
30609 floating unit pipeline preparation stages, the memory operands
30610 for floating point are cheaper.
30612 ??? For Athlon it the difference is most probably 2. */
30613 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30614 loadcost = 3;
30615 else
30616 loadcost = TARGET_ATHLON ? 2 : 0;
30618 if (cost >= loadcost)
30619 cost -= loadcost;
30620 else
30621 cost = 0;
30623 break;
30625 case PROCESSOR_CORE2:
30626 case PROCESSOR_NEHALEM:
30627 case PROCESSOR_SANDYBRIDGE:
30628 case PROCESSOR_HASWELL:
30629 /* Stack engine allows to execute push&pop instructions in parall. */
30630 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30631 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30632 return 0;
30634 memory = get_attr_memory (insn);
30636 /* Show ability of reorder buffer to hide latency of load by executing
30637 in parallel with previous instruction in case
30638 previous instruction is not needed to compute the address. */
30639 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30640 && !ix86_agi_dependent (dep_insn, insn))
30642 if (cost >= 4)
30643 cost -= 4;
30644 else
30645 cost = 0;
30647 break;
30649 case PROCESSOR_SILVERMONT:
30650 case PROCESSOR_KNL:
30651 case PROCESSOR_INTEL:
30652 if (!reload_completed)
30653 return cost;
30655 /* Increase cost of integer loads. */
30656 memory = get_attr_memory (dep_insn);
30657 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30659 enum attr_unit unit = get_attr_unit (dep_insn);
30660 if (unit == UNIT_INTEGER && cost == 1)
30662 if (memory == MEMORY_LOAD)
30663 cost = 3;
30664 else
30666 /* Increase cost of ld/st for short int types only
30667 because of store forwarding issue. */
30668 rtx set = single_set (dep_insn);
30669 if (set && (GET_MODE (SET_DEST (set)) == QImode
30670 || GET_MODE (SET_DEST (set)) == HImode))
30672 /* Increase cost of store/load insn if exact
30673 dependence exists and it is load insn. */
30674 enum attr_memory insn_memory = get_attr_memory (insn);
30675 if (insn_memory == MEMORY_LOAD
30676 && exact_store_load_dependency (dep_insn, insn))
30677 cost = 3;
30683 default:
30684 break;
30687 return cost;
30690 /* How many alternative schedules to try. This should be as wide as the
30691 scheduling freedom in the DFA, but no wider. Making this value too
30692 large results extra work for the scheduler. */
30694 static int
30695 ia32_multipass_dfa_lookahead (void)
30697 switch (ix86_tune)
30699 case PROCESSOR_PENTIUM:
30700 case PROCESSOR_LAKEMONT:
30701 return 2;
30703 case PROCESSOR_PENTIUMPRO:
30704 case PROCESSOR_K6:
30705 return 1;
30707 case PROCESSOR_BDVER1:
30708 case PROCESSOR_BDVER2:
30709 case PROCESSOR_BDVER3:
30710 case PROCESSOR_BDVER4:
30711 /* We use lookahead value 4 for BD both before and after reload
30712 schedules. Plan is to have value 8 included for O3. */
30713 return 4;
30715 case PROCESSOR_CORE2:
30716 case PROCESSOR_NEHALEM:
30717 case PROCESSOR_SANDYBRIDGE:
30718 case PROCESSOR_HASWELL:
30719 case PROCESSOR_BONNELL:
30720 case PROCESSOR_SILVERMONT:
30721 case PROCESSOR_KNL:
30722 case PROCESSOR_INTEL:
30723 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30724 as many instructions can be executed on a cycle, i.e.,
30725 issue_rate. I wonder why tuning for many CPUs does not do this. */
30726 if (reload_completed)
30727 return ix86_issue_rate ();
30728 /* Don't use lookahead for pre-reload schedule to save compile time. */
30729 return 0;
30731 default:
30732 return 0;
30736 /* Return true if target platform supports macro-fusion. */
30738 static bool
30739 ix86_macro_fusion_p ()
30741 return TARGET_FUSE_CMP_AND_BRANCH;
30744 /* Check whether current microarchitecture support macro fusion
30745 for insn pair "CONDGEN + CONDJMP". Refer to
30746 "Intel Architectures Optimization Reference Manual". */
30748 static bool
30749 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30751 rtx src, dest;
30752 enum rtx_code ccode;
30753 rtx compare_set = NULL_RTX, test_if, cond;
30754 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30756 if (!any_condjump_p (condjmp))
30757 return false;
30759 unsigned int condreg1, condreg2;
30760 rtx cc_reg_1;
30761 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30762 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30763 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30764 || !condgen
30765 || !modified_in_p (cc_reg_1, condgen))
30766 return false;
30768 if (get_attr_type (condgen) != TYPE_TEST
30769 && get_attr_type (condgen) != TYPE_ICMP
30770 && get_attr_type (condgen) != TYPE_INCDEC
30771 && get_attr_type (condgen) != TYPE_ALU)
30772 return false;
30774 compare_set = single_set (condgen);
30775 if (compare_set == NULL_RTX
30776 && !TARGET_FUSE_ALU_AND_BRANCH)
30777 return false;
30779 if (compare_set == NULL_RTX)
30781 int i;
30782 rtx pat = PATTERN (condgen);
30783 for (i = 0; i < XVECLEN (pat, 0); i++)
30784 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30786 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30787 if (GET_CODE (set_src) == COMPARE)
30788 compare_set = XVECEXP (pat, 0, i);
30789 else
30790 alu_set = XVECEXP (pat, 0, i);
30793 if (compare_set == NULL_RTX)
30794 return false;
30795 src = SET_SRC (compare_set);
30796 if (GET_CODE (src) != COMPARE)
30797 return false;
30799 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30800 supported. */
30801 if ((MEM_P (XEXP (src, 0))
30802 && CONST_INT_P (XEXP (src, 1)))
30803 || (MEM_P (XEXP (src, 1))
30804 && CONST_INT_P (XEXP (src, 0))))
30805 return false;
30807 /* No fusion for RIP-relative address. */
30808 if (MEM_P (XEXP (src, 0)))
30809 addr = XEXP (XEXP (src, 0), 0);
30810 else if (MEM_P (XEXP (src, 1)))
30811 addr = XEXP (XEXP (src, 1), 0);
30813 if (addr) {
30814 ix86_address parts;
30815 int ok = ix86_decompose_address (addr, &parts);
30816 gcc_assert (ok);
30818 if (rip_relative_addr_p (&parts))
30819 return false;
30822 test_if = SET_SRC (pc_set (condjmp));
30823 cond = XEXP (test_if, 0);
30824 ccode = GET_CODE (cond);
30825 /* Check whether conditional jump use Sign or Overflow Flags. */
30826 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30827 && (ccode == GE
30828 || ccode == GT
30829 || ccode == LE
30830 || ccode == LT))
30831 return false;
30833 /* Return true for TYPE_TEST and TYPE_ICMP. */
30834 if (get_attr_type (condgen) == TYPE_TEST
30835 || get_attr_type (condgen) == TYPE_ICMP)
30836 return true;
30838 /* The following is the case that macro-fusion for alu + jmp. */
30839 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30840 return false;
30842 /* No fusion for alu op with memory destination operand. */
30843 dest = SET_DEST (alu_set);
30844 if (MEM_P (dest))
30845 return false;
30847 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30848 supported. */
30849 if (get_attr_type (condgen) == TYPE_INCDEC
30850 && (ccode == GEU
30851 || ccode == GTU
30852 || ccode == LEU
30853 || ccode == LTU))
30854 return false;
30856 return true;
30859 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30860 execution. It is applied if
30861 (1) IMUL instruction is on the top of list;
30862 (2) There exists the only producer of independent IMUL instruction in
30863 ready list.
30864 Return index of IMUL producer if it was found and -1 otherwise. */
30865 static int
30866 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30868 rtx_insn *insn;
30869 rtx set, insn1, insn2;
30870 sd_iterator_def sd_it;
30871 dep_t dep;
30872 int index = -1;
30873 int i;
30875 if (!TARGET_BONNELL)
30876 return index;
30878 /* Check that IMUL instruction is on the top of ready list. */
30879 insn = ready[n_ready - 1];
30880 set = single_set (insn);
30881 if (!set)
30882 return index;
30883 if (!(GET_CODE (SET_SRC (set)) == MULT
30884 && GET_MODE (SET_SRC (set)) == SImode))
30885 return index;
30887 /* Search for producer of independent IMUL instruction. */
30888 for (i = n_ready - 2; i >= 0; i--)
30890 insn = ready[i];
30891 if (!NONDEBUG_INSN_P (insn))
30892 continue;
30893 /* Skip IMUL instruction. */
30894 insn2 = PATTERN (insn);
30895 if (GET_CODE (insn2) == PARALLEL)
30896 insn2 = XVECEXP (insn2, 0, 0);
30897 if (GET_CODE (insn2) == SET
30898 && GET_CODE (SET_SRC (insn2)) == MULT
30899 && GET_MODE (SET_SRC (insn2)) == SImode)
30900 continue;
30902 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30904 rtx con;
30905 con = DEP_CON (dep);
30906 if (!NONDEBUG_INSN_P (con))
30907 continue;
30908 insn1 = PATTERN (con);
30909 if (GET_CODE (insn1) == PARALLEL)
30910 insn1 = XVECEXP (insn1, 0, 0);
30912 if (GET_CODE (insn1) == SET
30913 && GET_CODE (SET_SRC (insn1)) == MULT
30914 && GET_MODE (SET_SRC (insn1)) == SImode)
30916 sd_iterator_def sd_it1;
30917 dep_t dep1;
30918 /* Check if there is no other dependee for IMUL. */
30919 index = i;
30920 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30922 rtx pro;
30923 pro = DEP_PRO (dep1);
30924 if (!NONDEBUG_INSN_P (pro))
30925 continue;
30926 if (pro != insn)
30927 index = -1;
30929 if (index >= 0)
30930 break;
30933 if (index >= 0)
30934 break;
30936 return index;
30939 /* Try to find the best candidate on the top of ready list if two insns
30940 have the same priority - candidate is best if its dependees were
30941 scheduled earlier. Applied for Silvermont only.
30942 Return true if top 2 insns must be interchanged. */
30943 static bool
30944 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30946 rtx_insn *top = ready[n_ready - 1];
30947 rtx_insn *next = ready[n_ready - 2];
30948 rtx set;
30949 sd_iterator_def sd_it;
30950 dep_t dep;
30951 int clock1 = -1;
30952 int clock2 = -1;
30953 #define INSN_TICK(INSN) (HID (INSN)->tick)
30955 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30956 return false;
30958 if (!NONDEBUG_INSN_P (top))
30959 return false;
30960 if (!NONJUMP_INSN_P (top))
30961 return false;
30962 if (!NONDEBUG_INSN_P (next))
30963 return false;
30964 if (!NONJUMP_INSN_P (next))
30965 return false;
30966 set = single_set (top);
30967 if (!set)
30968 return false;
30969 set = single_set (next);
30970 if (!set)
30971 return false;
30973 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30975 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30976 return false;
30977 /* Determine winner more precise. */
30978 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30980 rtx pro;
30981 pro = DEP_PRO (dep);
30982 if (!NONDEBUG_INSN_P (pro))
30983 continue;
30984 if (INSN_TICK (pro) > clock1)
30985 clock1 = INSN_TICK (pro);
30987 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30989 rtx pro;
30990 pro = DEP_PRO (dep);
30991 if (!NONDEBUG_INSN_P (pro))
30992 continue;
30993 if (INSN_TICK (pro) > clock2)
30994 clock2 = INSN_TICK (pro);
30997 if (clock1 == clock2)
30999 /* Determine winner - load must win. */
31000 enum attr_memory memory1, memory2;
31001 memory1 = get_attr_memory (top);
31002 memory2 = get_attr_memory (next);
31003 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
31004 return true;
31006 return (bool) (clock2 < clock1);
31008 return false;
31009 #undef INSN_TICK
31012 /* Perform possible reodering of ready list for Atom/Silvermont only.
31013 Return issue rate. */
31014 static int
31015 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
31016 int *pn_ready, int clock_var)
31018 int issue_rate = -1;
31019 int n_ready = *pn_ready;
31020 int i;
31021 rtx_insn *insn;
31022 int index = -1;
31024 /* Set up issue rate. */
31025 issue_rate = ix86_issue_rate ();
31027 /* Do reodering for BONNELL/SILVERMONT only. */
31028 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
31029 return issue_rate;
31031 /* Nothing to do if ready list contains only 1 instruction. */
31032 if (n_ready <= 1)
31033 return issue_rate;
31035 /* Do reodering for post-reload scheduler only. */
31036 if (!reload_completed)
31037 return issue_rate;
31039 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
31041 if (sched_verbose > 1)
31042 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
31043 INSN_UID (ready[index]));
31045 /* Put IMUL producer (ready[index]) at the top of ready list. */
31046 insn = ready[index];
31047 for (i = index; i < n_ready - 1; i++)
31048 ready[i] = ready[i + 1];
31049 ready[n_ready - 1] = insn;
31050 return issue_rate;
31053 /* Skip selective scheduling since HID is not populated in it. */
31054 if (clock_var != 0
31055 && !sel_sched_p ()
31056 && swap_top_of_ready_list (ready, n_ready))
31058 if (sched_verbose > 1)
31059 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
31060 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
31061 /* Swap 2 top elements of ready list. */
31062 insn = ready[n_ready - 1];
31063 ready[n_ready - 1] = ready[n_ready - 2];
31064 ready[n_ready - 2] = insn;
31066 return issue_rate;
31069 static bool
31070 ix86_class_likely_spilled_p (reg_class_t);
31072 /* Returns true if lhs of insn is HW function argument register and set up
31073 is_spilled to true if it is likely spilled HW register. */
31074 static bool
31075 insn_is_function_arg (rtx insn, bool* is_spilled)
31077 rtx dst;
31079 if (!NONDEBUG_INSN_P (insn))
31080 return false;
31081 /* Call instructions are not movable, ignore it. */
31082 if (CALL_P (insn))
31083 return false;
31084 insn = PATTERN (insn);
31085 if (GET_CODE (insn) == PARALLEL)
31086 insn = XVECEXP (insn, 0, 0);
31087 if (GET_CODE (insn) != SET)
31088 return false;
31089 dst = SET_DEST (insn);
31090 if (REG_P (dst) && HARD_REGISTER_P (dst)
31091 && ix86_function_arg_regno_p (REGNO (dst)))
31093 /* Is it likely spilled HW register? */
31094 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
31095 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
31096 *is_spilled = true;
31097 return true;
31099 return false;
31102 /* Add output dependencies for chain of function adjacent arguments if only
31103 there is a move to likely spilled HW register. Return first argument
31104 if at least one dependence was added or NULL otherwise. */
31105 static rtx_insn *
31106 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
31108 rtx_insn *insn;
31109 rtx_insn *last = call;
31110 rtx_insn *first_arg = NULL;
31111 bool is_spilled = false;
31113 head = PREV_INSN (head);
31115 /* Find nearest to call argument passing instruction. */
31116 while (true)
31118 last = PREV_INSN (last);
31119 if (last == head)
31120 return NULL;
31121 if (!NONDEBUG_INSN_P (last))
31122 continue;
31123 if (insn_is_function_arg (last, &is_spilled))
31124 break;
31125 return NULL;
31128 first_arg = last;
31129 while (true)
31131 insn = PREV_INSN (last);
31132 if (!INSN_P (insn))
31133 break;
31134 if (insn == head)
31135 break;
31136 if (!NONDEBUG_INSN_P (insn))
31138 last = insn;
31139 continue;
31141 if (insn_is_function_arg (insn, &is_spilled))
31143 /* Add output depdendence between two function arguments if chain
31144 of output arguments contains likely spilled HW registers. */
31145 if (is_spilled)
31146 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
31147 first_arg = last = insn;
31149 else
31150 break;
31152 if (!is_spilled)
31153 return NULL;
31154 return first_arg;
31157 /* Add output or anti dependency from insn to first_arg to restrict its code
31158 motion. */
31159 static void
31160 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
31162 rtx set;
31163 rtx tmp;
31165 /* Add anti dependencies for bounds stores. */
31166 if (INSN_P (insn)
31167 && GET_CODE (PATTERN (insn)) == PARALLEL
31168 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
31169 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
31171 add_dependence (first_arg, insn, REG_DEP_ANTI);
31172 return;
31175 set = single_set (insn);
31176 if (!set)
31177 return;
31178 tmp = SET_DEST (set);
31179 if (REG_P (tmp))
31181 /* Add output dependency to the first function argument. */
31182 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
31183 return;
31185 /* Add anti dependency. */
31186 add_dependence (first_arg, insn, REG_DEP_ANTI);
31189 /* Avoid cross block motion of function argument through adding dependency
31190 from the first non-jump instruction in bb. */
31191 static void
31192 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
31194 rtx_insn *insn = BB_END (bb);
31196 while (insn)
31198 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
31200 rtx set = single_set (insn);
31201 if (set)
31203 avoid_func_arg_motion (arg, insn);
31204 return;
31207 if (insn == BB_HEAD (bb))
31208 return;
31209 insn = PREV_INSN (insn);
31213 /* Hook for pre-reload schedule - avoid motion of function arguments
31214 passed in likely spilled HW registers. */
31215 static void
31216 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
31218 rtx_insn *insn;
31219 rtx_insn *first_arg = NULL;
31220 if (reload_completed)
31221 return;
31222 while (head != tail && DEBUG_INSN_P (head))
31223 head = NEXT_INSN (head);
31224 for (insn = tail; insn != head; insn = PREV_INSN (insn))
31225 if (INSN_P (insn) && CALL_P (insn))
31227 first_arg = add_parameter_dependencies (insn, head);
31228 if (first_arg)
31230 /* Add dependee for first argument to predecessors if only
31231 region contains more than one block. */
31232 basic_block bb = BLOCK_FOR_INSN (insn);
31233 int rgn = CONTAINING_RGN (bb->index);
31234 int nr_blks = RGN_NR_BLOCKS (rgn);
31235 /* Skip trivial regions and region head blocks that can have
31236 predecessors outside of region. */
31237 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
31239 edge e;
31240 edge_iterator ei;
31242 /* Regions are SCCs with the exception of selective
31243 scheduling with pipelining of outer blocks enabled.
31244 So also check that immediate predecessors of a non-head
31245 block are in the same region. */
31246 FOR_EACH_EDGE (e, ei, bb->preds)
31248 /* Avoid creating of loop-carried dependencies through
31249 using topological ordering in the region. */
31250 if (rgn == CONTAINING_RGN (e->src->index)
31251 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
31252 add_dependee_for_func_arg (first_arg, e->src);
31255 insn = first_arg;
31256 if (insn == head)
31257 break;
31260 else if (first_arg)
31261 avoid_func_arg_motion (first_arg, insn);
31264 /* Hook for pre-reload schedule - set priority of moves from likely spilled
31265 HW registers to maximum, to schedule them at soon as possible. These are
31266 moves from function argument registers at the top of the function entry
31267 and moves from function return value registers after call. */
31268 static int
31269 ix86_adjust_priority (rtx_insn *insn, int priority)
31271 rtx set;
31273 if (reload_completed)
31274 return priority;
31276 if (!NONDEBUG_INSN_P (insn))
31277 return priority;
31279 set = single_set (insn);
31280 if (set)
31282 rtx tmp = SET_SRC (set);
31283 if (REG_P (tmp)
31284 && HARD_REGISTER_P (tmp)
31285 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
31286 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
31287 return current_sched_info->sched_max_insns_priority;
31290 return priority;
31293 /* Model decoder of Core 2/i7.
31294 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
31295 track the instruction fetch block boundaries and make sure that long
31296 (9+ bytes) instructions are assigned to D0. */
31298 /* Maximum length of an insn that can be handled by
31299 a secondary decoder unit. '8' for Core 2/i7. */
31300 static int core2i7_secondary_decoder_max_insn_size;
31302 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
31303 '16' for Core 2/i7. */
31304 static int core2i7_ifetch_block_size;
31306 /* Maximum number of instructions decoder can handle per cycle.
31307 '6' for Core 2/i7. */
31308 static int core2i7_ifetch_block_max_insns;
31310 typedef struct ix86_first_cycle_multipass_data_ *
31311 ix86_first_cycle_multipass_data_t;
31312 typedef const struct ix86_first_cycle_multipass_data_ *
31313 const_ix86_first_cycle_multipass_data_t;
31315 /* A variable to store target state across calls to max_issue within
31316 one cycle. */
31317 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
31318 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
31320 /* Initialize DATA. */
31321 static void
31322 core2i7_first_cycle_multipass_init (void *_data)
31324 ix86_first_cycle_multipass_data_t data
31325 = (ix86_first_cycle_multipass_data_t) _data;
31327 data->ifetch_block_len = 0;
31328 data->ifetch_block_n_insns = 0;
31329 data->ready_try_change = NULL;
31330 data->ready_try_change_size = 0;
31333 /* Advancing the cycle; reset ifetch block counts. */
31334 static void
31335 core2i7_dfa_post_advance_cycle (void)
31337 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
31339 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31341 data->ifetch_block_len = 0;
31342 data->ifetch_block_n_insns = 0;
31345 static int min_insn_size (rtx_insn *);
31347 /* Filter out insns from ready_try that the core will not be able to issue
31348 on current cycle due to decoder. */
31349 static void
31350 core2i7_first_cycle_multipass_filter_ready_try
31351 (const_ix86_first_cycle_multipass_data_t data,
31352 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
31354 while (n_ready--)
31356 rtx_insn *insn;
31357 int insn_size;
31359 if (ready_try[n_ready])
31360 continue;
31362 insn = get_ready_element (n_ready);
31363 insn_size = min_insn_size (insn);
31365 if (/* If this is a too long an insn for a secondary decoder ... */
31366 (!first_cycle_insn_p
31367 && insn_size > core2i7_secondary_decoder_max_insn_size)
31368 /* ... or it would not fit into the ifetch block ... */
31369 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
31370 /* ... or the decoder is full already ... */
31371 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
31372 /* ... mask the insn out. */
31374 ready_try[n_ready] = 1;
31376 if (data->ready_try_change)
31377 bitmap_set_bit (data->ready_try_change, n_ready);
31382 /* Prepare for a new round of multipass lookahead scheduling. */
31383 static void
31384 core2i7_first_cycle_multipass_begin (void *_data,
31385 signed char *ready_try, int n_ready,
31386 bool first_cycle_insn_p)
31388 ix86_first_cycle_multipass_data_t data
31389 = (ix86_first_cycle_multipass_data_t) _data;
31390 const_ix86_first_cycle_multipass_data_t prev_data
31391 = ix86_first_cycle_multipass_data;
31393 /* Restore the state from the end of the previous round. */
31394 data->ifetch_block_len = prev_data->ifetch_block_len;
31395 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31397 /* Filter instructions that cannot be issued on current cycle due to
31398 decoder restrictions. */
31399 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31400 first_cycle_insn_p);
31403 /* INSN is being issued in current solution. Account for its impact on
31404 the decoder model. */
31405 static void
31406 core2i7_first_cycle_multipass_issue (void *_data,
31407 signed char *ready_try, int n_ready,
31408 rtx_insn *insn, const void *_prev_data)
31410 ix86_first_cycle_multipass_data_t data
31411 = (ix86_first_cycle_multipass_data_t) _data;
31412 const_ix86_first_cycle_multipass_data_t prev_data
31413 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31415 int insn_size = min_insn_size (insn);
31417 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31418 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31419 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31420 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31422 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31423 if (!data->ready_try_change)
31425 data->ready_try_change = sbitmap_alloc (n_ready);
31426 data->ready_try_change_size = n_ready;
31428 else if (data->ready_try_change_size < n_ready)
31430 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31431 n_ready, 0);
31432 data->ready_try_change_size = n_ready;
31434 bitmap_clear (data->ready_try_change);
31436 /* Filter out insns from ready_try that the core will not be able to issue
31437 on current cycle due to decoder. */
31438 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31439 false);
31442 /* Revert the effect on ready_try. */
31443 static void
31444 core2i7_first_cycle_multipass_backtrack (const void *_data,
31445 signed char *ready_try,
31446 int n_ready ATTRIBUTE_UNUSED)
31448 const_ix86_first_cycle_multipass_data_t data
31449 = (const_ix86_first_cycle_multipass_data_t) _data;
31450 unsigned int i = 0;
31451 sbitmap_iterator sbi;
31453 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31454 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31456 ready_try[i] = 0;
31460 /* Save the result of multipass lookahead scheduling for the next round. */
31461 static void
31462 core2i7_first_cycle_multipass_end (const void *_data)
31464 const_ix86_first_cycle_multipass_data_t data
31465 = (const_ix86_first_cycle_multipass_data_t) _data;
31466 ix86_first_cycle_multipass_data_t next_data
31467 = ix86_first_cycle_multipass_data;
31469 if (data != NULL)
31471 next_data->ifetch_block_len = data->ifetch_block_len;
31472 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31476 /* Deallocate target data. */
31477 static void
31478 core2i7_first_cycle_multipass_fini (void *_data)
31480 ix86_first_cycle_multipass_data_t data
31481 = (ix86_first_cycle_multipass_data_t) _data;
31483 if (data->ready_try_change)
31485 sbitmap_free (data->ready_try_change);
31486 data->ready_try_change = NULL;
31487 data->ready_try_change_size = 0;
31491 /* Prepare for scheduling pass. */
31492 static void
31493 ix86_sched_init_global (FILE *, int, int)
31495 /* Install scheduling hooks for current CPU. Some of these hooks are used
31496 in time-critical parts of the scheduler, so we only set them up when
31497 they are actually used. */
31498 switch (ix86_tune)
31500 case PROCESSOR_CORE2:
31501 case PROCESSOR_NEHALEM:
31502 case PROCESSOR_SANDYBRIDGE:
31503 case PROCESSOR_HASWELL:
31504 /* Do not perform multipass scheduling for pre-reload schedule
31505 to save compile time. */
31506 if (reload_completed)
31508 targetm.sched.dfa_post_advance_cycle
31509 = core2i7_dfa_post_advance_cycle;
31510 targetm.sched.first_cycle_multipass_init
31511 = core2i7_first_cycle_multipass_init;
31512 targetm.sched.first_cycle_multipass_begin
31513 = core2i7_first_cycle_multipass_begin;
31514 targetm.sched.first_cycle_multipass_issue
31515 = core2i7_first_cycle_multipass_issue;
31516 targetm.sched.first_cycle_multipass_backtrack
31517 = core2i7_first_cycle_multipass_backtrack;
31518 targetm.sched.first_cycle_multipass_end
31519 = core2i7_first_cycle_multipass_end;
31520 targetm.sched.first_cycle_multipass_fini
31521 = core2i7_first_cycle_multipass_fini;
31523 /* Set decoder parameters. */
31524 core2i7_secondary_decoder_max_insn_size = 8;
31525 core2i7_ifetch_block_size = 16;
31526 core2i7_ifetch_block_max_insns = 6;
31527 break;
31529 /* Fall through. */
31530 default:
31531 targetm.sched.dfa_post_advance_cycle = NULL;
31532 targetm.sched.first_cycle_multipass_init = NULL;
31533 targetm.sched.first_cycle_multipass_begin = NULL;
31534 targetm.sched.first_cycle_multipass_issue = NULL;
31535 targetm.sched.first_cycle_multipass_backtrack = NULL;
31536 targetm.sched.first_cycle_multipass_end = NULL;
31537 targetm.sched.first_cycle_multipass_fini = NULL;
31538 break;
31543 /* Compute the alignment given to a constant that is being placed in memory.
31544 EXP is the constant and ALIGN is the alignment that the object would
31545 ordinarily have.
31546 The value of this function is used instead of that alignment to align
31547 the object. */
31550 ix86_constant_alignment (tree exp, int align)
31552 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31553 || TREE_CODE (exp) == INTEGER_CST)
31555 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31556 return 64;
31557 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31558 return 128;
31560 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31561 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31562 return BITS_PER_WORD;
31564 return align;
31567 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31568 the data type, and ALIGN is the alignment that the object would
31569 ordinarily have. */
31571 static int
31572 iamcu_alignment (tree type, int align)
31574 machine_mode mode;
31576 if (align < 32 || TYPE_USER_ALIGN (type))
31577 return align;
31579 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31580 bytes. */
31581 mode = TYPE_MODE (strip_array_types (type));
31582 switch (GET_MODE_CLASS (mode))
31584 case MODE_INT:
31585 case MODE_COMPLEX_INT:
31586 case MODE_COMPLEX_FLOAT:
31587 case MODE_FLOAT:
31588 case MODE_DECIMAL_FLOAT:
31589 return 32;
31590 default:
31591 return align;
31595 /* Compute the alignment for a static variable.
31596 TYPE is the data type, and ALIGN is the alignment that
31597 the object would ordinarily have. The value of this function is used
31598 instead of that alignment to align the object. */
31601 ix86_data_alignment (tree type, int align, bool opt)
31603 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31604 for symbols from other compilation units or symbols that don't need
31605 to bind locally. In order to preserve some ABI compatibility with
31606 those compilers, ensure we don't decrease alignment from what we
31607 used to assume. */
31609 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31611 /* A data structure, equal or greater than the size of a cache line
31612 (64 bytes in the Pentium 4 and other recent Intel processors, including
31613 processors based on Intel Core microarchitecture) should be aligned
31614 so that its base address is a multiple of a cache line size. */
31616 int max_align
31617 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31619 if (max_align < BITS_PER_WORD)
31620 max_align = BITS_PER_WORD;
31622 switch (ix86_align_data_type)
31624 case ix86_align_data_type_abi: opt = false; break;
31625 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31626 case ix86_align_data_type_cacheline: break;
31629 if (TARGET_IAMCU)
31630 align = iamcu_alignment (type, align);
31632 if (opt
31633 && AGGREGATE_TYPE_P (type)
31634 && TYPE_SIZE (type)
31635 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31637 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31638 && align < max_align_compat)
31639 align = max_align_compat;
31640 if (wi::geu_p (TYPE_SIZE (type), max_align)
31641 && align < max_align)
31642 align = max_align;
31645 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31646 to 16byte boundary. */
31647 if (TARGET_64BIT)
31649 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31650 && TYPE_SIZE (type)
31651 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31652 && wi::geu_p (TYPE_SIZE (type), 128)
31653 && align < 128)
31654 return 128;
31657 if (!opt)
31658 return align;
31660 if (TREE_CODE (type) == ARRAY_TYPE)
31662 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31663 return 64;
31664 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31665 return 128;
31667 else if (TREE_CODE (type) == COMPLEX_TYPE)
31670 if (TYPE_MODE (type) == DCmode && align < 64)
31671 return 64;
31672 if ((TYPE_MODE (type) == XCmode
31673 || TYPE_MODE (type) == TCmode) && align < 128)
31674 return 128;
31676 else if ((TREE_CODE (type) == RECORD_TYPE
31677 || TREE_CODE (type) == UNION_TYPE
31678 || TREE_CODE (type) == QUAL_UNION_TYPE)
31679 && TYPE_FIELDS (type))
31681 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31682 return 64;
31683 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31684 return 128;
31686 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31687 || TREE_CODE (type) == INTEGER_TYPE)
31689 if (TYPE_MODE (type) == DFmode && align < 64)
31690 return 64;
31691 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31692 return 128;
31695 return align;
31698 /* Compute the alignment for a local variable or a stack slot. EXP is
31699 the data type or decl itself, MODE is the widest mode available and
31700 ALIGN is the alignment that the object would ordinarily have. The
31701 value of this macro is used instead of that alignment to align the
31702 object. */
31704 unsigned int
31705 ix86_local_alignment (tree exp, machine_mode mode,
31706 unsigned int align)
31708 tree type, decl;
31710 if (exp && DECL_P (exp))
31712 type = TREE_TYPE (exp);
31713 decl = exp;
31715 else
31717 type = exp;
31718 decl = NULL;
31721 /* Don't do dynamic stack realignment for long long objects with
31722 -mpreferred-stack-boundary=2. */
31723 if (!TARGET_64BIT
31724 && align == 64
31725 && ix86_preferred_stack_boundary < 64
31726 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31727 && (!type || !TYPE_USER_ALIGN (type))
31728 && (!decl || !DECL_USER_ALIGN (decl)))
31729 align = 32;
31731 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31732 register in MODE. We will return the largest alignment of XF
31733 and DF. */
31734 if (!type)
31736 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31737 align = GET_MODE_ALIGNMENT (DFmode);
31738 return align;
31741 /* Don't increase alignment for Intel MCU psABI. */
31742 if (TARGET_IAMCU)
31743 return align;
31745 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31746 to 16byte boundary. Exact wording is:
31748 An array uses the same alignment as its elements, except that a local or
31749 global array variable of length at least 16 bytes or
31750 a C99 variable-length array variable always has alignment of at least 16 bytes.
31752 This was added to allow use of aligned SSE instructions at arrays. This
31753 rule is meant for static storage (where compiler can not do the analysis
31754 by itself). We follow it for automatic variables only when convenient.
31755 We fully control everything in the function compiled and functions from
31756 other unit can not rely on the alignment.
31758 Exclude va_list type. It is the common case of local array where
31759 we can not benefit from the alignment.
31761 TODO: Probably one should optimize for size only when var is not escaping. */
31762 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31763 && TARGET_SSE)
31765 if (AGGREGATE_TYPE_P (type)
31766 && (va_list_type_node == NULL_TREE
31767 || (TYPE_MAIN_VARIANT (type)
31768 != TYPE_MAIN_VARIANT (va_list_type_node)))
31769 && TYPE_SIZE (type)
31770 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31771 && wi::geu_p (TYPE_SIZE (type), 128)
31772 && align < 128)
31773 return 128;
31775 if (TREE_CODE (type) == ARRAY_TYPE)
31777 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31778 return 64;
31779 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31780 return 128;
31782 else if (TREE_CODE (type) == COMPLEX_TYPE)
31784 if (TYPE_MODE (type) == DCmode && align < 64)
31785 return 64;
31786 if ((TYPE_MODE (type) == XCmode
31787 || TYPE_MODE (type) == TCmode) && align < 128)
31788 return 128;
31790 else if ((TREE_CODE (type) == RECORD_TYPE
31791 || TREE_CODE (type) == UNION_TYPE
31792 || TREE_CODE (type) == QUAL_UNION_TYPE)
31793 && TYPE_FIELDS (type))
31795 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31796 return 64;
31797 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31798 return 128;
31800 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31801 || TREE_CODE (type) == INTEGER_TYPE)
31804 if (TYPE_MODE (type) == DFmode && align < 64)
31805 return 64;
31806 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31807 return 128;
31809 return align;
31812 /* Compute the minimum required alignment for dynamic stack realignment
31813 purposes for a local variable, parameter or a stack slot. EXP is
31814 the data type or decl itself, MODE is its mode and ALIGN is the
31815 alignment that the object would ordinarily have. */
31817 unsigned int
31818 ix86_minimum_alignment (tree exp, machine_mode mode,
31819 unsigned int align)
31821 tree type, decl;
31823 if (exp && DECL_P (exp))
31825 type = TREE_TYPE (exp);
31826 decl = exp;
31828 else
31830 type = exp;
31831 decl = NULL;
31834 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31835 return align;
31837 /* Don't do dynamic stack realignment for long long objects with
31838 -mpreferred-stack-boundary=2. */
31839 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31840 && (!type || !TYPE_USER_ALIGN (type))
31841 && (!decl || !DECL_USER_ALIGN (decl)))
31843 gcc_checking_assert (!TARGET_STV);
31844 return 32;
31847 return align;
31850 /* Find a location for the static chain incoming to a nested function.
31851 This is a register, unless all free registers are used by arguments. */
31853 static rtx
31854 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31856 unsigned regno;
31858 /* While this function won't be called by the middle-end when a static
31859 chain isn't needed, it's also used throughout the backend so it's
31860 easiest to keep this check centralized. */
31861 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31862 return NULL;
31864 if (TARGET_64BIT)
31866 /* We always use R10 in 64-bit mode. */
31867 regno = R10_REG;
31869 else
31871 const_tree fntype, fndecl;
31872 unsigned int ccvt;
31874 /* By default in 32-bit mode we use ECX to pass the static chain. */
31875 regno = CX_REG;
31877 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31879 fntype = TREE_TYPE (fndecl_or_type);
31880 fndecl = fndecl_or_type;
31882 else
31884 fntype = fndecl_or_type;
31885 fndecl = NULL;
31888 ccvt = ix86_get_callcvt (fntype);
31889 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31891 /* Fastcall functions use ecx/edx for arguments, which leaves
31892 us with EAX for the static chain.
31893 Thiscall functions use ecx for arguments, which also
31894 leaves us with EAX for the static chain. */
31895 regno = AX_REG;
31897 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31899 /* Thiscall functions use ecx for arguments, which leaves
31900 us with EAX and EDX for the static chain.
31901 We are using for abi-compatibility EAX. */
31902 regno = AX_REG;
31904 else if (ix86_function_regparm (fntype, fndecl) == 3)
31906 /* For regparm 3, we have no free call-clobbered registers in
31907 which to store the static chain. In order to implement this,
31908 we have the trampoline push the static chain to the stack.
31909 However, we can't push a value below the return address when
31910 we call the nested function directly, so we have to use an
31911 alternate entry point. For this we use ESI, and have the
31912 alternate entry point push ESI, so that things appear the
31913 same once we're executing the nested function. */
31914 if (incoming_p)
31916 if (fndecl == current_function_decl
31917 && !ix86_static_chain_on_stack)
31919 gcc_assert (!reload_completed);
31920 ix86_static_chain_on_stack = true;
31922 return gen_frame_mem (SImode,
31923 plus_constant (Pmode,
31924 arg_pointer_rtx, -8));
31926 regno = SI_REG;
31930 return gen_rtx_REG (Pmode, regno);
31933 /* Emit RTL insns to initialize the variable parts of a trampoline.
31934 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31935 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31936 to be passed to the target function. */
31938 static void
31939 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31941 rtx mem, fnaddr;
31942 int opcode;
31943 int offset = 0;
31945 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31947 if (TARGET_64BIT)
31949 int size;
31951 /* Load the function address to r11. Try to load address using
31952 the shorter movl instead of movabs. We may want to support
31953 movq for kernel mode, but kernel does not use trampolines at
31954 the moment. FNADDR is a 32bit address and may not be in
31955 DImode when ptr_mode == SImode. Always use movl in this
31956 case. */
31957 if (ptr_mode == SImode
31958 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31960 fnaddr = copy_addr_to_reg (fnaddr);
31962 mem = adjust_address (m_tramp, HImode, offset);
31963 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31965 mem = adjust_address (m_tramp, SImode, offset + 2);
31966 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31967 offset += 6;
31969 else
31971 mem = adjust_address (m_tramp, HImode, offset);
31972 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31974 mem = adjust_address (m_tramp, DImode, offset + 2);
31975 emit_move_insn (mem, fnaddr);
31976 offset += 10;
31979 /* Load static chain using movabs to r10. Use the shorter movl
31980 instead of movabs when ptr_mode == SImode. */
31981 if (ptr_mode == SImode)
31983 opcode = 0xba41;
31984 size = 6;
31986 else
31988 opcode = 0xba49;
31989 size = 10;
31992 mem = adjust_address (m_tramp, HImode, offset);
31993 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31995 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31996 emit_move_insn (mem, chain_value);
31997 offset += size;
31999 /* Jump to r11; the last (unused) byte is a nop, only there to
32000 pad the write out to a single 32-bit store. */
32001 mem = adjust_address (m_tramp, SImode, offset);
32002 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
32003 offset += 4;
32005 else
32007 rtx disp, chain;
32009 /* Depending on the static chain location, either load a register
32010 with a constant, or push the constant to the stack. All of the
32011 instructions are the same size. */
32012 chain = ix86_static_chain (fndecl, true);
32013 if (REG_P (chain))
32015 switch (REGNO (chain))
32017 case AX_REG:
32018 opcode = 0xb8; break;
32019 case CX_REG:
32020 opcode = 0xb9; break;
32021 default:
32022 gcc_unreachable ();
32025 else
32026 opcode = 0x68;
32028 mem = adjust_address (m_tramp, QImode, offset);
32029 emit_move_insn (mem, gen_int_mode (opcode, QImode));
32031 mem = adjust_address (m_tramp, SImode, offset + 1);
32032 emit_move_insn (mem, chain_value);
32033 offset += 5;
32035 mem = adjust_address (m_tramp, QImode, offset);
32036 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
32038 mem = adjust_address (m_tramp, SImode, offset + 1);
32040 /* Compute offset from the end of the jmp to the target function.
32041 In the case in which the trampoline stores the static chain on
32042 the stack, we need to skip the first insn which pushes the
32043 (call-saved) register static chain; this push is 1 byte. */
32044 offset += 5;
32045 disp = expand_binop (SImode, sub_optab, fnaddr,
32046 plus_constant (Pmode, XEXP (m_tramp, 0),
32047 offset - (MEM_P (chain) ? 1 : 0)),
32048 NULL_RTX, 1, OPTAB_DIRECT);
32049 emit_move_insn (mem, disp);
32052 gcc_assert (offset <= TRAMPOLINE_SIZE);
32054 #ifdef HAVE_ENABLE_EXECUTE_STACK
32055 #ifdef CHECK_EXECUTE_STACK_ENABLED
32056 if (CHECK_EXECUTE_STACK_ENABLED)
32057 #endif
32058 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
32059 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
32060 #endif
32063 static bool
32064 ix86_allocate_stack_slots_for_args (void)
32066 /* Naked functions should not allocate stack slots for arguments. */
32067 return !ix86_function_naked (current_function_decl);
32070 static bool
32071 ix86_warn_func_return (tree decl)
32073 /* Naked functions are implemented entirely in assembly, including the
32074 return sequence, so suppress warnings about this. */
32075 return !ix86_function_naked (decl);
32078 /* The following file contains several enumerations and data structures
32079 built from the definitions in i386-builtin-types.def. */
32081 #include "i386-builtin-types.inc"
32083 /* Table for the ix86 builtin non-function types. */
32084 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
32086 /* Retrieve an element from the above table, building some of
32087 the types lazily. */
32089 static tree
32090 ix86_get_builtin_type (enum ix86_builtin_type tcode)
32092 unsigned int index;
32093 tree type, itype;
32095 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
32097 type = ix86_builtin_type_tab[(int) tcode];
32098 if (type != NULL)
32099 return type;
32101 gcc_assert (tcode > IX86_BT_LAST_PRIM);
32102 if (tcode <= IX86_BT_LAST_VECT)
32104 machine_mode mode;
32106 index = tcode - IX86_BT_LAST_PRIM - 1;
32107 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
32108 mode = ix86_builtin_type_vect_mode[index];
32110 type = build_vector_type_for_mode (itype, mode);
32112 else
32114 int quals;
32116 index = tcode - IX86_BT_LAST_VECT - 1;
32117 if (tcode <= IX86_BT_LAST_PTR)
32118 quals = TYPE_UNQUALIFIED;
32119 else
32120 quals = TYPE_QUAL_CONST;
32122 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
32123 if (quals != TYPE_UNQUALIFIED)
32124 itype = build_qualified_type (itype, quals);
32126 type = build_pointer_type (itype);
32129 ix86_builtin_type_tab[(int) tcode] = type;
32130 return type;
32133 /* Table for the ix86 builtin function types. */
32134 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
32136 /* Retrieve an element from the above table, building some of
32137 the types lazily. */
32139 static tree
32140 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
32142 tree type;
32144 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
32146 type = ix86_builtin_func_type_tab[(int) tcode];
32147 if (type != NULL)
32148 return type;
32150 if (tcode <= IX86_BT_LAST_FUNC)
32152 unsigned start = ix86_builtin_func_start[(int) tcode];
32153 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
32154 tree rtype, atype, args = void_list_node;
32155 unsigned i;
32157 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
32158 for (i = after - 1; i > start; --i)
32160 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
32161 args = tree_cons (NULL, atype, args);
32164 type = build_function_type (rtype, args);
32166 else
32168 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
32169 enum ix86_builtin_func_type icode;
32171 icode = ix86_builtin_func_alias_base[index];
32172 type = ix86_get_builtin_func_type (icode);
32175 ix86_builtin_func_type_tab[(int) tcode] = type;
32176 return type;
32180 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
32181 bdesc_* arrays below should come first, then builtins for each bdesc_*
32182 array in ascending order, so that we can use direct array accesses. */
32183 enum ix86_builtins
32185 IX86_BUILTIN_MASKMOVQ,
32186 IX86_BUILTIN_LDMXCSR,
32187 IX86_BUILTIN_STMXCSR,
32188 IX86_BUILTIN_MASKMOVDQU,
32189 IX86_BUILTIN_PSLLDQ128,
32190 IX86_BUILTIN_CLFLUSH,
32191 IX86_BUILTIN_MONITOR,
32192 IX86_BUILTIN_MWAIT,
32193 IX86_BUILTIN_CLZERO,
32194 IX86_BUILTIN_VEC_INIT_V2SI,
32195 IX86_BUILTIN_VEC_INIT_V4HI,
32196 IX86_BUILTIN_VEC_INIT_V8QI,
32197 IX86_BUILTIN_VEC_EXT_V2DF,
32198 IX86_BUILTIN_VEC_EXT_V2DI,
32199 IX86_BUILTIN_VEC_EXT_V4SF,
32200 IX86_BUILTIN_VEC_EXT_V4SI,
32201 IX86_BUILTIN_VEC_EXT_V8HI,
32202 IX86_BUILTIN_VEC_EXT_V2SI,
32203 IX86_BUILTIN_VEC_EXT_V4HI,
32204 IX86_BUILTIN_VEC_EXT_V16QI,
32205 IX86_BUILTIN_VEC_SET_V2DI,
32206 IX86_BUILTIN_VEC_SET_V4SF,
32207 IX86_BUILTIN_VEC_SET_V4SI,
32208 IX86_BUILTIN_VEC_SET_V8HI,
32209 IX86_BUILTIN_VEC_SET_V4HI,
32210 IX86_BUILTIN_VEC_SET_V16QI,
32211 IX86_BUILTIN_GATHERSIV2DF,
32212 IX86_BUILTIN_GATHERSIV4DF,
32213 IX86_BUILTIN_GATHERDIV2DF,
32214 IX86_BUILTIN_GATHERDIV4DF,
32215 IX86_BUILTIN_GATHERSIV4SF,
32216 IX86_BUILTIN_GATHERSIV8SF,
32217 IX86_BUILTIN_GATHERDIV4SF,
32218 IX86_BUILTIN_GATHERDIV8SF,
32219 IX86_BUILTIN_GATHERSIV2DI,
32220 IX86_BUILTIN_GATHERSIV4DI,
32221 IX86_BUILTIN_GATHERDIV2DI,
32222 IX86_BUILTIN_GATHERDIV4DI,
32223 IX86_BUILTIN_GATHERSIV4SI,
32224 IX86_BUILTIN_GATHERSIV8SI,
32225 IX86_BUILTIN_GATHERDIV4SI,
32226 IX86_BUILTIN_GATHERDIV8SI,
32227 IX86_BUILTIN_VFMSUBSD3_MASK3,
32228 IX86_BUILTIN_VFMSUBSS3_MASK3,
32229 IX86_BUILTIN_GATHER3SIV8SF,
32230 IX86_BUILTIN_GATHER3SIV4SF,
32231 IX86_BUILTIN_GATHER3SIV4DF,
32232 IX86_BUILTIN_GATHER3SIV2DF,
32233 IX86_BUILTIN_GATHER3DIV8SF,
32234 IX86_BUILTIN_GATHER3DIV4SF,
32235 IX86_BUILTIN_GATHER3DIV4DF,
32236 IX86_BUILTIN_GATHER3DIV2DF,
32237 IX86_BUILTIN_GATHER3SIV8SI,
32238 IX86_BUILTIN_GATHER3SIV4SI,
32239 IX86_BUILTIN_GATHER3SIV4DI,
32240 IX86_BUILTIN_GATHER3SIV2DI,
32241 IX86_BUILTIN_GATHER3DIV8SI,
32242 IX86_BUILTIN_GATHER3DIV4SI,
32243 IX86_BUILTIN_GATHER3DIV4DI,
32244 IX86_BUILTIN_GATHER3DIV2DI,
32245 IX86_BUILTIN_SCATTERSIV8SF,
32246 IX86_BUILTIN_SCATTERSIV4SF,
32247 IX86_BUILTIN_SCATTERSIV4DF,
32248 IX86_BUILTIN_SCATTERSIV2DF,
32249 IX86_BUILTIN_SCATTERDIV8SF,
32250 IX86_BUILTIN_SCATTERDIV4SF,
32251 IX86_BUILTIN_SCATTERDIV4DF,
32252 IX86_BUILTIN_SCATTERDIV2DF,
32253 IX86_BUILTIN_SCATTERSIV8SI,
32254 IX86_BUILTIN_SCATTERSIV4SI,
32255 IX86_BUILTIN_SCATTERSIV4DI,
32256 IX86_BUILTIN_SCATTERSIV2DI,
32257 IX86_BUILTIN_SCATTERDIV8SI,
32258 IX86_BUILTIN_SCATTERDIV4SI,
32259 IX86_BUILTIN_SCATTERDIV4DI,
32260 IX86_BUILTIN_SCATTERDIV2DI,
32261 /* Alternate 4 and 8 element gather/scatter for the vectorizer
32262 where all operands are 32-byte or 64-byte wide respectively. */
32263 IX86_BUILTIN_GATHERALTSIV4DF,
32264 IX86_BUILTIN_GATHERALTDIV8SF,
32265 IX86_BUILTIN_GATHERALTSIV4DI,
32266 IX86_BUILTIN_GATHERALTDIV8SI,
32267 IX86_BUILTIN_GATHER3ALTDIV16SF,
32268 IX86_BUILTIN_GATHER3ALTDIV16SI,
32269 IX86_BUILTIN_GATHER3ALTSIV4DF,
32270 IX86_BUILTIN_GATHER3ALTDIV8SF,
32271 IX86_BUILTIN_GATHER3ALTSIV4DI,
32272 IX86_BUILTIN_GATHER3ALTDIV8SI,
32273 IX86_BUILTIN_GATHER3ALTSIV8DF,
32274 IX86_BUILTIN_GATHER3ALTSIV8DI,
32275 IX86_BUILTIN_GATHER3DIV16SF,
32276 IX86_BUILTIN_GATHER3DIV16SI,
32277 IX86_BUILTIN_GATHER3DIV8DF,
32278 IX86_BUILTIN_GATHER3DIV8DI,
32279 IX86_BUILTIN_GATHER3SIV16SF,
32280 IX86_BUILTIN_GATHER3SIV16SI,
32281 IX86_BUILTIN_GATHER3SIV8DF,
32282 IX86_BUILTIN_GATHER3SIV8DI,
32283 IX86_BUILTIN_SCATTERALTSIV8DF,
32284 IX86_BUILTIN_SCATTERALTDIV16SF,
32285 IX86_BUILTIN_SCATTERALTSIV8DI,
32286 IX86_BUILTIN_SCATTERALTDIV16SI,
32287 IX86_BUILTIN_SCATTERDIV16SF,
32288 IX86_BUILTIN_SCATTERDIV16SI,
32289 IX86_BUILTIN_SCATTERDIV8DF,
32290 IX86_BUILTIN_SCATTERDIV8DI,
32291 IX86_BUILTIN_SCATTERSIV16SF,
32292 IX86_BUILTIN_SCATTERSIV16SI,
32293 IX86_BUILTIN_SCATTERSIV8DF,
32294 IX86_BUILTIN_SCATTERSIV8DI,
32295 IX86_BUILTIN_GATHERPFQPD,
32296 IX86_BUILTIN_GATHERPFDPS,
32297 IX86_BUILTIN_GATHERPFDPD,
32298 IX86_BUILTIN_GATHERPFQPS,
32299 IX86_BUILTIN_SCATTERPFDPD,
32300 IX86_BUILTIN_SCATTERPFDPS,
32301 IX86_BUILTIN_SCATTERPFQPD,
32302 IX86_BUILTIN_SCATTERPFQPS,
32303 IX86_BUILTIN_CLWB,
32304 IX86_BUILTIN_CLFLUSHOPT,
32305 IX86_BUILTIN_INFQ,
32306 IX86_BUILTIN_HUGE_VALQ,
32307 IX86_BUILTIN_NANQ,
32308 IX86_BUILTIN_NANSQ,
32309 IX86_BUILTIN_XABORT,
32310 IX86_BUILTIN_ADDCARRYX32,
32311 IX86_BUILTIN_ADDCARRYX64,
32312 IX86_BUILTIN_SBB32,
32313 IX86_BUILTIN_SBB64,
32314 IX86_BUILTIN_RDRAND16_STEP,
32315 IX86_BUILTIN_RDRAND32_STEP,
32316 IX86_BUILTIN_RDRAND64_STEP,
32317 IX86_BUILTIN_RDSEED16_STEP,
32318 IX86_BUILTIN_RDSEED32_STEP,
32319 IX86_BUILTIN_RDSEED64_STEP,
32320 IX86_BUILTIN_MONITORX,
32321 IX86_BUILTIN_MWAITX,
32322 IX86_BUILTIN_CFSTRING,
32323 IX86_BUILTIN_CPU_INIT,
32324 IX86_BUILTIN_CPU_IS,
32325 IX86_BUILTIN_CPU_SUPPORTS,
32326 IX86_BUILTIN_READ_FLAGS,
32327 IX86_BUILTIN_WRITE_FLAGS,
32329 /* All the remaining builtins are tracked in bdesc_* arrays in
32330 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
32331 this point. */
32332 #define BDESC(mask, icode, name, code, comparison, flag) \
32333 code,
32334 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32335 code, \
32336 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
32337 #define BDESC_END(kind, next_kind)
32339 #include "i386-builtin.def"
32341 #undef BDESC
32342 #undef BDESC_FIRST
32343 #undef BDESC_END
32345 IX86_BUILTIN_MAX,
32347 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
32349 /* Now just the aliases for bdesc_* start/end. */
32350 #define BDESC(mask, icode, name, code, comparison, flag)
32351 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
32352 #define BDESC_END(kind, next_kind) \
32353 IX86_BUILTIN__BDESC_##kind##_LAST \
32354 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
32356 #include "i386-builtin.def"
32358 #undef BDESC
32359 #undef BDESC_FIRST
32360 #undef BDESC_END
32362 /* Just to make sure there is no comma after the last enumerator. */
32363 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
32366 /* Table for the ix86 builtin decls. */
32367 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
32369 /* Table of all of the builtin functions that are possible with different ISA's
32370 but are waiting to be built until a function is declared to use that
32371 ISA. */
32372 struct builtin_isa {
32373 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
32374 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
32375 const char *name; /* function name */
32376 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
32377 unsigned char const_p:1; /* true if the declaration is constant */
32378 unsigned char pure_p:1; /* true if the declaration has pure attribute */
32379 bool leaf_p; /* true if the declaration has leaf attribute */
32380 bool nothrow_p; /* true if the declaration has nothrow attribute */
32381 bool set_and_not_built_p;
32384 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
32386 /* Bits that can still enable any inclusion of a builtin. */
32387 static HOST_WIDE_INT deferred_isa_values = 0;
32388 static HOST_WIDE_INT deferred_isa_values2 = 0;
32390 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32391 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32392 function decl in the ix86_builtins array. Returns the function decl or
32393 NULL_TREE, if the builtin was not added.
32395 If the front end has a special hook for builtin functions, delay adding
32396 builtin functions that aren't in the current ISA until the ISA is changed
32397 with function specific optimization. Doing so, can save about 300K for the
32398 default compiler. When the builtin is expanded, check at that time whether
32399 it is valid.
32401 If the front end doesn't have a special hook, record all builtins, even if
32402 it isn't an instruction set in the current ISA in case the user uses
32403 function specific options for a different ISA, so that we don't get scope
32404 errors if a builtin is added in the middle of a function scope. */
32406 static inline tree
32407 def_builtin (HOST_WIDE_INT mask, const char *name,
32408 enum ix86_builtin_func_type tcode,
32409 enum ix86_builtins code)
32411 tree decl = NULL_TREE;
32413 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32415 ix86_builtins_isa[(int) code].isa = mask;
32417 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32418 where any bit set means that built-in is enable, this bit must be *and-ed*
32419 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32420 means that *both* cpuid bits must be set for the built-in to be available.
32421 Handle this here. */
32422 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32423 mask &= ~OPTION_MASK_ISA_AVX512VL;
32425 mask &= ~OPTION_MASK_ISA_64BIT;
32426 if (mask == 0
32427 || (mask & ix86_isa_flags) != 0
32428 || (lang_hooks.builtin_function
32429 == lang_hooks.builtin_function_ext_scope))
32432 tree type = ix86_get_builtin_func_type (tcode);
32433 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32434 NULL, NULL_TREE);
32435 ix86_builtins[(int) code] = decl;
32436 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32438 else
32440 /* Just a MASK where set_and_not_built_p == true can potentially
32441 include a builtin. */
32442 deferred_isa_values |= mask;
32443 ix86_builtins[(int) code] = NULL_TREE;
32444 ix86_builtins_isa[(int) code].tcode = tcode;
32445 ix86_builtins_isa[(int) code].name = name;
32446 ix86_builtins_isa[(int) code].leaf_p = false;
32447 ix86_builtins_isa[(int) code].nothrow_p = false;
32448 ix86_builtins_isa[(int) code].const_p = false;
32449 ix86_builtins_isa[(int) code].pure_p = false;
32450 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32454 return decl;
32457 /* Like def_builtin, but also marks the function decl "const". */
32459 static inline tree
32460 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32461 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32463 tree decl = def_builtin (mask, name, tcode, code);
32464 if (decl)
32465 TREE_READONLY (decl) = 1;
32466 else
32467 ix86_builtins_isa[(int) code].const_p = true;
32469 return decl;
32472 /* Like def_builtin, but also marks the function decl "pure". */
32474 static inline tree
32475 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32476 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32478 tree decl = def_builtin (mask, name, tcode, code);
32479 if (decl)
32480 DECL_PURE_P (decl) = 1;
32481 else
32482 ix86_builtins_isa[(int) code].pure_p = true;
32484 return decl;
32487 /* Like def_builtin, but for additional isa2 flags. */
32489 static inline tree
32490 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32491 enum ix86_builtin_func_type tcode,
32492 enum ix86_builtins code)
32494 tree decl = NULL_TREE;
32496 ix86_builtins_isa[(int) code].isa2 = mask;
32498 if (mask == 0
32499 || (mask & ix86_isa_flags2) != 0
32500 || (lang_hooks.builtin_function
32501 == lang_hooks.builtin_function_ext_scope))
32504 tree type = ix86_get_builtin_func_type (tcode);
32505 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32506 NULL, NULL_TREE);
32507 ix86_builtins[(int) code] = decl;
32508 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32510 else
32512 /* Just a MASK where set_and_not_built_p == true can potentially
32513 include a builtin. */
32514 deferred_isa_values2 |= mask;
32515 ix86_builtins[(int) code] = NULL_TREE;
32516 ix86_builtins_isa[(int) code].tcode = tcode;
32517 ix86_builtins_isa[(int) code].name = name;
32518 ix86_builtins_isa[(int) code].leaf_p = false;
32519 ix86_builtins_isa[(int) code].nothrow_p = false;
32520 ix86_builtins_isa[(int) code].const_p = false;
32521 ix86_builtins_isa[(int) code].pure_p = false;
32522 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32525 return decl;
32528 /* Like def_builtin, but also marks the function decl "const". */
32530 static inline tree
32531 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32532 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32534 tree decl = def_builtin2 (mask, name, tcode, code);
32535 if (decl)
32536 TREE_READONLY (decl) = 1;
32537 else
32538 ix86_builtins_isa[(int) code].const_p = true;
32540 return decl;
32543 /* Like def_builtin, but also marks the function decl "pure". */
32545 static inline tree
32546 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32547 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32549 tree decl = def_builtin2 (mask, name, tcode, code);
32550 if (decl)
32551 DECL_PURE_P (decl) = 1;
32552 else
32553 ix86_builtins_isa[(int) code].pure_p = true;
32555 return decl;
32558 /* Add any new builtin functions for a given ISA that may not have been
32559 declared. This saves a bit of space compared to adding all of the
32560 declarations to the tree, even if we didn't use them. */
32562 static void
32563 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32565 if ((isa & deferred_isa_values) == 0
32566 && (isa2 & deferred_isa_values2) == 0)
32567 return;
32569 /* Bits in ISA value can be removed from potential isa values. */
32570 deferred_isa_values &= ~isa;
32571 deferred_isa_values2 &= ~isa2;
32573 int i;
32574 tree saved_current_target_pragma = current_target_pragma;
32575 current_target_pragma = NULL_TREE;
32577 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32579 if (((ix86_builtins_isa[i].isa & isa) != 0
32580 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32581 && ix86_builtins_isa[i].set_and_not_built_p)
32583 tree decl, type;
32585 /* Don't define the builtin again. */
32586 ix86_builtins_isa[i].set_and_not_built_p = false;
32588 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32589 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32590 type, i, BUILT_IN_MD, NULL,
32591 NULL_TREE);
32593 ix86_builtins[i] = decl;
32594 if (ix86_builtins_isa[i].const_p)
32595 TREE_READONLY (decl) = 1;
32596 if (ix86_builtins_isa[i].pure_p)
32597 DECL_PURE_P (decl) = 1;
32598 if (ix86_builtins_isa[i].leaf_p)
32599 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32600 NULL_TREE);
32601 if (ix86_builtins_isa[i].nothrow_p)
32602 TREE_NOTHROW (decl) = 1;
32606 current_target_pragma = saved_current_target_pragma;
32609 /* Bits for builtin_description.flag. */
32611 /* Set when we don't support the comparison natively, and should
32612 swap_comparison in order to support it. */
32613 #define BUILTIN_DESC_SWAP_OPERANDS 1
32615 struct builtin_description
32617 const HOST_WIDE_INT mask;
32618 const enum insn_code icode;
32619 const char *const name;
32620 const enum ix86_builtins code;
32621 const enum rtx_code comparison;
32622 const int flag;
32625 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32626 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32627 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32628 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32629 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32630 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32631 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32632 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32633 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32634 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32635 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32636 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32637 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32638 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32639 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32640 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32641 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32642 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32643 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32644 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32645 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32646 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32647 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32648 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32649 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32650 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32651 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32652 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32653 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32654 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32655 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32656 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32657 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32658 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32659 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32660 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32661 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32662 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32663 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32664 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32665 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32666 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32667 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32668 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32669 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32670 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32671 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32672 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32673 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32674 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32675 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32676 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32678 #define BDESC(mask, icode, name, code, comparison, flag) \
32679 { mask, icode, name, code, comparison, flag },
32680 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32681 static const struct builtin_description bdesc_##kind[] = \
32683 BDESC (mask, icode, name, code, comparison, flag)
32684 #define BDESC_END(kind, next_kind) \
32687 #include "i386-builtin.def"
32689 #undef BDESC
32690 #undef BDESC_FIRST
32691 #undef BDESC_END
32693 /* TM vector builtins. */
32695 /* Reuse the existing x86-specific `struct builtin_description' cause
32696 we're lazy. Add casts to make them fit. */
32697 static const struct builtin_description bdesc_tm[] =
32699 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32700 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32701 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32702 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32703 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32704 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32705 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32707 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32708 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32709 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32710 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32711 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32712 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32713 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32715 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32716 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32717 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32718 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32719 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32720 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32721 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32723 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32724 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32725 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32728 /* Initialize the transactional memory vector load/store builtins. */
32730 static void
32731 ix86_init_tm_builtins (void)
32733 enum ix86_builtin_func_type ftype;
32734 const struct builtin_description *d;
32735 size_t i;
32736 tree decl;
32737 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32738 tree attrs_log, attrs_type_log;
32740 if (!flag_tm)
32741 return;
32743 /* If there are no builtins defined, we must be compiling in a
32744 language without trans-mem support. */
32745 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32746 return;
32748 /* Use whatever attributes a normal TM load has. */
32749 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32750 attrs_load = DECL_ATTRIBUTES (decl);
32751 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32752 /* Use whatever attributes a normal TM store has. */
32753 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32754 attrs_store = DECL_ATTRIBUTES (decl);
32755 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32756 /* Use whatever attributes a normal TM log has. */
32757 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32758 attrs_log = DECL_ATTRIBUTES (decl);
32759 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32761 for (i = 0, d = bdesc_tm;
32762 i < ARRAY_SIZE (bdesc_tm);
32763 i++, d++)
32765 if ((d->mask & ix86_isa_flags) != 0
32766 || (lang_hooks.builtin_function
32767 == lang_hooks.builtin_function_ext_scope))
32769 tree type, attrs, attrs_type;
32770 enum built_in_function code = (enum built_in_function) d->code;
32772 ftype = (enum ix86_builtin_func_type) d->flag;
32773 type = ix86_get_builtin_func_type (ftype);
32775 if (BUILTIN_TM_LOAD_P (code))
32777 attrs = attrs_load;
32778 attrs_type = attrs_type_load;
32780 else if (BUILTIN_TM_STORE_P (code))
32782 attrs = attrs_store;
32783 attrs_type = attrs_type_store;
32785 else
32787 attrs = attrs_log;
32788 attrs_type = attrs_type_log;
32790 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32791 /* The builtin without the prefix for
32792 calling it directly. */
32793 d->name + strlen ("__builtin_"),
32794 attrs);
32795 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32796 set the TYPE_ATTRIBUTES. */
32797 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32799 set_builtin_decl (code, decl, false);
32804 /* Macros for verification of enum ix86_builtins order. */
32805 #define BDESC_VERIFY(x, y, z) \
32806 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32807 #define BDESC_VERIFYS(x, y, z) \
32808 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32810 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32811 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32812 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32813 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32814 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32815 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32816 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32817 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32818 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32819 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32820 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32821 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32822 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32823 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32824 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32825 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32826 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32827 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32828 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32829 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32831 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32832 in the current target ISA to allow the user to compile particular modules
32833 with different target specific options that differ from the command line
32834 options. */
32835 static void
32836 ix86_init_mmx_sse_builtins (void)
32838 const struct builtin_description * d;
32839 enum ix86_builtin_func_type ftype;
32840 size_t i;
32842 /* Add all special builtins with variable number of operands. */
32843 for (i = 0, d = bdesc_special_args;
32844 i < ARRAY_SIZE (bdesc_special_args);
32845 i++, d++)
32847 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32848 if (d->name == 0)
32849 continue;
32851 ftype = (enum ix86_builtin_func_type) d->flag;
32852 def_builtin (d->mask, d->name, ftype, d->code);
32854 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32855 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32856 ARRAY_SIZE (bdesc_special_args) - 1);
32858 /* Add all builtins with variable number of operands. */
32859 for (i = 0, d = bdesc_args;
32860 i < ARRAY_SIZE (bdesc_args);
32861 i++, d++)
32863 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32864 if (d->name == 0)
32865 continue;
32867 ftype = (enum ix86_builtin_func_type) d->flag;
32868 def_builtin_const (d->mask, d->name, ftype, d->code);
32870 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32871 IX86_BUILTIN__BDESC_ARGS_FIRST,
32872 ARRAY_SIZE (bdesc_args) - 1);
32874 /* Add all builtins with variable number of operands. */
32875 for (i = 0, d = bdesc_args2;
32876 i < ARRAY_SIZE (bdesc_args2);
32877 i++, d++)
32879 if (d->name == 0)
32880 continue;
32882 ftype = (enum ix86_builtin_func_type) d->flag;
32883 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32886 /* Add all builtins with rounding. */
32887 for (i = 0, d = bdesc_round_args;
32888 i < ARRAY_SIZE (bdesc_round_args);
32889 i++, d++)
32891 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32892 if (d->name == 0)
32893 continue;
32895 ftype = (enum ix86_builtin_func_type) d->flag;
32896 def_builtin_const (d->mask, d->name, ftype, d->code);
32898 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32899 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32900 ARRAY_SIZE (bdesc_round_args) - 1);
32902 /* pcmpestr[im] insns. */
32903 for (i = 0, d = bdesc_pcmpestr;
32904 i < ARRAY_SIZE (bdesc_pcmpestr);
32905 i++, d++)
32907 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32908 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32909 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32910 else
32911 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32912 def_builtin_const (d->mask, d->name, ftype, d->code);
32914 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32915 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32916 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32918 /* pcmpistr[im] insns. */
32919 for (i = 0, d = bdesc_pcmpistr;
32920 i < ARRAY_SIZE (bdesc_pcmpistr);
32921 i++, d++)
32923 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32924 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32925 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32926 else
32927 ftype = INT_FTYPE_V16QI_V16QI_INT;
32928 def_builtin_const (d->mask, d->name, ftype, d->code);
32930 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32931 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32932 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32934 /* comi/ucomi insns. */
32935 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32937 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32938 if (d->mask == OPTION_MASK_ISA_SSE2)
32939 ftype = INT_FTYPE_V2DF_V2DF;
32940 else
32941 ftype = INT_FTYPE_V4SF_V4SF;
32942 def_builtin_const (d->mask, d->name, ftype, d->code);
32944 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32945 IX86_BUILTIN__BDESC_COMI_FIRST,
32946 ARRAY_SIZE (bdesc_comi) - 1);
32948 /* SSE */
32949 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32950 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32951 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32952 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32954 /* SSE or 3DNow!A */
32955 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32956 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32957 IX86_BUILTIN_MASKMOVQ);
32959 /* SSE2 */
32960 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32961 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32963 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32964 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32965 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32966 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32968 /* SSE3. */
32969 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32970 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32971 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32972 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32974 /* AES */
32975 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32976 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32977 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32978 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32979 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32980 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32981 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32982 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32983 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32984 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32985 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32986 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32988 /* PCLMUL */
32989 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32990 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32992 /* RDRND */
32993 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32994 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32995 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32996 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32997 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32998 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32999 IX86_BUILTIN_RDRAND64_STEP);
33001 /* AVX2 */
33002 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
33003 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
33004 IX86_BUILTIN_GATHERSIV2DF);
33006 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
33007 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
33008 IX86_BUILTIN_GATHERSIV4DF);
33010 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
33011 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
33012 IX86_BUILTIN_GATHERDIV2DF);
33014 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
33015 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
33016 IX86_BUILTIN_GATHERDIV4DF);
33018 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
33019 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
33020 IX86_BUILTIN_GATHERSIV4SF);
33022 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
33023 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
33024 IX86_BUILTIN_GATHERSIV8SF);
33026 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
33027 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
33028 IX86_BUILTIN_GATHERDIV4SF);
33030 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
33031 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
33032 IX86_BUILTIN_GATHERDIV8SF);
33034 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
33035 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
33036 IX86_BUILTIN_GATHERSIV2DI);
33038 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
33039 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
33040 IX86_BUILTIN_GATHERSIV4DI);
33042 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
33043 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
33044 IX86_BUILTIN_GATHERDIV2DI);
33046 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
33047 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
33048 IX86_BUILTIN_GATHERDIV4DI);
33050 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
33051 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
33052 IX86_BUILTIN_GATHERSIV4SI);
33054 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
33055 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
33056 IX86_BUILTIN_GATHERSIV8SI);
33058 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
33059 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
33060 IX86_BUILTIN_GATHERDIV4SI);
33062 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
33063 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
33064 IX86_BUILTIN_GATHERDIV8SI);
33066 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
33067 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
33068 IX86_BUILTIN_GATHERALTSIV4DF);
33070 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
33071 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
33072 IX86_BUILTIN_GATHERALTDIV8SF);
33074 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
33075 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
33076 IX86_BUILTIN_GATHERALTSIV4DI);
33078 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
33079 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
33080 IX86_BUILTIN_GATHERALTDIV8SI);
33082 /* AVX512F */
33083 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
33084 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
33085 IX86_BUILTIN_GATHER3SIV16SF);
33087 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
33088 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
33089 IX86_BUILTIN_GATHER3SIV8DF);
33091 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
33092 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
33093 IX86_BUILTIN_GATHER3DIV16SF);
33095 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
33096 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
33097 IX86_BUILTIN_GATHER3DIV8DF);
33099 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
33100 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
33101 IX86_BUILTIN_GATHER3SIV16SI);
33103 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
33104 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
33105 IX86_BUILTIN_GATHER3SIV8DI);
33107 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
33108 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
33109 IX86_BUILTIN_GATHER3DIV16SI);
33111 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
33112 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
33113 IX86_BUILTIN_GATHER3DIV8DI);
33115 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
33116 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
33117 IX86_BUILTIN_GATHER3ALTSIV8DF);
33119 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
33120 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
33121 IX86_BUILTIN_GATHER3ALTDIV16SF);
33123 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
33124 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
33125 IX86_BUILTIN_GATHER3ALTSIV8DI);
33127 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
33128 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
33129 IX86_BUILTIN_GATHER3ALTDIV16SI);
33131 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
33132 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
33133 IX86_BUILTIN_SCATTERSIV16SF);
33135 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
33136 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
33137 IX86_BUILTIN_SCATTERSIV8DF);
33139 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
33140 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
33141 IX86_BUILTIN_SCATTERDIV16SF);
33143 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
33144 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
33145 IX86_BUILTIN_SCATTERDIV8DF);
33147 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
33148 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
33149 IX86_BUILTIN_SCATTERSIV16SI);
33151 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
33152 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
33153 IX86_BUILTIN_SCATTERSIV8DI);
33155 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
33156 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
33157 IX86_BUILTIN_SCATTERDIV16SI);
33159 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
33160 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
33161 IX86_BUILTIN_SCATTERDIV8DI);
33163 /* AVX512VL */
33164 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
33165 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
33166 IX86_BUILTIN_GATHER3SIV2DF);
33168 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
33169 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
33170 IX86_BUILTIN_GATHER3SIV4DF);
33172 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
33173 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
33174 IX86_BUILTIN_GATHER3DIV2DF);
33176 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
33177 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
33178 IX86_BUILTIN_GATHER3DIV4DF);
33180 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
33181 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
33182 IX86_BUILTIN_GATHER3SIV4SF);
33184 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
33185 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
33186 IX86_BUILTIN_GATHER3SIV8SF);
33188 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
33189 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
33190 IX86_BUILTIN_GATHER3DIV4SF);
33192 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
33193 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
33194 IX86_BUILTIN_GATHER3DIV8SF);
33196 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
33197 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
33198 IX86_BUILTIN_GATHER3SIV2DI);
33200 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
33201 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
33202 IX86_BUILTIN_GATHER3SIV4DI);
33204 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
33205 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
33206 IX86_BUILTIN_GATHER3DIV2DI);
33208 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
33209 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
33210 IX86_BUILTIN_GATHER3DIV4DI);
33212 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
33213 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
33214 IX86_BUILTIN_GATHER3SIV4SI);
33216 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
33217 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
33218 IX86_BUILTIN_GATHER3SIV8SI);
33220 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
33221 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
33222 IX86_BUILTIN_GATHER3DIV4SI);
33224 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
33225 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
33226 IX86_BUILTIN_GATHER3DIV8SI);
33228 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
33229 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
33230 IX86_BUILTIN_GATHER3ALTSIV4DF);
33232 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
33233 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
33234 IX86_BUILTIN_GATHER3ALTDIV8SF);
33236 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
33237 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
33238 IX86_BUILTIN_GATHER3ALTSIV4DI);
33240 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
33241 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
33242 IX86_BUILTIN_GATHER3ALTDIV8SI);
33244 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
33245 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
33246 IX86_BUILTIN_SCATTERSIV8SF);
33248 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
33249 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
33250 IX86_BUILTIN_SCATTERSIV4SF);
33252 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
33253 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
33254 IX86_BUILTIN_SCATTERSIV4DF);
33256 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
33257 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
33258 IX86_BUILTIN_SCATTERSIV2DF);
33260 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
33261 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
33262 IX86_BUILTIN_SCATTERDIV8SF);
33264 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
33265 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
33266 IX86_BUILTIN_SCATTERDIV4SF);
33268 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
33269 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
33270 IX86_BUILTIN_SCATTERDIV4DF);
33272 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
33273 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
33274 IX86_BUILTIN_SCATTERDIV2DF);
33276 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
33277 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
33278 IX86_BUILTIN_SCATTERSIV8SI);
33280 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
33281 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
33282 IX86_BUILTIN_SCATTERSIV4SI);
33284 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
33285 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
33286 IX86_BUILTIN_SCATTERSIV4DI);
33288 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
33289 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
33290 IX86_BUILTIN_SCATTERSIV2DI);
33292 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
33293 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
33294 IX86_BUILTIN_SCATTERDIV8SI);
33296 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
33297 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
33298 IX86_BUILTIN_SCATTERDIV4SI);
33300 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
33301 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
33302 IX86_BUILTIN_SCATTERDIV4DI);
33304 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
33305 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
33306 IX86_BUILTIN_SCATTERDIV2DI);
33307 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
33308 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
33309 IX86_BUILTIN_SCATTERALTSIV8DF);
33311 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
33312 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
33313 IX86_BUILTIN_SCATTERALTDIV16SF);
33315 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
33316 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
33317 IX86_BUILTIN_SCATTERALTSIV8DI);
33319 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
33320 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
33321 IX86_BUILTIN_SCATTERALTDIV16SI);
33323 /* AVX512PF */
33324 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
33325 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33326 IX86_BUILTIN_GATHERPFDPD);
33327 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
33328 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33329 IX86_BUILTIN_GATHERPFDPS);
33330 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
33331 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33332 IX86_BUILTIN_GATHERPFQPD);
33333 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
33334 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33335 IX86_BUILTIN_GATHERPFQPS);
33336 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
33337 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33338 IX86_BUILTIN_SCATTERPFDPD);
33339 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
33340 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33341 IX86_BUILTIN_SCATTERPFDPS);
33342 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
33343 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33344 IX86_BUILTIN_SCATTERPFQPD);
33345 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
33346 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33347 IX86_BUILTIN_SCATTERPFQPS);
33349 /* SHA */
33350 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
33351 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
33352 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
33353 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
33354 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
33355 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
33356 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
33357 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
33358 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
33359 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
33360 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
33361 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
33362 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
33363 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
33365 /* RTM. */
33366 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
33367 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
33369 /* MMX access to the vec_init patterns. */
33370 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
33371 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
33373 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
33374 V4HI_FTYPE_HI_HI_HI_HI,
33375 IX86_BUILTIN_VEC_INIT_V4HI);
33377 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
33378 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
33379 IX86_BUILTIN_VEC_INIT_V8QI);
33381 /* Access to the vec_extract patterns. */
33382 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
33383 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
33384 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
33385 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
33386 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
33387 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33388 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33389 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33390 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33391 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33393 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33394 "__builtin_ia32_vec_ext_v4hi",
33395 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33397 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33398 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33400 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33401 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33403 /* Access to the vec_set patterns. */
33404 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33405 "__builtin_ia32_vec_set_v2di",
33406 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33408 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33409 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33411 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33412 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33414 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33415 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33417 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33418 "__builtin_ia32_vec_set_v4hi",
33419 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33421 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33422 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33424 /* RDSEED */
33425 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33426 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33427 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33428 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33429 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33430 "__builtin_ia32_rdseed_di_step",
33431 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33433 /* ADCX */
33434 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33435 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33436 def_builtin (OPTION_MASK_ISA_64BIT,
33437 "__builtin_ia32_addcarryx_u64",
33438 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33439 IX86_BUILTIN_ADDCARRYX64);
33441 /* SBB */
33442 def_builtin (0, "__builtin_ia32_sbb_u32",
33443 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33444 def_builtin (OPTION_MASK_ISA_64BIT,
33445 "__builtin_ia32_sbb_u64",
33446 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33447 IX86_BUILTIN_SBB64);
33449 /* Read/write FLAGS. */
33450 def_builtin (0, "__builtin_ia32_readeflags_u32",
33451 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33452 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33453 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33454 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33455 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33456 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33457 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33459 /* CLFLUSHOPT. */
33460 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33461 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33463 /* CLWB. */
33464 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33465 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33467 /* MONITORX and MWAITX. */
33468 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33469 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33470 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33471 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33473 /* CLZERO. */
33474 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33475 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33477 /* Add FMA4 multi-arg argument instructions */
33478 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33480 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33481 if (d->name == 0)
33482 continue;
33484 ftype = (enum ix86_builtin_func_type) d->flag;
33485 def_builtin_const (d->mask, d->name, ftype, d->code);
33487 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33488 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33489 ARRAY_SIZE (bdesc_multi_arg) - 1);
33492 static void
33493 ix86_init_mpx_builtins ()
33495 const struct builtin_description * d;
33496 enum ix86_builtin_func_type ftype;
33497 tree decl;
33498 size_t i;
33500 for (i = 0, d = bdesc_mpx;
33501 i < ARRAY_SIZE (bdesc_mpx);
33502 i++, d++)
33504 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33505 if (d->name == 0)
33506 continue;
33508 ftype = (enum ix86_builtin_func_type) d->flag;
33509 decl = def_builtin (d->mask, d->name, ftype, d->code);
33511 /* With no leaf and nothrow flags for MPX builtins
33512 abnormal edges may follow its call when setjmp
33513 presents in the function. Since we may have a lot
33514 of MPX builtins calls it causes lots of useless
33515 edges and enormous PHI nodes. To avoid this we mark
33516 MPX builtins as leaf and nothrow. */
33517 if (decl)
33519 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33520 NULL_TREE);
33521 TREE_NOTHROW (decl) = 1;
33523 else
33525 ix86_builtins_isa[(int)d->code].leaf_p = true;
33526 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33529 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33530 IX86_BUILTIN__BDESC_MPX_FIRST,
33531 ARRAY_SIZE (bdesc_mpx) - 1);
33533 for (i = 0, d = bdesc_mpx_const;
33534 i < ARRAY_SIZE (bdesc_mpx_const);
33535 i++, d++)
33537 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33538 if (d->name == 0)
33539 continue;
33541 ftype = (enum ix86_builtin_func_type) d->flag;
33542 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33544 if (decl)
33546 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33547 NULL_TREE);
33548 TREE_NOTHROW (decl) = 1;
33550 else
33552 ix86_builtins_isa[(int)d->code].leaf_p = true;
33553 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33556 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33557 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33558 ARRAY_SIZE (bdesc_mpx_const) - 1);
33560 #undef BDESC_VERIFY
33561 #undef BDESC_VERIFYS
33563 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33564 to return a pointer to VERSION_DECL if the outcome of the expression
33565 formed by PREDICATE_CHAIN is true. This function will be called during
33566 version dispatch to decide which function version to execute. It returns
33567 the basic block at the end, to which more conditions can be added. */
33569 static basic_block
33570 add_condition_to_bb (tree function_decl, tree version_decl,
33571 tree predicate_chain, basic_block new_bb)
33573 gimple *return_stmt;
33574 tree convert_expr, result_var;
33575 gimple *convert_stmt;
33576 gimple *call_cond_stmt;
33577 gimple *if_else_stmt;
33579 basic_block bb1, bb2, bb3;
33580 edge e12, e23;
33582 tree cond_var, and_expr_var = NULL_TREE;
33583 gimple_seq gseq;
33585 tree predicate_decl, predicate_arg;
33587 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33589 gcc_assert (new_bb != NULL);
33590 gseq = bb_seq (new_bb);
33593 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33594 build_fold_addr_expr (version_decl));
33595 result_var = create_tmp_var (ptr_type_node);
33596 convert_stmt = gimple_build_assign (result_var, convert_expr);
33597 return_stmt = gimple_build_return (result_var);
33599 if (predicate_chain == NULL_TREE)
33601 gimple_seq_add_stmt (&gseq, convert_stmt);
33602 gimple_seq_add_stmt (&gseq, return_stmt);
33603 set_bb_seq (new_bb, gseq);
33604 gimple_set_bb (convert_stmt, new_bb);
33605 gimple_set_bb (return_stmt, new_bb);
33606 pop_cfun ();
33607 return new_bb;
33610 while (predicate_chain != NULL)
33612 cond_var = create_tmp_var (integer_type_node);
33613 predicate_decl = TREE_PURPOSE (predicate_chain);
33614 predicate_arg = TREE_VALUE (predicate_chain);
33615 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33616 gimple_call_set_lhs (call_cond_stmt, cond_var);
33618 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33619 gimple_set_bb (call_cond_stmt, new_bb);
33620 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33622 predicate_chain = TREE_CHAIN (predicate_chain);
33624 if (and_expr_var == NULL)
33625 and_expr_var = cond_var;
33626 else
33628 gimple *assign_stmt;
33629 /* Use MIN_EXPR to check if any integer is zero?.
33630 and_expr_var = min_expr <cond_var, and_expr_var> */
33631 assign_stmt = gimple_build_assign (and_expr_var,
33632 build2 (MIN_EXPR, integer_type_node,
33633 cond_var, and_expr_var));
33635 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33636 gimple_set_bb (assign_stmt, new_bb);
33637 gimple_seq_add_stmt (&gseq, assign_stmt);
33641 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33642 integer_zero_node,
33643 NULL_TREE, NULL_TREE);
33644 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33645 gimple_set_bb (if_else_stmt, new_bb);
33646 gimple_seq_add_stmt (&gseq, if_else_stmt);
33648 gimple_seq_add_stmt (&gseq, convert_stmt);
33649 gimple_seq_add_stmt (&gseq, return_stmt);
33650 set_bb_seq (new_bb, gseq);
33652 bb1 = new_bb;
33653 e12 = split_block (bb1, if_else_stmt);
33654 bb2 = e12->dest;
33655 e12->flags &= ~EDGE_FALLTHRU;
33656 e12->flags |= EDGE_TRUE_VALUE;
33658 e23 = split_block (bb2, return_stmt);
33660 gimple_set_bb (convert_stmt, bb2);
33661 gimple_set_bb (return_stmt, bb2);
33663 bb3 = e23->dest;
33664 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33666 remove_edge (e23);
33667 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33669 pop_cfun ();
33671 return bb3;
33674 /* This parses the attribute arguments to target in DECL and determines
33675 the right builtin to use to match the platform specification.
33676 It returns the priority value for this version decl. If PREDICATE_LIST
33677 is not NULL, it stores the list of cpu features that need to be checked
33678 before dispatching this function. */
33680 static unsigned int
33681 get_builtin_code_for_version (tree decl, tree *predicate_list)
33683 tree attrs;
33684 struct cl_target_option cur_target;
33685 tree target_node;
33686 struct cl_target_option *new_target;
33687 const char *arg_str = NULL;
33688 const char *attrs_str = NULL;
33689 char *tok_str = NULL;
33690 char *token;
33692 /* Priority of i386 features, greater value is higher priority. This is
33693 used to decide the order in which function dispatch must happen. For
33694 instance, a version specialized for SSE4.2 should be checked for dispatch
33695 before a version for SSE3, as SSE4.2 implies SSE3. */
33696 enum feature_priority
33698 P_ZERO = 0,
33699 P_MMX,
33700 P_SSE,
33701 P_SSE2,
33702 P_SSE3,
33703 P_SSSE3,
33704 P_PROC_SSSE3,
33705 P_SSE4_A,
33706 P_PROC_SSE4_A,
33707 P_SSE4_1,
33708 P_SSE4_2,
33709 P_PROC_SSE4_2,
33710 P_POPCNT,
33711 P_AES,
33712 P_PCLMUL,
33713 P_AVX,
33714 P_PROC_AVX,
33715 P_BMI,
33716 P_PROC_BMI,
33717 P_FMA4,
33718 P_XOP,
33719 P_PROC_XOP,
33720 P_FMA,
33721 P_PROC_FMA,
33722 P_BMI2,
33723 P_AVX2,
33724 P_PROC_AVX2,
33725 P_AVX512F,
33726 P_PROC_AVX512F
33729 enum feature_priority priority = P_ZERO;
33731 /* These are the target attribute strings for which a dispatcher is
33732 available, from fold_builtin_cpu. */
33734 static struct _feature_list
33736 const char *const name;
33737 const enum feature_priority priority;
33739 const feature_list[] =
33741 {"mmx", P_MMX},
33742 {"sse", P_SSE},
33743 {"sse2", P_SSE2},
33744 {"sse3", P_SSE3},
33745 {"sse4a", P_SSE4_A},
33746 {"ssse3", P_SSSE3},
33747 {"sse4.1", P_SSE4_1},
33748 {"sse4.2", P_SSE4_2},
33749 {"popcnt", P_POPCNT},
33750 {"aes", P_AES},
33751 {"pclmul", P_PCLMUL},
33752 {"avx", P_AVX},
33753 {"bmi", P_BMI},
33754 {"fma4", P_FMA4},
33755 {"xop", P_XOP},
33756 {"fma", P_FMA},
33757 {"bmi2", P_BMI2},
33758 {"avx2", P_AVX2},
33759 {"avx512f", P_AVX512F}
33763 static unsigned int NUM_FEATURES
33764 = sizeof (feature_list) / sizeof (struct _feature_list);
33766 unsigned int i;
33768 tree predicate_chain = NULL_TREE;
33769 tree predicate_decl, predicate_arg;
33771 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33772 gcc_assert (attrs != NULL);
33774 attrs = TREE_VALUE (TREE_VALUE (attrs));
33776 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33777 attrs_str = TREE_STRING_POINTER (attrs);
33779 /* Return priority zero for default function. */
33780 if (strcmp (attrs_str, "default") == 0)
33781 return 0;
33783 /* Handle arch= if specified. For priority, set it to be 1 more than
33784 the best instruction set the processor can handle. For instance, if
33785 there is a version for atom and a version for ssse3 (the highest ISA
33786 priority for atom), the atom version must be checked for dispatch
33787 before the ssse3 version. */
33788 if (strstr (attrs_str, "arch=") != NULL)
33790 cl_target_option_save (&cur_target, &global_options);
33791 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33792 &global_options_set);
33794 gcc_assert (target_node);
33795 new_target = TREE_TARGET_OPTION (target_node);
33796 gcc_assert (new_target);
33798 if (new_target->arch_specified && new_target->arch > 0)
33800 switch (new_target->arch)
33802 case PROCESSOR_CORE2:
33803 arg_str = "core2";
33804 priority = P_PROC_SSSE3;
33805 break;
33806 case PROCESSOR_NEHALEM:
33807 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33809 arg_str = "westmere";
33810 priority = P_AES;
33812 else
33814 /* We translate "arch=corei7" and "arch=nehalem" to
33815 "corei7" so that it will be mapped to M_INTEL_COREI7
33816 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33817 arg_str = "corei7";
33818 priority = P_PROC_SSE4_2;
33820 break;
33821 case PROCESSOR_SANDYBRIDGE:
33822 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33823 arg_str = "ivybridge";
33824 else
33825 arg_str = "sandybridge";
33826 priority = P_PROC_AVX;
33827 break;
33828 case PROCESSOR_HASWELL:
33829 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33830 arg_str = "skylake-avx512";
33831 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33832 arg_str = "skylake";
33833 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33834 arg_str = "broadwell";
33835 else
33836 arg_str = "haswell";
33837 priority = P_PROC_AVX2;
33838 break;
33839 case PROCESSOR_BONNELL:
33840 arg_str = "bonnell";
33841 priority = P_PROC_SSSE3;
33842 break;
33843 case PROCESSOR_KNL:
33844 arg_str = "knl";
33845 priority = P_PROC_AVX512F;
33846 break;
33847 case PROCESSOR_SILVERMONT:
33848 arg_str = "silvermont";
33849 priority = P_PROC_SSE4_2;
33850 break;
33851 case PROCESSOR_AMDFAM10:
33852 arg_str = "amdfam10h";
33853 priority = P_PROC_SSE4_A;
33854 break;
33855 case PROCESSOR_BTVER1:
33856 arg_str = "btver1";
33857 priority = P_PROC_SSE4_A;
33858 break;
33859 case PROCESSOR_BTVER2:
33860 arg_str = "btver2";
33861 priority = P_PROC_BMI;
33862 break;
33863 case PROCESSOR_BDVER1:
33864 arg_str = "bdver1";
33865 priority = P_PROC_XOP;
33866 break;
33867 case PROCESSOR_BDVER2:
33868 arg_str = "bdver2";
33869 priority = P_PROC_FMA;
33870 break;
33871 case PROCESSOR_BDVER3:
33872 arg_str = "bdver3";
33873 priority = P_PROC_FMA;
33874 break;
33875 case PROCESSOR_BDVER4:
33876 arg_str = "bdver4";
33877 priority = P_PROC_AVX2;
33878 break;
33879 case PROCESSOR_ZNVER1:
33880 arg_str = "znver1";
33881 priority = P_PROC_AVX2;
33882 break;
33886 cl_target_option_restore (&global_options, &cur_target);
33888 if (predicate_list && arg_str == NULL)
33890 error_at (DECL_SOURCE_LOCATION (decl),
33891 "No dispatcher found for the versioning attributes");
33892 return 0;
33895 if (predicate_list)
33897 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33898 /* For a C string literal the length includes the trailing NULL. */
33899 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33900 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33901 predicate_chain);
33905 /* Process feature name. */
33906 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33907 strcpy (tok_str, attrs_str);
33908 token = strtok (tok_str, ",");
33909 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33911 while (token != NULL)
33913 /* Do not process "arch=" */
33914 if (strncmp (token, "arch=", 5) == 0)
33916 token = strtok (NULL, ",");
33917 continue;
33919 for (i = 0; i < NUM_FEATURES; ++i)
33921 if (strcmp (token, feature_list[i].name) == 0)
33923 if (predicate_list)
33925 predicate_arg = build_string_literal (
33926 strlen (feature_list[i].name) + 1,
33927 feature_list[i].name);
33928 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33929 predicate_chain);
33931 /* Find the maximum priority feature. */
33932 if (feature_list[i].priority > priority)
33933 priority = feature_list[i].priority;
33935 break;
33938 if (predicate_list && i == NUM_FEATURES)
33940 error_at (DECL_SOURCE_LOCATION (decl),
33941 "No dispatcher found for %s", token);
33942 return 0;
33944 token = strtok (NULL, ",");
33946 free (tok_str);
33948 if (predicate_list && predicate_chain == NULL_TREE)
33950 error_at (DECL_SOURCE_LOCATION (decl),
33951 "No dispatcher found for the versioning attributes : %s",
33952 attrs_str);
33953 return 0;
33955 else if (predicate_list)
33957 predicate_chain = nreverse (predicate_chain);
33958 *predicate_list = predicate_chain;
33961 return priority;
33964 /* This compares the priority of target features in function DECL1
33965 and DECL2. It returns positive value if DECL1 is higher priority,
33966 negative value if DECL2 is higher priority and 0 if they are the
33967 same. */
33969 static int
33970 ix86_compare_version_priority (tree decl1, tree decl2)
33972 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33973 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33975 return (int)priority1 - (int)priority2;
33978 /* V1 and V2 point to function versions with different priorities
33979 based on the target ISA. This function compares their priorities. */
33981 static int
33982 feature_compare (const void *v1, const void *v2)
33984 typedef struct _function_version_info
33986 tree version_decl;
33987 tree predicate_chain;
33988 unsigned int dispatch_priority;
33989 } function_version_info;
33991 const function_version_info c1 = *(const function_version_info *)v1;
33992 const function_version_info c2 = *(const function_version_info *)v2;
33993 return (c2.dispatch_priority - c1.dispatch_priority);
33996 /* This function generates the dispatch function for
33997 multi-versioned functions. DISPATCH_DECL is the function which will
33998 contain the dispatch logic. FNDECLS are the function choices for
33999 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
34000 in DISPATCH_DECL in which the dispatch code is generated. */
34002 static int
34003 dispatch_function_versions (tree dispatch_decl,
34004 void *fndecls_p,
34005 basic_block *empty_bb)
34007 tree default_decl;
34008 gimple *ifunc_cpu_init_stmt;
34009 gimple_seq gseq;
34010 int ix;
34011 tree ele;
34012 vec<tree> *fndecls;
34013 unsigned int num_versions = 0;
34014 unsigned int actual_versions = 0;
34015 unsigned int i;
34017 struct _function_version_info
34019 tree version_decl;
34020 tree predicate_chain;
34021 unsigned int dispatch_priority;
34022 }*function_version_info;
34024 gcc_assert (dispatch_decl != NULL
34025 && fndecls_p != NULL
34026 && empty_bb != NULL);
34028 /*fndecls_p is actually a vector. */
34029 fndecls = static_cast<vec<tree> *> (fndecls_p);
34031 /* At least one more version other than the default. */
34032 num_versions = fndecls->length ();
34033 gcc_assert (num_versions >= 2);
34035 function_version_info = (struct _function_version_info *)
34036 XNEWVEC (struct _function_version_info, (num_versions - 1));
34038 /* The first version in the vector is the default decl. */
34039 default_decl = (*fndecls)[0];
34041 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
34043 gseq = bb_seq (*empty_bb);
34044 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
34045 constructors, so explicity call __builtin_cpu_init here. */
34046 ifunc_cpu_init_stmt = gimple_build_call_vec (
34047 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
34048 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
34049 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
34050 set_bb_seq (*empty_bb, gseq);
34052 pop_cfun ();
34055 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
34057 tree version_decl = ele;
34058 tree predicate_chain = NULL_TREE;
34059 unsigned int priority;
34060 /* Get attribute string, parse it and find the right predicate decl.
34061 The predicate function could be a lengthy combination of many
34062 features, like arch-type and various isa-variants. */
34063 priority = get_builtin_code_for_version (version_decl,
34064 &predicate_chain);
34066 if (predicate_chain == NULL_TREE)
34067 continue;
34069 function_version_info [actual_versions].version_decl = version_decl;
34070 function_version_info [actual_versions].predicate_chain
34071 = predicate_chain;
34072 function_version_info [actual_versions].dispatch_priority = priority;
34073 actual_versions++;
34076 /* Sort the versions according to descending order of dispatch priority. The
34077 priority is based on the ISA. This is not a perfect solution. There
34078 could still be ambiguity. If more than one function version is suitable
34079 to execute, which one should be dispatched? In future, allow the user
34080 to specify a dispatch priority next to the version. */
34081 qsort (function_version_info, actual_versions,
34082 sizeof (struct _function_version_info), feature_compare);
34084 for (i = 0; i < actual_versions; ++i)
34085 *empty_bb = add_condition_to_bb (dispatch_decl,
34086 function_version_info[i].version_decl,
34087 function_version_info[i].predicate_chain,
34088 *empty_bb);
34090 /* dispatch default version at the end. */
34091 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
34092 NULL, *empty_bb);
34094 free (function_version_info);
34095 return 0;
34098 /* This function changes the assembler name for functions that are
34099 versions. If DECL is a function version and has a "target"
34100 attribute, it appends the attribute string to its assembler name. */
34102 static tree
34103 ix86_mangle_function_version_assembler_name (tree decl, tree id)
34105 tree version_attr;
34106 const char *orig_name, *version_string;
34107 char *attr_str, *assembler_name;
34109 if (DECL_DECLARED_INLINE_P (decl)
34110 && lookup_attribute ("gnu_inline",
34111 DECL_ATTRIBUTES (decl)))
34112 error_at (DECL_SOURCE_LOCATION (decl),
34113 "Function versions cannot be marked as gnu_inline,"
34114 " bodies have to be generated");
34116 if (DECL_VIRTUAL_P (decl)
34117 || DECL_VINDEX (decl))
34118 sorry ("Virtual function multiversioning not supported");
34120 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
34122 /* target attribute string cannot be NULL. */
34123 gcc_assert (version_attr != NULL_TREE);
34125 orig_name = IDENTIFIER_POINTER (id);
34126 version_string
34127 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
34129 if (strcmp (version_string, "default") == 0)
34130 return id;
34132 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
34133 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
34135 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
34137 /* Allow assembler name to be modified if already set. */
34138 if (DECL_ASSEMBLER_NAME_SET_P (decl))
34139 SET_DECL_RTL (decl, NULL);
34141 tree ret = get_identifier (assembler_name);
34142 XDELETEVEC (attr_str);
34143 XDELETEVEC (assembler_name);
34144 return ret;
34148 static tree
34149 ix86_mangle_decl_assembler_name (tree decl, tree id)
34151 /* For function version, add the target suffix to the assembler name. */
34152 if (TREE_CODE (decl) == FUNCTION_DECL
34153 && DECL_FUNCTION_VERSIONED (decl))
34154 id = ix86_mangle_function_version_assembler_name (decl, id);
34155 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
34156 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
34157 #endif
34159 return id;
34162 /* Make a dispatcher declaration for the multi-versioned function DECL.
34163 Calls to DECL function will be replaced with calls to the dispatcher
34164 by the front-end. Returns the decl of the dispatcher function. */
34166 static tree
34167 ix86_get_function_versions_dispatcher (void *decl)
34169 tree fn = (tree) decl;
34170 struct cgraph_node *node = NULL;
34171 struct cgraph_node *default_node = NULL;
34172 struct cgraph_function_version_info *node_v = NULL;
34173 struct cgraph_function_version_info *first_v = NULL;
34175 tree dispatch_decl = NULL;
34177 struct cgraph_function_version_info *default_version_info = NULL;
34179 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
34181 node = cgraph_node::get (fn);
34182 gcc_assert (node != NULL);
34184 node_v = node->function_version ();
34185 gcc_assert (node_v != NULL);
34187 if (node_v->dispatcher_resolver != NULL)
34188 return node_v->dispatcher_resolver;
34190 /* Find the default version and make it the first node. */
34191 first_v = node_v;
34192 /* Go to the beginning of the chain. */
34193 while (first_v->prev != NULL)
34194 first_v = first_v->prev;
34195 default_version_info = first_v;
34196 while (default_version_info != NULL)
34198 if (is_function_default_version
34199 (default_version_info->this_node->decl))
34200 break;
34201 default_version_info = default_version_info->next;
34204 /* If there is no default node, just return NULL. */
34205 if (default_version_info == NULL)
34206 return NULL;
34208 /* Make default info the first node. */
34209 if (first_v != default_version_info)
34211 default_version_info->prev->next = default_version_info->next;
34212 if (default_version_info->next)
34213 default_version_info->next->prev = default_version_info->prev;
34214 first_v->prev = default_version_info;
34215 default_version_info->next = first_v;
34216 default_version_info->prev = NULL;
34219 default_node = default_version_info->this_node;
34221 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
34222 if (targetm.has_ifunc_p ())
34224 struct cgraph_function_version_info *it_v = NULL;
34225 struct cgraph_node *dispatcher_node = NULL;
34226 struct cgraph_function_version_info *dispatcher_version_info = NULL;
34228 /* Right now, the dispatching is done via ifunc. */
34229 dispatch_decl = make_dispatcher_decl (default_node->decl);
34231 dispatcher_node = cgraph_node::get_create (dispatch_decl);
34232 gcc_assert (dispatcher_node != NULL);
34233 dispatcher_node->dispatcher_function = 1;
34234 dispatcher_version_info
34235 = dispatcher_node->insert_new_function_version ();
34236 dispatcher_version_info->next = default_version_info;
34237 dispatcher_node->definition = 1;
34239 /* Set the dispatcher for all the versions. */
34240 it_v = default_version_info;
34241 while (it_v != NULL)
34243 it_v->dispatcher_resolver = dispatch_decl;
34244 it_v = it_v->next;
34247 else
34248 #endif
34250 error_at (DECL_SOURCE_LOCATION (default_node->decl),
34251 "multiversioning needs ifunc which is not supported "
34252 "on this target");
34255 return dispatch_decl;
34258 /* Make the resolver function decl to dispatch the versions of
34259 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
34260 ifunc alias that will point to the created resolver. Create an
34261 empty basic block in the resolver and store the pointer in
34262 EMPTY_BB. Return the decl of the resolver function. */
34264 static tree
34265 make_resolver_func (const tree default_decl,
34266 const tree ifunc_alias_decl,
34267 basic_block *empty_bb)
34269 char *resolver_name;
34270 tree decl, type, decl_name, t;
34272 /* IFUNC's have to be globally visible. So, if the default_decl is
34273 not, then the name of the IFUNC should be made unique. */
34274 if (TREE_PUBLIC (default_decl) == 0)
34276 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
34277 symtab->change_decl_assembler_name (ifunc_alias_decl,
34278 get_identifier (ifunc_name));
34279 XDELETEVEC (ifunc_name);
34282 resolver_name = make_unique_name (default_decl, "resolver", false);
34284 /* The resolver function should return a (void *). */
34285 type = build_function_type_list (ptr_type_node, NULL_TREE);
34287 decl = build_fn_decl (resolver_name, type);
34288 decl_name = get_identifier (resolver_name);
34289 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
34291 DECL_NAME (decl) = decl_name;
34292 TREE_USED (decl) = 1;
34293 DECL_ARTIFICIAL (decl) = 1;
34294 DECL_IGNORED_P (decl) = 1;
34295 TREE_PUBLIC (decl) = 0;
34296 DECL_UNINLINABLE (decl) = 1;
34298 /* Resolver is not external, body is generated. */
34299 DECL_EXTERNAL (decl) = 0;
34300 DECL_EXTERNAL (ifunc_alias_decl) = 0;
34302 DECL_CONTEXT (decl) = NULL_TREE;
34303 DECL_INITIAL (decl) = make_node (BLOCK);
34304 DECL_STATIC_CONSTRUCTOR (decl) = 0;
34306 if (DECL_COMDAT_GROUP (default_decl)
34307 || TREE_PUBLIC (default_decl))
34309 /* In this case, each translation unit with a call to this
34310 versioned function will put out a resolver. Ensure it
34311 is comdat to keep just one copy. */
34312 DECL_COMDAT (decl) = 1;
34313 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
34315 /* Build result decl and add to function_decl. */
34316 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
34317 DECL_ARTIFICIAL (t) = 1;
34318 DECL_IGNORED_P (t) = 1;
34319 DECL_RESULT (decl) = t;
34321 gimplify_function_tree (decl);
34322 push_cfun (DECL_STRUCT_FUNCTION (decl));
34323 *empty_bb = init_lowered_empty_function (decl, false,
34324 profile_count::uninitialized ());
34326 cgraph_node::add_new_function (decl, true);
34327 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
34329 pop_cfun ();
34331 gcc_assert (ifunc_alias_decl != NULL);
34332 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
34333 DECL_ATTRIBUTES (ifunc_alias_decl)
34334 = make_attribute ("ifunc", resolver_name,
34335 DECL_ATTRIBUTES (ifunc_alias_decl));
34337 /* Create the alias for dispatch to resolver here. */
34338 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
34339 XDELETEVEC (resolver_name);
34340 return decl;
34343 /* Generate the dispatching code body to dispatch multi-versioned function
34344 DECL. The target hook is called to process the "target" attributes and
34345 provide the code to dispatch the right function at run-time. NODE points
34346 to the dispatcher decl whose body will be created. */
34348 static tree
34349 ix86_generate_version_dispatcher_body (void *node_p)
34351 tree resolver_decl;
34352 basic_block empty_bb;
34353 tree default_ver_decl;
34354 struct cgraph_node *versn;
34355 struct cgraph_node *node;
34357 struct cgraph_function_version_info *node_version_info = NULL;
34358 struct cgraph_function_version_info *versn_info = NULL;
34360 node = (cgraph_node *)node_p;
34362 node_version_info = node->function_version ();
34363 gcc_assert (node->dispatcher_function
34364 && node_version_info != NULL);
34366 if (node_version_info->dispatcher_resolver)
34367 return node_version_info->dispatcher_resolver;
34369 /* The first version in the chain corresponds to the default version. */
34370 default_ver_decl = node_version_info->next->this_node->decl;
34372 /* node is going to be an alias, so remove the finalized bit. */
34373 node->definition = false;
34375 resolver_decl = make_resolver_func (default_ver_decl,
34376 node->decl, &empty_bb);
34378 node_version_info->dispatcher_resolver = resolver_decl;
34380 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
34382 auto_vec<tree, 2> fn_ver_vec;
34384 for (versn_info = node_version_info->next; versn_info;
34385 versn_info = versn_info->next)
34387 versn = versn_info->this_node;
34388 /* Check for virtual functions here again, as by this time it should
34389 have been determined if this function needs a vtable index or
34390 not. This happens for methods in derived classes that override
34391 virtual methods in base classes but are not explicitly marked as
34392 virtual. */
34393 if (DECL_VINDEX (versn->decl))
34394 sorry ("Virtual function multiversioning not supported");
34396 fn_ver_vec.safe_push (versn->decl);
34399 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34400 cgraph_edge::rebuild_edges ();
34401 pop_cfun ();
34402 return resolver_decl;
34404 /* This builds the processor_model struct type defined in
34405 libgcc/config/i386/cpuinfo.c */
34407 static tree
34408 build_processor_model_struct (void)
34410 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34411 "__cpu_features"};
34412 tree field = NULL_TREE, field_chain = NULL_TREE;
34413 int i;
34414 tree type = make_node (RECORD_TYPE);
34416 /* The first 3 fields are unsigned int. */
34417 for (i = 0; i < 3; ++i)
34419 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34420 get_identifier (field_name[i]), unsigned_type_node);
34421 if (field_chain != NULL_TREE)
34422 DECL_CHAIN (field) = field_chain;
34423 field_chain = field;
34426 /* The last field is an array of unsigned integers of size one. */
34427 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34428 get_identifier (field_name[3]),
34429 build_array_type (unsigned_type_node,
34430 build_index_type (size_one_node)));
34431 if (field_chain != NULL_TREE)
34432 DECL_CHAIN (field) = field_chain;
34433 field_chain = field;
34435 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34436 return type;
34439 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34441 static tree
34442 make_var_decl (tree type, const char *name)
34444 tree new_decl;
34446 new_decl = build_decl (UNKNOWN_LOCATION,
34447 VAR_DECL,
34448 get_identifier(name),
34449 type);
34451 DECL_EXTERNAL (new_decl) = 1;
34452 TREE_STATIC (new_decl) = 1;
34453 TREE_PUBLIC (new_decl) = 1;
34454 DECL_INITIAL (new_decl) = 0;
34455 DECL_ARTIFICIAL (new_decl) = 0;
34456 DECL_PRESERVE_P (new_decl) = 1;
34458 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34459 assemble_variable (new_decl, 0, 0, 0);
34461 return new_decl;
34464 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34465 into an integer defined in libgcc/config/i386/cpuinfo.c */
34467 static tree
34468 fold_builtin_cpu (tree fndecl, tree *args)
34470 unsigned int i;
34471 enum ix86_builtins fn_code = (enum ix86_builtins)
34472 DECL_FUNCTION_CODE (fndecl);
34473 tree param_string_cst = NULL;
34475 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34476 enum processor_features
34478 F_CMOV = 0,
34479 F_MMX,
34480 F_POPCNT,
34481 F_SSE,
34482 F_SSE2,
34483 F_SSE3,
34484 F_SSSE3,
34485 F_SSE4_1,
34486 F_SSE4_2,
34487 F_AVX,
34488 F_AVX2,
34489 F_SSE4_A,
34490 F_FMA4,
34491 F_XOP,
34492 F_FMA,
34493 F_AVX512F,
34494 F_BMI,
34495 F_BMI2,
34496 F_AES,
34497 F_PCLMUL,
34498 F_AVX512VL,
34499 F_AVX512BW,
34500 F_AVX512DQ,
34501 F_AVX512CD,
34502 F_AVX512ER,
34503 F_AVX512PF,
34504 F_AVX512VBMI,
34505 F_AVX512IFMA,
34506 F_AVX5124VNNIW,
34507 F_AVX5124FMAPS,
34508 F_AVX512VPOPCNTDQ,
34509 F_MAX
34512 /* These are the values for vendor types and cpu types and subtypes
34513 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34514 the corresponding start value. */
34515 enum processor_model
34517 M_INTEL = 1,
34518 M_AMD,
34519 M_CPU_TYPE_START,
34520 M_INTEL_BONNELL,
34521 M_INTEL_CORE2,
34522 M_INTEL_COREI7,
34523 M_AMDFAM10H,
34524 M_AMDFAM15H,
34525 M_INTEL_SILVERMONT,
34526 M_INTEL_KNL,
34527 M_AMD_BTVER1,
34528 M_AMD_BTVER2,
34529 M_AMDFAM17H,
34530 M_CPU_SUBTYPE_START,
34531 M_INTEL_COREI7_NEHALEM,
34532 M_INTEL_COREI7_WESTMERE,
34533 M_INTEL_COREI7_SANDYBRIDGE,
34534 M_AMDFAM10H_BARCELONA,
34535 M_AMDFAM10H_SHANGHAI,
34536 M_AMDFAM10H_ISTANBUL,
34537 M_AMDFAM15H_BDVER1,
34538 M_AMDFAM15H_BDVER2,
34539 M_AMDFAM15H_BDVER3,
34540 M_AMDFAM15H_BDVER4,
34541 M_AMDFAM17H_ZNVER1,
34542 M_INTEL_COREI7_IVYBRIDGE,
34543 M_INTEL_COREI7_HASWELL,
34544 M_INTEL_COREI7_BROADWELL,
34545 M_INTEL_COREI7_SKYLAKE,
34546 M_INTEL_COREI7_SKYLAKE_AVX512
34549 static struct _arch_names_table
34551 const char *const name;
34552 const enum processor_model model;
34554 const arch_names_table[] =
34556 {"amd", M_AMD},
34557 {"intel", M_INTEL},
34558 {"atom", M_INTEL_BONNELL},
34559 {"slm", M_INTEL_SILVERMONT},
34560 {"core2", M_INTEL_CORE2},
34561 {"corei7", M_INTEL_COREI7},
34562 {"nehalem", M_INTEL_COREI7_NEHALEM},
34563 {"westmere", M_INTEL_COREI7_WESTMERE},
34564 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34565 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34566 {"haswell", M_INTEL_COREI7_HASWELL},
34567 {"broadwell", M_INTEL_COREI7_BROADWELL},
34568 {"skylake", M_INTEL_COREI7_SKYLAKE},
34569 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34570 {"bonnell", M_INTEL_BONNELL},
34571 {"silvermont", M_INTEL_SILVERMONT},
34572 {"knl", M_INTEL_KNL},
34573 {"amdfam10h", M_AMDFAM10H},
34574 {"barcelona", M_AMDFAM10H_BARCELONA},
34575 {"shanghai", M_AMDFAM10H_SHANGHAI},
34576 {"istanbul", M_AMDFAM10H_ISTANBUL},
34577 {"btver1", M_AMD_BTVER1},
34578 {"amdfam15h", M_AMDFAM15H},
34579 {"bdver1", M_AMDFAM15H_BDVER1},
34580 {"bdver2", M_AMDFAM15H_BDVER2},
34581 {"bdver3", M_AMDFAM15H_BDVER3},
34582 {"bdver4", M_AMDFAM15H_BDVER4},
34583 {"btver2", M_AMD_BTVER2},
34584 {"amdfam17h", M_AMDFAM17H},
34585 {"znver1", M_AMDFAM17H_ZNVER1},
34588 static struct _isa_names_table
34590 const char *const name;
34591 const enum processor_features feature;
34593 const isa_names_table[] =
34595 {"cmov", F_CMOV},
34596 {"mmx", F_MMX},
34597 {"popcnt", F_POPCNT},
34598 {"sse", F_SSE},
34599 {"sse2", F_SSE2},
34600 {"sse3", F_SSE3},
34601 {"ssse3", F_SSSE3},
34602 {"sse4a", F_SSE4_A},
34603 {"sse4.1", F_SSE4_1},
34604 {"sse4.2", F_SSE4_2},
34605 {"avx", F_AVX},
34606 {"fma4", F_FMA4},
34607 {"xop", F_XOP},
34608 {"fma", F_FMA},
34609 {"avx2", F_AVX2},
34610 {"avx512f", F_AVX512F},
34611 {"bmi", F_BMI},
34612 {"bmi2", F_BMI2},
34613 {"aes", F_AES},
34614 {"pclmul", F_PCLMUL},
34615 {"avx512vl",F_AVX512VL},
34616 {"avx512bw",F_AVX512BW},
34617 {"avx512dq",F_AVX512DQ},
34618 {"avx512cd",F_AVX512CD},
34619 {"avx512er",F_AVX512ER},
34620 {"avx512pf",F_AVX512PF},
34621 {"avx512vbmi",F_AVX512VBMI},
34622 {"avx512ifma",F_AVX512IFMA},
34623 {"avx5124vnniw",F_AVX5124VNNIW},
34624 {"avx5124fmaps",F_AVX5124FMAPS},
34625 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34628 tree __processor_model_type = build_processor_model_struct ();
34629 tree __cpu_model_var = make_var_decl (__processor_model_type,
34630 "__cpu_model");
34633 varpool_node::add (__cpu_model_var);
34635 gcc_assert ((args != NULL) && (*args != NULL));
34637 param_string_cst = *args;
34638 while (param_string_cst
34639 && TREE_CODE (param_string_cst) != STRING_CST)
34641 /* *args must be a expr that can contain other EXPRS leading to a
34642 STRING_CST. */
34643 if (!EXPR_P (param_string_cst))
34645 error ("Parameter to builtin must be a string constant or literal");
34646 return integer_zero_node;
34648 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34651 gcc_assert (param_string_cst);
34653 if (fn_code == IX86_BUILTIN_CPU_IS)
34655 tree ref;
34656 tree field;
34657 tree final;
34659 unsigned int field_val = 0;
34660 unsigned int NUM_ARCH_NAMES
34661 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34663 for (i = 0; i < NUM_ARCH_NAMES; i++)
34664 if (strcmp (arch_names_table[i].name,
34665 TREE_STRING_POINTER (param_string_cst)) == 0)
34666 break;
34668 if (i == NUM_ARCH_NAMES)
34670 error ("Parameter to builtin not valid: %s",
34671 TREE_STRING_POINTER (param_string_cst));
34672 return integer_zero_node;
34675 field = TYPE_FIELDS (__processor_model_type);
34676 field_val = arch_names_table[i].model;
34678 /* CPU types are stored in the next field. */
34679 if (field_val > M_CPU_TYPE_START
34680 && field_val < M_CPU_SUBTYPE_START)
34682 field = DECL_CHAIN (field);
34683 field_val -= M_CPU_TYPE_START;
34686 /* CPU subtypes are stored in the next field. */
34687 if (field_val > M_CPU_SUBTYPE_START)
34689 field = DECL_CHAIN ( DECL_CHAIN (field));
34690 field_val -= M_CPU_SUBTYPE_START;
34693 /* Get the appropriate field in __cpu_model. */
34694 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34695 field, NULL_TREE);
34697 /* Check the value. */
34698 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34699 build_int_cstu (unsigned_type_node, field_val));
34700 return build1 (CONVERT_EXPR, integer_type_node, final);
34702 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34704 tree ref;
34705 tree array_elt;
34706 tree field;
34707 tree final;
34709 unsigned int field_val = 0;
34710 unsigned int NUM_ISA_NAMES
34711 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34713 for (i = 0; i < NUM_ISA_NAMES; i++)
34714 if (strcmp (isa_names_table[i].name,
34715 TREE_STRING_POINTER (param_string_cst)) == 0)
34716 break;
34718 if (i == NUM_ISA_NAMES)
34720 error ("Parameter to builtin not valid: %s",
34721 TREE_STRING_POINTER (param_string_cst));
34722 return integer_zero_node;
34725 field = TYPE_FIELDS (__processor_model_type);
34726 /* Get the last field, which is __cpu_features. */
34727 while (DECL_CHAIN (field))
34728 field = DECL_CHAIN (field);
34730 /* Get the appropriate field: __cpu_model.__cpu_features */
34731 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34732 field, NULL_TREE);
34734 /* Access the 0th element of __cpu_features array. */
34735 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34736 integer_zero_node, NULL_TREE, NULL_TREE);
34738 field_val = (1 << isa_names_table[i].feature);
34739 /* Return __cpu_model.__cpu_features[0] & field_val */
34740 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34741 build_int_cstu (unsigned_type_node, field_val));
34742 return build1 (CONVERT_EXPR, integer_type_node, final);
34744 gcc_unreachable ();
34747 static tree
34748 ix86_fold_builtin (tree fndecl, int n_args,
34749 tree *args, bool ignore ATTRIBUTE_UNUSED)
34751 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34753 enum ix86_builtins fn_code = (enum ix86_builtins)
34754 DECL_FUNCTION_CODE (fndecl);
34755 switch (fn_code)
34757 case IX86_BUILTIN_CPU_IS:
34758 case IX86_BUILTIN_CPU_SUPPORTS:
34759 gcc_assert (n_args == 1);
34760 return fold_builtin_cpu (fndecl, args);
34762 case IX86_BUILTIN_NANQ:
34763 case IX86_BUILTIN_NANSQ:
34765 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34766 const char *str = c_getstr (*args);
34767 int quiet = fn_code == IX86_BUILTIN_NANQ;
34768 REAL_VALUE_TYPE real;
34770 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34771 return build_real (type, real);
34772 return NULL_TREE;
34775 case IX86_BUILTIN_INFQ:
34776 case IX86_BUILTIN_HUGE_VALQ:
34778 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34779 REAL_VALUE_TYPE inf;
34780 real_inf (&inf);
34781 return build_real (type, inf);
34784 case IX86_BUILTIN_TZCNT16:
34785 case IX86_BUILTIN_CTZS:
34786 case IX86_BUILTIN_TZCNT32:
34787 case IX86_BUILTIN_TZCNT64:
34788 gcc_assert (n_args == 1);
34789 if (TREE_CODE (args[0]) == INTEGER_CST)
34791 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34792 tree arg = args[0];
34793 if (fn_code == IX86_BUILTIN_TZCNT16
34794 || fn_code == IX86_BUILTIN_CTZS)
34795 arg = fold_convert (short_unsigned_type_node, arg);
34796 if (integer_zerop (arg))
34797 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34798 else
34799 return fold_const_call (CFN_CTZ, type, arg);
34801 break;
34803 case IX86_BUILTIN_LZCNT16:
34804 case IX86_BUILTIN_CLZS:
34805 case IX86_BUILTIN_LZCNT32:
34806 case IX86_BUILTIN_LZCNT64:
34807 gcc_assert (n_args == 1);
34808 if (TREE_CODE (args[0]) == INTEGER_CST)
34810 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34811 tree arg = args[0];
34812 if (fn_code == IX86_BUILTIN_LZCNT16
34813 || fn_code == IX86_BUILTIN_CLZS)
34814 arg = fold_convert (short_unsigned_type_node, arg);
34815 if (integer_zerop (arg))
34816 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34817 else
34818 return fold_const_call (CFN_CLZ, type, arg);
34820 break;
34822 case IX86_BUILTIN_BEXTR32:
34823 case IX86_BUILTIN_BEXTR64:
34824 case IX86_BUILTIN_BEXTRI32:
34825 case IX86_BUILTIN_BEXTRI64:
34826 gcc_assert (n_args == 2);
34827 if (tree_fits_uhwi_p (args[1]))
34829 unsigned HOST_WIDE_INT res = 0;
34830 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34831 unsigned int start = tree_to_uhwi (args[1]);
34832 unsigned int len = (start & 0xff00) >> 8;
34833 start &= 0xff;
34834 if (start >= prec || len == 0)
34835 res = 0;
34836 else if (!tree_fits_uhwi_p (args[0]))
34837 break;
34838 else
34839 res = tree_to_uhwi (args[0]) >> start;
34840 if (len > prec)
34841 len = prec;
34842 if (len < HOST_BITS_PER_WIDE_INT)
34843 res &= (HOST_WIDE_INT_1U << len) - 1;
34844 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34846 break;
34848 case IX86_BUILTIN_BZHI32:
34849 case IX86_BUILTIN_BZHI64:
34850 gcc_assert (n_args == 2);
34851 if (tree_fits_uhwi_p (args[1]))
34853 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34854 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34855 return args[0];
34856 if (!tree_fits_uhwi_p (args[0]))
34857 break;
34858 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34859 res &= ~(HOST_WIDE_INT_M1U << idx);
34860 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34862 break;
34864 case IX86_BUILTIN_PDEP32:
34865 case IX86_BUILTIN_PDEP64:
34866 gcc_assert (n_args == 2);
34867 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34869 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34870 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34871 unsigned HOST_WIDE_INT res = 0;
34872 unsigned HOST_WIDE_INT m, k = 1;
34873 for (m = 1; m; m <<= 1)
34874 if ((mask & m) != 0)
34876 if ((src & k) != 0)
34877 res |= m;
34878 k <<= 1;
34880 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34882 break;
34884 case IX86_BUILTIN_PEXT32:
34885 case IX86_BUILTIN_PEXT64:
34886 gcc_assert (n_args == 2);
34887 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34889 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34890 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34891 unsigned HOST_WIDE_INT res = 0;
34892 unsigned HOST_WIDE_INT m, k = 1;
34893 for (m = 1; m; m <<= 1)
34894 if ((mask & m) != 0)
34896 if ((src & m) != 0)
34897 res |= k;
34898 k <<= 1;
34900 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34902 break;
34904 default:
34905 break;
34909 #ifdef SUBTARGET_FOLD_BUILTIN
34910 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34911 #endif
34913 return NULL_TREE;
34916 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34917 constant) in GIMPLE. */
34919 bool
34920 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34922 gimple *stmt = gsi_stmt (*gsi);
34923 tree fndecl = gimple_call_fndecl (stmt);
34924 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34925 int n_args = gimple_call_num_args (stmt);
34926 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34927 tree decl = NULL_TREE;
34928 tree arg0, arg1;
34930 switch (fn_code)
34932 case IX86_BUILTIN_TZCNT32:
34933 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34934 goto fold_tzcnt_lzcnt;
34936 case IX86_BUILTIN_TZCNT64:
34937 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34938 goto fold_tzcnt_lzcnt;
34940 case IX86_BUILTIN_LZCNT32:
34941 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34942 goto fold_tzcnt_lzcnt;
34944 case IX86_BUILTIN_LZCNT64:
34945 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34946 goto fold_tzcnt_lzcnt;
34948 fold_tzcnt_lzcnt:
34949 gcc_assert (n_args == 1);
34950 arg0 = gimple_call_arg (stmt, 0);
34951 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34953 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34954 /* If arg0 is provably non-zero, optimize into generic
34955 __builtin_c[tl]z{,ll} function the middle-end handles
34956 better. */
34957 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34958 return false;
34960 location_t loc = gimple_location (stmt);
34961 gimple *g = gimple_build_call (decl, 1, arg0);
34962 gimple_set_location (g, loc);
34963 tree lhs = make_ssa_name (integer_type_node);
34964 gimple_call_set_lhs (g, lhs);
34965 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34966 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34967 gimple_set_location (g, loc);
34968 gsi_replace (gsi, g, false);
34969 return true;
34971 break;
34973 case IX86_BUILTIN_BZHI32:
34974 case IX86_BUILTIN_BZHI64:
34975 gcc_assert (n_args == 2);
34976 arg1 = gimple_call_arg (stmt, 1);
34977 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34979 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34980 arg0 = gimple_call_arg (stmt, 0);
34981 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34982 break;
34983 location_t loc = gimple_location (stmt);
34984 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34985 gimple_set_location (g, loc);
34986 gsi_replace (gsi, g, false);
34987 return true;
34989 break;
34991 case IX86_BUILTIN_PDEP32:
34992 case IX86_BUILTIN_PDEP64:
34993 case IX86_BUILTIN_PEXT32:
34994 case IX86_BUILTIN_PEXT64:
34995 gcc_assert (n_args == 2);
34996 arg1 = gimple_call_arg (stmt, 1);
34997 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34999 location_t loc = gimple_location (stmt);
35000 arg0 = gimple_call_arg (stmt, 0);
35001 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
35002 gimple_set_location (g, loc);
35003 gsi_replace (gsi, g, false);
35004 return true;
35006 break;
35008 default:
35009 break;
35012 return false;
35015 /* Make builtins to detect cpu type and features supported. NAME is
35016 the builtin name, CODE is the builtin code, and FTYPE is the function
35017 type of the builtin. */
35019 static void
35020 make_cpu_type_builtin (const char* name, int code,
35021 enum ix86_builtin_func_type ftype, bool is_const)
35023 tree decl;
35024 tree type;
35026 type = ix86_get_builtin_func_type (ftype);
35027 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
35028 NULL, NULL_TREE);
35029 gcc_assert (decl != NULL_TREE);
35030 ix86_builtins[(int) code] = decl;
35031 TREE_READONLY (decl) = is_const;
35034 /* Make builtins to get CPU type and features supported. The created
35035 builtins are :
35037 __builtin_cpu_init (), to detect cpu type and features,
35038 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
35039 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
35042 static void
35043 ix86_init_platform_type_builtins (void)
35045 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
35046 INT_FTYPE_VOID, false);
35047 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
35048 INT_FTYPE_PCCHAR, true);
35049 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
35050 INT_FTYPE_PCCHAR, true);
35053 /* Internal method for ix86_init_builtins. */
35055 static void
35056 ix86_init_builtins_va_builtins_abi (void)
35058 tree ms_va_ref, sysv_va_ref;
35059 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
35060 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
35061 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
35062 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
35064 if (!TARGET_64BIT)
35065 return;
35066 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
35067 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
35068 ms_va_ref = build_reference_type (ms_va_list_type_node);
35069 sysv_va_ref =
35070 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
35072 fnvoid_va_end_ms =
35073 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
35074 fnvoid_va_start_ms =
35075 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
35076 fnvoid_va_end_sysv =
35077 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
35078 fnvoid_va_start_sysv =
35079 build_varargs_function_type_list (void_type_node, sysv_va_ref,
35080 NULL_TREE);
35081 fnvoid_va_copy_ms =
35082 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
35083 NULL_TREE);
35084 fnvoid_va_copy_sysv =
35085 build_function_type_list (void_type_node, sysv_va_ref,
35086 sysv_va_ref, NULL_TREE);
35088 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
35089 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
35090 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
35091 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
35092 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
35093 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
35094 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
35095 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
35096 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
35097 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
35098 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
35099 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
35102 static void
35103 ix86_init_builtin_types (void)
35105 tree float80_type_node, const_string_type_node;
35107 /* The __float80 type. */
35108 float80_type_node = long_double_type_node;
35109 if (TYPE_MODE (float80_type_node) != XFmode)
35111 if (float64x_type_node != NULL_TREE
35112 && TYPE_MODE (float64x_type_node) == XFmode)
35113 float80_type_node = float64x_type_node;
35114 else
35116 /* The __float80 type. */
35117 float80_type_node = make_node (REAL_TYPE);
35119 TYPE_PRECISION (float80_type_node) = 80;
35120 layout_type (float80_type_node);
35123 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
35125 /* The __float128 type. The node has already been created as
35126 _Float128, so we only need to register the __float128 name for
35127 it. */
35128 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
35130 const_string_type_node
35131 = build_pointer_type (build_qualified_type
35132 (char_type_node, TYPE_QUAL_CONST));
35134 /* This macro is built by i386-builtin-types.awk. */
35135 DEFINE_BUILTIN_PRIMITIVE_TYPES;
35138 static void
35139 ix86_init_builtins (void)
35141 tree ftype, decl;
35143 ix86_init_builtin_types ();
35145 /* Builtins to get CPU type and features. */
35146 ix86_init_platform_type_builtins ();
35148 /* TFmode support builtins. */
35149 def_builtin_const (0, "__builtin_infq",
35150 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
35151 def_builtin_const (0, "__builtin_huge_valq",
35152 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
35154 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
35155 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
35156 BUILT_IN_MD, "nanq", NULL_TREE);
35157 TREE_READONLY (decl) = 1;
35158 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
35160 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
35161 BUILT_IN_MD, "nansq", NULL_TREE);
35162 TREE_READONLY (decl) = 1;
35163 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
35165 /* We will expand them to normal call if SSE isn't available since
35166 they are used by libgcc. */
35167 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
35168 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
35169 BUILT_IN_MD, "__fabstf2", NULL_TREE);
35170 TREE_READONLY (decl) = 1;
35171 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
35173 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
35174 decl = add_builtin_function ("__builtin_copysignq", ftype,
35175 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
35176 "__copysigntf3", NULL_TREE);
35177 TREE_READONLY (decl) = 1;
35178 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
35180 ix86_init_tm_builtins ();
35181 ix86_init_mmx_sse_builtins ();
35182 ix86_init_mpx_builtins ();
35184 if (TARGET_LP64)
35185 ix86_init_builtins_va_builtins_abi ();
35187 #ifdef SUBTARGET_INIT_BUILTINS
35188 SUBTARGET_INIT_BUILTINS;
35189 #endif
35192 /* Return the ix86 builtin for CODE. */
35194 static tree
35195 ix86_builtin_decl (unsigned code, bool)
35197 if (code >= IX86_BUILTIN_MAX)
35198 return error_mark_node;
35200 return ix86_builtins[code];
35203 /* Errors in the source file can cause expand_expr to return const0_rtx
35204 where we expect a vector. To avoid crashing, use one of the vector
35205 clear instructions. */
35206 static rtx
35207 safe_vector_operand (rtx x, machine_mode mode)
35209 if (x == const0_rtx)
35210 x = CONST0_RTX (mode);
35211 return x;
35214 /* Fixup modeless constants to fit required mode. */
35215 static rtx
35216 fixup_modeless_constant (rtx x, machine_mode mode)
35218 if (GET_MODE (x) == VOIDmode)
35219 x = convert_to_mode (mode, x, 1);
35220 return x;
35223 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
35225 static rtx
35226 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
35228 rtx pat;
35229 tree arg0 = CALL_EXPR_ARG (exp, 0);
35230 tree arg1 = CALL_EXPR_ARG (exp, 1);
35231 rtx op0 = expand_normal (arg0);
35232 rtx op1 = expand_normal (arg1);
35233 machine_mode tmode = insn_data[icode].operand[0].mode;
35234 machine_mode mode0 = insn_data[icode].operand[1].mode;
35235 machine_mode mode1 = insn_data[icode].operand[2].mode;
35237 if (VECTOR_MODE_P (mode0))
35238 op0 = safe_vector_operand (op0, mode0);
35239 if (VECTOR_MODE_P (mode1))
35240 op1 = safe_vector_operand (op1, mode1);
35242 if (optimize || !target
35243 || GET_MODE (target) != tmode
35244 || !insn_data[icode].operand[0].predicate (target, tmode))
35245 target = gen_reg_rtx (tmode);
35247 if (GET_MODE (op1) == SImode && mode1 == TImode)
35249 rtx x = gen_reg_rtx (V4SImode);
35250 emit_insn (gen_sse2_loadd (x, op1));
35251 op1 = gen_lowpart (TImode, x);
35254 if (!insn_data[icode].operand[1].predicate (op0, mode0))
35255 op0 = copy_to_mode_reg (mode0, op0);
35256 if (!insn_data[icode].operand[2].predicate (op1, mode1))
35257 op1 = copy_to_mode_reg (mode1, op1);
35259 pat = GEN_FCN (icode) (target, op0, op1);
35260 if (! pat)
35261 return 0;
35263 emit_insn (pat);
35265 return target;
35268 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
35270 static rtx
35271 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
35272 enum ix86_builtin_func_type m_type,
35273 enum rtx_code sub_code)
35275 rtx pat;
35276 int i;
35277 int nargs;
35278 bool comparison_p = false;
35279 bool tf_p = false;
35280 bool last_arg_constant = false;
35281 int num_memory = 0;
35282 struct {
35283 rtx op;
35284 machine_mode mode;
35285 } args[4];
35287 machine_mode tmode = insn_data[icode].operand[0].mode;
35289 switch (m_type)
35291 case MULTI_ARG_4_DF2_DI_I:
35292 case MULTI_ARG_4_DF2_DI_I1:
35293 case MULTI_ARG_4_SF2_SI_I:
35294 case MULTI_ARG_4_SF2_SI_I1:
35295 nargs = 4;
35296 last_arg_constant = true;
35297 break;
35299 case MULTI_ARG_3_SF:
35300 case MULTI_ARG_3_DF:
35301 case MULTI_ARG_3_SF2:
35302 case MULTI_ARG_3_DF2:
35303 case MULTI_ARG_3_DI:
35304 case MULTI_ARG_3_SI:
35305 case MULTI_ARG_3_SI_DI:
35306 case MULTI_ARG_3_HI:
35307 case MULTI_ARG_3_HI_SI:
35308 case MULTI_ARG_3_QI:
35309 case MULTI_ARG_3_DI2:
35310 case MULTI_ARG_3_SI2:
35311 case MULTI_ARG_3_HI2:
35312 case MULTI_ARG_3_QI2:
35313 nargs = 3;
35314 break;
35316 case MULTI_ARG_2_SF:
35317 case MULTI_ARG_2_DF:
35318 case MULTI_ARG_2_DI:
35319 case MULTI_ARG_2_SI:
35320 case MULTI_ARG_2_HI:
35321 case MULTI_ARG_2_QI:
35322 nargs = 2;
35323 break;
35325 case MULTI_ARG_2_DI_IMM:
35326 case MULTI_ARG_2_SI_IMM:
35327 case MULTI_ARG_2_HI_IMM:
35328 case MULTI_ARG_2_QI_IMM:
35329 nargs = 2;
35330 last_arg_constant = true;
35331 break;
35333 case MULTI_ARG_1_SF:
35334 case MULTI_ARG_1_DF:
35335 case MULTI_ARG_1_SF2:
35336 case MULTI_ARG_1_DF2:
35337 case MULTI_ARG_1_DI:
35338 case MULTI_ARG_1_SI:
35339 case MULTI_ARG_1_HI:
35340 case MULTI_ARG_1_QI:
35341 case MULTI_ARG_1_SI_DI:
35342 case MULTI_ARG_1_HI_DI:
35343 case MULTI_ARG_1_HI_SI:
35344 case MULTI_ARG_1_QI_DI:
35345 case MULTI_ARG_1_QI_SI:
35346 case MULTI_ARG_1_QI_HI:
35347 nargs = 1;
35348 break;
35350 case MULTI_ARG_2_DI_CMP:
35351 case MULTI_ARG_2_SI_CMP:
35352 case MULTI_ARG_2_HI_CMP:
35353 case MULTI_ARG_2_QI_CMP:
35354 nargs = 2;
35355 comparison_p = true;
35356 break;
35358 case MULTI_ARG_2_SF_TF:
35359 case MULTI_ARG_2_DF_TF:
35360 case MULTI_ARG_2_DI_TF:
35361 case MULTI_ARG_2_SI_TF:
35362 case MULTI_ARG_2_HI_TF:
35363 case MULTI_ARG_2_QI_TF:
35364 nargs = 2;
35365 tf_p = true;
35366 break;
35368 default:
35369 gcc_unreachable ();
35372 if (optimize || !target
35373 || GET_MODE (target) != tmode
35374 || !insn_data[icode].operand[0].predicate (target, tmode))
35375 target = gen_reg_rtx (tmode);
35376 else if (memory_operand (target, tmode))
35377 num_memory++;
35379 gcc_assert (nargs <= 4);
35381 for (i = 0; i < nargs; i++)
35383 tree arg = CALL_EXPR_ARG (exp, i);
35384 rtx op = expand_normal (arg);
35385 int adjust = (comparison_p) ? 1 : 0;
35386 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
35388 if (last_arg_constant && i == nargs - 1)
35390 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
35392 enum insn_code new_icode = icode;
35393 switch (icode)
35395 case CODE_FOR_xop_vpermil2v2df3:
35396 case CODE_FOR_xop_vpermil2v4sf3:
35397 case CODE_FOR_xop_vpermil2v4df3:
35398 case CODE_FOR_xop_vpermil2v8sf3:
35399 error ("the last argument must be a 2-bit immediate");
35400 return gen_reg_rtx (tmode);
35401 case CODE_FOR_xop_rotlv2di3:
35402 new_icode = CODE_FOR_rotlv2di3;
35403 goto xop_rotl;
35404 case CODE_FOR_xop_rotlv4si3:
35405 new_icode = CODE_FOR_rotlv4si3;
35406 goto xop_rotl;
35407 case CODE_FOR_xop_rotlv8hi3:
35408 new_icode = CODE_FOR_rotlv8hi3;
35409 goto xop_rotl;
35410 case CODE_FOR_xop_rotlv16qi3:
35411 new_icode = CODE_FOR_rotlv16qi3;
35412 xop_rotl:
35413 if (CONST_INT_P (op))
35415 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35416 op = GEN_INT (INTVAL (op) & mask);
35417 gcc_checking_assert
35418 (insn_data[icode].operand[i + 1].predicate (op, mode));
35420 else
35422 gcc_checking_assert
35423 (nargs == 2
35424 && insn_data[new_icode].operand[0].mode == tmode
35425 && insn_data[new_icode].operand[1].mode == tmode
35426 && insn_data[new_icode].operand[2].mode == mode
35427 && insn_data[new_icode].operand[0].predicate
35428 == insn_data[icode].operand[0].predicate
35429 && insn_data[new_icode].operand[1].predicate
35430 == insn_data[icode].operand[1].predicate);
35431 icode = new_icode;
35432 goto non_constant;
35434 break;
35435 default:
35436 gcc_unreachable ();
35440 else
35442 non_constant:
35443 if (VECTOR_MODE_P (mode))
35444 op = safe_vector_operand (op, mode);
35446 /* If we aren't optimizing, only allow one memory operand to be
35447 generated. */
35448 if (memory_operand (op, mode))
35449 num_memory++;
35451 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35453 if (optimize
35454 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35455 || num_memory > 1)
35456 op = force_reg (mode, op);
35459 args[i].op = op;
35460 args[i].mode = mode;
35463 switch (nargs)
35465 case 1:
35466 pat = GEN_FCN (icode) (target, args[0].op);
35467 break;
35469 case 2:
35470 if (tf_p)
35471 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35472 GEN_INT ((int)sub_code));
35473 else if (! comparison_p)
35474 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35475 else
35477 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35478 args[0].op,
35479 args[1].op);
35481 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35483 break;
35485 case 3:
35486 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35487 break;
35489 case 4:
35490 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35491 break;
35493 default:
35494 gcc_unreachable ();
35497 if (! pat)
35498 return 0;
35500 emit_insn (pat);
35501 return target;
35504 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35505 insns with vec_merge. */
35507 static rtx
35508 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35509 rtx target)
35511 rtx pat;
35512 tree arg0 = CALL_EXPR_ARG (exp, 0);
35513 rtx op1, op0 = expand_normal (arg0);
35514 machine_mode tmode = insn_data[icode].operand[0].mode;
35515 machine_mode mode0 = insn_data[icode].operand[1].mode;
35517 if (optimize || !target
35518 || GET_MODE (target) != tmode
35519 || !insn_data[icode].operand[0].predicate (target, tmode))
35520 target = gen_reg_rtx (tmode);
35522 if (VECTOR_MODE_P (mode0))
35523 op0 = safe_vector_operand (op0, mode0);
35525 if ((optimize && !register_operand (op0, mode0))
35526 || !insn_data[icode].operand[1].predicate (op0, mode0))
35527 op0 = copy_to_mode_reg (mode0, op0);
35529 op1 = op0;
35530 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35531 op1 = copy_to_mode_reg (mode0, op1);
35533 pat = GEN_FCN (icode) (target, op0, op1);
35534 if (! pat)
35535 return 0;
35536 emit_insn (pat);
35537 return target;
35540 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35542 static rtx
35543 ix86_expand_sse_compare (const struct builtin_description *d,
35544 tree exp, rtx target, bool swap)
35546 rtx pat;
35547 tree arg0 = CALL_EXPR_ARG (exp, 0);
35548 tree arg1 = CALL_EXPR_ARG (exp, 1);
35549 rtx op0 = expand_normal (arg0);
35550 rtx op1 = expand_normal (arg1);
35551 rtx op2;
35552 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35553 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35554 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35555 enum rtx_code comparison = d->comparison;
35557 if (VECTOR_MODE_P (mode0))
35558 op0 = safe_vector_operand (op0, mode0);
35559 if (VECTOR_MODE_P (mode1))
35560 op1 = safe_vector_operand (op1, mode1);
35562 /* Swap operands if we have a comparison that isn't available in
35563 hardware. */
35564 if (swap)
35565 std::swap (op0, op1);
35567 if (optimize || !target
35568 || GET_MODE (target) != tmode
35569 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35570 target = gen_reg_rtx (tmode);
35572 if ((optimize && !register_operand (op0, mode0))
35573 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35574 op0 = copy_to_mode_reg (mode0, op0);
35575 if ((optimize && !register_operand (op1, mode1))
35576 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35577 op1 = copy_to_mode_reg (mode1, op1);
35579 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35580 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35581 if (! pat)
35582 return 0;
35583 emit_insn (pat);
35584 return target;
35587 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35589 static rtx
35590 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35591 rtx target)
35593 rtx pat;
35594 tree arg0 = CALL_EXPR_ARG (exp, 0);
35595 tree arg1 = CALL_EXPR_ARG (exp, 1);
35596 rtx op0 = expand_normal (arg0);
35597 rtx op1 = expand_normal (arg1);
35598 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35599 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35600 enum rtx_code comparison = d->comparison;
35602 if (VECTOR_MODE_P (mode0))
35603 op0 = safe_vector_operand (op0, mode0);
35604 if (VECTOR_MODE_P (mode1))
35605 op1 = safe_vector_operand (op1, mode1);
35607 /* Swap operands if we have a comparison that isn't available in
35608 hardware. */
35609 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35610 std::swap (op0, op1);
35612 target = gen_reg_rtx (SImode);
35613 emit_move_insn (target, const0_rtx);
35614 target = gen_rtx_SUBREG (QImode, target, 0);
35616 if ((optimize && !register_operand (op0, mode0))
35617 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35618 op0 = copy_to_mode_reg (mode0, op0);
35619 if ((optimize && !register_operand (op1, mode1))
35620 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35621 op1 = copy_to_mode_reg (mode1, op1);
35623 pat = GEN_FCN (d->icode) (op0, op1);
35624 if (! pat)
35625 return 0;
35626 emit_insn (pat);
35627 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35628 gen_rtx_fmt_ee (comparison, QImode,
35629 SET_DEST (pat),
35630 const0_rtx)));
35632 return SUBREG_REG (target);
35635 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35637 static rtx
35638 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35639 rtx target)
35641 rtx pat;
35642 tree arg0 = CALL_EXPR_ARG (exp, 0);
35643 rtx op1, op0 = expand_normal (arg0);
35644 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35645 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35647 if (optimize || target == 0
35648 || GET_MODE (target) != tmode
35649 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35650 target = gen_reg_rtx (tmode);
35652 if (VECTOR_MODE_P (mode0))
35653 op0 = safe_vector_operand (op0, mode0);
35655 if ((optimize && !register_operand (op0, mode0))
35656 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35657 op0 = copy_to_mode_reg (mode0, op0);
35659 op1 = GEN_INT (d->comparison);
35661 pat = GEN_FCN (d->icode) (target, op0, op1);
35662 if (! pat)
35663 return 0;
35664 emit_insn (pat);
35665 return target;
35668 static rtx
35669 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35670 tree exp, rtx target)
35672 rtx pat;
35673 tree arg0 = CALL_EXPR_ARG (exp, 0);
35674 tree arg1 = CALL_EXPR_ARG (exp, 1);
35675 rtx op0 = expand_normal (arg0);
35676 rtx op1 = expand_normal (arg1);
35677 rtx op2;
35678 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35679 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35680 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35682 if (optimize || target == 0
35683 || GET_MODE (target) != tmode
35684 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35685 target = gen_reg_rtx (tmode);
35687 op0 = safe_vector_operand (op0, mode0);
35688 op1 = safe_vector_operand (op1, mode1);
35690 if ((optimize && !register_operand (op0, mode0))
35691 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35692 op0 = copy_to_mode_reg (mode0, op0);
35693 if ((optimize && !register_operand (op1, mode1))
35694 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35695 op1 = copy_to_mode_reg (mode1, op1);
35697 op2 = GEN_INT (d->comparison);
35699 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35700 if (! pat)
35701 return 0;
35702 emit_insn (pat);
35703 return target;
35706 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35708 static rtx
35709 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35710 rtx target)
35712 rtx pat;
35713 tree arg0 = CALL_EXPR_ARG (exp, 0);
35714 tree arg1 = CALL_EXPR_ARG (exp, 1);
35715 rtx op0 = expand_normal (arg0);
35716 rtx op1 = expand_normal (arg1);
35717 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35718 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35719 enum rtx_code comparison = d->comparison;
35721 if (VECTOR_MODE_P (mode0))
35722 op0 = safe_vector_operand (op0, mode0);
35723 if (VECTOR_MODE_P (mode1))
35724 op1 = safe_vector_operand (op1, mode1);
35726 target = gen_reg_rtx (SImode);
35727 emit_move_insn (target, const0_rtx);
35728 target = gen_rtx_SUBREG (QImode, target, 0);
35730 if ((optimize && !register_operand (op0, mode0))
35731 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35732 op0 = copy_to_mode_reg (mode0, op0);
35733 if ((optimize && !register_operand (op1, mode1))
35734 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35735 op1 = copy_to_mode_reg (mode1, op1);
35737 pat = GEN_FCN (d->icode) (op0, op1);
35738 if (! pat)
35739 return 0;
35740 emit_insn (pat);
35741 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35742 gen_rtx_fmt_ee (comparison, QImode,
35743 SET_DEST (pat),
35744 const0_rtx)));
35746 return SUBREG_REG (target);
35749 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35751 static rtx
35752 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35753 tree exp, rtx target)
35755 rtx pat;
35756 tree arg0 = CALL_EXPR_ARG (exp, 0);
35757 tree arg1 = CALL_EXPR_ARG (exp, 1);
35758 tree arg2 = CALL_EXPR_ARG (exp, 2);
35759 tree arg3 = CALL_EXPR_ARG (exp, 3);
35760 tree arg4 = CALL_EXPR_ARG (exp, 4);
35761 rtx scratch0, scratch1;
35762 rtx op0 = expand_normal (arg0);
35763 rtx op1 = expand_normal (arg1);
35764 rtx op2 = expand_normal (arg2);
35765 rtx op3 = expand_normal (arg3);
35766 rtx op4 = expand_normal (arg4);
35767 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35769 tmode0 = insn_data[d->icode].operand[0].mode;
35770 tmode1 = insn_data[d->icode].operand[1].mode;
35771 modev2 = insn_data[d->icode].operand[2].mode;
35772 modei3 = insn_data[d->icode].operand[3].mode;
35773 modev4 = insn_data[d->icode].operand[4].mode;
35774 modei5 = insn_data[d->icode].operand[5].mode;
35775 modeimm = insn_data[d->icode].operand[6].mode;
35777 if (VECTOR_MODE_P (modev2))
35778 op0 = safe_vector_operand (op0, modev2);
35779 if (VECTOR_MODE_P (modev4))
35780 op2 = safe_vector_operand (op2, modev4);
35782 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35783 op0 = copy_to_mode_reg (modev2, op0);
35784 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35785 op1 = copy_to_mode_reg (modei3, op1);
35786 if ((optimize && !register_operand (op2, modev4))
35787 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35788 op2 = copy_to_mode_reg (modev4, op2);
35789 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35790 op3 = copy_to_mode_reg (modei5, op3);
35792 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35794 error ("the fifth argument must be an 8-bit immediate");
35795 return const0_rtx;
35798 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35800 if (optimize || !target
35801 || GET_MODE (target) != tmode0
35802 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35803 target = gen_reg_rtx (tmode0);
35805 scratch1 = gen_reg_rtx (tmode1);
35807 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35809 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35811 if (optimize || !target
35812 || GET_MODE (target) != tmode1
35813 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35814 target = gen_reg_rtx (tmode1);
35816 scratch0 = gen_reg_rtx (tmode0);
35818 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35820 else
35822 gcc_assert (d->flag);
35824 scratch0 = gen_reg_rtx (tmode0);
35825 scratch1 = gen_reg_rtx (tmode1);
35827 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35830 if (! pat)
35831 return 0;
35833 emit_insn (pat);
35835 if (d->flag)
35837 target = gen_reg_rtx (SImode);
35838 emit_move_insn (target, const0_rtx);
35839 target = gen_rtx_SUBREG (QImode, target, 0);
35841 emit_insn
35842 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35843 gen_rtx_fmt_ee (EQ, QImode,
35844 gen_rtx_REG ((machine_mode) d->flag,
35845 FLAGS_REG),
35846 const0_rtx)));
35847 return SUBREG_REG (target);
35849 else
35850 return target;
35854 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35856 static rtx
35857 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35858 tree exp, rtx target)
35860 rtx pat;
35861 tree arg0 = CALL_EXPR_ARG (exp, 0);
35862 tree arg1 = CALL_EXPR_ARG (exp, 1);
35863 tree arg2 = CALL_EXPR_ARG (exp, 2);
35864 rtx scratch0, scratch1;
35865 rtx op0 = expand_normal (arg0);
35866 rtx op1 = expand_normal (arg1);
35867 rtx op2 = expand_normal (arg2);
35868 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35870 tmode0 = insn_data[d->icode].operand[0].mode;
35871 tmode1 = insn_data[d->icode].operand[1].mode;
35872 modev2 = insn_data[d->icode].operand[2].mode;
35873 modev3 = insn_data[d->icode].operand[3].mode;
35874 modeimm = insn_data[d->icode].operand[4].mode;
35876 if (VECTOR_MODE_P (modev2))
35877 op0 = safe_vector_operand (op0, modev2);
35878 if (VECTOR_MODE_P (modev3))
35879 op1 = safe_vector_operand (op1, modev3);
35881 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35882 op0 = copy_to_mode_reg (modev2, op0);
35883 if ((optimize && !register_operand (op1, modev3))
35884 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35885 op1 = copy_to_mode_reg (modev3, op1);
35887 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35889 error ("the third argument must be an 8-bit immediate");
35890 return const0_rtx;
35893 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35895 if (optimize || !target
35896 || GET_MODE (target) != tmode0
35897 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35898 target = gen_reg_rtx (tmode0);
35900 scratch1 = gen_reg_rtx (tmode1);
35902 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35904 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35906 if (optimize || !target
35907 || GET_MODE (target) != tmode1
35908 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35909 target = gen_reg_rtx (tmode1);
35911 scratch0 = gen_reg_rtx (tmode0);
35913 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35915 else
35917 gcc_assert (d->flag);
35919 scratch0 = gen_reg_rtx (tmode0);
35920 scratch1 = gen_reg_rtx (tmode1);
35922 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35925 if (! pat)
35926 return 0;
35928 emit_insn (pat);
35930 if (d->flag)
35932 target = gen_reg_rtx (SImode);
35933 emit_move_insn (target, const0_rtx);
35934 target = gen_rtx_SUBREG (QImode, target, 0);
35936 emit_insn
35937 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35938 gen_rtx_fmt_ee (EQ, QImode,
35939 gen_rtx_REG ((machine_mode) d->flag,
35940 FLAGS_REG),
35941 const0_rtx)));
35942 return SUBREG_REG (target);
35944 else
35945 return target;
35948 /* Subroutine of ix86_expand_builtin to take care of insns with
35949 variable number of operands. */
35951 static rtx
35952 ix86_expand_args_builtin (const struct builtin_description *d,
35953 tree exp, rtx target)
35955 rtx pat, real_target;
35956 unsigned int i, nargs;
35957 unsigned int nargs_constant = 0;
35958 unsigned int mask_pos = 0;
35959 int num_memory = 0;
35960 struct
35962 rtx op;
35963 machine_mode mode;
35964 } args[6];
35965 bool second_arg_count = false;
35966 enum insn_code icode = d->icode;
35967 const struct insn_data_d *insn_p = &insn_data[icode];
35968 machine_mode tmode = insn_p->operand[0].mode;
35969 machine_mode rmode = VOIDmode;
35970 bool swap = false;
35971 enum rtx_code comparison = d->comparison;
35973 switch ((enum ix86_builtin_func_type) d->flag)
35975 case V2DF_FTYPE_V2DF_ROUND:
35976 case V4DF_FTYPE_V4DF_ROUND:
35977 case V8DF_FTYPE_V8DF_ROUND:
35978 case V4SF_FTYPE_V4SF_ROUND:
35979 case V8SF_FTYPE_V8SF_ROUND:
35980 case V16SF_FTYPE_V16SF_ROUND:
35981 case V4SI_FTYPE_V4SF_ROUND:
35982 case V8SI_FTYPE_V8SF_ROUND:
35983 case V16SI_FTYPE_V16SF_ROUND:
35984 return ix86_expand_sse_round (d, exp, target);
35985 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35986 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35987 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35988 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35989 case INT_FTYPE_V8SF_V8SF_PTEST:
35990 case INT_FTYPE_V4DI_V4DI_PTEST:
35991 case INT_FTYPE_V4DF_V4DF_PTEST:
35992 case INT_FTYPE_V4SF_V4SF_PTEST:
35993 case INT_FTYPE_V2DI_V2DI_PTEST:
35994 case INT_FTYPE_V2DF_V2DF_PTEST:
35995 return ix86_expand_sse_ptest (d, exp, target);
35996 case FLOAT128_FTYPE_FLOAT128:
35997 case FLOAT_FTYPE_FLOAT:
35998 case INT_FTYPE_INT:
35999 case UINT_FTYPE_UINT:
36000 case UINT16_FTYPE_UINT16:
36001 case UINT64_FTYPE_INT:
36002 case UINT64_FTYPE_UINT64:
36003 case INT64_FTYPE_INT64:
36004 case INT64_FTYPE_V4SF:
36005 case INT64_FTYPE_V2DF:
36006 case INT_FTYPE_V16QI:
36007 case INT_FTYPE_V8QI:
36008 case INT_FTYPE_V8SF:
36009 case INT_FTYPE_V4DF:
36010 case INT_FTYPE_V4SF:
36011 case INT_FTYPE_V2DF:
36012 case INT_FTYPE_V32QI:
36013 case V16QI_FTYPE_V16QI:
36014 case V8SI_FTYPE_V8SF:
36015 case V8SI_FTYPE_V4SI:
36016 case V8HI_FTYPE_V8HI:
36017 case V8HI_FTYPE_V16QI:
36018 case V8QI_FTYPE_V8QI:
36019 case V8SF_FTYPE_V8SF:
36020 case V8SF_FTYPE_V8SI:
36021 case V8SF_FTYPE_V4SF:
36022 case V8SF_FTYPE_V8HI:
36023 case V4SI_FTYPE_V4SI:
36024 case V4SI_FTYPE_V16QI:
36025 case V4SI_FTYPE_V4SF:
36026 case V4SI_FTYPE_V8SI:
36027 case V4SI_FTYPE_V8HI:
36028 case V4SI_FTYPE_V4DF:
36029 case V4SI_FTYPE_V2DF:
36030 case V4HI_FTYPE_V4HI:
36031 case V4DF_FTYPE_V4DF:
36032 case V4DF_FTYPE_V4SI:
36033 case V4DF_FTYPE_V4SF:
36034 case V4DF_FTYPE_V2DF:
36035 case V4SF_FTYPE_V4SF:
36036 case V4SF_FTYPE_V4SI:
36037 case V4SF_FTYPE_V8SF:
36038 case V4SF_FTYPE_V4DF:
36039 case V4SF_FTYPE_V8HI:
36040 case V4SF_FTYPE_V2DF:
36041 case V2DI_FTYPE_V2DI:
36042 case V2DI_FTYPE_V16QI:
36043 case V2DI_FTYPE_V8HI:
36044 case V2DI_FTYPE_V4SI:
36045 case V2DF_FTYPE_V2DF:
36046 case V2DF_FTYPE_V4SI:
36047 case V2DF_FTYPE_V4DF:
36048 case V2DF_FTYPE_V4SF:
36049 case V2DF_FTYPE_V2SI:
36050 case V2SI_FTYPE_V2SI:
36051 case V2SI_FTYPE_V4SF:
36052 case V2SI_FTYPE_V2SF:
36053 case V2SI_FTYPE_V2DF:
36054 case V2SF_FTYPE_V2SF:
36055 case V2SF_FTYPE_V2SI:
36056 case V32QI_FTYPE_V32QI:
36057 case V32QI_FTYPE_V16QI:
36058 case V16HI_FTYPE_V16HI:
36059 case V16HI_FTYPE_V8HI:
36060 case V8SI_FTYPE_V8SI:
36061 case V16HI_FTYPE_V16QI:
36062 case V8SI_FTYPE_V16QI:
36063 case V4DI_FTYPE_V16QI:
36064 case V8SI_FTYPE_V8HI:
36065 case V4DI_FTYPE_V8HI:
36066 case V4DI_FTYPE_V4SI:
36067 case V4DI_FTYPE_V2DI:
36068 case UQI_FTYPE_UQI:
36069 case UHI_FTYPE_UHI:
36070 case USI_FTYPE_USI:
36071 case USI_FTYPE_UQI:
36072 case USI_FTYPE_UHI:
36073 case UDI_FTYPE_UDI:
36074 case UHI_FTYPE_V16QI:
36075 case USI_FTYPE_V32QI:
36076 case UDI_FTYPE_V64QI:
36077 case V16QI_FTYPE_UHI:
36078 case V32QI_FTYPE_USI:
36079 case V64QI_FTYPE_UDI:
36080 case V8HI_FTYPE_UQI:
36081 case V16HI_FTYPE_UHI:
36082 case V32HI_FTYPE_USI:
36083 case V4SI_FTYPE_UQI:
36084 case V8SI_FTYPE_UQI:
36085 case V4SI_FTYPE_UHI:
36086 case V8SI_FTYPE_UHI:
36087 case UQI_FTYPE_V8HI:
36088 case UHI_FTYPE_V16HI:
36089 case USI_FTYPE_V32HI:
36090 case UQI_FTYPE_V4SI:
36091 case UQI_FTYPE_V8SI:
36092 case UHI_FTYPE_V16SI:
36093 case UQI_FTYPE_V2DI:
36094 case UQI_FTYPE_V4DI:
36095 case UQI_FTYPE_V8DI:
36096 case V16SI_FTYPE_UHI:
36097 case V2DI_FTYPE_UQI:
36098 case V4DI_FTYPE_UQI:
36099 case V16SI_FTYPE_INT:
36100 case V16SF_FTYPE_V8SF:
36101 case V16SI_FTYPE_V8SI:
36102 case V16SF_FTYPE_V4SF:
36103 case V16SI_FTYPE_V4SI:
36104 case V16SI_FTYPE_V16SF:
36105 case V16SI_FTYPE_V16SI:
36106 case V16SF_FTYPE_V16SF:
36107 case V8DI_FTYPE_UQI:
36108 case V8DI_FTYPE_V8DI:
36109 case V8DF_FTYPE_V4DF:
36110 case V8DF_FTYPE_V2DF:
36111 case V8DF_FTYPE_V8DF:
36112 nargs = 1;
36113 break;
36114 case V4SF_FTYPE_V4SF_VEC_MERGE:
36115 case V2DF_FTYPE_V2DF_VEC_MERGE:
36116 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
36117 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
36118 case V16QI_FTYPE_V16QI_V16QI:
36119 case V16QI_FTYPE_V8HI_V8HI:
36120 case V16SF_FTYPE_V16SF_V16SF:
36121 case V8QI_FTYPE_V8QI_V8QI:
36122 case V8QI_FTYPE_V4HI_V4HI:
36123 case V8HI_FTYPE_V8HI_V8HI:
36124 case V8HI_FTYPE_V16QI_V16QI:
36125 case V8HI_FTYPE_V4SI_V4SI:
36126 case V8SF_FTYPE_V8SF_V8SF:
36127 case V8SF_FTYPE_V8SF_V8SI:
36128 case V8DF_FTYPE_V8DF_V8DF:
36129 case V4SI_FTYPE_V4SI_V4SI:
36130 case V4SI_FTYPE_V8HI_V8HI:
36131 case V4SI_FTYPE_V2DF_V2DF:
36132 case V4HI_FTYPE_V4HI_V4HI:
36133 case V4HI_FTYPE_V8QI_V8QI:
36134 case V4HI_FTYPE_V2SI_V2SI:
36135 case V4DF_FTYPE_V4DF_V4DF:
36136 case V4DF_FTYPE_V4DF_V4DI:
36137 case V4SF_FTYPE_V4SF_V4SF:
36138 case V4SF_FTYPE_V4SF_V4SI:
36139 case V4SF_FTYPE_V4SF_V2SI:
36140 case V4SF_FTYPE_V4SF_V2DF:
36141 case V4SF_FTYPE_V4SF_UINT:
36142 case V4SF_FTYPE_V4SF_DI:
36143 case V4SF_FTYPE_V4SF_SI:
36144 case V2DI_FTYPE_V2DI_V2DI:
36145 case V2DI_FTYPE_V16QI_V16QI:
36146 case V2DI_FTYPE_V4SI_V4SI:
36147 case V2DI_FTYPE_V2DI_V16QI:
36148 case V2SI_FTYPE_V2SI_V2SI:
36149 case V2SI_FTYPE_V4HI_V4HI:
36150 case V2SI_FTYPE_V2SF_V2SF:
36151 case V2DF_FTYPE_V2DF_V2DF:
36152 case V2DF_FTYPE_V2DF_V4SF:
36153 case V2DF_FTYPE_V2DF_V2DI:
36154 case V2DF_FTYPE_V2DF_DI:
36155 case V2DF_FTYPE_V2DF_SI:
36156 case V2DF_FTYPE_V2DF_UINT:
36157 case V2SF_FTYPE_V2SF_V2SF:
36158 case V1DI_FTYPE_V1DI_V1DI:
36159 case V1DI_FTYPE_V8QI_V8QI:
36160 case V1DI_FTYPE_V2SI_V2SI:
36161 case V32QI_FTYPE_V16HI_V16HI:
36162 case V16HI_FTYPE_V8SI_V8SI:
36163 case V32QI_FTYPE_V32QI_V32QI:
36164 case V16HI_FTYPE_V32QI_V32QI:
36165 case V16HI_FTYPE_V16HI_V16HI:
36166 case V8SI_FTYPE_V4DF_V4DF:
36167 case V8SI_FTYPE_V8SI_V8SI:
36168 case V8SI_FTYPE_V16HI_V16HI:
36169 case V4DI_FTYPE_V4DI_V4DI:
36170 case V4DI_FTYPE_V8SI_V8SI:
36171 case V8DI_FTYPE_V64QI_V64QI:
36172 if (comparison == UNKNOWN)
36173 return ix86_expand_binop_builtin (icode, exp, target);
36174 nargs = 2;
36175 break;
36176 case V4SF_FTYPE_V4SF_V4SF_SWAP:
36177 case V2DF_FTYPE_V2DF_V2DF_SWAP:
36178 gcc_assert (comparison != UNKNOWN);
36179 nargs = 2;
36180 swap = true;
36181 break;
36182 case V16HI_FTYPE_V16HI_V8HI_COUNT:
36183 case V16HI_FTYPE_V16HI_SI_COUNT:
36184 case V8SI_FTYPE_V8SI_V4SI_COUNT:
36185 case V8SI_FTYPE_V8SI_SI_COUNT:
36186 case V4DI_FTYPE_V4DI_V2DI_COUNT:
36187 case V4DI_FTYPE_V4DI_INT_COUNT:
36188 case V8HI_FTYPE_V8HI_V8HI_COUNT:
36189 case V8HI_FTYPE_V8HI_SI_COUNT:
36190 case V4SI_FTYPE_V4SI_V4SI_COUNT:
36191 case V4SI_FTYPE_V4SI_SI_COUNT:
36192 case V4HI_FTYPE_V4HI_V4HI_COUNT:
36193 case V4HI_FTYPE_V4HI_SI_COUNT:
36194 case V2DI_FTYPE_V2DI_V2DI_COUNT:
36195 case V2DI_FTYPE_V2DI_SI_COUNT:
36196 case V2SI_FTYPE_V2SI_V2SI_COUNT:
36197 case V2SI_FTYPE_V2SI_SI_COUNT:
36198 case V1DI_FTYPE_V1DI_V1DI_COUNT:
36199 case V1DI_FTYPE_V1DI_SI_COUNT:
36200 nargs = 2;
36201 second_arg_count = true;
36202 break;
36203 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
36204 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
36205 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
36206 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
36207 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
36208 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
36209 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
36210 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
36211 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
36212 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
36213 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
36214 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
36215 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
36216 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
36217 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
36218 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
36219 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
36220 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
36221 nargs = 4;
36222 second_arg_count = true;
36223 break;
36224 case UINT64_FTYPE_UINT64_UINT64:
36225 case UINT_FTYPE_UINT_UINT:
36226 case UINT_FTYPE_UINT_USHORT:
36227 case UINT_FTYPE_UINT_UCHAR:
36228 case UINT16_FTYPE_UINT16_INT:
36229 case UINT8_FTYPE_UINT8_INT:
36230 case UQI_FTYPE_UQI_UQI:
36231 case UHI_FTYPE_UHI_UHI:
36232 case USI_FTYPE_USI_USI:
36233 case UDI_FTYPE_UDI_UDI:
36234 case V16SI_FTYPE_V8DF_V8DF:
36235 nargs = 2;
36236 break;
36237 case V2DI_FTYPE_V2DI_INT_CONVERT:
36238 nargs = 2;
36239 rmode = V1TImode;
36240 nargs_constant = 1;
36241 break;
36242 case V4DI_FTYPE_V4DI_INT_CONVERT:
36243 nargs = 2;
36244 rmode = V2TImode;
36245 nargs_constant = 1;
36246 break;
36247 case V8DI_FTYPE_V8DI_INT_CONVERT:
36248 nargs = 2;
36249 rmode = V4TImode;
36250 nargs_constant = 1;
36251 break;
36252 case V8HI_FTYPE_V8HI_INT:
36253 case V8HI_FTYPE_V8SF_INT:
36254 case V16HI_FTYPE_V16SF_INT:
36255 case V8HI_FTYPE_V4SF_INT:
36256 case V8SF_FTYPE_V8SF_INT:
36257 case V4SF_FTYPE_V16SF_INT:
36258 case V16SF_FTYPE_V16SF_INT:
36259 case V4SI_FTYPE_V4SI_INT:
36260 case V4SI_FTYPE_V8SI_INT:
36261 case V4HI_FTYPE_V4HI_INT:
36262 case V4DF_FTYPE_V4DF_INT:
36263 case V4DF_FTYPE_V8DF_INT:
36264 case V4SF_FTYPE_V4SF_INT:
36265 case V4SF_FTYPE_V8SF_INT:
36266 case V2DI_FTYPE_V2DI_INT:
36267 case V2DF_FTYPE_V2DF_INT:
36268 case V2DF_FTYPE_V4DF_INT:
36269 case V16HI_FTYPE_V16HI_INT:
36270 case V8SI_FTYPE_V8SI_INT:
36271 case V16SI_FTYPE_V16SI_INT:
36272 case V4SI_FTYPE_V16SI_INT:
36273 case V4DI_FTYPE_V4DI_INT:
36274 case V2DI_FTYPE_V4DI_INT:
36275 case V4DI_FTYPE_V8DI_INT:
36276 case QI_FTYPE_V4SF_INT:
36277 case QI_FTYPE_V2DF_INT:
36278 case UQI_FTYPE_UQI_UQI_CONST:
36279 case UHI_FTYPE_UHI_UQI:
36280 case USI_FTYPE_USI_UQI:
36281 case UDI_FTYPE_UDI_UQI:
36282 nargs = 2;
36283 nargs_constant = 1;
36284 break;
36285 case V16QI_FTYPE_V16QI_V16QI_V16QI:
36286 case V8SF_FTYPE_V8SF_V8SF_V8SF:
36287 case V4DF_FTYPE_V4DF_V4DF_V4DF:
36288 case V4SF_FTYPE_V4SF_V4SF_V4SF:
36289 case V2DF_FTYPE_V2DF_V2DF_V2DF:
36290 case V32QI_FTYPE_V32QI_V32QI_V32QI:
36291 case UHI_FTYPE_V16SI_V16SI_UHI:
36292 case UQI_FTYPE_V8DI_V8DI_UQI:
36293 case V16HI_FTYPE_V16SI_V16HI_UHI:
36294 case V16QI_FTYPE_V16SI_V16QI_UHI:
36295 case V16QI_FTYPE_V8DI_V16QI_UQI:
36296 case V16SF_FTYPE_V16SF_V16SF_UHI:
36297 case V16SF_FTYPE_V4SF_V16SF_UHI:
36298 case V16SI_FTYPE_SI_V16SI_UHI:
36299 case V16SI_FTYPE_V16HI_V16SI_UHI:
36300 case V16SI_FTYPE_V16QI_V16SI_UHI:
36301 case V8SF_FTYPE_V4SF_V8SF_UQI:
36302 case V4DF_FTYPE_V2DF_V4DF_UQI:
36303 case V8SI_FTYPE_V4SI_V8SI_UQI:
36304 case V8SI_FTYPE_SI_V8SI_UQI:
36305 case V4SI_FTYPE_V4SI_V4SI_UQI:
36306 case V4SI_FTYPE_SI_V4SI_UQI:
36307 case V4DI_FTYPE_V2DI_V4DI_UQI:
36308 case V4DI_FTYPE_DI_V4DI_UQI:
36309 case V2DI_FTYPE_V2DI_V2DI_UQI:
36310 case V2DI_FTYPE_DI_V2DI_UQI:
36311 case V64QI_FTYPE_V64QI_V64QI_UDI:
36312 case V64QI_FTYPE_V16QI_V64QI_UDI:
36313 case V64QI_FTYPE_QI_V64QI_UDI:
36314 case V32QI_FTYPE_V32QI_V32QI_USI:
36315 case V32QI_FTYPE_V16QI_V32QI_USI:
36316 case V32QI_FTYPE_QI_V32QI_USI:
36317 case V16QI_FTYPE_V16QI_V16QI_UHI:
36318 case V16QI_FTYPE_QI_V16QI_UHI:
36319 case V32HI_FTYPE_V8HI_V32HI_USI:
36320 case V32HI_FTYPE_HI_V32HI_USI:
36321 case V16HI_FTYPE_V8HI_V16HI_UHI:
36322 case V16HI_FTYPE_HI_V16HI_UHI:
36323 case V8HI_FTYPE_V8HI_V8HI_UQI:
36324 case V8HI_FTYPE_HI_V8HI_UQI:
36325 case V8SF_FTYPE_V8HI_V8SF_UQI:
36326 case V4SF_FTYPE_V8HI_V4SF_UQI:
36327 case V8SI_FTYPE_V8SF_V8SI_UQI:
36328 case V4SI_FTYPE_V4SF_V4SI_UQI:
36329 case V4DI_FTYPE_V4SF_V4DI_UQI:
36330 case V2DI_FTYPE_V4SF_V2DI_UQI:
36331 case V4SF_FTYPE_V4DI_V4SF_UQI:
36332 case V4SF_FTYPE_V2DI_V4SF_UQI:
36333 case V4DF_FTYPE_V4DI_V4DF_UQI:
36334 case V2DF_FTYPE_V2DI_V2DF_UQI:
36335 case V16QI_FTYPE_V8HI_V16QI_UQI:
36336 case V16QI_FTYPE_V16HI_V16QI_UHI:
36337 case V16QI_FTYPE_V4SI_V16QI_UQI:
36338 case V16QI_FTYPE_V8SI_V16QI_UQI:
36339 case V8HI_FTYPE_V4SI_V8HI_UQI:
36340 case V8HI_FTYPE_V8SI_V8HI_UQI:
36341 case V16QI_FTYPE_V2DI_V16QI_UQI:
36342 case V16QI_FTYPE_V4DI_V16QI_UQI:
36343 case V8HI_FTYPE_V2DI_V8HI_UQI:
36344 case V8HI_FTYPE_V4DI_V8HI_UQI:
36345 case V4SI_FTYPE_V2DI_V4SI_UQI:
36346 case V4SI_FTYPE_V4DI_V4SI_UQI:
36347 case V32QI_FTYPE_V32HI_V32QI_USI:
36348 case UHI_FTYPE_V16QI_V16QI_UHI:
36349 case USI_FTYPE_V32QI_V32QI_USI:
36350 case UDI_FTYPE_V64QI_V64QI_UDI:
36351 case UQI_FTYPE_V8HI_V8HI_UQI:
36352 case UHI_FTYPE_V16HI_V16HI_UHI:
36353 case USI_FTYPE_V32HI_V32HI_USI:
36354 case UQI_FTYPE_V4SI_V4SI_UQI:
36355 case UQI_FTYPE_V8SI_V8SI_UQI:
36356 case UQI_FTYPE_V2DI_V2DI_UQI:
36357 case UQI_FTYPE_V4DI_V4DI_UQI:
36358 case V4SF_FTYPE_V2DF_V4SF_UQI:
36359 case V4SF_FTYPE_V4DF_V4SF_UQI:
36360 case V16SI_FTYPE_V16SI_V16SI_UHI:
36361 case V16SI_FTYPE_V4SI_V16SI_UHI:
36362 case V2DI_FTYPE_V4SI_V2DI_UQI:
36363 case V2DI_FTYPE_V8HI_V2DI_UQI:
36364 case V2DI_FTYPE_V16QI_V2DI_UQI:
36365 case V4DI_FTYPE_V4DI_V4DI_UQI:
36366 case V4DI_FTYPE_V4SI_V4DI_UQI:
36367 case V4DI_FTYPE_V8HI_V4DI_UQI:
36368 case V4DI_FTYPE_V16QI_V4DI_UQI:
36369 case V4DI_FTYPE_V4DF_V4DI_UQI:
36370 case V2DI_FTYPE_V2DF_V2DI_UQI:
36371 case V4SI_FTYPE_V4DF_V4SI_UQI:
36372 case V4SI_FTYPE_V2DF_V4SI_UQI:
36373 case V4SI_FTYPE_V8HI_V4SI_UQI:
36374 case V4SI_FTYPE_V16QI_V4SI_UQI:
36375 case V4DI_FTYPE_V4DI_V4DI_V4DI:
36376 case V8DF_FTYPE_V2DF_V8DF_UQI:
36377 case V8DF_FTYPE_V4DF_V8DF_UQI:
36378 case V8DF_FTYPE_V8DF_V8DF_UQI:
36379 case V8SF_FTYPE_V8SF_V8SF_UQI:
36380 case V8SF_FTYPE_V8SI_V8SF_UQI:
36381 case V4DF_FTYPE_V4DF_V4DF_UQI:
36382 case V4SF_FTYPE_V4SF_V4SF_UQI:
36383 case V2DF_FTYPE_V2DF_V2DF_UQI:
36384 case V2DF_FTYPE_V4SF_V2DF_UQI:
36385 case V2DF_FTYPE_V4SI_V2DF_UQI:
36386 case V4SF_FTYPE_V4SI_V4SF_UQI:
36387 case V4DF_FTYPE_V4SF_V4DF_UQI:
36388 case V4DF_FTYPE_V4SI_V4DF_UQI:
36389 case V8SI_FTYPE_V8SI_V8SI_UQI:
36390 case V8SI_FTYPE_V8HI_V8SI_UQI:
36391 case V8SI_FTYPE_V16QI_V8SI_UQI:
36392 case V8DF_FTYPE_V8SI_V8DF_UQI:
36393 case V8DI_FTYPE_DI_V8DI_UQI:
36394 case V16SF_FTYPE_V8SF_V16SF_UHI:
36395 case V16SI_FTYPE_V8SI_V16SI_UHI:
36396 case V16HI_FTYPE_V16HI_V16HI_UHI:
36397 case V8HI_FTYPE_V16QI_V8HI_UQI:
36398 case V16HI_FTYPE_V16QI_V16HI_UHI:
36399 case V32HI_FTYPE_V32HI_V32HI_USI:
36400 case V32HI_FTYPE_V32QI_V32HI_USI:
36401 case V8DI_FTYPE_V16QI_V8DI_UQI:
36402 case V8DI_FTYPE_V2DI_V8DI_UQI:
36403 case V8DI_FTYPE_V4DI_V8DI_UQI:
36404 case V8DI_FTYPE_V8DI_V8DI_UQI:
36405 case V8DI_FTYPE_V8HI_V8DI_UQI:
36406 case V8DI_FTYPE_V8SI_V8DI_UQI:
36407 case V8HI_FTYPE_V8DI_V8HI_UQI:
36408 case V8SI_FTYPE_V8DI_V8SI_UQI:
36409 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36410 nargs = 3;
36411 break;
36412 case V32QI_FTYPE_V32QI_V32QI_INT:
36413 case V16HI_FTYPE_V16HI_V16HI_INT:
36414 case V16QI_FTYPE_V16QI_V16QI_INT:
36415 case V4DI_FTYPE_V4DI_V4DI_INT:
36416 case V8HI_FTYPE_V8HI_V8HI_INT:
36417 case V8SI_FTYPE_V8SI_V8SI_INT:
36418 case V8SI_FTYPE_V8SI_V4SI_INT:
36419 case V8SF_FTYPE_V8SF_V8SF_INT:
36420 case V8SF_FTYPE_V8SF_V4SF_INT:
36421 case V4SI_FTYPE_V4SI_V4SI_INT:
36422 case V4DF_FTYPE_V4DF_V4DF_INT:
36423 case V16SF_FTYPE_V16SF_V16SF_INT:
36424 case V16SF_FTYPE_V16SF_V4SF_INT:
36425 case V16SI_FTYPE_V16SI_V4SI_INT:
36426 case V4DF_FTYPE_V4DF_V2DF_INT:
36427 case V4SF_FTYPE_V4SF_V4SF_INT:
36428 case V2DI_FTYPE_V2DI_V2DI_INT:
36429 case V4DI_FTYPE_V4DI_V2DI_INT:
36430 case V2DF_FTYPE_V2DF_V2DF_INT:
36431 case UQI_FTYPE_V8DI_V8UDI_INT:
36432 case UQI_FTYPE_V8DF_V8DF_INT:
36433 case UQI_FTYPE_V2DF_V2DF_INT:
36434 case UQI_FTYPE_V4SF_V4SF_INT:
36435 case UHI_FTYPE_V16SI_V16SI_INT:
36436 case UHI_FTYPE_V16SF_V16SF_INT:
36437 nargs = 3;
36438 nargs_constant = 1;
36439 break;
36440 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36441 nargs = 3;
36442 rmode = V4DImode;
36443 nargs_constant = 1;
36444 break;
36445 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36446 nargs = 3;
36447 rmode = V2DImode;
36448 nargs_constant = 1;
36449 break;
36450 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36451 nargs = 3;
36452 rmode = DImode;
36453 nargs_constant = 1;
36454 break;
36455 case V2DI_FTYPE_V2DI_UINT_UINT:
36456 nargs = 3;
36457 nargs_constant = 2;
36458 break;
36459 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36460 nargs = 3;
36461 rmode = V8DImode;
36462 nargs_constant = 1;
36463 break;
36464 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36465 nargs = 5;
36466 rmode = V8DImode;
36467 mask_pos = 2;
36468 nargs_constant = 1;
36469 break;
36470 case QI_FTYPE_V8DF_INT_UQI:
36471 case QI_FTYPE_V4DF_INT_UQI:
36472 case QI_FTYPE_V2DF_INT_UQI:
36473 case HI_FTYPE_V16SF_INT_UHI:
36474 case QI_FTYPE_V8SF_INT_UQI:
36475 case QI_FTYPE_V4SF_INT_UQI:
36476 nargs = 3;
36477 mask_pos = 1;
36478 nargs_constant = 1;
36479 break;
36480 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36481 nargs = 5;
36482 rmode = V4DImode;
36483 mask_pos = 2;
36484 nargs_constant = 1;
36485 break;
36486 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36487 nargs = 5;
36488 rmode = V2DImode;
36489 mask_pos = 2;
36490 nargs_constant = 1;
36491 break;
36492 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36493 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36494 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36495 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36496 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36497 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36498 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36499 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36500 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36501 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36502 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36503 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36504 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36505 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36506 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36507 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36508 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36509 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36510 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36511 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36512 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36513 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36514 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36515 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36516 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36517 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36518 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36519 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36520 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36521 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36522 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36523 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36524 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36525 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36526 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36527 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36528 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36529 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36530 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36531 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36532 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36533 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36534 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36535 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36536 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36537 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36538 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36539 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36540 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36541 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36542 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36543 nargs = 4;
36544 break;
36545 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36546 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36547 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36548 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36549 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36550 nargs = 4;
36551 nargs_constant = 1;
36552 break;
36553 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36554 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36555 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36556 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36557 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36558 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36559 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36560 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36561 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36562 case USI_FTYPE_V32QI_V32QI_INT_USI:
36563 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36564 case USI_FTYPE_V32HI_V32HI_INT_USI:
36565 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36566 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36567 nargs = 4;
36568 mask_pos = 1;
36569 nargs_constant = 1;
36570 break;
36571 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36572 nargs = 4;
36573 nargs_constant = 2;
36574 break;
36575 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36576 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36577 nargs = 4;
36578 break;
36579 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36580 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36581 mask_pos = 1;
36582 nargs = 4;
36583 nargs_constant = 1;
36584 break;
36585 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36586 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36587 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36588 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36589 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36590 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36591 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36592 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36593 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36594 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36595 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36596 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36597 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36598 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36599 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36600 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36601 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36602 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36603 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36604 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36605 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36606 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36607 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36608 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36609 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36610 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36611 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36612 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36613 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36614 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36615 nargs = 4;
36616 mask_pos = 2;
36617 nargs_constant = 1;
36618 break;
36619 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36620 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36621 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36622 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36623 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36624 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36625 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36626 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36627 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36628 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36629 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36630 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36631 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36632 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36633 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36634 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36635 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36636 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36637 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36638 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36639 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36640 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36641 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36642 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36643 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36644 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36645 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36646 nargs = 5;
36647 mask_pos = 2;
36648 nargs_constant = 1;
36649 break;
36650 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36651 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36652 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36653 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36654 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36655 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36656 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36657 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36658 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36659 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36660 nargs = 5;
36661 mask_pos = 1;
36662 nargs_constant = 1;
36663 break;
36665 default:
36666 gcc_unreachable ();
36669 gcc_assert (nargs <= ARRAY_SIZE (args));
36671 if (comparison != UNKNOWN)
36673 gcc_assert (nargs == 2);
36674 return ix86_expand_sse_compare (d, exp, target, swap);
36677 if (rmode == VOIDmode || rmode == tmode)
36679 if (optimize
36680 || target == 0
36681 || GET_MODE (target) != tmode
36682 || !insn_p->operand[0].predicate (target, tmode))
36683 target = gen_reg_rtx (tmode);
36684 else if (memory_operand (target, tmode))
36685 num_memory++;
36686 real_target = target;
36688 else
36690 real_target = gen_reg_rtx (tmode);
36691 target = lowpart_subreg (rmode, real_target, tmode);
36694 for (i = 0; i < nargs; i++)
36696 tree arg = CALL_EXPR_ARG (exp, i);
36697 rtx op = expand_normal (arg);
36698 machine_mode mode = insn_p->operand[i + 1].mode;
36699 bool match = insn_p->operand[i + 1].predicate (op, mode);
36701 if (second_arg_count && i == 1)
36703 /* SIMD shift insns take either an 8-bit immediate or
36704 register as count. But builtin functions take int as
36705 count. If count doesn't match, we put it in register.
36706 The instructions are using 64-bit count, if op is just
36707 32-bit, zero-extend it, as negative shift counts
36708 are undefined behavior and zero-extension is more
36709 efficient. */
36710 if (!match)
36712 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36713 op = convert_modes (mode, GET_MODE (op), op, 1);
36714 else
36715 op = lowpart_subreg (mode, op, GET_MODE (op));
36716 if (!insn_p->operand[i + 1].predicate (op, mode))
36717 op = copy_to_reg (op);
36720 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36721 (!mask_pos && (nargs - i) <= nargs_constant))
36723 if (!match)
36724 switch (icode)
36726 case CODE_FOR_avx_vinsertf128v4di:
36727 case CODE_FOR_avx_vextractf128v4di:
36728 error ("the last argument must be an 1-bit immediate");
36729 return const0_rtx;
36731 case CODE_FOR_avx512f_cmpv8di3_mask:
36732 case CODE_FOR_avx512f_cmpv16si3_mask:
36733 case CODE_FOR_avx512f_ucmpv8di3_mask:
36734 case CODE_FOR_avx512f_ucmpv16si3_mask:
36735 case CODE_FOR_avx512vl_cmpv4di3_mask:
36736 case CODE_FOR_avx512vl_cmpv8si3_mask:
36737 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36738 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36739 case CODE_FOR_avx512vl_cmpv2di3_mask:
36740 case CODE_FOR_avx512vl_cmpv4si3_mask:
36741 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36742 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36743 error ("the last argument must be a 3-bit immediate");
36744 return const0_rtx;
36746 case CODE_FOR_sse4_1_roundsd:
36747 case CODE_FOR_sse4_1_roundss:
36749 case CODE_FOR_sse4_1_roundpd:
36750 case CODE_FOR_sse4_1_roundps:
36751 case CODE_FOR_avx_roundpd256:
36752 case CODE_FOR_avx_roundps256:
36754 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36755 case CODE_FOR_sse4_1_roundps_sfix:
36756 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36757 case CODE_FOR_avx_roundps_sfix256:
36759 case CODE_FOR_sse4_1_blendps:
36760 case CODE_FOR_avx_blendpd256:
36761 case CODE_FOR_avx_vpermilv4df:
36762 case CODE_FOR_avx_vpermilv4df_mask:
36763 case CODE_FOR_avx512f_getmantv8df_mask:
36764 case CODE_FOR_avx512f_getmantv16sf_mask:
36765 case CODE_FOR_avx512vl_getmantv8sf_mask:
36766 case CODE_FOR_avx512vl_getmantv4df_mask:
36767 case CODE_FOR_avx512vl_getmantv4sf_mask:
36768 case CODE_FOR_avx512vl_getmantv2df_mask:
36769 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36770 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36771 case CODE_FOR_avx512dq_rangepv4df_mask:
36772 case CODE_FOR_avx512dq_rangepv8sf_mask:
36773 case CODE_FOR_avx512dq_rangepv2df_mask:
36774 case CODE_FOR_avx512dq_rangepv4sf_mask:
36775 case CODE_FOR_avx_shufpd256_mask:
36776 error ("the last argument must be a 4-bit immediate");
36777 return const0_rtx;
36779 case CODE_FOR_sha1rnds4:
36780 case CODE_FOR_sse4_1_blendpd:
36781 case CODE_FOR_avx_vpermilv2df:
36782 case CODE_FOR_avx_vpermilv2df_mask:
36783 case CODE_FOR_xop_vpermil2v2df3:
36784 case CODE_FOR_xop_vpermil2v4sf3:
36785 case CODE_FOR_xop_vpermil2v4df3:
36786 case CODE_FOR_xop_vpermil2v8sf3:
36787 case CODE_FOR_avx512f_vinsertf32x4_mask:
36788 case CODE_FOR_avx512f_vinserti32x4_mask:
36789 case CODE_FOR_avx512f_vextractf32x4_mask:
36790 case CODE_FOR_avx512f_vextracti32x4_mask:
36791 case CODE_FOR_sse2_shufpd:
36792 case CODE_FOR_sse2_shufpd_mask:
36793 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36794 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36795 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36796 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36797 error ("the last argument must be a 2-bit immediate");
36798 return const0_rtx;
36800 case CODE_FOR_avx_vextractf128v4df:
36801 case CODE_FOR_avx_vextractf128v8sf:
36802 case CODE_FOR_avx_vextractf128v8si:
36803 case CODE_FOR_avx_vinsertf128v4df:
36804 case CODE_FOR_avx_vinsertf128v8sf:
36805 case CODE_FOR_avx_vinsertf128v8si:
36806 case CODE_FOR_avx512f_vinsertf64x4_mask:
36807 case CODE_FOR_avx512f_vinserti64x4_mask:
36808 case CODE_FOR_avx512f_vextractf64x4_mask:
36809 case CODE_FOR_avx512f_vextracti64x4_mask:
36810 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36811 case CODE_FOR_avx512dq_vinserti32x8_mask:
36812 case CODE_FOR_avx512vl_vinsertv4df:
36813 case CODE_FOR_avx512vl_vinsertv4di:
36814 case CODE_FOR_avx512vl_vinsertv8sf:
36815 case CODE_FOR_avx512vl_vinsertv8si:
36816 error ("the last argument must be a 1-bit immediate");
36817 return const0_rtx;
36819 case CODE_FOR_avx_vmcmpv2df3:
36820 case CODE_FOR_avx_vmcmpv4sf3:
36821 case CODE_FOR_avx_cmpv2df3:
36822 case CODE_FOR_avx_cmpv4sf3:
36823 case CODE_FOR_avx_cmpv4df3:
36824 case CODE_FOR_avx_cmpv8sf3:
36825 case CODE_FOR_avx512f_cmpv8df3_mask:
36826 case CODE_FOR_avx512f_cmpv16sf3_mask:
36827 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36828 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36829 error ("the last argument must be a 5-bit immediate");
36830 return const0_rtx;
36832 default:
36833 switch (nargs_constant)
36835 case 2:
36836 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36837 (!mask_pos && (nargs - i) == nargs_constant))
36839 error ("the next to last argument must be an 8-bit immediate");
36840 break;
36842 /* FALLTHRU */
36843 case 1:
36844 error ("the last argument must be an 8-bit immediate");
36845 break;
36846 default:
36847 gcc_unreachable ();
36849 return const0_rtx;
36852 else
36854 if (VECTOR_MODE_P (mode))
36855 op = safe_vector_operand (op, mode);
36857 /* If we aren't optimizing, only allow one memory operand to
36858 be generated. */
36859 if (memory_operand (op, mode))
36860 num_memory++;
36862 op = fixup_modeless_constant (op, mode);
36864 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36866 if (optimize || !match || num_memory > 1)
36867 op = copy_to_mode_reg (mode, op);
36869 else
36871 op = copy_to_reg (op);
36872 op = lowpart_subreg (mode, op, GET_MODE (op));
36876 args[i].op = op;
36877 args[i].mode = mode;
36880 switch (nargs)
36882 case 1:
36883 pat = GEN_FCN (icode) (real_target, args[0].op);
36884 break;
36885 case 2:
36886 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36887 break;
36888 case 3:
36889 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36890 args[2].op);
36891 break;
36892 case 4:
36893 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36894 args[2].op, args[3].op);
36895 break;
36896 case 5:
36897 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36898 args[2].op, args[3].op, args[4].op);
36899 break;
36900 case 6:
36901 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36902 args[2].op, args[3].op, args[4].op,
36903 args[5].op);
36904 break;
36905 default:
36906 gcc_unreachable ();
36909 if (! pat)
36910 return 0;
36912 emit_insn (pat);
36913 return target;
36916 /* Transform pattern of following layout:
36917 (set A
36918 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36920 into:
36921 (set (A B)) */
36923 static rtx
36924 ix86_erase_embedded_rounding (rtx pat)
36926 if (GET_CODE (pat) == INSN)
36927 pat = PATTERN (pat);
36929 gcc_assert (GET_CODE (pat) == SET);
36930 rtx src = SET_SRC (pat);
36931 gcc_assert (XVECLEN (src, 0) == 2);
36932 rtx p0 = XVECEXP (src, 0, 0);
36933 gcc_assert (GET_CODE (src) == UNSPEC
36934 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36935 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36936 return res;
36939 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36940 with rounding. */
36941 static rtx
36942 ix86_expand_sse_comi_round (const struct builtin_description *d,
36943 tree exp, rtx target)
36945 rtx pat, set_dst;
36946 tree arg0 = CALL_EXPR_ARG (exp, 0);
36947 tree arg1 = CALL_EXPR_ARG (exp, 1);
36948 tree arg2 = CALL_EXPR_ARG (exp, 2);
36949 tree arg3 = CALL_EXPR_ARG (exp, 3);
36950 rtx op0 = expand_normal (arg0);
36951 rtx op1 = expand_normal (arg1);
36952 rtx op2 = expand_normal (arg2);
36953 rtx op3 = expand_normal (arg3);
36954 enum insn_code icode = d->icode;
36955 const struct insn_data_d *insn_p = &insn_data[icode];
36956 machine_mode mode0 = insn_p->operand[0].mode;
36957 machine_mode mode1 = insn_p->operand[1].mode;
36958 enum rtx_code comparison = UNEQ;
36959 bool need_ucomi = false;
36961 /* See avxintrin.h for values. */
36962 enum rtx_code comi_comparisons[32] =
36964 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36965 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36966 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36968 bool need_ucomi_values[32] =
36970 true, false, false, true, true, false, false, true,
36971 true, false, false, true, true, false, false, true,
36972 false, true, true, false, false, true, true, false,
36973 false, true, true, false, false, true, true, false
36976 if (!CONST_INT_P (op2))
36978 error ("the third argument must be comparison constant");
36979 return const0_rtx;
36981 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36983 error ("incorrect comparison mode");
36984 return const0_rtx;
36987 if (!insn_p->operand[2].predicate (op3, SImode))
36989 error ("incorrect rounding operand");
36990 return const0_rtx;
36993 comparison = comi_comparisons[INTVAL (op2)];
36994 need_ucomi = need_ucomi_values[INTVAL (op2)];
36996 if (VECTOR_MODE_P (mode0))
36997 op0 = safe_vector_operand (op0, mode0);
36998 if (VECTOR_MODE_P (mode1))
36999 op1 = safe_vector_operand (op1, mode1);
37001 target = gen_reg_rtx (SImode);
37002 emit_move_insn (target, const0_rtx);
37003 target = gen_rtx_SUBREG (QImode, target, 0);
37005 if ((optimize && !register_operand (op0, mode0))
37006 || !insn_p->operand[0].predicate (op0, mode0))
37007 op0 = copy_to_mode_reg (mode0, op0);
37008 if ((optimize && !register_operand (op1, mode1))
37009 || !insn_p->operand[1].predicate (op1, mode1))
37010 op1 = copy_to_mode_reg (mode1, op1);
37012 if (need_ucomi)
37013 icode = icode == CODE_FOR_sse_comi_round
37014 ? CODE_FOR_sse_ucomi_round
37015 : CODE_FOR_sse2_ucomi_round;
37017 pat = GEN_FCN (icode) (op0, op1, op3);
37018 if (! pat)
37019 return 0;
37021 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
37022 if (INTVAL (op3) == NO_ROUND)
37024 pat = ix86_erase_embedded_rounding (pat);
37025 if (! pat)
37026 return 0;
37028 set_dst = SET_DEST (pat);
37030 else
37032 gcc_assert (GET_CODE (pat) == SET);
37033 set_dst = SET_DEST (pat);
37036 emit_insn (pat);
37037 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
37038 gen_rtx_fmt_ee (comparison, QImode,
37039 set_dst,
37040 const0_rtx)));
37042 return SUBREG_REG (target);
37045 static rtx
37046 ix86_expand_round_builtin (const struct builtin_description *d,
37047 tree exp, rtx target)
37049 rtx pat;
37050 unsigned int i, nargs;
37051 struct
37053 rtx op;
37054 machine_mode mode;
37055 } args[6];
37056 enum insn_code icode = d->icode;
37057 const struct insn_data_d *insn_p = &insn_data[icode];
37058 machine_mode tmode = insn_p->operand[0].mode;
37059 unsigned int nargs_constant = 0;
37060 unsigned int redundant_embed_rnd = 0;
37062 switch ((enum ix86_builtin_func_type) d->flag)
37064 case UINT64_FTYPE_V2DF_INT:
37065 case UINT64_FTYPE_V4SF_INT:
37066 case UINT_FTYPE_V2DF_INT:
37067 case UINT_FTYPE_V4SF_INT:
37068 case INT64_FTYPE_V2DF_INT:
37069 case INT64_FTYPE_V4SF_INT:
37070 case INT_FTYPE_V2DF_INT:
37071 case INT_FTYPE_V4SF_INT:
37072 nargs = 2;
37073 break;
37074 case V4SF_FTYPE_V4SF_UINT_INT:
37075 case V4SF_FTYPE_V4SF_UINT64_INT:
37076 case V2DF_FTYPE_V2DF_UINT64_INT:
37077 case V4SF_FTYPE_V4SF_INT_INT:
37078 case V4SF_FTYPE_V4SF_INT64_INT:
37079 case V2DF_FTYPE_V2DF_INT64_INT:
37080 case V4SF_FTYPE_V4SF_V4SF_INT:
37081 case V2DF_FTYPE_V2DF_V2DF_INT:
37082 case V4SF_FTYPE_V4SF_V2DF_INT:
37083 case V2DF_FTYPE_V2DF_V4SF_INT:
37084 nargs = 3;
37085 break;
37086 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
37087 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
37088 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
37089 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
37090 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
37091 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
37092 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
37093 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
37094 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
37095 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
37096 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
37097 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
37098 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
37099 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
37100 nargs = 4;
37101 break;
37102 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
37103 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
37104 nargs_constant = 2;
37105 nargs = 4;
37106 break;
37107 case INT_FTYPE_V4SF_V4SF_INT_INT:
37108 case INT_FTYPE_V2DF_V2DF_INT_INT:
37109 return ix86_expand_sse_comi_round (d, exp, target);
37110 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
37111 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
37112 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
37113 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
37114 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
37115 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
37116 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
37117 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
37118 nargs = 5;
37119 break;
37120 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
37121 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
37122 nargs_constant = 4;
37123 nargs = 5;
37124 break;
37125 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
37126 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
37127 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
37128 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
37129 nargs_constant = 3;
37130 nargs = 5;
37131 break;
37132 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
37133 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
37134 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
37135 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
37136 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
37137 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
37138 nargs = 6;
37139 nargs_constant = 4;
37140 break;
37141 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
37142 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
37143 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
37144 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
37145 nargs = 6;
37146 nargs_constant = 3;
37147 break;
37148 default:
37149 gcc_unreachable ();
37151 gcc_assert (nargs <= ARRAY_SIZE (args));
37153 if (optimize
37154 || target == 0
37155 || GET_MODE (target) != tmode
37156 || !insn_p->operand[0].predicate (target, tmode))
37157 target = gen_reg_rtx (tmode);
37159 for (i = 0; i < nargs; i++)
37161 tree arg = CALL_EXPR_ARG (exp, i);
37162 rtx op = expand_normal (arg);
37163 machine_mode mode = insn_p->operand[i + 1].mode;
37164 bool match = insn_p->operand[i + 1].predicate (op, mode);
37166 if (i == nargs - nargs_constant)
37168 if (!match)
37170 switch (icode)
37172 case CODE_FOR_avx512f_getmantv8df_mask_round:
37173 case CODE_FOR_avx512f_getmantv16sf_mask_round:
37174 case CODE_FOR_avx512f_vgetmantv2df_round:
37175 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
37176 case CODE_FOR_avx512f_vgetmantv4sf_round:
37177 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
37178 error ("the immediate argument must be a 4-bit immediate");
37179 return const0_rtx;
37180 case CODE_FOR_avx512f_cmpv8df3_mask_round:
37181 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
37182 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
37183 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
37184 error ("the immediate argument must be a 5-bit immediate");
37185 return const0_rtx;
37186 default:
37187 error ("the immediate argument must be an 8-bit immediate");
37188 return const0_rtx;
37192 else if (i == nargs-1)
37194 if (!insn_p->operand[nargs].predicate (op, SImode))
37196 error ("incorrect rounding operand");
37197 return const0_rtx;
37200 /* If there is no rounding use normal version of the pattern. */
37201 if (INTVAL (op) == NO_ROUND)
37202 redundant_embed_rnd = 1;
37204 else
37206 if (VECTOR_MODE_P (mode))
37207 op = safe_vector_operand (op, mode);
37209 op = fixup_modeless_constant (op, mode);
37211 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37213 if (optimize || !match)
37214 op = copy_to_mode_reg (mode, op);
37216 else
37218 op = copy_to_reg (op);
37219 op = lowpart_subreg (mode, op, GET_MODE (op));
37223 args[i].op = op;
37224 args[i].mode = mode;
37227 switch (nargs)
37229 case 1:
37230 pat = GEN_FCN (icode) (target, args[0].op);
37231 break;
37232 case 2:
37233 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37234 break;
37235 case 3:
37236 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37237 args[2].op);
37238 break;
37239 case 4:
37240 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37241 args[2].op, args[3].op);
37242 break;
37243 case 5:
37244 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37245 args[2].op, args[3].op, args[4].op);
37246 break;
37247 case 6:
37248 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
37249 args[2].op, args[3].op, args[4].op,
37250 args[5].op);
37251 break;
37252 default:
37253 gcc_unreachable ();
37256 if (!pat)
37257 return 0;
37259 if (redundant_embed_rnd)
37260 pat = ix86_erase_embedded_rounding (pat);
37262 emit_insn (pat);
37263 return target;
37266 /* Subroutine of ix86_expand_builtin to take care of special insns
37267 with variable number of operands. */
37269 static rtx
37270 ix86_expand_special_args_builtin (const struct builtin_description *d,
37271 tree exp, rtx target)
37273 tree arg;
37274 rtx pat, op;
37275 unsigned int i, nargs, arg_adjust, memory;
37276 bool aligned_mem = false;
37277 struct
37279 rtx op;
37280 machine_mode mode;
37281 } args[3];
37282 enum insn_code icode = d->icode;
37283 bool last_arg_constant = false;
37284 const struct insn_data_d *insn_p = &insn_data[icode];
37285 machine_mode tmode = insn_p->operand[0].mode;
37286 enum { load, store } klass;
37288 switch ((enum ix86_builtin_func_type) d->flag)
37290 case VOID_FTYPE_VOID:
37291 emit_insn (GEN_FCN (icode) (target));
37292 return 0;
37293 case VOID_FTYPE_UINT64:
37294 case VOID_FTYPE_UNSIGNED:
37295 nargs = 0;
37296 klass = store;
37297 memory = 0;
37298 break;
37300 case INT_FTYPE_VOID:
37301 case USHORT_FTYPE_VOID:
37302 case UINT64_FTYPE_VOID:
37303 case UNSIGNED_FTYPE_VOID:
37304 nargs = 0;
37305 klass = load;
37306 memory = 0;
37307 break;
37308 case UINT64_FTYPE_PUNSIGNED:
37309 case V2DI_FTYPE_PV2DI:
37310 case V4DI_FTYPE_PV4DI:
37311 case V32QI_FTYPE_PCCHAR:
37312 case V16QI_FTYPE_PCCHAR:
37313 case V8SF_FTYPE_PCV4SF:
37314 case V8SF_FTYPE_PCFLOAT:
37315 case V4SF_FTYPE_PCFLOAT:
37316 case V4DF_FTYPE_PCV2DF:
37317 case V4DF_FTYPE_PCDOUBLE:
37318 case V2DF_FTYPE_PCDOUBLE:
37319 case VOID_FTYPE_PVOID:
37320 case V8DI_FTYPE_PV8DI:
37321 nargs = 1;
37322 klass = load;
37323 memory = 0;
37324 switch (icode)
37326 case CODE_FOR_sse4_1_movntdqa:
37327 case CODE_FOR_avx2_movntdqa:
37328 case CODE_FOR_avx512f_movntdqa:
37329 aligned_mem = true;
37330 break;
37331 default:
37332 break;
37334 break;
37335 case VOID_FTYPE_PV2SF_V4SF:
37336 case VOID_FTYPE_PV8DI_V8DI:
37337 case VOID_FTYPE_PV4DI_V4DI:
37338 case VOID_FTYPE_PV2DI_V2DI:
37339 case VOID_FTYPE_PCHAR_V32QI:
37340 case VOID_FTYPE_PCHAR_V16QI:
37341 case VOID_FTYPE_PFLOAT_V16SF:
37342 case VOID_FTYPE_PFLOAT_V8SF:
37343 case VOID_FTYPE_PFLOAT_V4SF:
37344 case VOID_FTYPE_PDOUBLE_V8DF:
37345 case VOID_FTYPE_PDOUBLE_V4DF:
37346 case VOID_FTYPE_PDOUBLE_V2DF:
37347 case VOID_FTYPE_PLONGLONG_LONGLONG:
37348 case VOID_FTYPE_PULONGLONG_ULONGLONG:
37349 case VOID_FTYPE_PINT_INT:
37350 nargs = 1;
37351 klass = store;
37352 /* Reserve memory operand for target. */
37353 memory = ARRAY_SIZE (args);
37354 switch (icode)
37356 /* These builtins and instructions require the memory
37357 to be properly aligned. */
37358 case CODE_FOR_avx_movntv4di:
37359 case CODE_FOR_sse2_movntv2di:
37360 case CODE_FOR_avx_movntv8sf:
37361 case CODE_FOR_sse_movntv4sf:
37362 case CODE_FOR_sse4a_vmmovntv4sf:
37363 case CODE_FOR_avx_movntv4df:
37364 case CODE_FOR_sse2_movntv2df:
37365 case CODE_FOR_sse4a_vmmovntv2df:
37366 case CODE_FOR_sse2_movntidi:
37367 case CODE_FOR_sse_movntq:
37368 case CODE_FOR_sse2_movntisi:
37369 case CODE_FOR_avx512f_movntv16sf:
37370 case CODE_FOR_avx512f_movntv8df:
37371 case CODE_FOR_avx512f_movntv8di:
37372 aligned_mem = true;
37373 break;
37374 default:
37375 break;
37377 break;
37378 case V4SF_FTYPE_V4SF_PCV2SF:
37379 case V2DF_FTYPE_V2DF_PCDOUBLE:
37380 nargs = 2;
37381 klass = load;
37382 memory = 1;
37383 break;
37384 case V8SF_FTYPE_PCV8SF_V8SI:
37385 case V4DF_FTYPE_PCV4DF_V4DI:
37386 case V4SF_FTYPE_PCV4SF_V4SI:
37387 case V2DF_FTYPE_PCV2DF_V2DI:
37388 case V8SI_FTYPE_PCV8SI_V8SI:
37389 case V4DI_FTYPE_PCV4DI_V4DI:
37390 case V4SI_FTYPE_PCV4SI_V4SI:
37391 case V2DI_FTYPE_PCV2DI_V2DI:
37392 case VOID_FTYPE_INT_INT64:
37393 nargs = 2;
37394 klass = load;
37395 memory = 0;
37396 break;
37397 case VOID_FTYPE_PV8DF_V8DF_UQI:
37398 case VOID_FTYPE_PV4DF_V4DF_UQI:
37399 case VOID_FTYPE_PV2DF_V2DF_UQI:
37400 case VOID_FTYPE_PV16SF_V16SF_UHI:
37401 case VOID_FTYPE_PV8SF_V8SF_UQI:
37402 case VOID_FTYPE_PV4SF_V4SF_UQI:
37403 case VOID_FTYPE_PV8DI_V8DI_UQI:
37404 case VOID_FTYPE_PV4DI_V4DI_UQI:
37405 case VOID_FTYPE_PV2DI_V2DI_UQI:
37406 case VOID_FTYPE_PV16SI_V16SI_UHI:
37407 case VOID_FTYPE_PV8SI_V8SI_UQI:
37408 case VOID_FTYPE_PV4SI_V4SI_UQI:
37409 switch (icode)
37411 /* These builtins and instructions require the memory
37412 to be properly aligned. */
37413 case CODE_FOR_avx512f_storev16sf_mask:
37414 case CODE_FOR_avx512f_storev16si_mask:
37415 case CODE_FOR_avx512f_storev8df_mask:
37416 case CODE_FOR_avx512f_storev8di_mask:
37417 case CODE_FOR_avx512vl_storev8sf_mask:
37418 case CODE_FOR_avx512vl_storev8si_mask:
37419 case CODE_FOR_avx512vl_storev4df_mask:
37420 case CODE_FOR_avx512vl_storev4di_mask:
37421 case CODE_FOR_avx512vl_storev4sf_mask:
37422 case CODE_FOR_avx512vl_storev4si_mask:
37423 case CODE_FOR_avx512vl_storev2df_mask:
37424 case CODE_FOR_avx512vl_storev2di_mask:
37425 aligned_mem = true;
37426 break;
37427 default:
37428 break;
37430 /* FALLTHRU */
37431 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37432 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37433 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37434 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37435 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37436 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37437 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37438 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37439 case VOID_FTYPE_PV8SI_V8DI_UQI:
37440 case VOID_FTYPE_PV8HI_V8DI_UQI:
37441 case VOID_FTYPE_PV16HI_V16SI_UHI:
37442 case VOID_FTYPE_PV16QI_V8DI_UQI:
37443 case VOID_FTYPE_PV16QI_V16SI_UHI:
37444 case VOID_FTYPE_PV4SI_V4DI_UQI:
37445 case VOID_FTYPE_PV4SI_V2DI_UQI:
37446 case VOID_FTYPE_PV8HI_V4DI_UQI:
37447 case VOID_FTYPE_PV8HI_V2DI_UQI:
37448 case VOID_FTYPE_PV8HI_V8SI_UQI:
37449 case VOID_FTYPE_PV8HI_V4SI_UQI:
37450 case VOID_FTYPE_PV16QI_V4DI_UQI:
37451 case VOID_FTYPE_PV16QI_V2DI_UQI:
37452 case VOID_FTYPE_PV16QI_V8SI_UQI:
37453 case VOID_FTYPE_PV16QI_V4SI_UQI:
37454 case VOID_FTYPE_PCHAR_V64QI_UDI:
37455 case VOID_FTYPE_PCHAR_V32QI_USI:
37456 case VOID_FTYPE_PCHAR_V16QI_UHI:
37457 case VOID_FTYPE_PSHORT_V32HI_USI:
37458 case VOID_FTYPE_PSHORT_V16HI_UHI:
37459 case VOID_FTYPE_PSHORT_V8HI_UQI:
37460 case VOID_FTYPE_PINT_V16SI_UHI:
37461 case VOID_FTYPE_PINT_V8SI_UQI:
37462 case VOID_FTYPE_PINT_V4SI_UQI:
37463 case VOID_FTYPE_PINT64_V8DI_UQI:
37464 case VOID_FTYPE_PINT64_V4DI_UQI:
37465 case VOID_FTYPE_PINT64_V2DI_UQI:
37466 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37467 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37468 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37469 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37470 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37471 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37472 case VOID_FTYPE_PV32QI_V32HI_USI:
37473 case VOID_FTYPE_PV16QI_V16HI_UHI:
37474 case VOID_FTYPE_PV8QI_V8HI_UQI:
37475 nargs = 2;
37476 klass = store;
37477 /* Reserve memory operand for target. */
37478 memory = ARRAY_SIZE (args);
37479 break;
37480 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37481 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37482 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37483 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37484 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37485 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37486 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37487 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37488 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37489 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37490 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37491 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37492 switch (icode)
37494 /* These builtins and instructions require the memory
37495 to be properly aligned. */
37496 case CODE_FOR_avx512f_loadv16sf_mask:
37497 case CODE_FOR_avx512f_loadv16si_mask:
37498 case CODE_FOR_avx512f_loadv8df_mask:
37499 case CODE_FOR_avx512f_loadv8di_mask:
37500 case CODE_FOR_avx512vl_loadv8sf_mask:
37501 case CODE_FOR_avx512vl_loadv8si_mask:
37502 case CODE_FOR_avx512vl_loadv4df_mask:
37503 case CODE_FOR_avx512vl_loadv4di_mask:
37504 case CODE_FOR_avx512vl_loadv4sf_mask:
37505 case CODE_FOR_avx512vl_loadv4si_mask:
37506 case CODE_FOR_avx512vl_loadv2df_mask:
37507 case CODE_FOR_avx512vl_loadv2di_mask:
37508 case CODE_FOR_avx512bw_loadv64qi_mask:
37509 case CODE_FOR_avx512vl_loadv32qi_mask:
37510 case CODE_FOR_avx512vl_loadv16qi_mask:
37511 case CODE_FOR_avx512bw_loadv32hi_mask:
37512 case CODE_FOR_avx512vl_loadv16hi_mask:
37513 case CODE_FOR_avx512vl_loadv8hi_mask:
37514 aligned_mem = true;
37515 break;
37516 default:
37517 break;
37519 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37520 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37521 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37522 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37523 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37524 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37525 case V16SI_FTYPE_PCINT_V16SI_UHI:
37526 case V8SI_FTYPE_PCINT_V8SI_UQI:
37527 case V4SI_FTYPE_PCINT_V4SI_UQI:
37528 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37529 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37530 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37531 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37532 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37533 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37534 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37535 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37536 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37537 nargs = 3;
37538 klass = load;
37539 memory = 0;
37540 break;
37541 case VOID_FTYPE_UINT_UINT_UINT:
37542 case VOID_FTYPE_UINT64_UINT_UINT:
37543 case UCHAR_FTYPE_UINT_UINT_UINT:
37544 case UCHAR_FTYPE_UINT64_UINT_UINT:
37545 nargs = 3;
37546 klass = load;
37547 memory = ARRAY_SIZE (args);
37548 last_arg_constant = true;
37549 break;
37550 default:
37551 gcc_unreachable ();
37554 gcc_assert (nargs <= ARRAY_SIZE (args));
37556 if (klass == store)
37558 arg = CALL_EXPR_ARG (exp, 0);
37559 op = expand_normal (arg);
37560 gcc_assert (target == 0);
37561 if (memory)
37563 op = ix86_zero_extend_to_Pmode (op);
37564 target = gen_rtx_MEM (tmode, op);
37565 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37566 on it. Try to improve it using get_pointer_alignment,
37567 and if the special builtin is one that requires strict
37568 mode alignment, also from it's GET_MODE_ALIGNMENT.
37569 Failure to do so could lead to ix86_legitimate_combined_insn
37570 rejecting all changes to such insns. */
37571 unsigned int align = get_pointer_alignment (arg);
37572 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37573 align = GET_MODE_ALIGNMENT (tmode);
37574 if (MEM_ALIGN (target) < align)
37575 set_mem_align (target, align);
37577 else
37578 target = force_reg (tmode, op);
37579 arg_adjust = 1;
37581 else
37583 arg_adjust = 0;
37584 if (optimize
37585 || target == 0
37586 || !register_operand (target, tmode)
37587 || GET_MODE (target) != tmode)
37588 target = gen_reg_rtx (tmode);
37591 for (i = 0; i < nargs; i++)
37593 machine_mode mode = insn_p->operand[i + 1].mode;
37594 bool match;
37596 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37597 op = expand_normal (arg);
37598 match = insn_p->operand[i + 1].predicate (op, mode);
37600 if (last_arg_constant && (i + 1) == nargs)
37602 if (!match)
37604 if (icode == CODE_FOR_lwp_lwpvalsi3
37605 || icode == CODE_FOR_lwp_lwpinssi3
37606 || icode == CODE_FOR_lwp_lwpvaldi3
37607 || icode == CODE_FOR_lwp_lwpinsdi3)
37608 error ("the last argument must be a 32-bit immediate");
37609 else
37610 error ("the last argument must be an 8-bit immediate");
37611 return const0_rtx;
37614 else
37616 if (i == memory)
37618 /* This must be the memory operand. */
37619 op = ix86_zero_extend_to_Pmode (op);
37620 op = gen_rtx_MEM (mode, op);
37621 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37622 on it. Try to improve it using get_pointer_alignment,
37623 and if the special builtin is one that requires strict
37624 mode alignment, also from it's GET_MODE_ALIGNMENT.
37625 Failure to do so could lead to ix86_legitimate_combined_insn
37626 rejecting all changes to such insns. */
37627 unsigned int align = get_pointer_alignment (arg);
37628 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37629 align = GET_MODE_ALIGNMENT (mode);
37630 if (MEM_ALIGN (op) < align)
37631 set_mem_align (op, align);
37633 else
37635 /* This must be register. */
37636 if (VECTOR_MODE_P (mode))
37637 op = safe_vector_operand (op, mode);
37639 op = fixup_modeless_constant (op, mode);
37641 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37642 op = copy_to_mode_reg (mode, op);
37643 else
37645 op = copy_to_reg (op);
37646 op = lowpart_subreg (mode, op, GET_MODE (op));
37651 args[i].op = op;
37652 args[i].mode = mode;
37655 switch (nargs)
37657 case 0:
37658 pat = GEN_FCN (icode) (target);
37659 break;
37660 case 1:
37661 pat = GEN_FCN (icode) (target, args[0].op);
37662 break;
37663 case 2:
37664 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37665 break;
37666 case 3:
37667 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37668 break;
37669 default:
37670 gcc_unreachable ();
37673 if (! pat)
37674 return 0;
37675 emit_insn (pat);
37676 return klass == store ? 0 : target;
37679 /* Return the integer constant in ARG. Constrain it to be in the range
37680 of the subparts of VEC_TYPE; issue an error if not. */
37682 static int
37683 get_element_number (tree vec_type, tree arg)
37685 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37687 if (!tree_fits_uhwi_p (arg)
37688 || (elt = tree_to_uhwi (arg), elt > max))
37690 error ("selector must be an integer constant in the range 0..%wi", max);
37691 return 0;
37694 return elt;
37697 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37698 ix86_expand_vector_init. We DO have language-level syntax for this, in
37699 the form of (type){ init-list }. Except that since we can't place emms
37700 instructions from inside the compiler, we can't allow the use of MMX
37701 registers unless the user explicitly asks for it. So we do *not* define
37702 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37703 we have builtins invoked by mmintrin.h that gives us license to emit
37704 these sorts of instructions. */
37706 static rtx
37707 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37709 machine_mode tmode = TYPE_MODE (type);
37710 machine_mode inner_mode = GET_MODE_INNER (tmode);
37711 int i, n_elt = GET_MODE_NUNITS (tmode);
37712 rtvec v = rtvec_alloc (n_elt);
37714 gcc_assert (VECTOR_MODE_P (tmode));
37715 gcc_assert (call_expr_nargs (exp) == n_elt);
37717 for (i = 0; i < n_elt; ++i)
37719 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37720 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37723 if (!target || !register_operand (target, tmode))
37724 target = gen_reg_rtx (tmode);
37726 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37727 return target;
37730 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37731 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37732 had a language-level syntax for referencing vector elements. */
37734 static rtx
37735 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37737 machine_mode tmode, mode0;
37738 tree arg0, arg1;
37739 int elt;
37740 rtx op0;
37742 arg0 = CALL_EXPR_ARG (exp, 0);
37743 arg1 = CALL_EXPR_ARG (exp, 1);
37745 op0 = expand_normal (arg0);
37746 elt = get_element_number (TREE_TYPE (arg0), arg1);
37748 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37749 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37750 gcc_assert (VECTOR_MODE_P (mode0));
37752 op0 = force_reg (mode0, op0);
37754 if (optimize || !target || !register_operand (target, tmode))
37755 target = gen_reg_rtx (tmode);
37757 ix86_expand_vector_extract (true, target, op0, elt);
37759 return target;
37762 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37763 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37764 a language-level syntax for referencing vector elements. */
37766 static rtx
37767 ix86_expand_vec_set_builtin (tree exp)
37769 machine_mode tmode, mode1;
37770 tree arg0, arg1, arg2;
37771 int elt;
37772 rtx op0, op1, target;
37774 arg0 = CALL_EXPR_ARG (exp, 0);
37775 arg1 = CALL_EXPR_ARG (exp, 1);
37776 arg2 = CALL_EXPR_ARG (exp, 2);
37778 tmode = TYPE_MODE (TREE_TYPE (arg0));
37779 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37780 gcc_assert (VECTOR_MODE_P (tmode));
37782 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37783 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37784 elt = get_element_number (TREE_TYPE (arg0), arg2);
37786 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37787 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37789 op0 = force_reg (tmode, op0);
37790 op1 = force_reg (mode1, op1);
37792 /* OP0 is the source of these builtin functions and shouldn't be
37793 modified. Create a copy, use it and return it as target. */
37794 target = gen_reg_rtx (tmode);
37795 emit_move_insn (target, op0);
37796 ix86_expand_vector_set (true, target, op1, elt);
37798 return target;
37801 /* Emit conditional move of SRC to DST with condition
37802 OP1 CODE OP2. */
37803 static void
37804 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37806 rtx t;
37808 if (TARGET_CMOVE)
37810 t = ix86_expand_compare (code, op1, op2);
37811 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37812 src, dst)));
37814 else
37816 rtx_code_label *nomove = gen_label_rtx ();
37817 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37818 const0_rtx, GET_MODE (op1), 1, nomove);
37819 emit_move_insn (dst, src);
37820 emit_label (nomove);
37824 /* Choose max of DST and SRC and put it to DST. */
37825 static void
37826 ix86_emit_move_max (rtx dst, rtx src)
37828 ix86_emit_cmove (dst, src, LTU, dst, src);
37831 /* Expand an expression EXP that calls a built-in function,
37832 with result going to TARGET if that's convenient
37833 (and in mode MODE if that's convenient).
37834 SUBTARGET may be used as the target for computing one of EXP's operands.
37835 IGNORE is nonzero if the value is to be ignored. */
37837 static rtx
37838 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37839 machine_mode mode, int ignore)
37841 size_t i;
37842 enum insn_code icode;
37843 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37844 tree arg0, arg1, arg2, arg3, arg4;
37845 rtx op0, op1, op2, op3, op4, pat, insn;
37846 machine_mode mode0, mode1, mode2, mode3, mode4;
37847 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37849 /* For CPU builtins that can be folded, fold first and expand the fold. */
37850 switch (fcode)
37852 case IX86_BUILTIN_CPU_INIT:
37854 /* Make it call __cpu_indicator_init in libgcc. */
37855 tree call_expr, fndecl, type;
37856 type = build_function_type_list (integer_type_node, NULL_TREE);
37857 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37858 call_expr = build_call_expr (fndecl, 0);
37859 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37861 case IX86_BUILTIN_CPU_IS:
37862 case IX86_BUILTIN_CPU_SUPPORTS:
37864 tree arg0 = CALL_EXPR_ARG (exp, 0);
37865 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37866 gcc_assert (fold_expr != NULL_TREE);
37867 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37871 /* Determine whether the builtin function is available under the current ISA.
37872 Originally the builtin was not created if it wasn't applicable to the
37873 current ISA based on the command line switches. With function specific
37874 options, we need to check in the context of the function making the call
37875 whether it is supported. Treat AVX512VL specially. For other flags,
37876 if isa includes more than one ISA bit, treat those are requiring any
37877 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37878 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37879 at all, -m64 is a whole TU option. */
37880 if (((ix86_builtins_isa[fcode].isa
37881 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37882 && !(ix86_builtins_isa[fcode].isa
37883 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37884 & ix86_isa_flags))
37885 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37886 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37887 || (ix86_builtins_isa[fcode].isa2
37888 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37890 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37891 ix86_builtins_isa[fcode].isa2, 0, 0,
37892 NULL, NULL, (enum fpmath_unit) 0,
37893 false);
37894 if (!opts)
37895 error ("%qE needs unknown isa option", fndecl);
37896 else
37898 gcc_assert (opts != NULL);
37899 error ("%qE needs isa option %s", fndecl, opts);
37900 free (opts);
37902 return expand_call (exp, target, ignore);
37905 switch (fcode)
37907 case IX86_BUILTIN_BNDMK:
37908 if (!target
37909 || GET_MODE (target) != BNDmode
37910 || !register_operand (target, BNDmode))
37911 target = gen_reg_rtx (BNDmode);
37913 arg0 = CALL_EXPR_ARG (exp, 0);
37914 arg1 = CALL_EXPR_ARG (exp, 1);
37916 op0 = expand_normal (arg0);
37917 op1 = expand_normal (arg1);
37919 if (!register_operand (op0, Pmode))
37920 op0 = ix86_zero_extend_to_Pmode (op0);
37921 if (!register_operand (op1, Pmode))
37922 op1 = ix86_zero_extend_to_Pmode (op1);
37924 /* Builtin arg1 is size of block but instruction op1 should
37925 be (size - 1). */
37926 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37927 NULL_RTX, 1, OPTAB_DIRECT);
37929 emit_insn (BNDmode == BND64mode
37930 ? gen_bnd64_mk (target, op0, op1)
37931 : gen_bnd32_mk (target, op0, op1));
37932 return target;
37934 case IX86_BUILTIN_BNDSTX:
37935 arg0 = CALL_EXPR_ARG (exp, 0);
37936 arg1 = CALL_EXPR_ARG (exp, 1);
37937 arg2 = CALL_EXPR_ARG (exp, 2);
37939 op0 = expand_normal (arg0);
37940 op1 = expand_normal (arg1);
37941 op2 = expand_normal (arg2);
37943 if (!register_operand (op0, Pmode))
37944 op0 = ix86_zero_extend_to_Pmode (op0);
37945 if (!register_operand (op1, BNDmode))
37946 op1 = copy_to_mode_reg (BNDmode, op1);
37947 if (!register_operand (op2, Pmode))
37948 op2 = ix86_zero_extend_to_Pmode (op2);
37950 emit_insn (BNDmode == BND64mode
37951 ? gen_bnd64_stx (op2, op0, op1)
37952 : gen_bnd32_stx (op2, op0, op1));
37953 return 0;
37955 case IX86_BUILTIN_BNDLDX:
37956 if (!target
37957 || GET_MODE (target) != BNDmode
37958 || !register_operand (target, BNDmode))
37959 target = gen_reg_rtx (BNDmode);
37961 arg0 = CALL_EXPR_ARG (exp, 0);
37962 arg1 = CALL_EXPR_ARG (exp, 1);
37964 op0 = expand_normal (arg0);
37965 op1 = expand_normal (arg1);
37967 if (!register_operand (op0, Pmode))
37968 op0 = ix86_zero_extend_to_Pmode (op0);
37969 if (!register_operand (op1, Pmode))
37970 op1 = ix86_zero_extend_to_Pmode (op1);
37972 emit_insn (BNDmode == BND64mode
37973 ? gen_bnd64_ldx (target, op0, op1)
37974 : gen_bnd32_ldx (target, op0, op1));
37975 return target;
37977 case IX86_BUILTIN_BNDCL:
37978 arg0 = CALL_EXPR_ARG (exp, 0);
37979 arg1 = CALL_EXPR_ARG (exp, 1);
37981 op0 = expand_normal (arg0);
37982 op1 = expand_normal (arg1);
37984 if (!register_operand (op0, Pmode))
37985 op0 = ix86_zero_extend_to_Pmode (op0);
37986 if (!register_operand (op1, BNDmode))
37987 op1 = copy_to_mode_reg (BNDmode, op1);
37989 emit_insn (BNDmode == BND64mode
37990 ? gen_bnd64_cl (op1, op0)
37991 : gen_bnd32_cl (op1, op0));
37992 return 0;
37994 case IX86_BUILTIN_BNDCU:
37995 arg0 = CALL_EXPR_ARG (exp, 0);
37996 arg1 = CALL_EXPR_ARG (exp, 1);
37998 op0 = expand_normal (arg0);
37999 op1 = expand_normal (arg1);
38001 if (!register_operand (op0, Pmode))
38002 op0 = ix86_zero_extend_to_Pmode (op0);
38003 if (!register_operand (op1, BNDmode))
38004 op1 = copy_to_mode_reg (BNDmode, op1);
38006 emit_insn (BNDmode == BND64mode
38007 ? gen_bnd64_cu (op1, op0)
38008 : gen_bnd32_cu (op1, op0));
38009 return 0;
38011 case IX86_BUILTIN_BNDRET:
38012 arg0 = CALL_EXPR_ARG (exp, 0);
38013 target = chkp_get_rtl_bounds (arg0);
38015 /* If no bounds were specified for returned value,
38016 then use INIT bounds. It usually happens when
38017 some built-in function is expanded. */
38018 if (!target)
38020 rtx t1 = gen_reg_rtx (Pmode);
38021 rtx t2 = gen_reg_rtx (Pmode);
38022 target = gen_reg_rtx (BNDmode);
38023 emit_move_insn (t1, const0_rtx);
38024 emit_move_insn (t2, constm1_rtx);
38025 emit_insn (BNDmode == BND64mode
38026 ? gen_bnd64_mk (target, t1, t2)
38027 : gen_bnd32_mk (target, t1, t2));
38030 gcc_assert (target && REG_P (target));
38031 return target;
38033 case IX86_BUILTIN_BNDNARROW:
38035 rtx m1, m1h1, m1h2, lb, ub, t1;
38037 /* Return value and lb. */
38038 arg0 = CALL_EXPR_ARG (exp, 0);
38039 /* Bounds. */
38040 arg1 = CALL_EXPR_ARG (exp, 1);
38041 /* Size. */
38042 arg2 = CALL_EXPR_ARG (exp, 2);
38044 lb = expand_normal (arg0);
38045 op1 = expand_normal (arg1);
38046 op2 = expand_normal (arg2);
38048 /* Size was passed but we need to use (size - 1) as for bndmk. */
38049 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
38050 NULL_RTX, 1, OPTAB_DIRECT);
38052 /* Add LB to size and inverse to get UB. */
38053 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
38054 op2, 1, OPTAB_DIRECT);
38055 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
38057 if (!register_operand (lb, Pmode))
38058 lb = ix86_zero_extend_to_Pmode (lb);
38059 if (!register_operand (ub, Pmode))
38060 ub = ix86_zero_extend_to_Pmode (ub);
38062 /* We need to move bounds to memory before any computations. */
38063 if (MEM_P (op1))
38064 m1 = op1;
38065 else
38067 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
38068 emit_move_insn (m1, op1);
38071 /* Generate mem expression to be used for access to LB and UB. */
38072 m1h1 = adjust_address (m1, Pmode, 0);
38073 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
38075 t1 = gen_reg_rtx (Pmode);
38077 /* Compute LB. */
38078 emit_move_insn (t1, m1h1);
38079 ix86_emit_move_max (t1, lb);
38080 emit_move_insn (m1h1, t1);
38082 /* Compute UB. UB is stored in 1's complement form. Therefore
38083 we also use max here. */
38084 emit_move_insn (t1, m1h2);
38085 ix86_emit_move_max (t1, ub);
38086 emit_move_insn (m1h2, t1);
38088 op2 = gen_reg_rtx (BNDmode);
38089 emit_move_insn (op2, m1);
38091 return chkp_join_splitted_slot (lb, op2);
38094 case IX86_BUILTIN_BNDINT:
38096 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
38098 if (!target
38099 || GET_MODE (target) != BNDmode
38100 || !register_operand (target, BNDmode))
38101 target = gen_reg_rtx (BNDmode);
38103 arg0 = CALL_EXPR_ARG (exp, 0);
38104 arg1 = CALL_EXPR_ARG (exp, 1);
38106 op0 = expand_normal (arg0);
38107 op1 = expand_normal (arg1);
38109 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
38110 rh1 = adjust_address (res, Pmode, 0);
38111 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
38113 /* Put first bounds to temporaries. */
38114 lb1 = gen_reg_rtx (Pmode);
38115 ub1 = gen_reg_rtx (Pmode);
38116 if (MEM_P (op0))
38118 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
38119 emit_move_insn (ub1, adjust_address (op0, Pmode,
38120 GET_MODE_SIZE (Pmode)));
38122 else
38124 emit_move_insn (res, op0);
38125 emit_move_insn (lb1, rh1);
38126 emit_move_insn (ub1, rh2);
38129 /* Put second bounds to temporaries. */
38130 lb2 = gen_reg_rtx (Pmode);
38131 ub2 = gen_reg_rtx (Pmode);
38132 if (MEM_P (op1))
38134 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
38135 emit_move_insn (ub2, adjust_address (op1, Pmode,
38136 GET_MODE_SIZE (Pmode)));
38138 else
38140 emit_move_insn (res, op1);
38141 emit_move_insn (lb2, rh1);
38142 emit_move_insn (ub2, rh2);
38145 /* Compute LB. */
38146 ix86_emit_move_max (lb1, lb2);
38147 emit_move_insn (rh1, lb1);
38149 /* Compute UB. UB is stored in 1's complement form. Therefore
38150 we also use max here. */
38151 ix86_emit_move_max (ub1, ub2);
38152 emit_move_insn (rh2, ub1);
38154 emit_move_insn (target, res);
38156 return target;
38159 case IX86_BUILTIN_SIZEOF:
38161 tree name;
38162 rtx symbol;
38164 if (!target
38165 || GET_MODE (target) != Pmode
38166 || !register_operand (target, Pmode))
38167 target = gen_reg_rtx (Pmode);
38169 arg0 = CALL_EXPR_ARG (exp, 0);
38170 gcc_assert (VAR_P (arg0));
38172 name = DECL_ASSEMBLER_NAME (arg0);
38173 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
38175 emit_insn (Pmode == SImode
38176 ? gen_move_size_reloc_si (target, symbol)
38177 : gen_move_size_reloc_di (target, symbol));
38179 return target;
38182 case IX86_BUILTIN_BNDLOWER:
38184 rtx mem, hmem;
38186 if (!target
38187 || GET_MODE (target) != Pmode
38188 || !register_operand (target, Pmode))
38189 target = gen_reg_rtx (Pmode);
38191 arg0 = CALL_EXPR_ARG (exp, 0);
38192 op0 = expand_normal (arg0);
38194 /* We need to move bounds to memory first. */
38195 if (MEM_P (op0))
38196 mem = op0;
38197 else
38199 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
38200 emit_move_insn (mem, op0);
38203 /* Generate mem expression to access LB and load it. */
38204 hmem = adjust_address (mem, Pmode, 0);
38205 emit_move_insn (target, hmem);
38207 return target;
38210 case IX86_BUILTIN_BNDUPPER:
38212 rtx mem, hmem, res;
38214 if (!target
38215 || GET_MODE (target) != Pmode
38216 || !register_operand (target, Pmode))
38217 target = gen_reg_rtx (Pmode);
38219 arg0 = CALL_EXPR_ARG (exp, 0);
38220 op0 = expand_normal (arg0);
38222 /* We need to move bounds to memory first. */
38223 if (MEM_P (op0))
38224 mem = op0;
38225 else
38227 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
38228 emit_move_insn (mem, op0);
38231 /* Generate mem expression to access UB. */
38232 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
38234 /* We need to inverse all bits of UB. */
38235 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
38237 if (res != target)
38238 emit_move_insn (target, res);
38240 return target;
38243 case IX86_BUILTIN_MASKMOVQ:
38244 case IX86_BUILTIN_MASKMOVDQU:
38245 icode = (fcode == IX86_BUILTIN_MASKMOVQ
38246 ? CODE_FOR_mmx_maskmovq
38247 : CODE_FOR_sse2_maskmovdqu);
38248 /* Note the arg order is different from the operand order. */
38249 arg1 = CALL_EXPR_ARG (exp, 0);
38250 arg2 = CALL_EXPR_ARG (exp, 1);
38251 arg0 = CALL_EXPR_ARG (exp, 2);
38252 op0 = expand_normal (arg0);
38253 op1 = expand_normal (arg1);
38254 op2 = expand_normal (arg2);
38255 mode0 = insn_data[icode].operand[0].mode;
38256 mode1 = insn_data[icode].operand[1].mode;
38257 mode2 = insn_data[icode].operand[2].mode;
38259 op0 = ix86_zero_extend_to_Pmode (op0);
38260 op0 = gen_rtx_MEM (mode1, op0);
38262 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38263 op0 = copy_to_mode_reg (mode0, op0);
38264 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38265 op1 = copy_to_mode_reg (mode1, op1);
38266 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38267 op2 = copy_to_mode_reg (mode2, op2);
38268 pat = GEN_FCN (icode) (op0, op1, op2);
38269 if (! pat)
38270 return 0;
38271 emit_insn (pat);
38272 return 0;
38274 case IX86_BUILTIN_LDMXCSR:
38275 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
38276 target = assign_386_stack_local (SImode, SLOT_TEMP);
38277 emit_move_insn (target, op0);
38278 emit_insn (gen_sse_ldmxcsr (target));
38279 return 0;
38281 case IX86_BUILTIN_STMXCSR:
38282 target = assign_386_stack_local (SImode, SLOT_TEMP);
38283 emit_insn (gen_sse_stmxcsr (target));
38284 return copy_to_mode_reg (SImode, target);
38286 case IX86_BUILTIN_CLFLUSH:
38287 arg0 = CALL_EXPR_ARG (exp, 0);
38288 op0 = expand_normal (arg0);
38289 icode = CODE_FOR_sse2_clflush;
38290 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38291 op0 = ix86_zero_extend_to_Pmode (op0);
38293 emit_insn (gen_sse2_clflush (op0));
38294 return 0;
38296 case IX86_BUILTIN_CLWB:
38297 arg0 = CALL_EXPR_ARG (exp, 0);
38298 op0 = expand_normal (arg0);
38299 icode = CODE_FOR_clwb;
38300 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38301 op0 = ix86_zero_extend_to_Pmode (op0);
38303 emit_insn (gen_clwb (op0));
38304 return 0;
38306 case IX86_BUILTIN_CLFLUSHOPT:
38307 arg0 = CALL_EXPR_ARG (exp, 0);
38308 op0 = expand_normal (arg0);
38309 icode = CODE_FOR_clflushopt;
38310 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38311 op0 = ix86_zero_extend_to_Pmode (op0);
38313 emit_insn (gen_clflushopt (op0));
38314 return 0;
38316 case IX86_BUILTIN_MONITOR:
38317 case IX86_BUILTIN_MONITORX:
38318 arg0 = CALL_EXPR_ARG (exp, 0);
38319 arg1 = CALL_EXPR_ARG (exp, 1);
38320 arg2 = CALL_EXPR_ARG (exp, 2);
38321 op0 = expand_normal (arg0);
38322 op1 = expand_normal (arg1);
38323 op2 = expand_normal (arg2);
38324 if (!REG_P (op0))
38325 op0 = ix86_zero_extend_to_Pmode (op0);
38326 if (!REG_P (op1))
38327 op1 = copy_to_mode_reg (SImode, op1);
38328 if (!REG_P (op2))
38329 op2 = copy_to_mode_reg (SImode, op2);
38331 emit_insn (fcode == IX86_BUILTIN_MONITOR
38332 ? ix86_gen_monitor (op0, op1, op2)
38333 : ix86_gen_monitorx (op0, op1, op2));
38334 return 0;
38336 case IX86_BUILTIN_MWAIT:
38337 arg0 = CALL_EXPR_ARG (exp, 0);
38338 arg1 = CALL_EXPR_ARG (exp, 1);
38339 op0 = expand_normal (arg0);
38340 op1 = expand_normal (arg1);
38341 if (!REG_P (op0))
38342 op0 = copy_to_mode_reg (SImode, op0);
38343 if (!REG_P (op1))
38344 op1 = copy_to_mode_reg (SImode, op1);
38345 emit_insn (gen_sse3_mwait (op0, op1));
38346 return 0;
38348 case IX86_BUILTIN_MWAITX:
38349 arg0 = CALL_EXPR_ARG (exp, 0);
38350 arg1 = CALL_EXPR_ARG (exp, 1);
38351 arg2 = CALL_EXPR_ARG (exp, 2);
38352 op0 = expand_normal (arg0);
38353 op1 = expand_normal (arg1);
38354 op2 = expand_normal (arg2);
38355 if (!REG_P (op0))
38356 op0 = copy_to_mode_reg (SImode, op0);
38357 if (!REG_P (op1))
38358 op1 = copy_to_mode_reg (SImode, op1);
38359 if (!REG_P (op2))
38360 op2 = copy_to_mode_reg (SImode, op2);
38361 emit_insn (gen_mwaitx (op0, op1, op2));
38362 return 0;
38364 case IX86_BUILTIN_CLZERO:
38365 arg0 = CALL_EXPR_ARG (exp, 0);
38366 op0 = expand_normal (arg0);
38367 if (!REG_P (op0))
38368 op0 = ix86_zero_extend_to_Pmode (op0);
38369 emit_insn (ix86_gen_clzero (op0));
38370 return 0;
38372 case IX86_BUILTIN_VEC_INIT_V2SI:
38373 case IX86_BUILTIN_VEC_INIT_V4HI:
38374 case IX86_BUILTIN_VEC_INIT_V8QI:
38375 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
38377 case IX86_BUILTIN_VEC_EXT_V2DF:
38378 case IX86_BUILTIN_VEC_EXT_V2DI:
38379 case IX86_BUILTIN_VEC_EXT_V4SF:
38380 case IX86_BUILTIN_VEC_EXT_V4SI:
38381 case IX86_BUILTIN_VEC_EXT_V8HI:
38382 case IX86_BUILTIN_VEC_EXT_V2SI:
38383 case IX86_BUILTIN_VEC_EXT_V4HI:
38384 case IX86_BUILTIN_VEC_EXT_V16QI:
38385 return ix86_expand_vec_ext_builtin (exp, target);
38387 case IX86_BUILTIN_VEC_SET_V2DI:
38388 case IX86_BUILTIN_VEC_SET_V4SF:
38389 case IX86_BUILTIN_VEC_SET_V4SI:
38390 case IX86_BUILTIN_VEC_SET_V8HI:
38391 case IX86_BUILTIN_VEC_SET_V4HI:
38392 case IX86_BUILTIN_VEC_SET_V16QI:
38393 return ix86_expand_vec_set_builtin (exp);
38395 case IX86_BUILTIN_NANQ:
38396 case IX86_BUILTIN_NANSQ:
38397 return expand_call (exp, target, ignore);
38399 case IX86_BUILTIN_RDPMC:
38400 case IX86_BUILTIN_RDTSC:
38401 case IX86_BUILTIN_RDTSCP:
38402 case IX86_BUILTIN_XGETBV:
38404 op0 = gen_reg_rtx (DImode);
38405 op1 = gen_reg_rtx (DImode);
38407 if (fcode == IX86_BUILTIN_RDPMC)
38409 arg0 = CALL_EXPR_ARG (exp, 0);
38410 op2 = expand_normal (arg0);
38411 if (!register_operand (op2, SImode))
38412 op2 = copy_to_mode_reg (SImode, op2);
38414 insn = (TARGET_64BIT
38415 ? gen_rdpmc_rex64 (op0, op1, op2)
38416 : gen_rdpmc (op0, op2));
38417 emit_insn (insn);
38419 else if (fcode == IX86_BUILTIN_XGETBV)
38421 arg0 = CALL_EXPR_ARG (exp, 0);
38422 op2 = expand_normal (arg0);
38423 if (!register_operand (op2, SImode))
38424 op2 = copy_to_mode_reg (SImode, op2);
38426 insn = (TARGET_64BIT
38427 ? gen_xgetbv_rex64 (op0, op1, op2)
38428 : gen_xgetbv (op0, op2));
38429 emit_insn (insn);
38431 else if (fcode == IX86_BUILTIN_RDTSC)
38433 insn = (TARGET_64BIT
38434 ? gen_rdtsc_rex64 (op0, op1)
38435 : gen_rdtsc (op0));
38436 emit_insn (insn);
38438 else
38440 op2 = gen_reg_rtx (SImode);
38442 insn = (TARGET_64BIT
38443 ? gen_rdtscp_rex64 (op0, op1, op2)
38444 : gen_rdtscp (op0, op2));
38445 emit_insn (insn);
38447 arg0 = CALL_EXPR_ARG (exp, 0);
38448 op4 = expand_normal (arg0);
38449 if (!address_operand (op4, VOIDmode))
38451 op4 = convert_memory_address (Pmode, op4);
38452 op4 = copy_addr_to_reg (op4);
38454 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38457 if (target == 0)
38459 /* mode is VOIDmode if __builtin_rd* has been called
38460 without lhs. */
38461 if (mode == VOIDmode)
38462 return target;
38463 target = gen_reg_rtx (mode);
38466 if (TARGET_64BIT)
38468 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38469 op1, 1, OPTAB_DIRECT);
38470 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38471 op0, 1, OPTAB_DIRECT);
38474 emit_move_insn (target, op0);
38475 return target;
38477 case IX86_BUILTIN_FXSAVE:
38478 case IX86_BUILTIN_FXRSTOR:
38479 case IX86_BUILTIN_FXSAVE64:
38480 case IX86_BUILTIN_FXRSTOR64:
38481 case IX86_BUILTIN_FNSTENV:
38482 case IX86_BUILTIN_FLDENV:
38483 mode0 = BLKmode;
38484 switch (fcode)
38486 case IX86_BUILTIN_FXSAVE:
38487 icode = CODE_FOR_fxsave;
38488 break;
38489 case IX86_BUILTIN_FXRSTOR:
38490 icode = CODE_FOR_fxrstor;
38491 break;
38492 case IX86_BUILTIN_FXSAVE64:
38493 icode = CODE_FOR_fxsave64;
38494 break;
38495 case IX86_BUILTIN_FXRSTOR64:
38496 icode = CODE_FOR_fxrstor64;
38497 break;
38498 case IX86_BUILTIN_FNSTENV:
38499 icode = CODE_FOR_fnstenv;
38500 break;
38501 case IX86_BUILTIN_FLDENV:
38502 icode = CODE_FOR_fldenv;
38503 break;
38504 default:
38505 gcc_unreachable ();
38508 arg0 = CALL_EXPR_ARG (exp, 0);
38509 op0 = expand_normal (arg0);
38511 if (!address_operand (op0, VOIDmode))
38513 op0 = convert_memory_address (Pmode, op0);
38514 op0 = copy_addr_to_reg (op0);
38516 op0 = gen_rtx_MEM (mode0, op0);
38518 pat = GEN_FCN (icode) (op0);
38519 if (pat)
38520 emit_insn (pat);
38521 return 0;
38523 case IX86_BUILTIN_XSETBV:
38524 arg0 = CALL_EXPR_ARG (exp, 0);
38525 arg1 = CALL_EXPR_ARG (exp, 1);
38526 op0 = expand_normal (arg0);
38527 op1 = expand_normal (arg1);
38529 if (!REG_P (op0))
38530 op0 = copy_to_mode_reg (SImode, op0);
38532 if (TARGET_64BIT)
38534 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38535 NULL, 1, OPTAB_DIRECT);
38537 op2 = gen_lowpart (SImode, op2);
38538 op1 = gen_lowpart (SImode, op1);
38539 if (!REG_P (op1))
38540 op1 = copy_to_mode_reg (SImode, op1);
38541 if (!REG_P (op2))
38542 op2 = copy_to_mode_reg (SImode, op2);
38543 icode = CODE_FOR_xsetbv_rex64;
38544 pat = GEN_FCN (icode) (op0, op1, op2);
38546 else
38548 if (!REG_P (op1))
38549 op1 = copy_to_mode_reg (DImode, op1);
38550 icode = CODE_FOR_xsetbv;
38551 pat = GEN_FCN (icode) (op0, op1);
38553 if (pat)
38554 emit_insn (pat);
38555 return 0;
38557 case IX86_BUILTIN_XSAVE:
38558 case IX86_BUILTIN_XRSTOR:
38559 case IX86_BUILTIN_XSAVE64:
38560 case IX86_BUILTIN_XRSTOR64:
38561 case IX86_BUILTIN_XSAVEOPT:
38562 case IX86_BUILTIN_XSAVEOPT64:
38563 case IX86_BUILTIN_XSAVES:
38564 case IX86_BUILTIN_XRSTORS:
38565 case IX86_BUILTIN_XSAVES64:
38566 case IX86_BUILTIN_XRSTORS64:
38567 case IX86_BUILTIN_XSAVEC:
38568 case IX86_BUILTIN_XSAVEC64:
38569 arg0 = CALL_EXPR_ARG (exp, 0);
38570 arg1 = CALL_EXPR_ARG (exp, 1);
38571 op0 = expand_normal (arg0);
38572 op1 = expand_normal (arg1);
38574 if (!address_operand (op0, VOIDmode))
38576 op0 = convert_memory_address (Pmode, op0);
38577 op0 = copy_addr_to_reg (op0);
38579 op0 = gen_rtx_MEM (BLKmode, op0);
38581 op1 = force_reg (DImode, op1);
38583 if (TARGET_64BIT)
38585 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38586 NULL, 1, OPTAB_DIRECT);
38587 switch (fcode)
38589 case IX86_BUILTIN_XSAVE:
38590 icode = CODE_FOR_xsave_rex64;
38591 break;
38592 case IX86_BUILTIN_XRSTOR:
38593 icode = CODE_FOR_xrstor_rex64;
38594 break;
38595 case IX86_BUILTIN_XSAVE64:
38596 icode = CODE_FOR_xsave64;
38597 break;
38598 case IX86_BUILTIN_XRSTOR64:
38599 icode = CODE_FOR_xrstor64;
38600 break;
38601 case IX86_BUILTIN_XSAVEOPT:
38602 icode = CODE_FOR_xsaveopt_rex64;
38603 break;
38604 case IX86_BUILTIN_XSAVEOPT64:
38605 icode = CODE_FOR_xsaveopt64;
38606 break;
38607 case IX86_BUILTIN_XSAVES:
38608 icode = CODE_FOR_xsaves_rex64;
38609 break;
38610 case IX86_BUILTIN_XRSTORS:
38611 icode = CODE_FOR_xrstors_rex64;
38612 break;
38613 case IX86_BUILTIN_XSAVES64:
38614 icode = CODE_FOR_xsaves64;
38615 break;
38616 case IX86_BUILTIN_XRSTORS64:
38617 icode = CODE_FOR_xrstors64;
38618 break;
38619 case IX86_BUILTIN_XSAVEC:
38620 icode = CODE_FOR_xsavec_rex64;
38621 break;
38622 case IX86_BUILTIN_XSAVEC64:
38623 icode = CODE_FOR_xsavec64;
38624 break;
38625 default:
38626 gcc_unreachable ();
38629 op2 = gen_lowpart (SImode, op2);
38630 op1 = gen_lowpart (SImode, op1);
38631 pat = GEN_FCN (icode) (op0, op1, op2);
38633 else
38635 switch (fcode)
38637 case IX86_BUILTIN_XSAVE:
38638 icode = CODE_FOR_xsave;
38639 break;
38640 case IX86_BUILTIN_XRSTOR:
38641 icode = CODE_FOR_xrstor;
38642 break;
38643 case IX86_BUILTIN_XSAVEOPT:
38644 icode = CODE_FOR_xsaveopt;
38645 break;
38646 case IX86_BUILTIN_XSAVES:
38647 icode = CODE_FOR_xsaves;
38648 break;
38649 case IX86_BUILTIN_XRSTORS:
38650 icode = CODE_FOR_xrstors;
38651 break;
38652 case IX86_BUILTIN_XSAVEC:
38653 icode = CODE_FOR_xsavec;
38654 break;
38655 default:
38656 gcc_unreachable ();
38658 pat = GEN_FCN (icode) (op0, op1);
38661 if (pat)
38662 emit_insn (pat);
38663 return 0;
38665 case IX86_BUILTIN_LLWPCB:
38666 arg0 = CALL_EXPR_ARG (exp, 0);
38667 op0 = expand_normal (arg0);
38668 icode = CODE_FOR_lwp_llwpcb;
38669 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38670 op0 = ix86_zero_extend_to_Pmode (op0);
38671 emit_insn (gen_lwp_llwpcb (op0));
38672 return 0;
38674 case IX86_BUILTIN_SLWPCB:
38675 icode = CODE_FOR_lwp_slwpcb;
38676 if (!target
38677 || !insn_data[icode].operand[0].predicate (target, Pmode))
38678 target = gen_reg_rtx (Pmode);
38679 emit_insn (gen_lwp_slwpcb (target));
38680 return target;
38682 case IX86_BUILTIN_BEXTRI32:
38683 case IX86_BUILTIN_BEXTRI64:
38684 arg0 = CALL_EXPR_ARG (exp, 0);
38685 arg1 = CALL_EXPR_ARG (exp, 1);
38686 op0 = expand_normal (arg0);
38687 op1 = expand_normal (arg1);
38688 icode = (fcode == IX86_BUILTIN_BEXTRI32
38689 ? CODE_FOR_tbm_bextri_si
38690 : CODE_FOR_tbm_bextri_di);
38691 if (!CONST_INT_P (op1))
38693 error ("last argument must be an immediate");
38694 return const0_rtx;
38696 else
38698 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38699 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38700 op1 = GEN_INT (length);
38701 op2 = GEN_INT (lsb_index);
38702 pat = GEN_FCN (icode) (target, op0, op1, op2);
38703 if (pat)
38704 emit_insn (pat);
38705 return target;
38708 case IX86_BUILTIN_RDRAND16_STEP:
38709 icode = CODE_FOR_rdrandhi_1;
38710 mode0 = HImode;
38711 goto rdrand_step;
38713 case IX86_BUILTIN_RDRAND32_STEP:
38714 icode = CODE_FOR_rdrandsi_1;
38715 mode0 = SImode;
38716 goto rdrand_step;
38718 case IX86_BUILTIN_RDRAND64_STEP:
38719 icode = CODE_FOR_rdranddi_1;
38720 mode0 = DImode;
38722 rdrand_step:
38723 arg0 = CALL_EXPR_ARG (exp, 0);
38724 op1 = expand_normal (arg0);
38725 if (!address_operand (op1, VOIDmode))
38727 op1 = convert_memory_address (Pmode, op1);
38728 op1 = copy_addr_to_reg (op1);
38731 op0 = gen_reg_rtx (mode0);
38732 emit_insn (GEN_FCN (icode) (op0));
38734 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38736 op1 = gen_reg_rtx (SImode);
38737 emit_move_insn (op1, CONST1_RTX (SImode));
38739 /* Emit SImode conditional move. */
38740 if (mode0 == HImode)
38742 if (TARGET_ZERO_EXTEND_WITH_AND
38743 && optimize_function_for_speed_p (cfun))
38745 op2 = force_reg (SImode, const0_rtx);
38747 emit_insn (gen_movstricthi
38748 (gen_lowpart (HImode, op2), op0));
38750 else
38752 op2 = gen_reg_rtx (SImode);
38754 emit_insn (gen_zero_extendhisi2 (op2, op0));
38757 else if (mode0 == SImode)
38758 op2 = op0;
38759 else
38760 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38762 if (target == 0
38763 || !register_operand (target, SImode))
38764 target = gen_reg_rtx (SImode);
38766 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38767 const0_rtx);
38768 emit_insn (gen_rtx_SET (target,
38769 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38770 return target;
38772 case IX86_BUILTIN_RDSEED16_STEP:
38773 icode = CODE_FOR_rdseedhi_1;
38774 mode0 = HImode;
38775 goto rdseed_step;
38777 case IX86_BUILTIN_RDSEED32_STEP:
38778 icode = CODE_FOR_rdseedsi_1;
38779 mode0 = SImode;
38780 goto rdseed_step;
38782 case IX86_BUILTIN_RDSEED64_STEP:
38783 icode = CODE_FOR_rdseeddi_1;
38784 mode0 = DImode;
38786 rdseed_step:
38787 arg0 = CALL_EXPR_ARG (exp, 0);
38788 op1 = expand_normal (arg0);
38789 if (!address_operand (op1, VOIDmode))
38791 op1 = convert_memory_address (Pmode, op1);
38792 op1 = copy_addr_to_reg (op1);
38795 op0 = gen_reg_rtx (mode0);
38796 emit_insn (GEN_FCN (icode) (op0));
38798 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38800 op2 = gen_reg_rtx (QImode);
38802 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38803 const0_rtx);
38804 emit_insn (gen_rtx_SET (op2, pat));
38806 if (target == 0
38807 || !register_operand (target, SImode))
38808 target = gen_reg_rtx (SImode);
38810 emit_insn (gen_zero_extendqisi2 (target, op2));
38811 return target;
38813 case IX86_BUILTIN_SBB32:
38814 icode = CODE_FOR_subborrowsi;
38815 mode0 = SImode;
38816 goto handlecarry;
38818 case IX86_BUILTIN_SBB64:
38819 icode = CODE_FOR_subborrowdi;
38820 mode0 = DImode;
38821 goto handlecarry;
38823 case IX86_BUILTIN_ADDCARRYX32:
38824 icode = CODE_FOR_addcarrysi;
38825 mode0 = SImode;
38826 goto handlecarry;
38828 case IX86_BUILTIN_ADDCARRYX64:
38829 icode = CODE_FOR_addcarrydi;
38830 mode0 = DImode;
38832 handlecarry:
38833 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38834 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38835 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38836 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38838 op1 = expand_normal (arg0);
38839 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38841 op2 = expand_normal (arg1);
38842 if (!register_operand (op2, mode0))
38843 op2 = copy_to_mode_reg (mode0, op2);
38845 op3 = expand_normal (arg2);
38846 if (!register_operand (op3, mode0))
38847 op3 = copy_to_mode_reg (mode0, op3);
38849 op4 = expand_normal (arg3);
38850 if (!address_operand (op4, VOIDmode))
38852 op4 = convert_memory_address (Pmode, op4);
38853 op4 = copy_addr_to_reg (op4);
38856 /* Generate CF from input operand. */
38857 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38859 /* Generate instruction that consumes CF. */
38860 op0 = gen_reg_rtx (mode0);
38862 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38863 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38864 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38866 /* Return current CF value. */
38867 if (target == 0)
38868 target = gen_reg_rtx (QImode);
38870 PUT_MODE (pat, QImode);
38871 emit_insn (gen_rtx_SET (target, pat));
38873 /* Store the result. */
38874 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38876 return target;
38878 case IX86_BUILTIN_READ_FLAGS:
38879 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38881 if (optimize
38882 || target == NULL_RTX
38883 || !nonimmediate_operand (target, word_mode)
38884 || GET_MODE (target) != word_mode)
38885 target = gen_reg_rtx (word_mode);
38887 emit_insn (gen_pop (target));
38888 return target;
38890 case IX86_BUILTIN_WRITE_FLAGS:
38892 arg0 = CALL_EXPR_ARG (exp, 0);
38893 op0 = expand_normal (arg0);
38894 if (!general_no_elim_operand (op0, word_mode))
38895 op0 = copy_to_mode_reg (word_mode, op0);
38897 emit_insn (gen_push (op0));
38898 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38899 return 0;
38901 case IX86_BUILTIN_KTESTC8:
38902 icode = CODE_FOR_ktestqi;
38903 mode3 = CCCmode;
38904 goto kortest;
38906 case IX86_BUILTIN_KTESTZ8:
38907 icode = CODE_FOR_ktestqi;
38908 mode3 = CCZmode;
38909 goto kortest;
38911 case IX86_BUILTIN_KTESTC16:
38912 icode = CODE_FOR_ktesthi;
38913 mode3 = CCCmode;
38914 goto kortest;
38916 case IX86_BUILTIN_KTESTZ16:
38917 icode = CODE_FOR_ktesthi;
38918 mode3 = CCZmode;
38919 goto kortest;
38921 case IX86_BUILTIN_KTESTC32:
38922 icode = CODE_FOR_ktestsi;
38923 mode3 = CCCmode;
38924 goto kortest;
38926 case IX86_BUILTIN_KTESTZ32:
38927 icode = CODE_FOR_ktestsi;
38928 mode3 = CCZmode;
38929 goto kortest;
38931 case IX86_BUILTIN_KTESTC64:
38932 icode = CODE_FOR_ktestdi;
38933 mode3 = CCCmode;
38934 goto kortest;
38936 case IX86_BUILTIN_KTESTZ64:
38937 icode = CODE_FOR_ktestdi;
38938 mode3 = CCZmode;
38939 goto kortest;
38941 case IX86_BUILTIN_KORTESTC8:
38942 icode = CODE_FOR_kortestqi;
38943 mode3 = CCCmode;
38944 goto kortest;
38946 case IX86_BUILTIN_KORTESTZ8:
38947 icode = CODE_FOR_kortestqi;
38948 mode3 = CCZmode;
38949 goto kortest;
38951 case IX86_BUILTIN_KORTESTC16:
38952 icode = CODE_FOR_kortesthi;
38953 mode3 = CCCmode;
38954 goto kortest;
38956 case IX86_BUILTIN_KORTESTZ16:
38957 icode = CODE_FOR_kortesthi;
38958 mode3 = CCZmode;
38959 goto kortest;
38961 case IX86_BUILTIN_KORTESTC32:
38962 icode = CODE_FOR_kortestsi;
38963 mode3 = CCCmode;
38964 goto kortest;
38966 case IX86_BUILTIN_KORTESTZ32:
38967 icode = CODE_FOR_kortestsi;
38968 mode3 = CCZmode;
38969 goto kortest;
38971 case IX86_BUILTIN_KORTESTC64:
38972 icode = CODE_FOR_kortestdi;
38973 mode3 = CCCmode;
38974 goto kortest;
38976 case IX86_BUILTIN_KORTESTZ64:
38977 icode = CODE_FOR_kortestdi;
38978 mode3 = CCZmode;
38980 kortest:
38981 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38982 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38983 op0 = expand_normal (arg0);
38984 op1 = expand_normal (arg1);
38986 mode0 = insn_data[icode].operand[0].mode;
38987 mode1 = insn_data[icode].operand[1].mode;
38989 if (GET_MODE (op0) != VOIDmode)
38990 op0 = force_reg (GET_MODE (op0), op0);
38992 op0 = gen_lowpart (mode0, op0);
38994 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38995 op0 = copy_to_mode_reg (mode0, op0);
38997 if (GET_MODE (op1) != VOIDmode)
38998 op1 = force_reg (GET_MODE (op1), op1);
39000 op1 = gen_lowpart (mode1, op1);
39002 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39003 op1 = copy_to_mode_reg (mode1, op1);
39005 target = gen_reg_rtx (QImode);
39007 /* Emit kortest. */
39008 emit_insn (GEN_FCN (icode) (op0, op1));
39009 /* And use setcc to return result from flags. */
39010 ix86_expand_setcc (target, EQ,
39011 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
39012 return target;
39014 case IX86_BUILTIN_GATHERSIV2DF:
39015 icode = CODE_FOR_avx2_gathersiv2df;
39016 goto gather_gen;
39017 case IX86_BUILTIN_GATHERSIV4DF:
39018 icode = CODE_FOR_avx2_gathersiv4df;
39019 goto gather_gen;
39020 case IX86_BUILTIN_GATHERDIV2DF:
39021 icode = CODE_FOR_avx2_gatherdiv2df;
39022 goto gather_gen;
39023 case IX86_BUILTIN_GATHERDIV4DF:
39024 icode = CODE_FOR_avx2_gatherdiv4df;
39025 goto gather_gen;
39026 case IX86_BUILTIN_GATHERSIV4SF:
39027 icode = CODE_FOR_avx2_gathersiv4sf;
39028 goto gather_gen;
39029 case IX86_BUILTIN_GATHERSIV8SF:
39030 icode = CODE_FOR_avx2_gathersiv8sf;
39031 goto gather_gen;
39032 case IX86_BUILTIN_GATHERDIV4SF:
39033 icode = CODE_FOR_avx2_gatherdiv4sf;
39034 goto gather_gen;
39035 case IX86_BUILTIN_GATHERDIV8SF:
39036 icode = CODE_FOR_avx2_gatherdiv8sf;
39037 goto gather_gen;
39038 case IX86_BUILTIN_GATHERSIV2DI:
39039 icode = CODE_FOR_avx2_gathersiv2di;
39040 goto gather_gen;
39041 case IX86_BUILTIN_GATHERSIV4DI:
39042 icode = CODE_FOR_avx2_gathersiv4di;
39043 goto gather_gen;
39044 case IX86_BUILTIN_GATHERDIV2DI:
39045 icode = CODE_FOR_avx2_gatherdiv2di;
39046 goto gather_gen;
39047 case IX86_BUILTIN_GATHERDIV4DI:
39048 icode = CODE_FOR_avx2_gatherdiv4di;
39049 goto gather_gen;
39050 case IX86_BUILTIN_GATHERSIV4SI:
39051 icode = CODE_FOR_avx2_gathersiv4si;
39052 goto gather_gen;
39053 case IX86_BUILTIN_GATHERSIV8SI:
39054 icode = CODE_FOR_avx2_gathersiv8si;
39055 goto gather_gen;
39056 case IX86_BUILTIN_GATHERDIV4SI:
39057 icode = CODE_FOR_avx2_gatherdiv4si;
39058 goto gather_gen;
39059 case IX86_BUILTIN_GATHERDIV8SI:
39060 icode = CODE_FOR_avx2_gatherdiv8si;
39061 goto gather_gen;
39062 case IX86_BUILTIN_GATHERALTSIV4DF:
39063 icode = CODE_FOR_avx2_gathersiv4df;
39064 goto gather_gen;
39065 case IX86_BUILTIN_GATHERALTDIV8SF:
39066 icode = CODE_FOR_avx2_gatherdiv8sf;
39067 goto gather_gen;
39068 case IX86_BUILTIN_GATHERALTSIV4DI:
39069 icode = CODE_FOR_avx2_gathersiv4di;
39070 goto gather_gen;
39071 case IX86_BUILTIN_GATHERALTDIV8SI:
39072 icode = CODE_FOR_avx2_gatherdiv8si;
39073 goto gather_gen;
39074 case IX86_BUILTIN_GATHER3SIV16SF:
39075 icode = CODE_FOR_avx512f_gathersiv16sf;
39076 goto gather_gen;
39077 case IX86_BUILTIN_GATHER3SIV8DF:
39078 icode = CODE_FOR_avx512f_gathersiv8df;
39079 goto gather_gen;
39080 case IX86_BUILTIN_GATHER3DIV16SF:
39081 icode = CODE_FOR_avx512f_gatherdiv16sf;
39082 goto gather_gen;
39083 case IX86_BUILTIN_GATHER3DIV8DF:
39084 icode = CODE_FOR_avx512f_gatherdiv8df;
39085 goto gather_gen;
39086 case IX86_BUILTIN_GATHER3SIV16SI:
39087 icode = CODE_FOR_avx512f_gathersiv16si;
39088 goto gather_gen;
39089 case IX86_BUILTIN_GATHER3SIV8DI:
39090 icode = CODE_FOR_avx512f_gathersiv8di;
39091 goto gather_gen;
39092 case IX86_BUILTIN_GATHER3DIV16SI:
39093 icode = CODE_FOR_avx512f_gatherdiv16si;
39094 goto gather_gen;
39095 case IX86_BUILTIN_GATHER3DIV8DI:
39096 icode = CODE_FOR_avx512f_gatherdiv8di;
39097 goto gather_gen;
39098 case IX86_BUILTIN_GATHER3ALTSIV8DF:
39099 icode = CODE_FOR_avx512f_gathersiv8df;
39100 goto gather_gen;
39101 case IX86_BUILTIN_GATHER3ALTDIV16SF:
39102 icode = CODE_FOR_avx512f_gatherdiv16sf;
39103 goto gather_gen;
39104 case IX86_BUILTIN_GATHER3ALTSIV8DI:
39105 icode = CODE_FOR_avx512f_gathersiv8di;
39106 goto gather_gen;
39107 case IX86_BUILTIN_GATHER3ALTDIV16SI:
39108 icode = CODE_FOR_avx512f_gatherdiv16si;
39109 goto gather_gen;
39110 case IX86_BUILTIN_GATHER3SIV2DF:
39111 icode = CODE_FOR_avx512vl_gathersiv2df;
39112 goto gather_gen;
39113 case IX86_BUILTIN_GATHER3SIV4DF:
39114 icode = CODE_FOR_avx512vl_gathersiv4df;
39115 goto gather_gen;
39116 case IX86_BUILTIN_GATHER3DIV2DF:
39117 icode = CODE_FOR_avx512vl_gatherdiv2df;
39118 goto gather_gen;
39119 case IX86_BUILTIN_GATHER3DIV4DF:
39120 icode = CODE_FOR_avx512vl_gatherdiv4df;
39121 goto gather_gen;
39122 case IX86_BUILTIN_GATHER3SIV4SF:
39123 icode = CODE_FOR_avx512vl_gathersiv4sf;
39124 goto gather_gen;
39125 case IX86_BUILTIN_GATHER3SIV8SF:
39126 icode = CODE_FOR_avx512vl_gathersiv8sf;
39127 goto gather_gen;
39128 case IX86_BUILTIN_GATHER3DIV4SF:
39129 icode = CODE_FOR_avx512vl_gatherdiv4sf;
39130 goto gather_gen;
39131 case IX86_BUILTIN_GATHER3DIV8SF:
39132 icode = CODE_FOR_avx512vl_gatherdiv8sf;
39133 goto gather_gen;
39134 case IX86_BUILTIN_GATHER3SIV2DI:
39135 icode = CODE_FOR_avx512vl_gathersiv2di;
39136 goto gather_gen;
39137 case IX86_BUILTIN_GATHER3SIV4DI:
39138 icode = CODE_FOR_avx512vl_gathersiv4di;
39139 goto gather_gen;
39140 case IX86_BUILTIN_GATHER3DIV2DI:
39141 icode = CODE_FOR_avx512vl_gatherdiv2di;
39142 goto gather_gen;
39143 case IX86_BUILTIN_GATHER3DIV4DI:
39144 icode = CODE_FOR_avx512vl_gatherdiv4di;
39145 goto gather_gen;
39146 case IX86_BUILTIN_GATHER3SIV4SI:
39147 icode = CODE_FOR_avx512vl_gathersiv4si;
39148 goto gather_gen;
39149 case IX86_BUILTIN_GATHER3SIV8SI:
39150 icode = CODE_FOR_avx512vl_gathersiv8si;
39151 goto gather_gen;
39152 case IX86_BUILTIN_GATHER3DIV4SI:
39153 icode = CODE_FOR_avx512vl_gatherdiv4si;
39154 goto gather_gen;
39155 case IX86_BUILTIN_GATHER3DIV8SI:
39156 icode = CODE_FOR_avx512vl_gatherdiv8si;
39157 goto gather_gen;
39158 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39159 icode = CODE_FOR_avx512vl_gathersiv4df;
39160 goto gather_gen;
39161 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39162 icode = CODE_FOR_avx512vl_gatherdiv8sf;
39163 goto gather_gen;
39164 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39165 icode = CODE_FOR_avx512vl_gathersiv4di;
39166 goto gather_gen;
39167 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39168 icode = CODE_FOR_avx512vl_gatherdiv8si;
39169 goto gather_gen;
39170 case IX86_BUILTIN_SCATTERSIV16SF:
39171 icode = CODE_FOR_avx512f_scattersiv16sf;
39172 goto scatter_gen;
39173 case IX86_BUILTIN_SCATTERSIV8DF:
39174 icode = CODE_FOR_avx512f_scattersiv8df;
39175 goto scatter_gen;
39176 case IX86_BUILTIN_SCATTERDIV16SF:
39177 icode = CODE_FOR_avx512f_scatterdiv16sf;
39178 goto scatter_gen;
39179 case IX86_BUILTIN_SCATTERDIV8DF:
39180 icode = CODE_FOR_avx512f_scatterdiv8df;
39181 goto scatter_gen;
39182 case IX86_BUILTIN_SCATTERSIV16SI:
39183 icode = CODE_FOR_avx512f_scattersiv16si;
39184 goto scatter_gen;
39185 case IX86_BUILTIN_SCATTERSIV8DI:
39186 icode = CODE_FOR_avx512f_scattersiv8di;
39187 goto scatter_gen;
39188 case IX86_BUILTIN_SCATTERDIV16SI:
39189 icode = CODE_FOR_avx512f_scatterdiv16si;
39190 goto scatter_gen;
39191 case IX86_BUILTIN_SCATTERDIV8DI:
39192 icode = CODE_FOR_avx512f_scatterdiv8di;
39193 goto scatter_gen;
39194 case IX86_BUILTIN_SCATTERSIV8SF:
39195 icode = CODE_FOR_avx512vl_scattersiv8sf;
39196 goto scatter_gen;
39197 case IX86_BUILTIN_SCATTERSIV4SF:
39198 icode = CODE_FOR_avx512vl_scattersiv4sf;
39199 goto scatter_gen;
39200 case IX86_BUILTIN_SCATTERSIV4DF:
39201 icode = CODE_FOR_avx512vl_scattersiv4df;
39202 goto scatter_gen;
39203 case IX86_BUILTIN_SCATTERSIV2DF:
39204 icode = CODE_FOR_avx512vl_scattersiv2df;
39205 goto scatter_gen;
39206 case IX86_BUILTIN_SCATTERDIV8SF:
39207 icode = CODE_FOR_avx512vl_scatterdiv8sf;
39208 goto scatter_gen;
39209 case IX86_BUILTIN_SCATTERDIV4SF:
39210 icode = CODE_FOR_avx512vl_scatterdiv4sf;
39211 goto scatter_gen;
39212 case IX86_BUILTIN_SCATTERDIV4DF:
39213 icode = CODE_FOR_avx512vl_scatterdiv4df;
39214 goto scatter_gen;
39215 case IX86_BUILTIN_SCATTERDIV2DF:
39216 icode = CODE_FOR_avx512vl_scatterdiv2df;
39217 goto scatter_gen;
39218 case IX86_BUILTIN_SCATTERSIV8SI:
39219 icode = CODE_FOR_avx512vl_scattersiv8si;
39220 goto scatter_gen;
39221 case IX86_BUILTIN_SCATTERSIV4SI:
39222 icode = CODE_FOR_avx512vl_scattersiv4si;
39223 goto scatter_gen;
39224 case IX86_BUILTIN_SCATTERSIV4DI:
39225 icode = CODE_FOR_avx512vl_scattersiv4di;
39226 goto scatter_gen;
39227 case IX86_BUILTIN_SCATTERSIV2DI:
39228 icode = CODE_FOR_avx512vl_scattersiv2di;
39229 goto scatter_gen;
39230 case IX86_BUILTIN_SCATTERDIV8SI:
39231 icode = CODE_FOR_avx512vl_scatterdiv8si;
39232 goto scatter_gen;
39233 case IX86_BUILTIN_SCATTERDIV4SI:
39234 icode = CODE_FOR_avx512vl_scatterdiv4si;
39235 goto scatter_gen;
39236 case IX86_BUILTIN_SCATTERDIV4DI:
39237 icode = CODE_FOR_avx512vl_scatterdiv4di;
39238 goto scatter_gen;
39239 case IX86_BUILTIN_SCATTERDIV2DI:
39240 icode = CODE_FOR_avx512vl_scatterdiv2di;
39241 goto scatter_gen;
39242 case IX86_BUILTIN_GATHERPFDPD:
39243 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
39244 goto vec_prefetch_gen;
39245 case IX86_BUILTIN_SCATTERALTSIV8DF:
39246 icode = CODE_FOR_avx512f_scattersiv8df;
39247 goto scatter_gen;
39248 case IX86_BUILTIN_SCATTERALTDIV16SF:
39249 icode = CODE_FOR_avx512f_scatterdiv16sf;
39250 goto scatter_gen;
39251 case IX86_BUILTIN_SCATTERALTSIV8DI:
39252 icode = CODE_FOR_avx512f_scattersiv8di;
39253 goto scatter_gen;
39254 case IX86_BUILTIN_SCATTERALTDIV16SI:
39255 icode = CODE_FOR_avx512f_scatterdiv16si;
39256 goto scatter_gen;
39257 case IX86_BUILTIN_GATHERPFDPS:
39258 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
39259 goto vec_prefetch_gen;
39260 case IX86_BUILTIN_GATHERPFQPD:
39261 icode = CODE_FOR_avx512pf_gatherpfv8didf;
39262 goto vec_prefetch_gen;
39263 case IX86_BUILTIN_GATHERPFQPS:
39264 icode = CODE_FOR_avx512pf_gatherpfv8disf;
39265 goto vec_prefetch_gen;
39266 case IX86_BUILTIN_SCATTERPFDPD:
39267 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
39268 goto vec_prefetch_gen;
39269 case IX86_BUILTIN_SCATTERPFDPS:
39270 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
39271 goto vec_prefetch_gen;
39272 case IX86_BUILTIN_SCATTERPFQPD:
39273 icode = CODE_FOR_avx512pf_scatterpfv8didf;
39274 goto vec_prefetch_gen;
39275 case IX86_BUILTIN_SCATTERPFQPS:
39276 icode = CODE_FOR_avx512pf_scatterpfv8disf;
39277 goto vec_prefetch_gen;
39279 gather_gen:
39280 rtx half;
39281 rtx (*gen) (rtx, rtx);
39283 arg0 = CALL_EXPR_ARG (exp, 0);
39284 arg1 = CALL_EXPR_ARG (exp, 1);
39285 arg2 = CALL_EXPR_ARG (exp, 2);
39286 arg3 = CALL_EXPR_ARG (exp, 3);
39287 arg4 = CALL_EXPR_ARG (exp, 4);
39288 op0 = expand_normal (arg0);
39289 op1 = expand_normal (arg1);
39290 op2 = expand_normal (arg2);
39291 op3 = expand_normal (arg3);
39292 op4 = expand_normal (arg4);
39293 /* Note the arg order is different from the operand order. */
39294 mode0 = insn_data[icode].operand[1].mode;
39295 mode2 = insn_data[icode].operand[3].mode;
39296 mode3 = insn_data[icode].operand[4].mode;
39297 mode4 = insn_data[icode].operand[5].mode;
39299 if (target == NULL_RTX
39300 || GET_MODE (target) != insn_data[icode].operand[0].mode
39301 || !insn_data[icode].operand[0].predicate (target,
39302 GET_MODE (target)))
39303 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
39304 else
39305 subtarget = target;
39307 switch (fcode)
39309 case IX86_BUILTIN_GATHER3ALTSIV8DF:
39310 case IX86_BUILTIN_GATHER3ALTSIV8DI:
39311 half = gen_reg_rtx (V8SImode);
39312 if (!nonimmediate_operand (op2, V16SImode))
39313 op2 = copy_to_mode_reg (V16SImode, op2);
39314 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39315 op2 = half;
39316 break;
39317 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39318 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39319 case IX86_BUILTIN_GATHERALTSIV4DF:
39320 case IX86_BUILTIN_GATHERALTSIV4DI:
39321 half = gen_reg_rtx (V4SImode);
39322 if (!nonimmediate_operand (op2, V8SImode))
39323 op2 = copy_to_mode_reg (V8SImode, op2);
39324 emit_insn (gen_vec_extract_lo_v8si (half, op2));
39325 op2 = half;
39326 break;
39327 case IX86_BUILTIN_GATHER3ALTDIV16SF:
39328 case IX86_BUILTIN_GATHER3ALTDIV16SI:
39329 half = gen_reg_rtx (mode0);
39330 if (mode0 == V8SFmode)
39331 gen = gen_vec_extract_lo_v16sf;
39332 else
39333 gen = gen_vec_extract_lo_v16si;
39334 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39335 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39336 emit_insn (gen (half, op0));
39337 op0 = half;
39338 if (GET_MODE (op3) != VOIDmode)
39340 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39341 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39342 emit_insn (gen (half, op3));
39343 op3 = half;
39345 break;
39346 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39347 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39348 case IX86_BUILTIN_GATHERALTDIV8SF:
39349 case IX86_BUILTIN_GATHERALTDIV8SI:
39350 half = gen_reg_rtx (mode0);
39351 if (mode0 == V4SFmode)
39352 gen = gen_vec_extract_lo_v8sf;
39353 else
39354 gen = gen_vec_extract_lo_v8si;
39355 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39356 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39357 emit_insn (gen (half, op0));
39358 op0 = half;
39359 if (GET_MODE (op3) != VOIDmode)
39361 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39362 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39363 emit_insn (gen (half, op3));
39364 op3 = half;
39366 break;
39367 default:
39368 break;
39371 /* Force memory operand only with base register here. But we
39372 don't want to do it on memory operand for other builtin
39373 functions. */
39374 op1 = ix86_zero_extend_to_Pmode (op1);
39376 if (!insn_data[icode].operand[1].predicate (op0, mode0))
39377 op0 = copy_to_mode_reg (mode0, op0);
39378 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
39379 op1 = copy_to_mode_reg (Pmode, op1);
39380 if (!insn_data[icode].operand[3].predicate (op2, mode2))
39381 op2 = copy_to_mode_reg (mode2, op2);
39383 op3 = fixup_modeless_constant (op3, mode3);
39385 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
39387 if (!insn_data[icode].operand[4].predicate (op3, mode3))
39388 op3 = copy_to_mode_reg (mode3, op3);
39390 else
39392 op3 = copy_to_reg (op3);
39393 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39395 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39397 error ("the last argument must be scale 1, 2, 4, 8");
39398 return const0_rtx;
39401 /* Optimize. If mask is known to have all high bits set,
39402 replace op0 with pc_rtx to signal that the instruction
39403 overwrites the whole destination and doesn't use its
39404 previous contents. */
39405 if (optimize)
39407 if (TREE_CODE (arg3) == INTEGER_CST)
39409 if (integer_all_onesp (arg3))
39410 op0 = pc_rtx;
39412 else if (TREE_CODE (arg3) == VECTOR_CST)
39414 unsigned int negative = 0;
39415 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39417 tree cst = VECTOR_CST_ELT (arg3, i);
39418 if (TREE_CODE (cst) == INTEGER_CST
39419 && tree_int_cst_sign_bit (cst))
39420 negative++;
39421 else if (TREE_CODE (cst) == REAL_CST
39422 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39423 negative++;
39425 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39426 op0 = pc_rtx;
39428 else if (TREE_CODE (arg3) == SSA_NAME
39429 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39431 /* Recognize also when mask is like:
39432 __v2df src = _mm_setzero_pd ();
39433 __v2df mask = _mm_cmpeq_pd (src, src);
39435 __v8sf src = _mm256_setzero_ps ();
39436 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39437 as that is a cheaper way to load all ones into
39438 a register than having to load a constant from
39439 memory. */
39440 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39441 if (is_gimple_call (def_stmt))
39443 tree fndecl = gimple_call_fndecl (def_stmt);
39444 if (fndecl
39445 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39446 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39448 case IX86_BUILTIN_CMPPD:
39449 case IX86_BUILTIN_CMPPS:
39450 case IX86_BUILTIN_CMPPD256:
39451 case IX86_BUILTIN_CMPPS256:
39452 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39453 break;
39454 /* FALLTHRU */
39455 case IX86_BUILTIN_CMPEQPD:
39456 case IX86_BUILTIN_CMPEQPS:
39457 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39458 && initializer_zerop (gimple_call_arg (def_stmt,
39459 1)))
39460 op0 = pc_rtx;
39461 break;
39462 default:
39463 break;
39469 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39470 if (! pat)
39471 return const0_rtx;
39472 emit_insn (pat);
39474 switch (fcode)
39476 case IX86_BUILTIN_GATHER3DIV16SF:
39477 if (target == NULL_RTX)
39478 target = gen_reg_rtx (V8SFmode);
39479 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39480 break;
39481 case IX86_BUILTIN_GATHER3DIV16SI:
39482 if (target == NULL_RTX)
39483 target = gen_reg_rtx (V8SImode);
39484 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39485 break;
39486 case IX86_BUILTIN_GATHER3DIV8SF:
39487 case IX86_BUILTIN_GATHERDIV8SF:
39488 if (target == NULL_RTX)
39489 target = gen_reg_rtx (V4SFmode);
39490 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39491 break;
39492 case IX86_BUILTIN_GATHER3DIV8SI:
39493 case IX86_BUILTIN_GATHERDIV8SI:
39494 if (target == NULL_RTX)
39495 target = gen_reg_rtx (V4SImode);
39496 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39497 break;
39498 default:
39499 target = subtarget;
39500 break;
39502 return target;
39504 scatter_gen:
39505 arg0 = CALL_EXPR_ARG (exp, 0);
39506 arg1 = CALL_EXPR_ARG (exp, 1);
39507 arg2 = CALL_EXPR_ARG (exp, 2);
39508 arg3 = CALL_EXPR_ARG (exp, 3);
39509 arg4 = CALL_EXPR_ARG (exp, 4);
39510 op0 = expand_normal (arg0);
39511 op1 = expand_normal (arg1);
39512 op2 = expand_normal (arg2);
39513 op3 = expand_normal (arg3);
39514 op4 = expand_normal (arg4);
39515 mode1 = insn_data[icode].operand[1].mode;
39516 mode2 = insn_data[icode].operand[2].mode;
39517 mode3 = insn_data[icode].operand[3].mode;
39518 mode4 = insn_data[icode].operand[4].mode;
39520 /* Scatter instruction stores operand op3 to memory with
39521 indices from op2 and scale from op4 under writemask op1.
39522 If index operand op2 has more elements then source operand
39523 op3 one need to use only its low half. And vice versa. */
39524 switch (fcode)
39526 case IX86_BUILTIN_SCATTERALTSIV8DF:
39527 case IX86_BUILTIN_SCATTERALTSIV8DI:
39528 half = gen_reg_rtx (V8SImode);
39529 if (!nonimmediate_operand (op2, V16SImode))
39530 op2 = copy_to_mode_reg (V16SImode, op2);
39531 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39532 op2 = half;
39533 break;
39534 case IX86_BUILTIN_SCATTERALTDIV16SF:
39535 case IX86_BUILTIN_SCATTERALTDIV16SI:
39536 half = gen_reg_rtx (mode3);
39537 if (mode3 == V8SFmode)
39538 gen = gen_vec_extract_lo_v16sf;
39539 else
39540 gen = gen_vec_extract_lo_v16si;
39541 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39542 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39543 emit_insn (gen (half, op3));
39544 op3 = half;
39545 break;
39546 default:
39547 break;
39550 /* Force memory operand only with base register here. But we
39551 don't want to do it on memory operand for other builtin
39552 functions. */
39553 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39555 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39556 op0 = copy_to_mode_reg (Pmode, op0);
39558 op1 = fixup_modeless_constant (op1, mode1);
39560 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39562 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39563 op1 = copy_to_mode_reg (mode1, op1);
39565 else
39567 op1 = copy_to_reg (op1);
39568 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39571 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39572 op2 = copy_to_mode_reg (mode2, op2);
39574 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39575 op3 = copy_to_mode_reg (mode3, op3);
39577 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39579 error ("the last argument must be scale 1, 2, 4, 8");
39580 return const0_rtx;
39583 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39584 if (! pat)
39585 return const0_rtx;
39587 emit_insn (pat);
39588 return 0;
39590 vec_prefetch_gen:
39591 arg0 = CALL_EXPR_ARG (exp, 0);
39592 arg1 = CALL_EXPR_ARG (exp, 1);
39593 arg2 = CALL_EXPR_ARG (exp, 2);
39594 arg3 = CALL_EXPR_ARG (exp, 3);
39595 arg4 = CALL_EXPR_ARG (exp, 4);
39596 op0 = expand_normal (arg0);
39597 op1 = expand_normal (arg1);
39598 op2 = expand_normal (arg2);
39599 op3 = expand_normal (arg3);
39600 op4 = expand_normal (arg4);
39601 mode0 = insn_data[icode].operand[0].mode;
39602 mode1 = insn_data[icode].operand[1].mode;
39603 mode3 = insn_data[icode].operand[3].mode;
39604 mode4 = insn_data[icode].operand[4].mode;
39606 op0 = fixup_modeless_constant (op0, mode0);
39608 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39610 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39611 op0 = copy_to_mode_reg (mode0, op0);
39613 else
39615 op0 = copy_to_reg (op0);
39616 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39619 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39620 op1 = copy_to_mode_reg (mode1, op1);
39622 /* Force memory operand only with base register here. But we
39623 don't want to do it on memory operand for other builtin
39624 functions. */
39625 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39627 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39628 op2 = copy_to_mode_reg (Pmode, op2);
39630 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39632 error ("the forth argument must be scale 1, 2, 4, 8");
39633 return const0_rtx;
39636 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39638 error ("incorrect hint operand");
39639 return const0_rtx;
39642 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39643 if (! pat)
39644 return const0_rtx;
39646 emit_insn (pat);
39648 return 0;
39650 case IX86_BUILTIN_XABORT:
39651 icode = CODE_FOR_xabort;
39652 arg0 = CALL_EXPR_ARG (exp, 0);
39653 op0 = expand_normal (arg0);
39654 mode0 = insn_data[icode].operand[0].mode;
39655 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39657 error ("the xabort's argument must be an 8-bit immediate");
39658 return const0_rtx;
39660 emit_insn (gen_xabort (op0));
39661 return 0;
39663 default:
39664 break;
39667 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39668 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39670 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39671 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39672 target);
39675 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39676 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39678 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39679 switch (fcode)
39681 case IX86_BUILTIN_FABSQ:
39682 case IX86_BUILTIN_COPYSIGNQ:
39683 if (!TARGET_SSE)
39684 /* Emit a normal call if SSE isn't available. */
39685 return expand_call (exp, target, ignore);
39686 /* FALLTHRU */
39687 default:
39688 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39692 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39693 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39695 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39696 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39697 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39698 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39699 int masked = 1;
39700 machine_mode mode, wide_mode, nar_mode;
39702 nar_mode = V4SFmode;
39703 mode = V16SFmode;
39704 wide_mode = V64SFmode;
39705 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39706 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39708 switch (fcode)
39710 case IX86_BUILTIN_4FMAPS:
39711 fcn = gen_avx5124fmaddps_4fmaddps;
39712 masked = 0;
39713 goto v4fma_expand;
39715 case IX86_BUILTIN_4DPWSSD:
39716 nar_mode = V4SImode;
39717 mode = V16SImode;
39718 wide_mode = V64SImode;
39719 fcn = gen_avx5124vnniw_vp4dpwssd;
39720 masked = 0;
39721 goto v4fma_expand;
39723 case IX86_BUILTIN_4DPWSSDS:
39724 nar_mode = V4SImode;
39725 mode = V16SImode;
39726 wide_mode = V64SImode;
39727 fcn = gen_avx5124vnniw_vp4dpwssds;
39728 masked = 0;
39729 goto v4fma_expand;
39731 case IX86_BUILTIN_4FNMAPS:
39732 fcn = gen_avx5124fmaddps_4fnmaddps;
39733 masked = 0;
39734 goto v4fma_expand;
39736 case IX86_BUILTIN_4FNMAPS_MASK:
39737 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39738 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39739 goto v4fma_expand;
39741 case IX86_BUILTIN_4DPWSSD_MASK:
39742 nar_mode = V4SImode;
39743 mode = V16SImode;
39744 wide_mode = V64SImode;
39745 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39746 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39747 goto v4fma_expand;
39749 case IX86_BUILTIN_4DPWSSDS_MASK:
39750 nar_mode = V4SImode;
39751 mode = V16SImode;
39752 wide_mode = V64SImode;
39753 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39754 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39755 goto v4fma_expand;
39757 case IX86_BUILTIN_4FMAPS_MASK:
39759 tree args[4];
39760 rtx ops[4];
39761 rtx wide_reg;
39762 rtx accum;
39763 rtx addr;
39764 rtx mem;
39766 v4fma_expand:
39767 wide_reg = gen_reg_rtx (wide_mode);
39768 for (i = 0; i < 4; i++)
39770 args[i] = CALL_EXPR_ARG (exp, i);
39771 ops[i] = expand_normal (args[i]);
39773 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39774 ops[i]);
39777 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39778 accum = force_reg (mode, accum);
39780 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39781 addr = force_reg (Pmode, addr);
39783 mem = gen_rtx_MEM (nar_mode, addr);
39785 target = gen_reg_rtx (mode);
39787 emit_move_insn (target, accum);
39789 if (! masked)
39790 emit_insn (fcn (target, accum, wide_reg, mem));
39791 else
39793 rtx merge, mask;
39794 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39796 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39798 if (CONST_INT_P (mask))
39799 mask = fixup_modeless_constant (mask, HImode);
39801 mask = force_reg (HImode, mask);
39803 if (GET_MODE (mask) != HImode)
39804 mask = gen_rtx_SUBREG (HImode, mask, 0);
39806 /* If merge is 0 then we're about to emit z-masked variant. */
39807 if (const0_operand (merge, mode))
39808 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39809 /* If merge is the same as accum then emit merge-masked variant. */
39810 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39812 merge = force_reg (mode, merge);
39813 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39815 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39816 else
39818 target = gen_reg_rtx (mode);
39819 emit_move_insn (target, merge);
39820 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39823 return target;
39826 case IX86_BUILTIN_4FNMASS:
39827 fcn = gen_avx5124fmaddps_4fnmaddss;
39828 masked = 0;
39829 goto s4fma_expand;
39831 case IX86_BUILTIN_4FMASS:
39832 fcn = gen_avx5124fmaddps_4fmaddss;
39833 masked = 0;
39834 goto s4fma_expand;
39836 case IX86_BUILTIN_4FNMASS_MASK:
39837 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39838 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39839 goto s4fma_expand;
39841 case IX86_BUILTIN_4FMASS_MASK:
39843 tree args[4];
39844 rtx ops[4];
39845 rtx wide_reg;
39846 rtx accum;
39847 rtx addr;
39848 rtx mem;
39850 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39851 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39853 s4fma_expand:
39854 mode = V4SFmode;
39855 wide_reg = gen_reg_rtx (V64SFmode);
39856 for (i = 0; i < 4; i++)
39858 rtx tmp;
39859 args[i] = CALL_EXPR_ARG (exp, i);
39860 ops[i] = expand_normal (args[i]);
39862 tmp = gen_reg_rtx (SFmode);
39863 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39865 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39866 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39869 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39870 accum = force_reg (V4SFmode, accum);
39872 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39873 addr = force_reg (Pmode, addr);
39875 mem = gen_rtx_MEM (V4SFmode, addr);
39877 target = gen_reg_rtx (V4SFmode);
39879 emit_move_insn (target, accum);
39881 if (! masked)
39882 emit_insn (fcn (target, accum, wide_reg, mem));
39883 else
39885 rtx merge, mask;
39886 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39888 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39890 if (CONST_INT_P (mask))
39891 mask = fixup_modeless_constant (mask, QImode);
39893 mask = force_reg (QImode, mask);
39895 if (GET_MODE (mask) != QImode)
39896 mask = gen_rtx_SUBREG (QImode, mask, 0);
39898 /* If merge is 0 then we're about to emit z-masked variant. */
39899 if (const0_operand (merge, mode))
39900 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39901 /* If merge is the same as accum then emit merge-masked
39902 variant. */
39903 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39905 merge = force_reg (mode, merge);
39906 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39908 /* Merge with something unknown might happen if we z-mask
39909 w/ -O0. */
39910 else
39912 target = gen_reg_rtx (mode);
39913 emit_move_insn (target, merge);
39914 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39917 return target;
39919 case IX86_BUILTIN_RDPID:
39920 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39921 target);
39922 default:
39923 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39927 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39928 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39930 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39931 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39934 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39935 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39937 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39938 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39941 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39942 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39944 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39945 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39948 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39949 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39951 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39952 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39955 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39956 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39958 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39959 const struct builtin_description *d = bdesc_multi_arg + i;
39960 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39961 (enum ix86_builtin_func_type)
39962 d->flag, d->comparison);
39965 gcc_unreachable ();
39968 /* This returns the target-specific builtin with code CODE if
39969 current_function_decl has visibility on this builtin, which is checked
39970 using isa flags. Returns NULL_TREE otherwise. */
39972 static tree ix86_get_builtin (enum ix86_builtins code)
39974 struct cl_target_option *opts;
39975 tree target_tree = NULL_TREE;
39977 /* Determine the isa flags of current_function_decl. */
39979 if (current_function_decl)
39980 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39982 if (target_tree == NULL)
39983 target_tree = target_option_default_node;
39985 opts = TREE_TARGET_OPTION (target_tree);
39987 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39988 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39989 return ix86_builtin_decl (code, true);
39990 else
39991 return NULL_TREE;
39994 /* Return function decl for target specific builtin
39995 for given MPX builtin passed i FCODE. */
39996 static tree
39997 ix86_builtin_mpx_function (unsigned fcode)
39999 switch (fcode)
40001 case BUILT_IN_CHKP_BNDMK:
40002 return ix86_builtins[IX86_BUILTIN_BNDMK];
40004 case BUILT_IN_CHKP_BNDSTX:
40005 return ix86_builtins[IX86_BUILTIN_BNDSTX];
40007 case BUILT_IN_CHKP_BNDLDX:
40008 return ix86_builtins[IX86_BUILTIN_BNDLDX];
40010 case BUILT_IN_CHKP_BNDCL:
40011 return ix86_builtins[IX86_BUILTIN_BNDCL];
40013 case BUILT_IN_CHKP_BNDCU:
40014 return ix86_builtins[IX86_BUILTIN_BNDCU];
40016 case BUILT_IN_CHKP_BNDRET:
40017 return ix86_builtins[IX86_BUILTIN_BNDRET];
40019 case BUILT_IN_CHKP_INTERSECT:
40020 return ix86_builtins[IX86_BUILTIN_BNDINT];
40022 case BUILT_IN_CHKP_NARROW:
40023 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
40025 case BUILT_IN_CHKP_SIZEOF:
40026 return ix86_builtins[IX86_BUILTIN_SIZEOF];
40028 case BUILT_IN_CHKP_EXTRACT_LOWER:
40029 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
40031 case BUILT_IN_CHKP_EXTRACT_UPPER:
40032 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
40034 default:
40035 return NULL_TREE;
40038 gcc_unreachable ();
40041 /* Helper function for ix86_load_bounds and ix86_store_bounds.
40043 Return an address to be used to load/store bounds for pointer
40044 passed in SLOT.
40046 SLOT_NO is an integer constant holding number of a target
40047 dependent special slot to be used in case SLOT is not a memory.
40049 SPECIAL_BASE is a pointer to be used as a base of fake address
40050 to access special slots in Bounds Table. SPECIAL_BASE[-1],
40051 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
40053 static rtx
40054 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
40056 rtx addr = NULL;
40058 /* NULL slot means we pass bounds for pointer not passed to the
40059 function at all. Register slot means we pass pointer in a
40060 register. In both these cases bounds are passed via Bounds
40061 Table. Since we do not have actual pointer stored in memory,
40062 we have to use fake addresses to access Bounds Table. We
40063 start with (special_base - sizeof (void*)) and decrease this
40064 address by pointer size to get addresses for other slots. */
40065 if (!slot || REG_P (slot))
40067 gcc_assert (CONST_INT_P (slot_no));
40068 addr = plus_constant (Pmode, special_base,
40069 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
40071 /* If pointer is passed in a memory then its address is used to
40072 access Bounds Table. */
40073 else if (MEM_P (slot))
40075 addr = XEXP (slot, 0);
40076 if (!register_operand (addr, Pmode))
40077 addr = copy_addr_to_reg (addr);
40079 else
40080 gcc_unreachable ();
40082 return addr;
40085 /* Expand pass uses this hook to load bounds for function parameter
40086 PTR passed in SLOT in case its bounds are not passed in a register.
40088 If SLOT is a memory, then bounds are loaded as for regular pointer
40089 loaded from memory. PTR may be NULL in case SLOT is a memory.
40090 In such case value of PTR (if required) may be loaded from SLOT.
40092 If SLOT is NULL or a register then SLOT_NO is an integer constant
40093 holding number of the target dependent special slot which should be
40094 used to obtain bounds.
40096 Return loaded bounds. */
40098 static rtx
40099 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
40101 rtx reg = gen_reg_rtx (BNDmode);
40102 rtx addr;
40104 /* Get address to be used to access Bounds Table. Special slots start
40105 at the location of return address of the current function. */
40106 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
40108 /* Load pointer value from a memory if we don't have it. */
40109 if (!ptr)
40111 gcc_assert (MEM_P (slot));
40112 ptr = copy_addr_to_reg (slot);
40115 if (!register_operand (ptr, Pmode))
40116 ptr = ix86_zero_extend_to_Pmode (ptr);
40118 emit_insn (BNDmode == BND64mode
40119 ? gen_bnd64_ldx (reg, addr, ptr)
40120 : gen_bnd32_ldx (reg, addr, ptr));
40122 return reg;
40125 /* Expand pass uses this hook to store BOUNDS for call argument PTR
40126 passed in SLOT in case BOUNDS are not passed in a register.
40128 If SLOT is a memory, then BOUNDS are stored as for regular pointer
40129 stored in memory. PTR may be NULL in case SLOT is a memory.
40130 In such case value of PTR (if required) may be loaded from SLOT.
40132 If SLOT is NULL or a register then SLOT_NO is an integer constant
40133 holding number of the target dependent special slot which should be
40134 used to store BOUNDS. */
40136 static void
40137 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
40139 rtx addr;
40141 /* Get address to be used to access Bounds Table. Special slots start
40142 at the location of return address of a called function. */
40143 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
40145 /* Load pointer value from a memory if we don't have it. */
40146 if (!ptr)
40148 gcc_assert (MEM_P (slot));
40149 ptr = copy_addr_to_reg (slot);
40152 if (!register_operand (ptr, Pmode))
40153 ptr = ix86_zero_extend_to_Pmode (ptr);
40155 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
40156 if (!register_operand (bounds, BNDmode))
40157 bounds = copy_to_mode_reg (BNDmode, bounds);
40159 emit_insn (BNDmode == BND64mode
40160 ? gen_bnd64_stx (addr, ptr, bounds)
40161 : gen_bnd32_stx (addr, ptr, bounds));
40164 /* Load and return bounds returned by function in SLOT. */
40166 static rtx
40167 ix86_load_returned_bounds (rtx slot)
40169 rtx res;
40171 gcc_assert (REG_P (slot));
40172 res = gen_reg_rtx (BNDmode);
40173 emit_move_insn (res, slot);
40175 return res;
40178 /* Store BOUNDS returned by function into SLOT. */
40180 static void
40181 ix86_store_returned_bounds (rtx slot, rtx bounds)
40183 gcc_assert (REG_P (slot));
40184 emit_move_insn (slot, bounds);
40187 /* Returns a function decl for a vectorized version of the combined function
40188 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
40189 if it is not available. */
40191 static tree
40192 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
40193 tree type_in)
40195 machine_mode in_mode, out_mode;
40196 int in_n, out_n;
40198 if (TREE_CODE (type_out) != VECTOR_TYPE
40199 || TREE_CODE (type_in) != VECTOR_TYPE)
40200 return NULL_TREE;
40202 out_mode = TYPE_MODE (TREE_TYPE (type_out));
40203 out_n = TYPE_VECTOR_SUBPARTS (type_out);
40204 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40205 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40207 switch (fn)
40209 CASE_CFN_EXP2:
40210 if (out_mode == SFmode && in_mode == SFmode)
40212 if (out_n == 16 && in_n == 16)
40213 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
40215 break;
40217 CASE_CFN_IFLOOR:
40218 CASE_CFN_LFLOOR:
40219 CASE_CFN_LLFLOOR:
40220 /* The round insn does not trap on denormals. */
40221 if (flag_trapping_math || !TARGET_SSE4_1)
40222 break;
40224 if (out_mode == SImode && in_mode == DFmode)
40226 if (out_n == 4 && in_n == 2)
40227 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
40228 else if (out_n == 8 && in_n == 4)
40229 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
40230 else if (out_n == 16 && in_n == 8)
40231 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
40233 if (out_mode == SImode && in_mode == SFmode)
40235 if (out_n == 4 && in_n == 4)
40236 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
40237 else if (out_n == 8 && in_n == 8)
40238 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
40239 else if (out_n == 16 && in_n == 16)
40240 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
40242 break;
40244 CASE_CFN_ICEIL:
40245 CASE_CFN_LCEIL:
40246 CASE_CFN_LLCEIL:
40247 /* The round insn does not trap on denormals. */
40248 if (flag_trapping_math || !TARGET_SSE4_1)
40249 break;
40251 if (out_mode == SImode && in_mode == DFmode)
40253 if (out_n == 4 && in_n == 2)
40254 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
40255 else if (out_n == 8 && in_n == 4)
40256 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
40257 else if (out_n == 16 && in_n == 8)
40258 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
40260 if (out_mode == SImode && in_mode == SFmode)
40262 if (out_n == 4 && in_n == 4)
40263 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
40264 else if (out_n == 8 && in_n == 8)
40265 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
40266 else if (out_n == 16 && in_n == 16)
40267 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
40269 break;
40271 CASE_CFN_IRINT:
40272 CASE_CFN_LRINT:
40273 CASE_CFN_LLRINT:
40274 if (out_mode == SImode && in_mode == DFmode)
40276 if (out_n == 4 && in_n == 2)
40277 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
40278 else if (out_n == 8 && in_n == 4)
40279 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
40280 else if (out_n == 16 && in_n == 8)
40281 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
40283 if (out_mode == SImode && in_mode == SFmode)
40285 if (out_n == 4 && in_n == 4)
40286 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
40287 else if (out_n == 8 && in_n == 8)
40288 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
40289 else if (out_n == 16 && in_n == 16)
40290 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
40292 break;
40294 CASE_CFN_IROUND:
40295 CASE_CFN_LROUND:
40296 CASE_CFN_LLROUND:
40297 /* The round insn does not trap on denormals. */
40298 if (flag_trapping_math || !TARGET_SSE4_1)
40299 break;
40301 if (out_mode == SImode && in_mode == DFmode)
40303 if (out_n == 4 && in_n == 2)
40304 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
40305 else if (out_n == 8 && in_n == 4)
40306 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
40307 else if (out_n == 16 && in_n == 8)
40308 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
40310 if (out_mode == SImode && in_mode == SFmode)
40312 if (out_n == 4 && in_n == 4)
40313 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
40314 else if (out_n == 8 && in_n == 8)
40315 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
40316 else if (out_n == 16 && in_n == 16)
40317 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
40319 break;
40321 CASE_CFN_FLOOR:
40322 /* The round insn does not trap on denormals. */
40323 if (flag_trapping_math || !TARGET_SSE4_1)
40324 break;
40326 if (out_mode == DFmode && in_mode == DFmode)
40328 if (out_n == 2 && in_n == 2)
40329 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
40330 else if (out_n == 4 && in_n == 4)
40331 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
40332 else if (out_n == 8 && in_n == 8)
40333 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
40335 if (out_mode == SFmode && in_mode == SFmode)
40337 if (out_n == 4 && in_n == 4)
40338 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
40339 else if (out_n == 8 && in_n == 8)
40340 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
40341 else if (out_n == 16 && in_n == 16)
40342 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
40344 break;
40346 CASE_CFN_CEIL:
40347 /* The round insn does not trap on denormals. */
40348 if (flag_trapping_math || !TARGET_SSE4_1)
40349 break;
40351 if (out_mode == DFmode && in_mode == DFmode)
40353 if (out_n == 2 && in_n == 2)
40354 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
40355 else if (out_n == 4 && in_n == 4)
40356 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
40357 else if (out_n == 8 && in_n == 8)
40358 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
40360 if (out_mode == SFmode && in_mode == SFmode)
40362 if (out_n == 4 && in_n == 4)
40363 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
40364 else if (out_n == 8 && in_n == 8)
40365 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
40366 else if (out_n == 16 && in_n == 16)
40367 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
40369 break;
40371 CASE_CFN_TRUNC:
40372 /* The round insn does not trap on denormals. */
40373 if (flag_trapping_math || !TARGET_SSE4_1)
40374 break;
40376 if (out_mode == DFmode && in_mode == DFmode)
40378 if (out_n == 2 && in_n == 2)
40379 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
40380 else if (out_n == 4 && in_n == 4)
40381 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
40382 else if (out_n == 8 && in_n == 8)
40383 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
40385 if (out_mode == SFmode && in_mode == SFmode)
40387 if (out_n == 4 && in_n == 4)
40388 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
40389 else if (out_n == 8 && in_n == 8)
40390 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
40391 else if (out_n == 16 && in_n == 16)
40392 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40394 break;
40396 CASE_CFN_RINT:
40397 /* The round insn does not trap on denormals. */
40398 if (flag_trapping_math || !TARGET_SSE4_1)
40399 break;
40401 if (out_mode == DFmode && in_mode == DFmode)
40403 if (out_n == 2 && in_n == 2)
40404 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40405 else if (out_n == 4 && in_n == 4)
40406 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40408 if (out_mode == SFmode && in_mode == SFmode)
40410 if (out_n == 4 && in_n == 4)
40411 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40412 else if (out_n == 8 && in_n == 8)
40413 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40415 break;
40417 CASE_CFN_FMA:
40418 if (out_mode == DFmode && in_mode == DFmode)
40420 if (out_n == 2 && in_n == 2)
40421 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40422 if (out_n == 4 && in_n == 4)
40423 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40425 if (out_mode == SFmode && in_mode == SFmode)
40427 if (out_n == 4 && in_n == 4)
40428 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40429 if (out_n == 8 && in_n == 8)
40430 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40432 break;
40434 default:
40435 break;
40438 /* Dispatch to a handler for a vectorization library. */
40439 if (ix86_veclib_handler)
40440 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40442 return NULL_TREE;
40445 /* Handler for an SVML-style interface to
40446 a library with vectorized intrinsics. */
40448 static tree
40449 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40451 char name[20];
40452 tree fntype, new_fndecl, args;
40453 unsigned arity;
40454 const char *bname;
40455 machine_mode el_mode, in_mode;
40456 int n, in_n;
40458 /* The SVML is suitable for unsafe math only. */
40459 if (!flag_unsafe_math_optimizations)
40460 return NULL_TREE;
40462 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40463 n = TYPE_VECTOR_SUBPARTS (type_out);
40464 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40465 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40466 if (el_mode != in_mode
40467 || n != in_n)
40468 return NULL_TREE;
40470 switch (fn)
40472 CASE_CFN_EXP:
40473 CASE_CFN_LOG:
40474 CASE_CFN_LOG10:
40475 CASE_CFN_POW:
40476 CASE_CFN_TANH:
40477 CASE_CFN_TAN:
40478 CASE_CFN_ATAN:
40479 CASE_CFN_ATAN2:
40480 CASE_CFN_ATANH:
40481 CASE_CFN_CBRT:
40482 CASE_CFN_SINH:
40483 CASE_CFN_SIN:
40484 CASE_CFN_ASINH:
40485 CASE_CFN_ASIN:
40486 CASE_CFN_COSH:
40487 CASE_CFN_COS:
40488 CASE_CFN_ACOSH:
40489 CASE_CFN_ACOS:
40490 if ((el_mode != DFmode || n != 2)
40491 && (el_mode != SFmode || n != 4))
40492 return NULL_TREE;
40493 break;
40495 default:
40496 return NULL_TREE;
40499 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40500 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40502 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40503 strcpy (name, "vmlsLn4");
40504 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40505 strcpy (name, "vmldLn2");
40506 else if (n == 4)
40508 sprintf (name, "vmls%s", bname+10);
40509 name[strlen (name)-1] = '4';
40511 else
40512 sprintf (name, "vmld%s2", bname+10);
40514 /* Convert to uppercase. */
40515 name[4] &= ~0x20;
40517 arity = 0;
40518 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40519 arity++;
40521 if (arity == 1)
40522 fntype = build_function_type_list (type_out, type_in, NULL);
40523 else
40524 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40526 /* Build a function declaration for the vectorized function. */
40527 new_fndecl = build_decl (BUILTINS_LOCATION,
40528 FUNCTION_DECL, get_identifier (name), fntype);
40529 TREE_PUBLIC (new_fndecl) = 1;
40530 DECL_EXTERNAL (new_fndecl) = 1;
40531 DECL_IS_NOVOPS (new_fndecl) = 1;
40532 TREE_READONLY (new_fndecl) = 1;
40534 return new_fndecl;
40537 /* Handler for an ACML-style interface to
40538 a library with vectorized intrinsics. */
40540 static tree
40541 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40543 char name[20] = "__vr.._";
40544 tree fntype, new_fndecl, args;
40545 unsigned arity;
40546 const char *bname;
40547 machine_mode el_mode, in_mode;
40548 int n, in_n;
40550 /* The ACML is 64bits only and suitable for unsafe math only as
40551 it does not correctly support parts of IEEE with the required
40552 precision such as denormals. */
40553 if (!TARGET_64BIT
40554 || !flag_unsafe_math_optimizations)
40555 return NULL_TREE;
40557 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40558 n = TYPE_VECTOR_SUBPARTS (type_out);
40559 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40560 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40561 if (el_mode != in_mode
40562 || n != in_n)
40563 return NULL_TREE;
40565 switch (fn)
40567 CASE_CFN_SIN:
40568 CASE_CFN_COS:
40569 CASE_CFN_EXP:
40570 CASE_CFN_LOG:
40571 CASE_CFN_LOG2:
40572 CASE_CFN_LOG10:
40573 if (el_mode == DFmode && n == 2)
40575 name[4] = 'd';
40576 name[5] = '2';
40578 else if (el_mode == SFmode && n == 4)
40580 name[4] = 's';
40581 name[5] = '4';
40583 else
40584 return NULL_TREE;
40585 break;
40587 default:
40588 return NULL_TREE;
40591 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40592 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40593 sprintf (name + 7, "%s", bname+10);
40595 arity = 0;
40596 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40597 arity++;
40599 if (arity == 1)
40600 fntype = build_function_type_list (type_out, type_in, NULL);
40601 else
40602 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40604 /* Build a function declaration for the vectorized function. */
40605 new_fndecl = build_decl (BUILTINS_LOCATION,
40606 FUNCTION_DECL, get_identifier (name), fntype);
40607 TREE_PUBLIC (new_fndecl) = 1;
40608 DECL_EXTERNAL (new_fndecl) = 1;
40609 DECL_IS_NOVOPS (new_fndecl) = 1;
40610 TREE_READONLY (new_fndecl) = 1;
40612 return new_fndecl;
40615 /* Returns a decl of a function that implements gather load with
40616 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40617 Return NULL_TREE if it is not available. */
40619 static tree
40620 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40621 const_tree index_type, int scale)
40623 bool si;
40624 enum ix86_builtins code;
40626 if (! TARGET_AVX2)
40627 return NULL_TREE;
40629 if ((TREE_CODE (index_type) != INTEGER_TYPE
40630 && !POINTER_TYPE_P (index_type))
40631 || (TYPE_MODE (index_type) != SImode
40632 && TYPE_MODE (index_type) != DImode))
40633 return NULL_TREE;
40635 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40636 return NULL_TREE;
40638 /* v*gather* insn sign extends index to pointer mode. */
40639 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40640 && TYPE_UNSIGNED (index_type))
40641 return NULL_TREE;
40643 if (scale <= 0
40644 || scale > 8
40645 || (scale & (scale - 1)) != 0)
40646 return NULL_TREE;
40648 si = TYPE_MODE (index_type) == SImode;
40649 switch (TYPE_MODE (mem_vectype))
40651 case E_V2DFmode:
40652 if (TARGET_AVX512VL)
40653 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40654 else
40655 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40656 break;
40657 case E_V4DFmode:
40658 if (TARGET_AVX512VL)
40659 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40660 else
40661 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40662 break;
40663 case E_V2DImode:
40664 if (TARGET_AVX512VL)
40665 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40666 else
40667 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40668 break;
40669 case E_V4DImode:
40670 if (TARGET_AVX512VL)
40671 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40672 else
40673 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40674 break;
40675 case E_V4SFmode:
40676 if (TARGET_AVX512VL)
40677 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40678 else
40679 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40680 break;
40681 case E_V8SFmode:
40682 if (TARGET_AVX512VL)
40683 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40684 else
40685 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40686 break;
40687 case E_V4SImode:
40688 if (TARGET_AVX512VL)
40689 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40690 else
40691 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40692 break;
40693 case E_V8SImode:
40694 if (TARGET_AVX512VL)
40695 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40696 else
40697 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40698 break;
40699 case E_V8DFmode:
40700 if (TARGET_AVX512F)
40701 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40702 else
40703 return NULL_TREE;
40704 break;
40705 case E_V8DImode:
40706 if (TARGET_AVX512F)
40707 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40708 else
40709 return NULL_TREE;
40710 break;
40711 case E_V16SFmode:
40712 if (TARGET_AVX512F)
40713 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40714 else
40715 return NULL_TREE;
40716 break;
40717 case E_V16SImode:
40718 if (TARGET_AVX512F)
40719 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40720 else
40721 return NULL_TREE;
40722 break;
40723 default:
40724 return NULL_TREE;
40727 return ix86_get_builtin (code);
40730 /* Returns a decl of a function that implements scatter store with
40731 register type VECTYPE and index type INDEX_TYPE and SCALE.
40732 Return NULL_TREE if it is not available. */
40734 static tree
40735 ix86_vectorize_builtin_scatter (const_tree vectype,
40736 const_tree index_type, int scale)
40738 bool si;
40739 enum ix86_builtins code;
40741 if (!TARGET_AVX512F)
40742 return NULL_TREE;
40744 if ((TREE_CODE (index_type) != INTEGER_TYPE
40745 && !POINTER_TYPE_P (index_type))
40746 || (TYPE_MODE (index_type) != SImode
40747 && TYPE_MODE (index_type) != DImode))
40748 return NULL_TREE;
40750 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40751 return NULL_TREE;
40753 /* v*scatter* insn sign extends index to pointer mode. */
40754 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40755 && TYPE_UNSIGNED (index_type))
40756 return NULL_TREE;
40758 /* Scale can be 1, 2, 4 or 8. */
40759 if (scale <= 0
40760 || scale > 8
40761 || (scale & (scale - 1)) != 0)
40762 return NULL_TREE;
40764 si = TYPE_MODE (index_type) == SImode;
40765 switch (TYPE_MODE (vectype))
40767 case E_V8DFmode:
40768 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40769 break;
40770 case E_V8DImode:
40771 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40772 break;
40773 case E_V16SFmode:
40774 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40775 break;
40776 case E_V16SImode:
40777 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40778 break;
40779 default:
40780 return NULL_TREE;
40783 return ix86_builtins[code];
40786 /* Return true if it is safe to use the rsqrt optabs to optimize
40787 1.0/sqrt. */
40789 static bool
40790 use_rsqrt_p ()
40792 return (TARGET_SSE_MATH
40793 && flag_finite_math_only
40794 && !flag_trapping_math
40795 && flag_unsafe_math_optimizations);
40798 /* Returns a code for a target-specific builtin that implements
40799 reciprocal of the function, or NULL_TREE if not available. */
40801 static tree
40802 ix86_builtin_reciprocal (tree fndecl)
40804 switch (DECL_FUNCTION_CODE (fndecl))
40806 /* Vectorized version of sqrt to rsqrt conversion. */
40807 case IX86_BUILTIN_SQRTPS_NR:
40808 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40810 case IX86_BUILTIN_SQRTPS_NR256:
40811 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40813 default:
40814 return NULL_TREE;
40818 /* Helper for avx_vpermilps256_operand et al. This is also used by
40819 the expansion functions to turn the parallel back into a mask.
40820 The return value is 0 for no match and the imm8+1 for a match. */
40823 avx_vpermilp_parallel (rtx par, machine_mode mode)
40825 unsigned i, nelt = GET_MODE_NUNITS (mode);
40826 unsigned mask = 0;
40827 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40829 if (XVECLEN (par, 0) != (int) nelt)
40830 return 0;
40832 /* Validate that all of the elements are constants, and not totally
40833 out of range. Copy the data into an integral array to make the
40834 subsequent checks easier. */
40835 for (i = 0; i < nelt; ++i)
40837 rtx er = XVECEXP (par, 0, i);
40838 unsigned HOST_WIDE_INT ei;
40840 if (!CONST_INT_P (er))
40841 return 0;
40842 ei = INTVAL (er);
40843 if (ei >= nelt)
40844 return 0;
40845 ipar[i] = ei;
40848 switch (mode)
40850 case E_V8DFmode:
40851 /* In the 512-bit DFmode case, we can only move elements within
40852 a 128-bit lane. First fill the second part of the mask,
40853 then fallthru. */
40854 for (i = 4; i < 6; ++i)
40856 if (ipar[i] < 4 || ipar[i] >= 6)
40857 return 0;
40858 mask |= (ipar[i] - 4) << i;
40860 for (i = 6; i < 8; ++i)
40862 if (ipar[i] < 6)
40863 return 0;
40864 mask |= (ipar[i] - 6) << i;
40866 /* FALLTHRU */
40868 case E_V4DFmode:
40869 /* In the 256-bit DFmode case, we can only move elements within
40870 a 128-bit lane. */
40871 for (i = 0; i < 2; ++i)
40873 if (ipar[i] >= 2)
40874 return 0;
40875 mask |= ipar[i] << i;
40877 for (i = 2; i < 4; ++i)
40879 if (ipar[i] < 2)
40880 return 0;
40881 mask |= (ipar[i] - 2) << i;
40883 break;
40885 case E_V16SFmode:
40886 /* In 512 bit SFmode case, permutation in the upper 256 bits
40887 must mirror the permutation in the lower 256-bits. */
40888 for (i = 0; i < 8; ++i)
40889 if (ipar[i] + 8 != ipar[i + 8])
40890 return 0;
40891 /* FALLTHRU */
40893 case E_V8SFmode:
40894 /* In 256 bit SFmode case, we have full freedom of
40895 movement within the low 128-bit lane, but the high 128-bit
40896 lane must mirror the exact same pattern. */
40897 for (i = 0; i < 4; ++i)
40898 if (ipar[i] + 4 != ipar[i + 4])
40899 return 0;
40900 nelt = 4;
40901 /* FALLTHRU */
40903 case E_V2DFmode:
40904 case E_V4SFmode:
40905 /* In the 128-bit case, we've full freedom in the placement of
40906 the elements from the source operand. */
40907 for (i = 0; i < nelt; ++i)
40908 mask |= ipar[i] << (i * (nelt / 2));
40909 break;
40911 default:
40912 gcc_unreachable ();
40915 /* Make sure success has a non-zero value by adding one. */
40916 return mask + 1;
40919 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40920 the expansion functions to turn the parallel back into a mask.
40921 The return value is 0 for no match and the imm8+1 for a match. */
40924 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40926 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40927 unsigned mask = 0;
40928 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40930 if (XVECLEN (par, 0) != (int) nelt)
40931 return 0;
40933 /* Validate that all of the elements are constants, and not totally
40934 out of range. Copy the data into an integral array to make the
40935 subsequent checks easier. */
40936 for (i = 0; i < nelt; ++i)
40938 rtx er = XVECEXP (par, 0, i);
40939 unsigned HOST_WIDE_INT ei;
40941 if (!CONST_INT_P (er))
40942 return 0;
40943 ei = INTVAL (er);
40944 if (ei >= 2 * nelt)
40945 return 0;
40946 ipar[i] = ei;
40949 /* Validate that the halves of the permute are halves. */
40950 for (i = 0; i < nelt2 - 1; ++i)
40951 if (ipar[i] + 1 != ipar[i + 1])
40952 return 0;
40953 for (i = nelt2; i < nelt - 1; ++i)
40954 if (ipar[i] + 1 != ipar[i + 1])
40955 return 0;
40957 /* Reconstruct the mask. */
40958 for (i = 0; i < 2; ++i)
40960 unsigned e = ipar[i * nelt2];
40961 if (e % nelt2)
40962 return 0;
40963 e /= nelt2;
40964 mask |= e << (i * 4);
40967 /* Make sure success has a non-zero value by adding one. */
40968 return mask + 1;
40971 /* Return a register priority for hard reg REGNO. */
40972 static int
40973 ix86_register_priority (int hard_regno)
40975 /* ebp and r13 as the base always wants a displacement, r12 as the
40976 base always wants an index. So discourage their usage in an
40977 address. */
40978 if (hard_regno == R12_REG || hard_regno == R13_REG)
40979 return 0;
40980 if (hard_regno == BP_REG)
40981 return 1;
40982 /* New x86-64 int registers result in bigger code size. Discourage
40983 them. */
40984 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40985 return 2;
40986 /* New x86-64 SSE registers result in bigger code size. Discourage
40987 them. */
40988 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40989 return 2;
40990 /* Usage of AX register results in smaller code. Prefer it. */
40991 if (hard_regno == AX_REG)
40992 return 4;
40993 return 3;
40996 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40998 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40999 QImode must go into class Q_REGS.
41000 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
41001 movdf to do mem-to-mem moves through integer regs. */
41003 static reg_class_t
41004 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
41006 machine_mode mode = GET_MODE (x);
41008 /* We're only allowed to return a subclass of CLASS. Many of the
41009 following checks fail for NO_REGS, so eliminate that early. */
41010 if (regclass == NO_REGS)
41011 return NO_REGS;
41013 /* All classes can load zeros. */
41014 if (x == CONST0_RTX (mode))
41015 return regclass;
41017 /* Force constants into memory if we are loading a (nonzero) constant into
41018 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
41019 instructions to load from a constant. */
41020 if (CONSTANT_P (x)
41021 && (MAYBE_MMX_CLASS_P (regclass)
41022 || MAYBE_SSE_CLASS_P (regclass)
41023 || MAYBE_MASK_CLASS_P (regclass)))
41024 return NO_REGS;
41026 /* Floating-point constants need more complex checks. */
41027 if (CONST_DOUBLE_P (x))
41029 /* General regs can load everything. */
41030 if (INTEGER_CLASS_P (regclass))
41031 return regclass;
41033 /* Floats can load 0 and 1 plus some others. Note that we eliminated
41034 zero above. We only want to wind up preferring 80387 registers if
41035 we plan on doing computation with them. */
41036 if (IS_STACK_MODE (mode)
41037 && standard_80387_constant_p (x) > 0)
41039 /* Limit class to FP regs. */
41040 if (FLOAT_CLASS_P (regclass))
41041 return FLOAT_REGS;
41042 else if (regclass == FP_TOP_SSE_REGS)
41043 return FP_TOP_REG;
41044 else if (regclass == FP_SECOND_SSE_REGS)
41045 return FP_SECOND_REG;
41048 return NO_REGS;
41051 /* Prefer SSE regs only, if we can use them for math. */
41052 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41053 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
41055 /* Generally when we see PLUS here, it's the function invariant
41056 (plus soft-fp const_int). Which can only be computed into general
41057 regs. */
41058 if (GET_CODE (x) == PLUS)
41059 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
41061 /* QImode constants are easy to load, but non-constant QImode data
41062 must go into Q_REGS. */
41063 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
41065 if (Q_CLASS_P (regclass))
41066 return regclass;
41067 else if (reg_class_subset_p (Q_REGS, regclass))
41068 return Q_REGS;
41069 else
41070 return NO_REGS;
41073 return regclass;
41076 /* Discourage putting floating-point values in SSE registers unless
41077 SSE math is being used, and likewise for the 387 registers. */
41078 static reg_class_t
41079 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
41081 machine_mode mode = GET_MODE (x);
41083 /* Restrict the output reload class to the register bank that we are doing
41084 math on. If we would like not to return a subset of CLASS, reject this
41085 alternative: if reload cannot do this, it will still use its choice. */
41086 mode = GET_MODE (x);
41087 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41088 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
41090 if (IS_STACK_MODE (mode))
41092 if (regclass == FP_TOP_SSE_REGS)
41093 return FP_TOP_REG;
41094 else if (regclass == FP_SECOND_SSE_REGS)
41095 return FP_SECOND_REG;
41096 else
41097 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
41100 return regclass;
41103 static reg_class_t
41104 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
41105 machine_mode mode, secondary_reload_info *sri)
41107 /* Double-word spills from general registers to non-offsettable memory
41108 references (zero-extended addresses) require special handling. */
41109 if (TARGET_64BIT
41110 && MEM_P (x)
41111 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
41112 && INTEGER_CLASS_P (rclass)
41113 && !offsettable_memref_p (x))
41115 sri->icode = (in_p
41116 ? CODE_FOR_reload_noff_load
41117 : CODE_FOR_reload_noff_store);
41118 /* Add the cost of moving address to a temporary. */
41119 sri->extra_cost = 1;
41121 return NO_REGS;
41124 /* QImode spills from non-QI registers require
41125 intermediate register on 32bit targets. */
41126 if (mode == QImode
41127 && ((!TARGET_64BIT && !in_p
41128 && INTEGER_CLASS_P (rclass)
41129 && MAYBE_NON_Q_CLASS_P (rclass))
41130 || (!TARGET_AVX512DQ
41131 && MAYBE_MASK_CLASS_P (rclass))))
41133 int regno = true_regnum (x);
41135 /* Return Q_REGS if the operand is in memory. */
41136 if (regno == -1)
41137 return Q_REGS;
41139 return NO_REGS;
41142 /* This condition handles corner case where an expression involving
41143 pointers gets vectorized. We're trying to use the address of a
41144 stack slot as a vector initializer.
41146 (set (reg:V2DI 74 [ vect_cst_.2 ])
41147 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
41149 Eventually frame gets turned into sp+offset like this:
41151 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41152 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
41153 (const_int 392 [0x188]))))
41155 That later gets turned into:
41157 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41158 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
41159 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
41161 We'll have the following reload recorded:
41163 Reload 0: reload_in (DI) =
41164 (plus:DI (reg/f:DI 7 sp)
41165 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
41166 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41167 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
41168 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
41169 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
41170 reload_reg_rtx: (reg:V2DI 22 xmm1)
41172 Which isn't going to work since SSE instructions can't handle scalar
41173 additions. Returning GENERAL_REGS forces the addition into integer
41174 register and reload can handle subsequent reloads without problems. */
41176 if (in_p && GET_CODE (x) == PLUS
41177 && SSE_CLASS_P (rclass)
41178 && SCALAR_INT_MODE_P (mode))
41179 return GENERAL_REGS;
41181 return NO_REGS;
41184 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
41186 static bool
41187 ix86_class_likely_spilled_p (reg_class_t rclass)
41189 switch (rclass)
41191 case AREG:
41192 case DREG:
41193 case CREG:
41194 case BREG:
41195 case AD_REGS:
41196 case SIREG:
41197 case DIREG:
41198 case SSE_FIRST_REG:
41199 case FP_TOP_REG:
41200 case FP_SECOND_REG:
41201 case BND_REGS:
41202 return true;
41204 default:
41205 break;
41208 return false;
41211 /* If we are copying between registers from different register sets
41212 (e.g. FP and integer), we may need a memory location.
41214 The function can't work reliably when one of the CLASSES is a class
41215 containing registers from multiple sets. We avoid this by never combining
41216 different sets in a single alternative in the machine description.
41217 Ensure that this constraint holds to avoid unexpected surprises.
41219 When STRICT is false, we are being called from REGISTER_MOVE_COST,
41220 so do not enforce these sanity checks.
41222 To optimize register_move_cost performance, define inline variant. */
41224 static inline bool
41225 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
41226 reg_class_t class2, int strict)
41228 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
41229 return false;
41231 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
41232 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
41233 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
41234 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
41235 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
41236 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
41237 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
41238 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
41240 gcc_assert (!strict || lra_in_progress);
41241 return true;
41244 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
41245 return true;
41247 /* Between mask and general, we have moves no larger than word size. */
41248 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
41249 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
41250 return true;
41252 /* ??? This is a lie. We do have moves between mmx/general, and for
41253 mmx/sse2. But by saying we need secondary memory we discourage the
41254 register allocator from using the mmx registers unless needed. */
41255 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
41256 return true;
41258 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41260 /* SSE1 doesn't have any direct moves from other classes. */
41261 if (!TARGET_SSE2)
41262 return true;
41264 /* If the target says that inter-unit moves are more expensive
41265 than moving through memory, then don't generate them. */
41266 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
41267 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
41268 return true;
41270 /* Between SSE and general, we have moves no larger than word size. */
41271 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41272 return true;
41275 return false;
41278 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
41280 static bool
41281 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
41282 reg_class_t class2)
41284 return inline_secondary_memory_needed (mode, class1, class2, true);
41287 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
41289 get_secondary_mem widens integral modes to BITS_PER_WORD.
41290 There is no need to emit full 64 bit move on 64 bit targets
41291 for integral modes that can be moved using 32 bit move. */
41293 static machine_mode
41294 ix86_secondary_memory_needed_mode (machine_mode mode)
41296 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
41297 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
41298 return mode;
41301 /* Implement the TARGET_CLASS_MAX_NREGS hook.
41303 On the 80386, this is the size of MODE in words,
41304 except in the FP regs, where a single reg is always enough. */
41306 static unsigned char
41307 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
41309 if (MAYBE_INTEGER_CLASS_P (rclass))
41311 if (mode == XFmode)
41312 return (TARGET_64BIT ? 2 : 3);
41313 else if (mode == XCmode)
41314 return (TARGET_64BIT ? 4 : 6);
41315 else
41316 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41318 else
41320 if (COMPLEX_MODE_P (mode))
41321 return 2;
41322 else
41323 return 1;
41327 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
41329 static bool
41330 ix86_can_change_mode_class (machine_mode from, machine_mode to,
41331 reg_class_t regclass)
41333 if (from == to)
41334 return true;
41336 /* x87 registers can't do subreg at all, as all values are reformatted
41337 to extended precision. */
41338 if (MAYBE_FLOAT_CLASS_P (regclass))
41339 return false;
41341 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
41343 /* Vector registers do not support QI or HImode loads. If we don't
41344 disallow a change to these modes, reload will assume it's ok to
41345 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
41346 the vec_dupv4hi pattern. */
41347 if (GET_MODE_SIZE (from) < 4)
41348 return false;
41351 return true;
41354 /* Return the cost of moving data of mode M between a
41355 register and memory. A value of 2 is the default; this cost is
41356 relative to those in `REGISTER_MOVE_COST'.
41358 This function is used extensively by register_move_cost that is used to
41359 build tables at startup. Make it inline in this case.
41360 When IN is 2, return maximum of in and out move cost.
41362 If moving between registers and memory is more expensive than
41363 between two registers, you should define this macro to express the
41364 relative cost.
41366 Model also increased moving costs of QImode registers in non
41367 Q_REGS classes.
41369 static inline int
41370 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
41371 int in)
41373 int cost;
41374 if (FLOAT_CLASS_P (regclass))
41376 int index;
41377 switch (mode)
41379 case E_SFmode:
41380 index = 0;
41381 break;
41382 case E_DFmode:
41383 index = 1;
41384 break;
41385 case E_XFmode:
41386 index = 2;
41387 break;
41388 default:
41389 return 100;
41391 if (in == 2)
41392 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
41393 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
41395 if (SSE_CLASS_P (regclass))
41397 int index;
41398 switch (GET_MODE_SIZE (mode))
41400 case 4:
41401 index = 0;
41402 break;
41403 case 8:
41404 index = 1;
41405 break;
41406 case 16:
41407 index = 2;
41408 break;
41409 default:
41410 return 100;
41412 if (in == 2)
41413 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41414 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41416 if (MMX_CLASS_P (regclass))
41418 int index;
41419 switch (GET_MODE_SIZE (mode))
41421 case 4:
41422 index = 0;
41423 break;
41424 case 8:
41425 index = 1;
41426 break;
41427 default:
41428 return 100;
41430 if (in)
41431 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41432 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41434 switch (GET_MODE_SIZE (mode))
41436 case 1:
41437 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41439 if (!in)
41440 return ix86_cost->int_store[0];
41441 if (TARGET_PARTIAL_REG_DEPENDENCY
41442 && optimize_function_for_speed_p (cfun))
41443 cost = ix86_cost->movzbl_load;
41444 else
41445 cost = ix86_cost->int_load[0];
41446 if (in == 2)
41447 return MAX (cost, ix86_cost->int_store[0]);
41448 return cost;
41450 else
41452 if (in == 2)
41453 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41454 if (in)
41455 return ix86_cost->movzbl_load;
41456 else
41457 return ix86_cost->int_store[0] + 4;
41459 break;
41460 case 2:
41461 if (in == 2)
41462 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41463 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41464 default:
41465 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41466 if (mode == TFmode)
41467 mode = XFmode;
41468 if (in == 2)
41469 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41470 else if (in)
41471 cost = ix86_cost->int_load[2];
41472 else
41473 cost = ix86_cost->int_store[2];
41474 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41478 static int
41479 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41480 bool in)
41482 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41486 /* Return the cost of moving data from a register in class CLASS1 to
41487 one in class CLASS2.
41489 It is not required that the cost always equal 2 when FROM is the same as TO;
41490 on some machines it is expensive to move between registers if they are not
41491 general registers. */
41493 static int
41494 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41495 reg_class_t class2_i)
41497 enum reg_class class1 = (enum reg_class) class1_i;
41498 enum reg_class class2 = (enum reg_class) class2_i;
41500 /* In case we require secondary memory, compute cost of the store followed
41501 by load. In order to avoid bad register allocation choices, we need
41502 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41504 if (inline_secondary_memory_needed (mode, class1, class2, false))
41506 int cost = 1;
41508 cost += inline_memory_move_cost (mode, class1, 2);
41509 cost += inline_memory_move_cost (mode, class2, 2);
41511 /* In case of copying from general_purpose_register we may emit multiple
41512 stores followed by single load causing memory size mismatch stall.
41513 Count this as arbitrarily high cost of 20. */
41514 if (targetm.class_max_nregs (class1, mode)
41515 > targetm.class_max_nregs (class2, mode))
41516 cost += 20;
41518 /* In the case of FP/MMX moves, the registers actually overlap, and we
41519 have to switch modes in order to treat them differently. */
41520 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41521 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41522 cost += 20;
41524 return cost;
41527 /* Moves between SSE/MMX and integer unit are expensive. */
41528 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41529 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41531 /* ??? By keeping returned value relatively high, we limit the number
41532 of moves between integer and MMX/SSE registers for all targets.
41533 Additionally, high value prevents problem with x86_modes_tieable_p(),
41534 where integer modes in MMX/SSE registers are not tieable
41535 because of missing QImode and HImode moves to, from or between
41536 MMX/SSE registers. */
41537 return MAX (8, ix86_cost->mmxsse_to_integer);
41539 if (MAYBE_FLOAT_CLASS_P (class1))
41540 return ix86_cost->fp_move;
41541 if (MAYBE_SSE_CLASS_P (class1))
41542 return ix86_cost->sse_move;
41543 if (MAYBE_MMX_CLASS_P (class1))
41544 return ix86_cost->mmx_move;
41545 return 2;
41548 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
41549 words of a value of mode MODE but can be less for certain modes in
41550 special long registers.
41552 Actually there are no two word move instructions for consecutive
41553 registers. And only registers 0-3 may have mov byte instructions
41554 applied to them. */
41556 static unsigned int
41557 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
41559 if (GENERAL_REGNO_P (regno))
41561 if (mode == XFmode)
41562 return TARGET_64BIT ? 2 : 3;
41563 if (mode == XCmode)
41564 return TARGET_64BIT ? 4 : 6;
41565 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41567 if (COMPLEX_MODE_P (mode))
41568 return 2;
41569 if (mode == V64SFmode || mode == V64SImode)
41570 return 4;
41571 return 1;
41574 /* Implement TARGET_HARD_REGNO_MODE_OK. */
41576 static bool
41577 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
41579 /* Flags and only flags can only hold CCmode values. */
41580 if (CC_REGNO_P (regno))
41581 return GET_MODE_CLASS (mode) == MODE_CC;
41582 if (GET_MODE_CLASS (mode) == MODE_CC
41583 || GET_MODE_CLASS (mode) == MODE_RANDOM
41584 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41585 return false;
41586 if (STACK_REGNO_P (regno))
41587 return VALID_FP_MODE_P (mode);
41588 if (MASK_REGNO_P (regno))
41589 return (VALID_MASK_REG_MODE (mode)
41590 || (TARGET_AVX512BW
41591 && VALID_MASK_AVX512BW_MODE (mode)));
41592 if (BND_REGNO_P (regno))
41593 return VALID_BND_REG_MODE (mode);
41594 if (SSE_REGNO_P (regno))
41596 /* We implement the move patterns for all vector modes into and
41597 out of SSE registers, even when no operation instructions
41598 are available. */
41600 /* For AVX-512 we allow, regardless of regno:
41601 - XI mode
41602 - any of 512-bit wide vector mode
41603 - any scalar mode. */
41604 if (TARGET_AVX512F
41605 && (mode == XImode
41606 || VALID_AVX512F_REG_MODE (mode)
41607 || VALID_AVX512F_SCALAR_MODE (mode)))
41608 return true;
41610 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41611 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41612 && MOD4_SSE_REGNO_P (regno)
41613 && mode == V64SFmode)
41614 return true;
41616 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41617 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41618 && MOD4_SSE_REGNO_P (regno)
41619 && mode == V64SImode)
41620 return true;
41622 /* TODO check for QI/HI scalars. */
41623 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41624 if (TARGET_AVX512VL
41625 && (mode == OImode
41626 || mode == TImode
41627 || VALID_AVX256_REG_MODE (mode)
41628 || VALID_AVX512VL_128_REG_MODE (mode)))
41629 return true;
41631 /* xmm16-xmm31 are only available for AVX-512. */
41632 if (EXT_REX_SSE_REGNO_P (regno))
41633 return false;
41635 /* OImode and AVX modes are available only when AVX is enabled. */
41636 return ((TARGET_AVX
41637 && VALID_AVX256_REG_OR_OI_MODE (mode))
41638 || VALID_SSE_REG_MODE (mode)
41639 || VALID_SSE2_REG_MODE (mode)
41640 || VALID_MMX_REG_MODE (mode)
41641 || VALID_MMX_REG_MODE_3DNOW (mode));
41643 if (MMX_REGNO_P (regno))
41645 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41646 so if the register is available at all, then we can move data of
41647 the given mode into or out of it. */
41648 return (VALID_MMX_REG_MODE (mode)
41649 || VALID_MMX_REG_MODE_3DNOW (mode));
41652 if (mode == QImode)
41654 /* Take care for QImode values - they can be in non-QI regs,
41655 but then they do cause partial register stalls. */
41656 if (ANY_QI_REGNO_P (regno))
41657 return true;
41658 if (!TARGET_PARTIAL_REG_STALL)
41659 return true;
41660 /* LRA checks if the hard register is OK for the given mode.
41661 QImode values can live in non-QI regs, so we allow all
41662 registers here. */
41663 if (lra_in_progress)
41664 return true;
41665 return !can_create_pseudo_p ();
41667 /* We handle both integer and floats in the general purpose registers. */
41668 else if (VALID_INT_MODE_P (mode))
41669 return true;
41670 else if (VALID_FP_MODE_P (mode))
41671 return true;
41672 else if (VALID_DFP_MODE_P (mode))
41673 return true;
41674 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41675 on to use that value in smaller contexts, this can easily force a
41676 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41677 supporting DImode, allow it. */
41678 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41679 return true;
41681 return false;
41684 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
41685 saves SSE registers across calls is Win64 (thus no need to check the
41686 current ABI here), and with AVX enabled Win64 only guarantees that
41687 the low 16 bytes are saved. */
41689 static bool
41690 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
41692 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
41695 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41696 tieable integer mode. */
41698 static bool
41699 ix86_tieable_integer_mode_p (machine_mode mode)
41701 switch (mode)
41703 case E_HImode:
41704 case E_SImode:
41705 return true;
41707 case E_QImode:
41708 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41710 case E_DImode:
41711 return TARGET_64BIT;
41713 default:
41714 return false;
41718 /* Implement TARGET_MODES_TIEABLE_P.
41720 Return true if MODE1 is accessible in a register that can hold MODE2
41721 without copying. That is, all register classes that can hold MODE2
41722 can also hold MODE1. */
41724 static bool
41725 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41727 if (mode1 == mode2)
41728 return true;
41730 if (ix86_tieable_integer_mode_p (mode1)
41731 && ix86_tieable_integer_mode_p (mode2))
41732 return true;
41734 /* MODE2 being XFmode implies fp stack or general regs, which means we
41735 can tie any smaller floating point modes to it. Note that we do not
41736 tie this with TFmode. */
41737 if (mode2 == XFmode)
41738 return mode1 == SFmode || mode1 == DFmode;
41740 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41741 that we can tie it with SFmode. */
41742 if (mode2 == DFmode)
41743 return mode1 == SFmode;
41745 /* If MODE2 is only appropriate for an SSE register, then tie with
41746 any other mode acceptable to SSE registers. */
41747 if (GET_MODE_SIZE (mode2) == 32
41748 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41749 return (GET_MODE_SIZE (mode1) == 32
41750 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41751 if (GET_MODE_SIZE (mode2) == 16
41752 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41753 return (GET_MODE_SIZE (mode1) == 16
41754 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41756 /* If MODE2 is appropriate for an MMX register, then tie
41757 with any other mode acceptable to MMX registers. */
41758 if (GET_MODE_SIZE (mode2) == 8
41759 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41760 return (GET_MODE_SIZE (mode1) == 8
41761 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41763 return false;
41766 /* Return the cost of moving between two registers of mode MODE. */
41768 static int
41769 ix86_set_reg_reg_cost (machine_mode mode)
41771 unsigned int units = UNITS_PER_WORD;
41773 switch (GET_MODE_CLASS (mode))
41775 default:
41776 break;
41778 case MODE_CC:
41779 units = GET_MODE_SIZE (CCmode);
41780 break;
41782 case MODE_FLOAT:
41783 if ((TARGET_SSE && mode == TFmode)
41784 || (TARGET_80387 && mode == XFmode)
41785 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41786 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41787 units = GET_MODE_SIZE (mode);
41788 break;
41790 case MODE_COMPLEX_FLOAT:
41791 if ((TARGET_SSE && mode == TCmode)
41792 || (TARGET_80387 && mode == XCmode)
41793 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41794 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41795 units = GET_MODE_SIZE (mode);
41796 break;
41798 case MODE_VECTOR_INT:
41799 case MODE_VECTOR_FLOAT:
41800 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41801 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41802 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41803 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41804 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41805 units = GET_MODE_SIZE (mode);
41808 /* Return the cost of moving between two registers of mode MODE,
41809 assuming that the move will be in pieces of at most UNITS bytes. */
41810 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41813 /* Compute a (partial) cost for rtx X. Return true if the complete
41814 cost has been computed, and false if subexpressions should be
41815 scanned. In either case, *TOTAL contains the cost result. */
41817 static bool
41818 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41819 int *total, bool speed)
41821 rtx mask;
41822 enum rtx_code code = GET_CODE (x);
41823 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41824 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41825 int src_cost;
41827 switch (code)
41829 case SET:
41830 if (register_operand (SET_DEST (x), VOIDmode)
41831 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41833 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41834 return true;
41837 if (register_operand (SET_SRC (x), VOIDmode))
41838 /* Avoid potentially incorrect high cost from rtx_costs
41839 for non-tieable SUBREGs. */
41840 src_cost = 0;
41841 else
41843 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41845 if (CONSTANT_P (SET_SRC (x)))
41846 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41847 a small value, possibly zero for cheap constants. */
41848 src_cost += COSTS_N_INSNS (1);
41851 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41852 return true;
41854 case CONST_INT:
41855 case CONST:
41856 case LABEL_REF:
41857 case SYMBOL_REF:
41858 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41859 *total = 3;
41860 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41861 *total = 2;
41862 else if (flag_pic && SYMBOLIC_CONST (x)
41863 && !(TARGET_64BIT
41864 && (GET_CODE (x) == LABEL_REF
41865 || (GET_CODE (x) == SYMBOL_REF
41866 && SYMBOL_REF_LOCAL_P (x))))
41867 /* Use 0 cost for CONST to improve its propagation. */
41868 && (TARGET_64BIT || GET_CODE (x) != CONST))
41869 *total = 1;
41870 else
41871 *total = 0;
41872 return true;
41874 case CONST_DOUBLE:
41875 if (IS_STACK_MODE (mode))
41876 switch (standard_80387_constant_p (x))
41878 case -1:
41879 case 0:
41880 break;
41881 case 1: /* 0.0 */
41882 *total = 1;
41883 return true;
41884 default: /* Other constants */
41885 *total = 2;
41886 return true;
41888 /* FALLTHRU */
41890 case CONST_VECTOR:
41891 switch (standard_sse_constant_p (x, mode))
41893 case 0:
41894 break;
41895 case 1: /* 0: xor eliminates false dependency */
41896 *total = 0;
41897 return true;
41898 default: /* -1: cmp contains false dependency */
41899 *total = 1;
41900 return true;
41902 /* FALLTHRU */
41904 case CONST_WIDE_INT:
41905 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41906 it'll probably end up. Add a penalty for size. */
41907 *total = (COSTS_N_INSNS (1)
41908 + (!TARGET_64BIT && flag_pic)
41909 + (GET_MODE_SIZE (mode) <= 4
41910 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41911 return true;
41913 case ZERO_EXTEND:
41914 /* The zero extensions is often completely free on x86_64, so make
41915 it as cheap as possible. */
41916 if (TARGET_64BIT && mode == DImode
41917 && GET_MODE (XEXP (x, 0)) == SImode)
41918 *total = 1;
41919 else if (TARGET_ZERO_EXTEND_WITH_AND)
41920 *total = cost->add;
41921 else
41922 *total = cost->movzx;
41923 return false;
41925 case SIGN_EXTEND:
41926 *total = cost->movsx;
41927 return false;
41929 case ASHIFT:
41930 if (SCALAR_INT_MODE_P (mode)
41931 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41932 && CONST_INT_P (XEXP (x, 1)))
41934 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41935 if (value == 1)
41937 *total = cost->add;
41938 return false;
41940 if ((value == 2 || value == 3)
41941 && cost->lea <= cost->shift_const)
41943 *total = cost->lea;
41944 return false;
41947 /* FALLTHRU */
41949 case ROTATE:
41950 case ASHIFTRT:
41951 case LSHIFTRT:
41952 case ROTATERT:
41953 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41955 /* ??? Should be SSE vector operation cost. */
41956 /* At least for published AMD latencies, this really is the same
41957 as the latency for a simple fpu operation like fabs. */
41958 /* V*QImode is emulated with 1-11 insns. */
41959 if (mode == V16QImode || mode == V32QImode)
41961 int count = 11;
41962 if (TARGET_XOP && mode == V16QImode)
41964 /* For XOP we use vpshab, which requires a broadcast of the
41965 value to the variable shift insn. For constants this
41966 means a V16Q const in mem; even when we can perform the
41967 shift with one insn set the cost to prefer paddb. */
41968 if (CONSTANT_P (XEXP (x, 1)))
41970 *total = (cost->fabs
41971 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41972 + (speed ? 2 : COSTS_N_BYTES (16)));
41973 return true;
41975 count = 3;
41977 else if (TARGET_SSSE3)
41978 count = 7;
41979 *total = cost->fabs * count;
41981 else
41982 *total = cost->fabs;
41984 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41986 if (CONST_INT_P (XEXP (x, 1)))
41988 if (INTVAL (XEXP (x, 1)) > 32)
41989 *total = cost->shift_const + COSTS_N_INSNS (2);
41990 else
41991 *total = cost->shift_const * 2;
41993 else
41995 if (GET_CODE (XEXP (x, 1)) == AND)
41996 *total = cost->shift_var * 2;
41997 else
41998 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
42001 else
42003 if (CONST_INT_P (XEXP (x, 1)))
42004 *total = cost->shift_const;
42005 else if (SUBREG_P (XEXP (x, 1))
42006 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
42008 /* Return the cost after shift-and truncation. */
42009 *total = cost->shift_var;
42010 return true;
42012 else
42013 *total = cost->shift_var;
42015 return false;
42017 case FMA:
42019 rtx sub;
42021 gcc_assert (FLOAT_MODE_P (mode));
42022 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
42024 /* ??? SSE scalar/vector cost should be used here. */
42025 /* ??? Bald assumption that fma has the same cost as fmul. */
42026 *total = cost->fmul;
42027 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
42029 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
42030 sub = XEXP (x, 0);
42031 if (GET_CODE (sub) == NEG)
42032 sub = XEXP (sub, 0);
42033 *total += rtx_cost (sub, mode, FMA, 0, speed);
42035 sub = XEXP (x, 2);
42036 if (GET_CODE (sub) == NEG)
42037 sub = XEXP (sub, 0);
42038 *total += rtx_cost (sub, mode, FMA, 2, speed);
42039 return true;
42042 case MULT:
42043 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42045 /* ??? SSE scalar cost should be used here. */
42046 *total = cost->fmul;
42047 return false;
42049 else if (X87_FLOAT_MODE_P (mode))
42051 *total = cost->fmul;
42052 return false;
42054 else if (FLOAT_MODE_P (mode))
42056 /* ??? SSE vector cost should be used here. */
42057 *total = cost->fmul;
42058 return false;
42060 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
42062 /* V*QImode is emulated with 7-13 insns. */
42063 if (mode == V16QImode || mode == V32QImode)
42065 int extra = 11;
42066 if (TARGET_XOP && mode == V16QImode)
42067 extra = 5;
42068 else if (TARGET_SSSE3)
42069 extra = 6;
42070 *total = cost->fmul * 2 + cost->fabs * extra;
42072 /* V*DImode is emulated with 5-8 insns. */
42073 else if (mode == V2DImode || mode == V4DImode)
42075 if (TARGET_XOP && mode == V2DImode)
42076 *total = cost->fmul * 2 + cost->fabs * 3;
42077 else
42078 *total = cost->fmul * 3 + cost->fabs * 5;
42080 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
42081 insns, including two PMULUDQ. */
42082 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
42083 *total = cost->fmul * 2 + cost->fabs * 5;
42084 else
42085 *total = cost->fmul;
42086 return false;
42088 else
42090 rtx op0 = XEXP (x, 0);
42091 rtx op1 = XEXP (x, 1);
42092 int nbits;
42093 if (CONST_INT_P (XEXP (x, 1)))
42095 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
42096 for (nbits = 0; value != 0; value &= value - 1)
42097 nbits++;
42099 else
42100 /* This is arbitrary. */
42101 nbits = 7;
42103 /* Compute costs correctly for widening multiplication. */
42104 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
42105 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
42106 == GET_MODE_SIZE (mode))
42108 int is_mulwiden = 0;
42109 machine_mode inner_mode = GET_MODE (op0);
42111 if (GET_CODE (op0) == GET_CODE (op1))
42112 is_mulwiden = 1, op1 = XEXP (op1, 0);
42113 else if (CONST_INT_P (op1))
42115 if (GET_CODE (op0) == SIGN_EXTEND)
42116 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
42117 == INTVAL (op1);
42118 else
42119 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
42122 if (is_mulwiden)
42123 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
42126 *total = (cost->mult_init[MODE_INDEX (mode)]
42127 + nbits * cost->mult_bit
42128 + rtx_cost (op0, mode, outer_code, opno, speed)
42129 + rtx_cost (op1, mode, outer_code, opno, speed));
42131 return true;
42134 case DIV:
42135 case UDIV:
42136 case MOD:
42137 case UMOD:
42138 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42139 /* ??? SSE cost should be used here. */
42140 *total = cost->fdiv;
42141 else if (X87_FLOAT_MODE_P (mode))
42142 *total = cost->fdiv;
42143 else if (FLOAT_MODE_P (mode))
42144 /* ??? SSE vector cost should be used here. */
42145 *total = cost->fdiv;
42146 else
42147 *total = cost->divide[MODE_INDEX (mode)];
42148 return false;
42150 case PLUS:
42151 if (GET_MODE_CLASS (mode) == MODE_INT
42152 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
42154 if (GET_CODE (XEXP (x, 0)) == PLUS
42155 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
42156 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
42157 && CONSTANT_P (XEXP (x, 1)))
42159 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
42160 if (val == 2 || val == 4 || val == 8)
42162 *total = cost->lea;
42163 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
42164 outer_code, opno, speed);
42165 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
42166 outer_code, opno, speed);
42167 *total += rtx_cost (XEXP (x, 1), mode,
42168 outer_code, opno, speed);
42169 return true;
42172 else if (GET_CODE (XEXP (x, 0)) == MULT
42173 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
42175 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
42176 if (val == 2 || val == 4 || val == 8)
42178 *total = cost->lea;
42179 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42180 outer_code, opno, speed);
42181 *total += rtx_cost (XEXP (x, 1), mode,
42182 outer_code, opno, speed);
42183 return true;
42186 else if (GET_CODE (XEXP (x, 0)) == PLUS)
42188 /* Add with carry, ignore the cost of adding a carry flag. */
42189 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
42190 *total = cost->add;
42191 else
42193 *total = cost->lea;
42194 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42195 outer_code, opno, speed);
42198 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
42199 outer_code, opno, speed);
42200 *total += rtx_cost (XEXP (x, 1), mode,
42201 outer_code, opno, speed);
42202 return true;
42205 /* FALLTHRU */
42207 case MINUS:
42208 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
42209 if (GET_MODE_CLASS (mode) == MODE_INT
42210 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
42211 && GET_CODE (XEXP (x, 0)) == MINUS
42212 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
42214 *total = cost->add;
42215 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
42216 outer_code, opno, speed);
42217 *total += rtx_cost (XEXP (x, 1), mode,
42218 outer_code, opno, speed);
42219 return true;
42222 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42224 /* ??? SSE cost should be used here. */
42225 *total = cost->fadd;
42226 return false;
42228 else if (X87_FLOAT_MODE_P (mode))
42230 *total = cost->fadd;
42231 return false;
42233 else if (FLOAT_MODE_P (mode))
42235 /* ??? SSE vector cost should be used here. */
42236 *total = cost->fadd;
42237 return false;
42239 /* FALLTHRU */
42241 case AND:
42242 case IOR:
42243 case XOR:
42244 if (GET_MODE_CLASS (mode) == MODE_INT
42245 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
42247 *total = (cost->add * 2
42248 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
42249 << (GET_MODE (XEXP (x, 0)) != DImode))
42250 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
42251 << (GET_MODE (XEXP (x, 1)) != DImode)));
42252 return true;
42254 /* FALLTHRU */
42256 case NEG:
42257 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42259 /* ??? SSE cost should be used here. */
42260 *total = cost->fchs;
42261 return false;
42263 else if (X87_FLOAT_MODE_P (mode))
42265 *total = cost->fchs;
42266 return false;
42268 else if (FLOAT_MODE_P (mode))
42270 /* ??? SSE vector cost should be used here. */
42271 *total = cost->fchs;
42272 return false;
42274 /* FALLTHRU */
42276 case NOT:
42277 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
42279 /* ??? Should be SSE vector operation cost. */
42280 /* At least for published AMD latencies, this really is the same
42281 as the latency for a simple fpu operation like fabs. */
42282 *total = cost->fabs;
42284 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
42285 *total = cost->add * 2;
42286 else
42287 *total = cost->add;
42288 return false;
42290 case COMPARE:
42291 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
42292 && XEXP (XEXP (x, 0), 1) == const1_rtx
42293 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
42294 && XEXP (x, 1) == const0_rtx)
42296 /* This kind of construct is implemented using test[bwl].
42297 Treat it as if we had an AND. */
42298 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
42299 *total = (cost->add
42300 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
42301 opno, speed)
42302 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
42303 return true;
42306 /* The embedded comparison operand is completely free. */
42307 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
42308 && XEXP (x, 1) == const0_rtx)
42309 *total = 0;
42311 return false;
42313 case FLOAT_EXTEND:
42314 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
42315 *total = 0;
42316 return false;
42318 case ABS:
42319 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42320 /* ??? SSE cost should be used here. */
42321 *total = cost->fabs;
42322 else if (X87_FLOAT_MODE_P (mode))
42323 *total = cost->fabs;
42324 else if (FLOAT_MODE_P (mode))
42325 /* ??? SSE vector cost should be used here. */
42326 *total = cost->fabs;
42327 return false;
42329 case SQRT:
42330 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42331 /* ??? SSE cost should be used here. */
42332 *total = cost->fsqrt;
42333 else if (X87_FLOAT_MODE_P (mode))
42334 *total = cost->fsqrt;
42335 else if (FLOAT_MODE_P (mode))
42336 /* ??? SSE vector cost should be used here. */
42337 *total = cost->fsqrt;
42338 return false;
42340 case UNSPEC:
42341 if (XINT (x, 1) == UNSPEC_TP)
42342 *total = 0;
42343 return false;
42345 case VEC_SELECT:
42346 case VEC_CONCAT:
42347 case VEC_DUPLICATE:
42348 /* ??? Assume all of these vector manipulation patterns are
42349 recognizable. In which case they all pretty much have the
42350 same cost. */
42351 *total = cost->fabs;
42352 return true;
42353 case VEC_MERGE:
42354 mask = XEXP (x, 2);
42355 /* This is masked instruction, assume the same cost,
42356 as nonmasked variant. */
42357 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
42358 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
42359 else
42360 *total = cost->fabs;
42361 return true;
42363 default:
42364 return false;
42368 #if TARGET_MACHO
42370 static int current_machopic_label_num;
42372 /* Given a symbol name and its associated stub, write out the
42373 definition of the stub. */
42375 void
42376 machopic_output_stub (FILE *file, const char *symb, const char *stub)
42378 unsigned int length;
42379 char *binder_name, *symbol_name, lazy_ptr_name[32];
42380 int label = ++current_machopic_label_num;
42382 /* For 64-bit we shouldn't get here. */
42383 gcc_assert (!TARGET_64BIT);
42385 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
42386 symb = targetm.strip_name_encoding (symb);
42388 length = strlen (stub);
42389 binder_name = XALLOCAVEC (char, length + 32);
42390 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
42392 length = strlen (symb);
42393 symbol_name = XALLOCAVEC (char, length + 32);
42394 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
42396 sprintf (lazy_ptr_name, "L%d$lz", label);
42398 if (MACHOPIC_ATT_STUB)
42399 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
42400 else if (MACHOPIC_PURE)
42401 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
42402 else
42403 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
42405 fprintf (file, "%s:\n", stub);
42406 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42408 if (MACHOPIC_ATT_STUB)
42410 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
42412 else if (MACHOPIC_PURE)
42414 /* PIC stub. */
42415 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42416 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
42417 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
42418 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
42419 label, lazy_ptr_name, label);
42420 fprintf (file, "\tjmp\t*%%ecx\n");
42422 else
42423 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
42425 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
42426 it needs no stub-binding-helper. */
42427 if (MACHOPIC_ATT_STUB)
42428 return;
42430 fprintf (file, "%s:\n", binder_name);
42432 if (MACHOPIC_PURE)
42434 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
42435 fprintf (file, "\tpushl\t%%ecx\n");
42437 else
42438 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
42440 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
42442 /* N.B. Keep the correspondence of these
42443 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
42444 old-pic/new-pic/non-pic stubs; altering this will break
42445 compatibility with existing dylibs. */
42446 if (MACHOPIC_PURE)
42448 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42449 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42451 else
42452 /* 16-byte -mdynamic-no-pic stub. */
42453 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42455 fprintf (file, "%s:\n", lazy_ptr_name);
42456 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42457 fprintf (file, ASM_LONG "%s\n", binder_name);
42459 #endif /* TARGET_MACHO */
42461 /* Order the registers for register allocator. */
42463 void
42464 x86_order_regs_for_local_alloc (void)
42466 int pos = 0;
42467 int i;
42469 /* First allocate the local general purpose registers. */
42470 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42471 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42472 reg_alloc_order [pos++] = i;
42474 /* Global general purpose registers. */
42475 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42476 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42477 reg_alloc_order [pos++] = i;
42479 /* x87 registers come first in case we are doing FP math
42480 using them. */
42481 if (!TARGET_SSE_MATH)
42482 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42483 reg_alloc_order [pos++] = i;
42485 /* SSE registers. */
42486 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42487 reg_alloc_order [pos++] = i;
42488 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42489 reg_alloc_order [pos++] = i;
42491 /* Extended REX SSE registers. */
42492 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42493 reg_alloc_order [pos++] = i;
42495 /* Mask register. */
42496 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42497 reg_alloc_order [pos++] = i;
42499 /* MPX bound registers. */
42500 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42501 reg_alloc_order [pos++] = i;
42503 /* x87 registers. */
42504 if (TARGET_SSE_MATH)
42505 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42506 reg_alloc_order [pos++] = i;
42508 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42509 reg_alloc_order [pos++] = i;
42511 /* Initialize the rest of array as we do not allocate some registers
42512 at all. */
42513 while (pos < FIRST_PSEUDO_REGISTER)
42514 reg_alloc_order [pos++] = 0;
42517 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42518 in struct attribute_spec handler. */
42519 static tree
42520 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42521 tree args,
42522 int,
42523 bool *no_add_attrs)
42525 if (TREE_CODE (*node) != FUNCTION_TYPE
42526 && TREE_CODE (*node) != METHOD_TYPE
42527 && TREE_CODE (*node) != FIELD_DECL
42528 && TREE_CODE (*node) != TYPE_DECL)
42530 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42531 name);
42532 *no_add_attrs = true;
42533 return NULL_TREE;
42535 if (TARGET_64BIT)
42537 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42538 name);
42539 *no_add_attrs = true;
42540 return NULL_TREE;
42542 if (is_attribute_p ("callee_pop_aggregate_return", name))
42544 tree cst;
42546 cst = TREE_VALUE (args);
42547 if (TREE_CODE (cst) != INTEGER_CST)
42549 warning (OPT_Wattributes,
42550 "%qE attribute requires an integer constant argument",
42551 name);
42552 *no_add_attrs = true;
42554 else if (compare_tree_int (cst, 0) != 0
42555 && compare_tree_int (cst, 1) != 0)
42557 warning (OPT_Wattributes,
42558 "argument to %qE attribute is neither zero, nor one",
42559 name);
42560 *no_add_attrs = true;
42563 return NULL_TREE;
42566 return NULL_TREE;
42569 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42570 struct attribute_spec.handler. */
42571 static tree
42572 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42573 bool *no_add_attrs)
42575 if (TREE_CODE (*node) != FUNCTION_TYPE
42576 && TREE_CODE (*node) != METHOD_TYPE
42577 && TREE_CODE (*node) != FIELD_DECL
42578 && TREE_CODE (*node) != TYPE_DECL)
42580 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42581 name);
42582 *no_add_attrs = true;
42583 return NULL_TREE;
42586 /* Can combine regparm with all attributes but fastcall. */
42587 if (is_attribute_p ("ms_abi", name))
42589 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42591 error ("ms_abi and sysv_abi attributes are not compatible");
42594 return NULL_TREE;
42596 else if (is_attribute_p ("sysv_abi", name))
42598 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42600 error ("ms_abi and sysv_abi attributes are not compatible");
42603 return NULL_TREE;
42606 return NULL_TREE;
42609 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42610 struct attribute_spec.handler. */
42611 static tree
42612 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42613 bool *no_add_attrs)
42615 tree *type = NULL;
42616 if (DECL_P (*node))
42618 if (TREE_CODE (*node) == TYPE_DECL)
42619 type = &TREE_TYPE (*node);
42621 else
42622 type = node;
42624 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42626 warning (OPT_Wattributes, "%qE attribute ignored",
42627 name);
42628 *no_add_attrs = true;
42631 else if ((is_attribute_p ("ms_struct", name)
42632 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42633 || ((is_attribute_p ("gcc_struct", name)
42634 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42636 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42637 name);
42638 *no_add_attrs = true;
42641 return NULL_TREE;
42644 static tree
42645 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42646 bool *no_add_attrs)
42648 if (TREE_CODE (*node) != FUNCTION_DECL)
42650 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42651 name);
42652 *no_add_attrs = true;
42654 return NULL_TREE;
42657 static tree
42658 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42659 int, bool *)
42661 return NULL_TREE;
42664 static tree
42665 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42667 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42668 but the function type contains args and return type data. */
42669 tree func_type = *node;
42670 tree return_type = TREE_TYPE (func_type);
42672 int nargs = 0;
42673 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42674 while (current_arg_type
42675 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42677 if (nargs == 0)
42679 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42680 error ("interrupt service routine should have a pointer "
42681 "as the first argument");
42683 else if (nargs == 1)
42685 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42686 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42687 error ("interrupt service routine should have unsigned %s"
42688 "int as the second argument",
42689 TARGET_64BIT
42690 ? (TARGET_X32 ? "long long " : "long ")
42691 : "");
42693 nargs++;
42694 current_arg_type = TREE_CHAIN (current_arg_type);
42696 if (!nargs || nargs > 2)
42697 error ("interrupt service routine can only have a pointer argument "
42698 "and an optional integer argument");
42699 if (! VOID_TYPE_P (return_type))
42700 error ("interrupt service routine can't have non-void return value");
42702 return NULL_TREE;
42705 static bool
42706 ix86_ms_bitfield_layout_p (const_tree record_type)
42708 return ((TARGET_MS_BITFIELD_LAYOUT
42709 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42710 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42713 /* Returns an expression indicating where the this parameter is
42714 located on entry to the FUNCTION. */
42716 static rtx
42717 x86_this_parameter (tree function)
42719 tree type = TREE_TYPE (function);
42720 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42721 int nregs;
42723 if (TARGET_64BIT)
42725 const int *parm_regs;
42727 if (ix86_function_type_abi (type) == MS_ABI)
42728 parm_regs = x86_64_ms_abi_int_parameter_registers;
42729 else
42730 parm_regs = x86_64_int_parameter_registers;
42731 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42734 nregs = ix86_function_regparm (type, function);
42736 if (nregs > 0 && !stdarg_p (type))
42738 int regno;
42739 unsigned int ccvt = ix86_get_callcvt (type);
42741 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42742 regno = aggr ? DX_REG : CX_REG;
42743 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42745 regno = CX_REG;
42746 if (aggr)
42747 return gen_rtx_MEM (SImode,
42748 plus_constant (Pmode, stack_pointer_rtx, 4));
42750 else
42752 regno = AX_REG;
42753 if (aggr)
42755 regno = DX_REG;
42756 if (nregs == 1)
42757 return gen_rtx_MEM (SImode,
42758 plus_constant (Pmode,
42759 stack_pointer_rtx, 4));
42762 return gen_rtx_REG (SImode, regno);
42765 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42766 aggr ? 8 : 4));
42769 /* Determine whether x86_output_mi_thunk can succeed. */
42771 static bool
42772 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42773 const_tree function)
42775 /* 64-bit can handle anything. */
42776 if (TARGET_64BIT)
42777 return true;
42779 /* For 32-bit, everything's fine if we have one free register. */
42780 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42781 return true;
42783 /* Need a free register for vcall_offset. */
42784 if (vcall_offset)
42785 return false;
42787 /* Need a free register for GOT references. */
42788 if (flag_pic && !targetm.binds_local_p (function))
42789 return false;
42791 /* Otherwise ok. */
42792 return true;
42795 /* Output the assembler code for a thunk function. THUNK_DECL is the
42796 declaration for the thunk function itself, FUNCTION is the decl for
42797 the target function. DELTA is an immediate constant offset to be
42798 added to THIS. If VCALL_OFFSET is nonzero, the word at
42799 *(*this + vcall_offset) should be added to THIS. */
42801 static void
42802 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42803 HOST_WIDE_INT vcall_offset, tree function)
42805 rtx this_param = x86_this_parameter (function);
42806 rtx this_reg, tmp, fnaddr;
42807 unsigned int tmp_regno;
42808 rtx_insn *insn;
42810 if (TARGET_64BIT)
42811 tmp_regno = R10_REG;
42812 else
42814 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42815 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42816 tmp_regno = AX_REG;
42817 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42818 tmp_regno = DX_REG;
42819 else
42820 tmp_regno = CX_REG;
42823 emit_note (NOTE_INSN_PROLOGUE_END);
42825 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42826 pull it in now and let DELTA benefit. */
42827 if (REG_P (this_param))
42828 this_reg = this_param;
42829 else if (vcall_offset)
42831 /* Put the this parameter into %eax. */
42832 this_reg = gen_rtx_REG (Pmode, AX_REG);
42833 emit_move_insn (this_reg, this_param);
42835 else
42836 this_reg = NULL_RTX;
42838 /* Adjust the this parameter by a fixed constant. */
42839 if (delta)
42841 rtx delta_rtx = GEN_INT (delta);
42842 rtx delta_dst = this_reg ? this_reg : this_param;
42844 if (TARGET_64BIT)
42846 if (!x86_64_general_operand (delta_rtx, Pmode))
42848 tmp = gen_rtx_REG (Pmode, tmp_regno);
42849 emit_move_insn (tmp, delta_rtx);
42850 delta_rtx = tmp;
42854 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42857 /* Adjust the this parameter by a value stored in the vtable. */
42858 if (vcall_offset)
42860 rtx vcall_addr, vcall_mem, this_mem;
42862 tmp = gen_rtx_REG (Pmode, tmp_regno);
42864 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42865 if (Pmode != ptr_mode)
42866 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42867 emit_move_insn (tmp, this_mem);
42869 /* Adjust the this parameter. */
42870 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42871 if (TARGET_64BIT
42872 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42874 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42875 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42876 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42879 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42880 if (Pmode != ptr_mode)
42881 emit_insn (gen_addsi_1_zext (this_reg,
42882 gen_rtx_REG (ptr_mode,
42883 REGNO (this_reg)),
42884 vcall_mem));
42885 else
42886 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42889 /* If necessary, drop THIS back to its stack slot. */
42890 if (this_reg && this_reg != this_param)
42891 emit_move_insn (this_param, this_reg);
42893 fnaddr = XEXP (DECL_RTL (function), 0);
42894 if (TARGET_64BIT)
42896 if (!flag_pic || targetm.binds_local_p (function)
42897 || TARGET_PECOFF)
42899 else
42901 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42902 tmp = gen_rtx_CONST (Pmode, tmp);
42903 fnaddr = gen_const_mem (Pmode, tmp);
42906 else
42908 if (!flag_pic || targetm.binds_local_p (function))
42910 #if TARGET_MACHO
42911 else if (TARGET_MACHO)
42913 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42914 fnaddr = XEXP (fnaddr, 0);
42916 #endif /* TARGET_MACHO */
42917 else
42919 tmp = gen_rtx_REG (Pmode, CX_REG);
42920 output_set_got (tmp, NULL_RTX);
42922 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42923 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42924 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42925 fnaddr = gen_const_mem (Pmode, fnaddr);
42929 /* Our sibling call patterns do not allow memories, because we have no
42930 predicate that can distinguish between frame and non-frame memory.
42931 For our purposes here, we can get away with (ab)using a jump pattern,
42932 because we're going to do no optimization. */
42933 if (MEM_P (fnaddr))
42935 if (sibcall_insn_operand (fnaddr, word_mode))
42937 fnaddr = XEXP (DECL_RTL (function), 0);
42938 tmp = gen_rtx_MEM (QImode, fnaddr);
42939 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42940 tmp = emit_call_insn (tmp);
42941 SIBLING_CALL_P (tmp) = 1;
42943 else
42944 emit_jump_insn (gen_indirect_jump (fnaddr));
42946 else
42948 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42950 // CM_LARGE_PIC always uses pseudo PIC register which is
42951 // uninitialized. Since FUNCTION is local and calling it
42952 // doesn't go through PLT, we use scratch register %r11 as
42953 // PIC register and initialize it here.
42954 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42955 ix86_init_large_pic_reg (tmp_regno);
42956 fnaddr = legitimize_pic_address (fnaddr,
42957 gen_rtx_REG (Pmode, tmp_regno));
42960 if (!sibcall_insn_operand (fnaddr, word_mode))
42962 tmp = gen_rtx_REG (word_mode, tmp_regno);
42963 if (GET_MODE (fnaddr) != word_mode)
42964 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42965 emit_move_insn (tmp, fnaddr);
42966 fnaddr = tmp;
42969 tmp = gen_rtx_MEM (QImode, fnaddr);
42970 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42971 tmp = emit_call_insn (tmp);
42972 SIBLING_CALL_P (tmp) = 1;
42974 emit_barrier ();
42976 /* Emit just enough of rest_of_compilation to get the insns emitted.
42977 Note that use_thunk calls assemble_start_function et al. */
42978 insn = get_insns ();
42979 shorten_branches (insn);
42980 final_start_function (insn, file, 1);
42981 final (insn, file, 1);
42982 final_end_function ();
42985 static void
42986 x86_file_start (void)
42988 default_file_start ();
42989 if (TARGET_16BIT)
42990 fputs ("\t.code16gcc\n", asm_out_file);
42991 #if TARGET_MACHO
42992 darwin_file_start ();
42993 #endif
42994 if (X86_FILE_START_VERSION_DIRECTIVE)
42995 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42996 if (X86_FILE_START_FLTUSED)
42997 fputs ("\t.global\t__fltused\n", asm_out_file);
42998 if (ix86_asm_dialect == ASM_INTEL)
42999 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
43003 x86_field_alignment (tree type, int computed)
43005 machine_mode mode;
43007 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
43008 return computed;
43009 if (TARGET_IAMCU)
43010 return iamcu_alignment (type, computed);
43011 mode = TYPE_MODE (strip_array_types (type));
43012 if (mode == DFmode || mode == DCmode
43013 || GET_MODE_CLASS (mode) == MODE_INT
43014 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
43015 return MIN (32, computed);
43016 return computed;
43019 /* Print call to TARGET to FILE. */
43021 static void
43022 x86_print_call_or_nop (FILE *file, const char *target)
43024 if (flag_nop_mcount)
43025 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
43026 else
43027 fprintf (file, "1:\tcall\t%s\n", target);
43030 /* Output assembler code to FILE to increment profiler label # LABELNO
43031 for profiling a function entry. */
43032 void
43033 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
43035 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
43036 : MCOUNT_NAME);
43037 if (TARGET_64BIT)
43039 #ifndef NO_PROFILE_COUNTERS
43040 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
43041 #endif
43043 if (!TARGET_PECOFF && flag_pic)
43044 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
43045 else
43046 x86_print_call_or_nop (file, mcount_name);
43048 else if (flag_pic)
43050 #ifndef NO_PROFILE_COUNTERS
43051 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
43052 LPREFIX, labelno);
43053 #endif
43054 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
43056 else
43058 #ifndef NO_PROFILE_COUNTERS
43059 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
43060 LPREFIX, labelno);
43061 #endif
43062 x86_print_call_or_nop (file, mcount_name);
43065 if (flag_record_mcount)
43067 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
43068 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
43069 fprintf (file, "\t.previous\n");
43073 /* We don't have exact information about the insn sizes, but we may assume
43074 quite safely that we are informed about all 1 byte insns and memory
43075 address sizes. This is enough to eliminate unnecessary padding in
43076 99% of cases. */
43078 static int
43079 min_insn_size (rtx_insn *insn)
43081 int l = 0, len;
43083 if (!INSN_P (insn) || !active_insn_p (insn))
43084 return 0;
43086 /* Discard alignments we've emit and jump instructions. */
43087 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
43088 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
43089 return 0;
43091 /* Important case - calls are always 5 bytes.
43092 It is common to have many calls in the row. */
43093 if (CALL_P (insn)
43094 && symbolic_reference_mentioned_p (PATTERN (insn))
43095 && !SIBLING_CALL_P (insn))
43096 return 5;
43097 len = get_attr_length (insn);
43098 if (len <= 1)
43099 return 1;
43101 /* For normal instructions we rely on get_attr_length being exact,
43102 with a few exceptions. */
43103 if (!JUMP_P (insn))
43105 enum attr_type type = get_attr_type (insn);
43107 switch (type)
43109 case TYPE_MULTI:
43110 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
43111 || asm_noperands (PATTERN (insn)) >= 0)
43112 return 0;
43113 break;
43114 case TYPE_OTHER:
43115 case TYPE_FCMP:
43116 break;
43117 default:
43118 /* Otherwise trust get_attr_length. */
43119 return len;
43122 l = get_attr_length_address (insn);
43123 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
43124 l = 4;
43126 if (l)
43127 return 1+l;
43128 else
43129 return 2;
43132 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43134 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
43135 window. */
43137 static void
43138 ix86_avoid_jump_mispredicts (void)
43140 rtx_insn *insn, *start = get_insns ();
43141 int nbytes = 0, njumps = 0;
43142 bool isjump = false;
43144 /* Look for all minimal intervals of instructions containing 4 jumps.
43145 The intervals are bounded by START and INSN. NBYTES is the total
43146 size of instructions in the interval including INSN and not including
43147 START. When the NBYTES is smaller than 16 bytes, it is possible
43148 that the end of START and INSN ends up in the same 16byte page.
43150 The smallest offset in the page INSN can start is the case where START
43151 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
43152 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
43154 Don't consider asm goto as jump, while it can contain a jump, it doesn't
43155 have to, control transfer to label(s) can be performed through other
43156 means, and also we estimate minimum length of all asm stmts as 0. */
43157 for (insn = start; insn; insn = NEXT_INSN (insn))
43159 int min_size;
43161 if (LABEL_P (insn))
43163 int align = label_to_alignment (insn);
43164 int max_skip = label_to_max_skip (insn);
43166 if (max_skip > 15)
43167 max_skip = 15;
43168 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
43169 already in the current 16 byte page, because otherwise
43170 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
43171 bytes to reach 16 byte boundary. */
43172 if (align <= 0
43173 || (align <= 3 && max_skip != (1 << align) - 1))
43174 max_skip = 0;
43175 if (dump_file)
43176 fprintf (dump_file, "Label %i with max_skip %i\n",
43177 INSN_UID (insn), max_skip);
43178 if (max_skip)
43180 while (nbytes + max_skip >= 16)
43182 start = NEXT_INSN (start);
43183 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
43184 || CALL_P (start))
43185 njumps--, isjump = true;
43186 else
43187 isjump = false;
43188 nbytes -= min_insn_size (start);
43191 continue;
43194 min_size = min_insn_size (insn);
43195 nbytes += min_size;
43196 if (dump_file)
43197 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
43198 INSN_UID (insn), min_size);
43199 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
43200 || CALL_P (insn))
43201 njumps++;
43202 else
43203 continue;
43205 while (njumps > 3)
43207 start = NEXT_INSN (start);
43208 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
43209 || CALL_P (start))
43210 njumps--, isjump = true;
43211 else
43212 isjump = false;
43213 nbytes -= min_insn_size (start);
43215 gcc_assert (njumps >= 0);
43216 if (dump_file)
43217 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
43218 INSN_UID (start), INSN_UID (insn), nbytes);
43220 if (njumps == 3 && isjump && nbytes < 16)
43222 int padsize = 15 - nbytes + min_insn_size (insn);
43224 if (dump_file)
43225 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
43226 INSN_UID (insn), padsize);
43227 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
43231 #endif
43233 /* AMD Athlon works faster
43234 when RET is not destination of conditional jump or directly preceded
43235 by other jump instruction. We avoid the penalty by inserting NOP just
43236 before the RET instructions in such cases. */
43237 static void
43238 ix86_pad_returns (void)
43240 edge e;
43241 edge_iterator ei;
43243 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43245 basic_block bb = e->src;
43246 rtx_insn *ret = BB_END (bb);
43247 rtx_insn *prev;
43248 bool replace = false;
43250 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
43251 || optimize_bb_for_size_p (bb))
43252 continue;
43253 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
43254 if (active_insn_p (prev) || LABEL_P (prev))
43255 break;
43256 if (prev && LABEL_P (prev))
43258 edge e;
43259 edge_iterator ei;
43261 FOR_EACH_EDGE (e, ei, bb->preds)
43262 if (EDGE_FREQUENCY (e) && e->src->index >= 0
43263 && !(e->flags & EDGE_FALLTHRU))
43265 replace = true;
43266 break;
43269 if (!replace)
43271 prev = prev_active_insn (ret);
43272 if (prev
43273 && ((JUMP_P (prev) && any_condjump_p (prev))
43274 || CALL_P (prev)))
43275 replace = true;
43276 /* Empty functions get branch mispredict even when
43277 the jump destination is not visible to us. */
43278 if (!prev && !optimize_function_for_size_p (cfun))
43279 replace = true;
43281 if (replace)
43283 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
43284 delete_insn (ret);
43289 /* Count the minimum number of instructions in BB. Return 4 if the
43290 number of instructions >= 4. */
43292 static int
43293 ix86_count_insn_bb (basic_block bb)
43295 rtx_insn *insn;
43296 int insn_count = 0;
43298 /* Count number of instructions in this block. Return 4 if the number
43299 of instructions >= 4. */
43300 FOR_BB_INSNS (bb, insn)
43302 /* Only happen in exit blocks. */
43303 if (JUMP_P (insn)
43304 && ANY_RETURN_P (PATTERN (insn)))
43305 break;
43307 if (NONDEBUG_INSN_P (insn)
43308 && GET_CODE (PATTERN (insn)) != USE
43309 && GET_CODE (PATTERN (insn)) != CLOBBER)
43311 insn_count++;
43312 if (insn_count >= 4)
43313 return insn_count;
43317 return insn_count;
43321 /* Count the minimum number of instructions in code path in BB.
43322 Return 4 if the number of instructions >= 4. */
43324 static int
43325 ix86_count_insn (basic_block bb)
43327 edge e;
43328 edge_iterator ei;
43329 int min_prev_count;
43331 /* Only bother counting instructions along paths with no
43332 more than 2 basic blocks between entry and exit. Given
43333 that BB has an edge to exit, determine if a predecessor
43334 of BB has an edge from entry. If so, compute the number
43335 of instructions in the predecessor block. If there
43336 happen to be multiple such blocks, compute the minimum. */
43337 min_prev_count = 4;
43338 FOR_EACH_EDGE (e, ei, bb->preds)
43340 edge prev_e;
43341 edge_iterator prev_ei;
43343 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43345 min_prev_count = 0;
43346 break;
43348 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
43350 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43352 int count = ix86_count_insn_bb (e->src);
43353 if (count < min_prev_count)
43354 min_prev_count = count;
43355 break;
43360 if (min_prev_count < 4)
43361 min_prev_count += ix86_count_insn_bb (bb);
43363 return min_prev_count;
43366 /* Pad short function to 4 instructions. */
43368 static void
43369 ix86_pad_short_function (void)
43371 edge e;
43372 edge_iterator ei;
43374 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43376 rtx_insn *ret = BB_END (e->src);
43377 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
43379 int insn_count = ix86_count_insn (e->src);
43381 /* Pad short function. */
43382 if (insn_count < 4)
43384 rtx_insn *insn = ret;
43386 /* Find epilogue. */
43387 while (insn
43388 && (!NOTE_P (insn)
43389 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
43390 insn = PREV_INSN (insn);
43392 if (!insn)
43393 insn = ret;
43395 /* Two NOPs count as one instruction. */
43396 insn_count = 2 * (4 - insn_count);
43397 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
43403 /* Fix up a Windows system unwinder issue. If an EH region falls through into
43404 the epilogue, the Windows system unwinder will apply epilogue logic and
43405 produce incorrect offsets. This can be avoided by adding a nop between
43406 the last insn that can throw and the first insn of the epilogue. */
43408 static void
43409 ix86_seh_fixup_eh_fallthru (void)
43411 edge e;
43412 edge_iterator ei;
43414 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43416 rtx_insn *insn, *next;
43418 /* Find the beginning of the epilogue. */
43419 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
43420 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
43421 break;
43422 if (insn == NULL)
43423 continue;
43425 /* We only care about preceding insns that can throw. */
43426 insn = prev_active_insn (insn);
43427 if (insn == NULL || !can_throw_internal (insn))
43428 continue;
43430 /* Do not separate calls from their debug information. */
43431 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
43432 if (NOTE_P (next)
43433 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
43434 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
43435 insn = next;
43436 else
43437 break;
43439 emit_insn_after (gen_nops (const1_rtx), insn);
43443 /* Given a register number BASE, the lowest of a group of registers, update
43444 regsets IN and OUT with the registers that should be avoided in input
43445 and output operands respectively when trying to avoid generating a modr/m
43446 byte for -fmitigate-rop. */
43448 static void
43449 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43451 SET_HARD_REG_BIT (out, base);
43452 SET_HARD_REG_BIT (out, base + 1);
43453 SET_HARD_REG_BIT (in, base + 2);
43454 SET_HARD_REG_BIT (in, base + 3);
43457 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43458 that certain encodings of modr/m bytes do not occur. */
43459 static void
43460 ix86_mitigate_rop (void)
43462 HARD_REG_SET input_risky;
43463 HARD_REG_SET output_risky;
43464 HARD_REG_SET inout_risky;
43466 CLEAR_HARD_REG_SET (output_risky);
43467 CLEAR_HARD_REG_SET (input_risky);
43468 SET_HARD_REG_BIT (output_risky, AX_REG);
43469 SET_HARD_REG_BIT (output_risky, CX_REG);
43470 SET_HARD_REG_BIT (input_risky, BX_REG);
43471 SET_HARD_REG_BIT (input_risky, DX_REG);
43472 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43473 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43474 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43475 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43476 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43477 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43478 COPY_HARD_REG_SET (inout_risky, input_risky);
43479 IOR_HARD_REG_SET (inout_risky, output_risky);
43481 df_note_add_problem ();
43482 /* Fix up what stack-regs did. */
43483 df_insn_rescan_all ();
43484 df_analyze ();
43486 regrename_init (true);
43487 regrename_analyze (NULL);
43489 auto_vec<du_head_p> cands;
43491 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43493 if (!NONDEBUG_INSN_P (insn))
43494 continue;
43496 if (GET_CODE (PATTERN (insn)) == USE
43497 || GET_CODE (PATTERN (insn)) == CLOBBER)
43498 continue;
43500 extract_insn (insn);
43502 int opno0, opno1;
43503 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43504 recog_data.n_operands, &opno0,
43505 &opno1);
43507 if (!ix86_rop_should_change_byte_p (modrm))
43508 continue;
43510 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43512 /* This happens when regrename has to fail a block. */
43513 if (!info->op_info)
43514 continue;
43516 if (info->op_info[opno0].n_chains != 0)
43518 gcc_assert (info->op_info[opno0].n_chains == 1);
43519 du_head_p op0c;
43520 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43521 if (op0c->target_data_1 + op0c->target_data_2 == 0
43522 && !op0c->cannot_rename)
43523 cands.safe_push (op0c);
43525 op0c->target_data_1++;
43527 if (info->op_info[opno1].n_chains != 0)
43529 gcc_assert (info->op_info[opno1].n_chains == 1);
43530 du_head_p op1c;
43531 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43532 if (op1c->target_data_1 + op1c->target_data_2 == 0
43533 && !op1c->cannot_rename)
43534 cands.safe_push (op1c);
43536 op1c->target_data_2++;
43540 int i;
43541 du_head_p head;
43542 FOR_EACH_VEC_ELT (cands, i, head)
43544 int old_reg, best_reg;
43545 HARD_REG_SET unavailable;
43547 CLEAR_HARD_REG_SET (unavailable);
43548 if (head->target_data_1)
43549 IOR_HARD_REG_SET (unavailable, output_risky);
43550 if (head->target_data_2)
43551 IOR_HARD_REG_SET (unavailable, input_risky);
43553 int n_uses;
43554 reg_class superclass = regrename_find_superclass (head, &n_uses,
43555 &unavailable);
43556 old_reg = head->regno;
43557 best_reg = find_rename_reg (head, superclass, &unavailable,
43558 old_reg, false);
43559 bool ok = regrename_do_replace (head, best_reg);
43560 gcc_assert (ok);
43561 if (dump_file)
43562 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43563 reg_names[best_reg], reg_class_names[superclass]);
43567 regrename_finish ();
43569 df_analyze ();
43571 basic_block bb;
43572 regset_head live;
43574 INIT_REG_SET (&live);
43576 FOR_EACH_BB_FN (bb, cfun)
43578 rtx_insn *insn;
43580 COPY_REG_SET (&live, DF_LR_OUT (bb));
43581 df_simulate_initialize_backwards (bb, &live);
43583 FOR_BB_INSNS_REVERSE (bb, insn)
43585 if (!NONDEBUG_INSN_P (insn))
43586 continue;
43588 df_simulate_one_insn_backwards (bb, insn, &live);
43590 if (GET_CODE (PATTERN (insn)) == USE
43591 || GET_CODE (PATTERN (insn)) == CLOBBER)
43592 continue;
43594 extract_insn (insn);
43595 constrain_operands_cached (insn, reload_completed);
43596 int opno0, opno1;
43597 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43598 recog_data.n_operands, &opno0,
43599 &opno1);
43600 if (modrm < 0
43601 || !ix86_rop_should_change_byte_p (modrm)
43602 || opno0 == opno1)
43603 continue;
43605 rtx oldreg = recog_data.operand[opno1];
43606 preprocess_constraints (insn);
43607 const operand_alternative *alt = which_op_alt ();
43609 int i;
43610 for (i = 0; i < recog_data.n_operands; i++)
43611 if (i != opno1
43612 && alt[i].earlyclobber
43613 && reg_overlap_mentioned_p (recog_data.operand[i],
43614 oldreg))
43615 break;
43617 if (i < recog_data.n_operands)
43618 continue;
43620 if (dump_file)
43621 fprintf (dump_file,
43622 "attempting to fix modrm byte in insn %d:"
43623 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43624 reg_class_names[alt[opno1].cl]);
43626 HARD_REG_SET unavailable;
43627 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43628 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43629 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43630 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43631 IOR_HARD_REG_SET (unavailable, output_risky);
43632 IOR_COMPL_HARD_REG_SET (unavailable,
43633 reg_class_contents[alt[opno1].cl]);
43635 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43636 if (!TEST_HARD_REG_BIT (unavailable, i))
43637 break;
43638 if (i == FIRST_PSEUDO_REGISTER)
43640 if (dump_file)
43641 fprintf (dump_file, ", none available\n");
43642 continue;
43644 if (dump_file)
43645 fprintf (dump_file, " -> %d\n", i);
43646 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43647 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43648 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43653 /* Implement machine specific optimizations. We implement padding of returns
43654 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43655 static void
43656 ix86_reorg (void)
43658 /* We are freeing block_for_insn in the toplev to keep compatibility
43659 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43660 compute_bb_for_insn ();
43662 if (flag_mitigate_rop)
43663 ix86_mitigate_rop ();
43665 if (TARGET_SEH && current_function_has_exception_handlers ())
43666 ix86_seh_fixup_eh_fallthru ();
43668 if (optimize && optimize_function_for_speed_p (cfun))
43670 if (TARGET_PAD_SHORT_FUNCTION)
43671 ix86_pad_short_function ();
43672 else if (TARGET_PAD_RETURNS)
43673 ix86_pad_returns ();
43674 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43675 if (TARGET_FOUR_JUMP_LIMIT)
43676 ix86_avoid_jump_mispredicts ();
43677 #endif
43681 /* Return nonzero when QImode register that must be represented via REX prefix
43682 is used. */
43683 bool
43684 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43686 int i;
43687 extract_insn_cached (insn);
43688 for (i = 0; i < recog_data.n_operands; i++)
43689 if (GENERAL_REG_P (recog_data.operand[i])
43690 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43691 return true;
43692 return false;
43695 /* Return true when INSN mentions register that must be encoded using REX
43696 prefix. */
43697 bool
43698 x86_extended_reg_mentioned_p (rtx insn)
43700 subrtx_iterator::array_type array;
43701 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43703 const_rtx x = *iter;
43704 if (REG_P (x)
43705 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43706 return true;
43708 return false;
43711 /* If profitable, negate (without causing overflow) integer constant
43712 of mode MODE at location LOC. Return true in this case. */
43713 bool
43714 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43716 HOST_WIDE_INT val;
43718 if (!CONST_INT_P (*loc))
43719 return false;
43721 switch (mode)
43723 case E_DImode:
43724 /* DImode x86_64 constants must fit in 32 bits. */
43725 gcc_assert (x86_64_immediate_operand (*loc, mode));
43727 mode = SImode;
43728 break;
43730 case E_SImode:
43731 case E_HImode:
43732 case E_QImode:
43733 break;
43735 default:
43736 gcc_unreachable ();
43739 /* Avoid overflows. */
43740 if (mode_signbit_p (mode, *loc))
43741 return false;
43743 val = INTVAL (*loc);
43745 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43746 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43747 if ((val < 0 && val != -128)
43748 || val == 128)
43750 *loc = GEN_INT (-val);
43751 return true;
43754 return false;
43757 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43758 optabs would emit if we didn't have TFmode patterns. */
43760 void
43761 x86_emit_floatuns (rtx operands[2])
43763 rtx_code_label *neglab, *donelab;
43764 rtx i0, i1, f0, in, out;
43765 machine_mode mode, inmode;
43767 inmode = GET_MODE (operands[1]);
43768 gcc_assert (inmode == SImode || inmode == DImode);
43770 out = operands[0];
43771 in = force_reg (inmode, operands[1]);
43772 mode = GET_MODE (out);
43773 neglab = gen_label_rtx ();
43774 donelab = gen_label_rtx ();
43775 f0 = gen_reg_rtx (mode);
43777 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43779 expand_float (out, in, 0);
43781 emit_jump_insn (gen_jump (donelab));
43782 emit_barrier ();
43784 emit_label (neglab);
43786 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43787 1, OPTAB_DIRECT);
43788 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43789 1, OPTAB_DIRECT);
43790 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43792 expand_float (f0, i0, 0);
43794 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43796 emit_label (donelab);
43799 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43800 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43801 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43802 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43804 /* Get a vector mode of the same size as the original but with elements
43805 twice as wide. This is only guaranteed to apply to integral vectors. */
43807 static inline machine_mode
43808 get_mode_wider_vector (machine_mode o)
43810 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43811 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
43812 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43813 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43814 return n;
43817 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43818 fill target with val via vec_duplicate. */
43820 static bool
43821 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43823 bool ok;
43824 rtx_insn *insn;
43825 rtx dup;
43827 /* First attempt to recognize VAL as-is. */
43828 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43829 insn = emit_insn (gen_rtx_SET (target, dup));
43830 if (recog_memoized (insn) < 0)
43832 rtx_insn *seq;
43833 machine_mode innermode = GET_MODE_INNER (mode);
43834 rtx reg;
43836 /* If that fails, force VAL into a register. */
43838 start_sequence ();
43839 reg = force_reg (innermode, val);
43840 if (GET_MODE (reg) != innermode)
43841 reg = gen_lowpart (innermode, reg);
43842 XEXP (dup, 0) = reg;
43843 seq = get_insns ();
43844 end_sequence ();
43845 if (seq)
43846 emit_insn_before (seq, insn);
43848 ok = recog_memoized (insn) >= 0;
43849 gcc_assert (ok);
43851 return true;
43854 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43855 with all elements equal to VAR. Return true if successful. */
43857 static bool
43858 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43859 rtx target, rtx val)
43861 bool ok;
43863 switch (mode)
43865 case E_V2SImode:
43866 case E_V2SFmode:
43867 if (!mmx_ok)
43868 return false;
43869 /* FALLTHRU */
43871 case E_V4DFmode:
43872 case E_V4DImode:
43873 case E_V8SFmode:
43874 case E_V8SImode:
43875 case E_V2DFmode:
43876 case E_V2DImode:
43877 case E_V4SFmode:
43878 case E_V4SImode:
43879 case E_V16SImode:
43880 case E_V8DImode:
43881 case E_V16SFmode:
43882 case E_V8DFmode:
43883 return ix86_vector_duplicate_value (mode, target, val);
43885 case E_V4HImode:
43886 if (!mmx_ok)
43887 return false;
43888 if (TARGET_SSE || TARGET_3DNOW_A)
43890 rtx x;
43892 val = gen_lowpart (SImode, val);
43893 x = gen_rtx_TRUNCATE (HImode, val);
43894 x = gen_rtx_VEC_DUPLICATE (mode, x);
43895 emit_insn (gen_rtx_SET (target, x));
43896 return true;
43898 goto widen;
43900 case E_V8QImode:
43901 if (!mmx_ok)
43902 return false;
43903 goto widen;
43905 case E_V8HImode:
43906 if (TARGET_AVX2)
43907 return ix86_vector_duplicate_value (mode, target, val);
43909 if (TARGET_SSE2)
43911 struct expand_vec_perm_d dperm;
43912 rtx tmp1, tmp2;
43914 permute:
43915 memset (&dperm, 0, sizeof (dperm));
43916 dperm.target = target;
43917 dperm.vmode = mode;
43918 dperm.nelt = GET_MODE_NUNITS (mode);
43919 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43920 dperm.one_operand_p = true;
43922 /* Extend to SImode using a paradoxical SUBREG. */
43923 tmp1 = gen_reg_rtx (SImode);
43924 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43926 /* Insert the SImode value as low element of a V4SImode vector. */
43927 tmp2 = gen_reg_rtx (V4SImode);
43928 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43929 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43931 ok = (expand_vec_perm_1 (&dperm)
43932 || expand_vec_perm_broadcast_1 (&dperm));
43933 gcc_assert (ok);
43934 return ok;
43936 goto widen;
43938 case E_V16QImode:
43939 if (TARGET_AVX2)
43940 return ix86_vector_duplicate_value (mode, target, val);
43942 if (TARGET_SSE2)
43943 goto permute;
43944 goto widen;
43946 widen:
43947 /* Replicate the value once into the next wider mode and recurse. */
43949 machine_mode smode, wsmode, wvmode;
43950 rtx x;
43952 smode = GET_MODE_INNER (mode);
43953 wvmode = get_mode_wider_vector (mode);
43954 wsmode = GET_MODE_INNER (wvmode);
43956 val = convert_modes (wsmode, smode, val, true);
43957 x = expand_simple_binop (wsmode, ASHIFT, val,
43958 GEN_INT (GET_MODE_BITSIZE (smode)),
43959 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43960 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43962 x = gen_reg_rtx (wvmode);
43963 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43964 gcc_assert (ok);
43965 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43966 return ok;
43969 case E_V16HImode:
43970 case E_V32QImode:
43971 if (TARGET_AVX2)
43972 return ix86_vector_duplicate_value (mode, target, val);
43973 else
43975 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43976 rtx x = gen_reg_rtx (hvmode);
43978 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43979 gcc_assert (ok);
43981 x = gen_rtx_VEC_CONCAT (mode, x, x);
43982 emit_insn (gen_rtx_SET (target, x));
43984 return true;
43986 case E_V64QImode:
43987 case E_V32HImode:
43988 if (TARGET_AVX512BW)
43989 return ix86_vector_duplicate_value (mode, target, val);
43990 else
43992 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43993 rtx x = gen_reg_rtx (hvmode);
43995 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43996 gcc_assert (ok);
43998 x = gen_rtx_VEC_CONCAT (mode, x, x);
43999 emit_insn (gen_rtx_SET (target, x));
44001 return true;
44003 default:
44004 return false;
44008 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
44009 whose ONE_VAR element is VAR, and other elements are zero. Return true
44010 if successful. */
44012 static bool
44013 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
44014 rtx target, rtx var, int one_var)
44016 machine_mode vsimode;
44017 rtx new_target;
44018 rtx x, tmp;
44019 bool use_vector_set = false;
44021 switch (mode)
44023 case E_V2DImode:
44024 /* For SSE4.1, we normally use vector set. But if the second
44025 element is zero and inter-unit moves are OK, we use movq
44026 instead. */
44027 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
44028 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
44029 && one_var == 0));
44030 break;
44031 case E_V16QImode:
44032 case E_V4SImode:
44033 case E_V4SFmode:
44034 use_vector_set = TARGET_SSE4_1;
44035 break;
44036 case E_V8HImode:
44037 use_vector_set = TARGET_SSE2;
44038 break;
44039 case E_V4HImode:
44040 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
44041 break;
44042 case E_V32QImode:
44043 case E_V16HImode:
44044 case E_V8SImode:
44045 case E_V8SFmode:
44046 case E_V4DFmode:
44047 use_vector_set = TARGET_AVX;
44048 break;
44049 case E_V4DImode:
44050 /* Use ix86_expand_vector_set in 64bit mode only. */
44051 use_vector_set = TARGET_AVX && TARGET_64BIT;
44052 break;
44053 default:
44054 break;
44057 if (use_vector_set)
44059 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
44060 var = force_reg (GET_MODE_INNER (mode), var);
44061 ix86_expand_vector_set (mmx_ok, target, var, one_var);
44062 return true;
44065 switch (mode)
44067 case E_V2SFmode:
44068 case E_V2SImode:
44069 if (!mmx_ok)
44070 return false;
44071 /* FALLTHRU */
44073 case E_V2DFmode:
44074 case E_V2DImode:
44075 if (one_var != 0)
44076 return false;
44077 var = force_reg (GET_MODE_INNER (mode), var);
44078 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
44079 emit_insn (gen_rtx_SET (target, x));
44080 return true;
44082 case E_V4SFmode:
44083 case E_V4SImode:
44084 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
44085 new_target = gen_reg_rtx (mode);
44086 else
44087 new_target = target;
44088 var = force_reg (GET_MODE_INNER (mode), var);
44089 x = gen_rtx_VEC_DUPLICATE (mode, var);
44090 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
44091 emit_insn (gen_rtx_SET (new_target, x));
44092 if (one_var != 0)
44094 /* We need to shuffle the value to the correct position, so
44095 create a new pseudo to store the intermediate result. */
44097 /* With SSE2, we can use the integer shuffle insns. */
44098 if (mode != V4SFmode && TARGET_SSE2)
44100 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
44101 const1_rtx,
44102 GEN_INT (one_var == 1 ? 0 : 1),
44103 GEN_INT (one_var == 2 ? 0 : 1),
44104 GEN_INT (one_var == 3 ? 0 : 1)));
44105 if (target != new_target)
44106 emit_move_insn (target, new_target);
44107 return true;
44110 /* Otherwise convert the intermediate result to V4SFmode and
44111 use the SSE1 shuffle instructions. */
44112 if (mode != V4SFmode)
44114 tmp = gen_reg_rtx (V4SFmode);
44115 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
44117 else
44118 tmp = new_target;
44120 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
44121 const1_rtx,
44122 GEN_INT (one_var == 1 ? 0 : 1),
44123 GEN_INT (one_var == 2 ? 0+4 : 1+4),
44124 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
44126 if (mode != V4SFmode)
44127 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
44128 else if (tmp != target)
44129 emit_move_insn (target, tmp);
44131 else if (target != new_target)
44132 emit_move_insn (target, new_target);
44133 return true;
44135 case E_V8HImode:
44136 case E_V16QImode:
44137 vsimode = V4SImode;
44138 goto widen;
44139 case E_V4HImode:
44140 case E_V8QImode:
44141 if (!mmx_ok)
44142 return false;
44143 vsimode = V2SImode;
44144 goto widen;
44145 widen:
44146 if (one_var != 0)
44147 return false;
44149 /* Zero extend the variable element to SImode and recurse. */
44150 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
44152 x = gen_reg_rtx (vsimode);
44153 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
44154 var, one_var))
44155 gcc_unreachable ();
44157 emit_move_insn (target, gen_lowpart (mode, x));
44158 return true;
44160 default:
44161 return false;
44165 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
44166 consisting of the values in VALS. It is known that all elements
44167 except ONE_VAR are constants. Return true if successful. */
44169 static bool
44170 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
44171 rtx target, rtx vals, int one_var)
44173 rtx var = XVECEXP (vals, 0, one_var);
44174 machine_mode wmode;
44175 rtx const_vec, x;
44177 const_vec = copy_rtx (vals);
44178 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
44179 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
44181 switch (mode)
44183 case E_V2DFmode:
44184 case E_V2DImode:
44185 case E_V2SFmode:
44186 case E_V2SImode:
44187 /* For the two element vectors, it's just as easy to use
44188 the general case. */
44189 return false;
44191 case E_V4DImode:
44192 /* Use ix86_expand_vector_set in 64bit mode only. */
44193 if (!TARGET_64BIT)
44194 return false;
44195 /* FALLTHRU */
44196 case E_V4DFmode:
44197 case E_V8SFmode:
44198 case E_V8SImode:
44199 case E_V16HImode:
44200 case E_V32QImode:
44201 case E_V4SFmode:
44202 case E_V4SImode:
44203 case E_V8HImode:
44204 case E_V4HImode:
44205 break;
44207 case E_V16QImode:
44208 if (TARGET_SSE4_1)
44209 break;
44210 wmode = V8HImode;
44211 goto widen;
44212 case E_V8QImode:
44213 wmode = V4HImode;
44214 goto widen;
44215 widen:
44216 /* There's no way to set one QImode entry easily. Combine
44217 the variable value with its adjacent constant value, and
44218 promote to an HImode set. */
44219 x = XVECEXP (vals, 0, one_var ^ 1);
44220 if (one_var & 1)
44222 var = convert_modes (HImode, QImode, var, true);
44223 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
44224 NULL_RTX, 1, OPTAB_LIB_WIDEN);
44225 x = GEN_INT (INTVAL (x) & 0xff);
44227 else
44229 var = convert_modes (HImode, QImode, var, true);
44230 x = gen_int_mode (INTVAL (x) << 8, HImode);
44232 if (x != const0_rtx)
44233 var = expand_simple_binop (HImode, IOR, var, x, var,
44234 1, OPTAB_LIB_WIDEN);
44236 x = gen_reg_rtx (wmode);
44237 emit_move_insn (x, gen_lowpart (wmode, const_vec));
44238 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
44240 emit_move_insn (target, gen_lowpart (mode, x));
44241 return true;
44243 default:
44244 return false;
44247 emit_move_insn (target, const_vec);
44248 ix86_expand_vector_set (mmx_ok, target, var, one_var);
44249 return true;
44252 /* A subroutine of ix86_expand_vector_init_general. Use vector
44253 concatenate to handle the most general case: all values variable,
44254 and none identical. */
44256 static void
44257 ix86_expand_vector_init_concat (machine_mode mode,
44258 rtx target, rtx *ops, int n)
44260 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
44261 rtx first[16], second[8], third[4];
44262 rtvec v;
44263 int i, j;
44265 switch (n)
44267 case 2:
44268 switch (mode)
44270 case E_V16SImode:
44271 cmode = V8SImode;
44272 break;
44273 case E_V16SFmode:
44274 cmode = V8SFmode;
44275 break;
44276 case E_V8DImode:
44277 cmode = V4DImode;
44278 break;
44279 case E_V8DFmode:
44280 cmode = V4DFmode;
44281 break;
44282 case E_V8SImode:
44283 cmode = V4SImode;
44284 break;
44285 case E_V8SFmode:
44286 cmode = V4SFmode;
44287 break;
44288 case E_V4DImode:
44289 cmode = V2DImode;
44290 break;
44291 case E_V4DFmode:
44292 cmode = V2DFmode;
44293 break;
44294 case E_V4SImode:
44295 cmode = V2SImode;
44296 break;
44297 case E_V4SFmode:
44298 cmode = V2SFmode;
44299 break;
44300 case E_V2DImode:
44301 cmode = DImode;
44302 break;
44303 case E_V2SImode:
44304 cmode = SImode;
44305 break;
44306 case E_V2DFmode:
44307 cmode = DFmode;
44308 break;
44309 case E_V2SFmode:
44310 cmode = SFmode;
44311 break;
44312 default:
44313 gcc_unreachable ();
44316 if (!register_operand (ops[1], cmode))
44317 ops[1] = force_reg (cmode, ops[1]);
44318 if (!register_operand (ops[0], cmode))
44319 ops[0] = force_reg (cmode, ops[0]);
44320 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
44321 ops[1])));
44322 break;
44324 case 4:
44325 switch (mode)
44327 case E_V4DImode:
44328 cmode = V2DImode;
44329 break;
44330 case E_V4DFmode:
44331 cmode = V2DFmode;
44332 break;
44333 case E_V4SImode:
44334 cmode = V2SImode;
44335 break;
44336 case E_V4SFmode:
44337 cmode = V2SFmode;
44338 break;
44339 default:
44340 gcc_unreachable ();
44342 goto half;
44344 case 8:
44345 switch (mode)
44347 case E_V8DImode:
44348 cmode = V2DImode;
44349 hmode = V4DImode;
44350 break;
44351 case E_V8DFmode:
44352 cmode = V2DFmode;
44353 hmode = V4DFmode;
44354 break;
44355 case E_V8SImode:
44356 cmode = V2SImode;
44357 hmode = V4SImode;
44358 break;
44359 case E_V8SFmode:
44360 cmode = V2SFmode;
44361 hmode = V4SFmode;
44362 break;
44363 default:
44364 gcc_unreachable ();
44366 goto half;
44368 case 16:
44369 switch (mode)
44371 case E_V16SImode:
44372 cmode = V2SImode;
44373 hmode = V4SImode;
44374 gmode = V8SImode;
44375 break;
44376 case E_V16SFmode:
44377 cmode = V2SFmode;
44378 hmode = V4SFmode;
44379 gmode = V8SFmode;
44380 break;
44381 default:
44382 gcc_unreachable ();
44384 goto half;
44386 half:
44387 /* FIXME: We process inputs backward to help RA. PR 36222. */
44388 i = n - 1;
44389 j = (n >> 1) - 1;
44390 for (; i > 0; i -= 2, j--)
44392 first[j] = gen_reg_rtx (cmode);
44393 v = gen_rtvec (2, ops[i - 1], ops[i]);
44394 ix86_expand_vector_init (false, first[j],
44395 gen_rtx_PARALLEL (cmode, v));
44398 n >>= 1;
44399 if (n > 4)
44401 gcc_assert (hmode != VOIDmode);
44402 gcc_assert (gmode != VOIDmode);
44403 for (i = j = 0; i < n; i += 2, j++)
44405 second[j] = gen_reg_rtx (hmode);
44406 ix86_expand_vector_init_concat (hmode, second [j],
44407 &first [i], 2);
44409 n >>= 1;
44410 for (i = j = 0; i < n; i += 2, j++)
44412 third[j] = gen_reg_rtx (gmode);
44413 ix86_expand_vector_init_concat (gmode, third[j],
44414 &second[i], 2);
44416 n >>= 1;
44417 ix86_expand_vector_init_concat (mode, target, third, n);
44419 else if (n > 2)
44421 gcc_assert (hmode != VOIDmode);
44422 for (i = j = 0; i < n; i += 2, j++)
44424 second[j] = gen_reg_rtx (hmode);
44425 ix86_expand_vector_init_concat (hmode, second [j],
44426 &first [i], 2);
44428 n >>= 1;
44429 ix86_expand_vector_init_concat (mode, target, second, n);
44431 else
44432 ix86_expand_vector_init_concat (mode, target, first, n);
44433 break;
44435 default:
44436 gcc_unreachable ();
44440 /* A subroutine of ix86_expand_vector_init_general. Use vector
44441 interleave to handle the most general case: all values variable,
44442 and none identical. */
44444 static void
44445 ix86_expand_vector_init_interleave (machine_mode mode,
44446 rtx target, rtx *ops, int n)
44448 machine_mode first_imode, second_imode, third_imode, inner_mode;
44449 int i, j;
44450 rtx op0, op1;
44451 rtx (*gen_load_even) (rtx, rtx, rtx);
44452 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44453 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44455 switch (mode)
44457 case E_V8HImode:
44458 gen_load_even = gen_vec_setv8hi;
44459 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44460 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44461 inner_mode = HImode;
44462 first_imode = V4SImode;
44463 second_imode = V2DImode;
44464 third_imode = VOIDmode;
44465 break;
44466 case E_V16QImode:
44467 gen_load_even = gen_vec_setv16qi;
44468 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44469 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44470 inner_mode = QImode;
44471 first_imode = V8HImode;
44472 second_imode = V4SImode;
44473 third_imode = V2DImode;
44474 break;
44475 default:
44476 gcc_unreachable ();
44479 for (i = 0; i < n; i++)
44481 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44482 op0 = gen_reg_rtx (SImode);
44483 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44485 /* Insert the SImode value as low element of V4SImode vector. */
44486 op1 = gen_reg_rtx (V4SImode);
44487 op0 = gen_rtx_VEC_MERGE (V4SImode,
44488 gen_rtx_VEC_DUPLICATE (V4SImode,
44489 op0),
44490 CONST0_RTX (V4SImode),
44491 const1_rtx);
44492 emit_insn (gen_rtx_SET (op1, op0));
44494 /* Cast the V4SImode vector back to a vector in orignal mode. */
44495 op0 = gen_reg_rtx (mode);
44496 emit_move_insn (op0, gen_lowpart (mode, op1));
44498 /* Load even elements into the second position. */
44499 emit_insn (gen_load_even (op0,
44500 force_reg (inner_mode,
44501 ops [i + i + 1]),
44502 const1_rtx));
44504 /* Cast vector to FIRST_IMODE vector. */
44505 ops[i] = gen_reg_rtx (first_imode);
44506 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44509 /* Interleave low FIRST_IMODE vectors. */
44510 for (i = j = 0; i < n; i += 2, j++)
44512 op0 = gen_reg_rtx (first_imode);
44513 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44515 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44516 ops[j] = gen_reg_rtx (second_imode);
44517 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44520 /* Interleave low SECOND_IMODE vectors. */
44521 switch (second_imode)
44523 case E_V4SImode:
44524 for (i = j = 0; i < n / 2; i += 2, j++)
44526 op0 = gen_reg_rtx (second_imode);
44527 emit_insn (gen_interleave_second_low (op0, ops[i],
44528 ops[i + 1]));
44530 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44531 vector. */
44532 ops[j] = gen_reg_rtx (third_imode);
44533 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44535 second_imode = V2DImode;
44536 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44537 /* FALLTHRU */
44539 case E_V2DImode:
44540 op0 = gen_reg_rtx (second_imode);
44541 emit_insn (gen_interleave_second_low (op0, ops[0],
44542 ops[1]));
44544 /* Cast the SECOND_IMODE vector back to a vector on original
44545 mode. */
44546 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44547 break;
44549 default:
44550 gcc_unreachable ();
44554 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44555 all values variable, and none identical. */
44557 static void
44558 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44559 rtx target, rtx vals)
44561 rtx ops[64], op0, op1, op2, op3, op4, op5;
44562 machine_mode half_mode = VOIDmode;
44563 machine_mode quarter_mode = VOIDmode;
44564 int n, i;
44566 switch (mode)
44568 case E_V2SFmode:
44569 case E_V2SImode:
44570 if (!mmx_ok && !TARGET_SSE)
44571 break;
44572 /* FALLTHRU */
44574 case E_V16SImode:
44575 case E_V16SFmode:
44576 case E_V8DFmode:
44577 case E_V8DImode:
44578 case E_V8SFmode:
44579 case E_V8SImode:
44580 case E_V4DFmode:
44581 case E_V4DImode:
44582 case E_V4SFmode:
44583 case E_V4SImode:
44584 case E_V2DFmode:
44585 case E_V2DImode:
44586 n = GET_MODE_NUNITS (mode);
44587 for (i = 0; i < n; i++)
44588 ops[i] = XVECEXP (vals, 0, i);
44589 ix86_expand_vector_init_concat (mode, target, ops, n);
44590 return;
44592 case E_V2TImode:
44593 for (i = 0; i < 2; i++)
44594 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44595 op0 = gen_reg_rtx (V4DImode);
44596 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44597 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44598 return;
44600 case E_V4TImode:
44601 for (i = 0; i < 4; i++)
44602 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44603 ops[4] = gen_reg_rtx (V4DImode);
44604 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44605 ops[5] = gen_reg_rtx (V4DImode);
44606 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44607 op0 = gen_reg_rtx (V8DImode);
44608 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44609 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44610 return;
44612 case E_V32QImode:
44613 half_mode = V16QImode;
44614 goto half;
44616 case E_V16HImode:
44617 half_mode = V8HImode;
44618 goto half;
44620 half:
44621 n = GET_MODE_NUNITS (mode);
44622 for (i = 0; i < n; i++)
44623 ops[i] = XVECEXP (vals, 0, i);
44624 op0 = gen_reg_rtx (half_mode);
44625 op1 = gen_reg_rtx (half_mode);
44626 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44627 n >> 2);
44628 ix86_expand_vector_init_interleave (half_mode, op1,
44629 &ops [n >> 1], n >> 2);
44630 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44631 return;
44633 case E_V64QImode:
44634 quarter_mode = V16QImode;
44635 half_mode = V32QImode;
44636 goto quarter;
44638 case E_V32HImode:
44639 quarter_mode = V8HImode;
44640 half_mode = V16HImode;
44641 goto quarter;
44643 quarter:
44644 n = GET_MODE_NUNITS (mode);
44645 for (i = 0; i < n; i++)
44646 ops[i] = XVECEXP (vals, 0, i);
44647 op0 = gen_reg_rtx (quarter_mode);
44648 op1 = gen_reg_rtx (quarter_mode);
44649 op2 = gen_reg_rtx (quarter_mode);
44650 op3 = gen_reg_rtx (quarter_mode);
44651 op4 = gen_reg_rtx (half_mode);
44652 op5 = gen_reg_rtx (half_mode);
44653 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44654 n >> 3);
44655 ix86_expand_vector_init_interleave (quarter_mode, op1,
44656 &ops [n >> 2], n >> 3);
44657 ix86_expand_vector_init_interleave (quarter_mode, op2,
44658 &ops [n >> 1], n >> 3);
44659 ix86_expand_vector_init_interleave (quarter_mode, op3,
44660 &ops [(n >> 1) | (n >> 2)], n >> 3);
44661 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44662 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44663 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44664 return;
44666 case E_V16QImode:
44667 if (!TARGET_SSE4_1)
44668 break;
44669 /* FALLTHRU */
44671 case E_V8HImode:
44672 if (!TARGET_SSE2)
44673 break;
44675 /* Don't use ix86_expand_vector_init_interleave if we can't
44676 move from GPR to SSE register directly. */
44677 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44678 break;
44680 n = GET_MODE_NUNITS (mode);
44681 for (i = 0; i < n; i++)
44682 ops[i] = XVECEXP (vals, 0, i);
44683 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44684 return;
44686 case E_V4HImode:
44687 case E_V8QImode:
44688 break;
44690 default:
44691 gcc_unreachable ();
44695 int i, j, n_elts, n_words, n_elt_per_word;
44696 machine_mode inner_mode;
44697 rtx words[4], shift;
44699 inner_mode = GET_MODE_INNER (mode);
44700 n_elts = GET_MODE_NUNITS (mode);
44701 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44702 n_elt_per_word = n_elts / n_words;
44703 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44705 for (i = 0; i < n_words; ++i)
44707 rtx word = NULL_RTX;
44709 for (j = 0; j < n_elt_per_word; ++j)
44711 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44712 elt = convert_modes (word_mode, inner_mode, elt, true);
44714 if (j == 0)
44715 word = elt;
44716 else
44718 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44719 word, 1, OPTAB_LIB_WIDEN);
44720 word = expand_simple_binop (word_mode, IOR, word, elt,
44721 word, 1, OPTAB_LIB_WIDEN);
44725 words[i] = word;
44728 if (n_words == 1)
44729 emit_move_insn (target, gen_lowpart (mode, words[0]));
44730 else if (n_words == 2)
44732 rtx tmp = gen_reg_rtx (mode);
44733 emit_clobber (tmp);
44734 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44735 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44736 emit_move_insn (target, tmp);
44738 else if (n_words == 4)
44740 rtx tmp = gen_reg_rtx (V4SImode);
44741 gcc_assert (word_mode == SImode);
44742 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44743 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44744 emit_move_insn (target, gen_lowpart (mode, tmp));
44746 else
44747 gcc_unreachable ();
44751 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44752 instructions unless MMX_OK is true. */
44754 void
44755 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44757 machine_mode mode = GET_MODE (target);
44758 machine_mode inner_mode = GET_MODE_INNER (mode);
44759 int n_elts = GET_MODE_NUNITS (mode);
44760 int n_var = 0, one_var = -1;
44761 bool all_same = true, all_const_zero = true;
44762 int i;
44763 rtx x;
44765 /* Handle first initialization from vector elts. */
44766 if (n_elts != XVECLEN (vals, 0))
44768 rtx subtarget = target;
44769 x = XVECEXP (vals, 0, 0);
44770 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44771 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44773 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44774 if (inner_mode == QImode || inner_mode == HImode)
44776 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
44777 mode = mode_for_vector (SImode, n_bits / 4).require ();
44778 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
44779 ops[0] = gen_lowpart (inner_mode, ops[0]);
44780 ops[1] = gen_lowpart (inner_mode, ops[1]);
44781 subtarget = gen_reg_rtx (mode);
44783 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44784 if (subtarget != target)
44785 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44786 return;
44788 gcc_unreachable ();
44791 for (i = 0; i < n_elts; ++i)
44793 x = XVECEXP (vals, 0, i);
44794 if (!(CONST_SCALAR_INT_P (x)
44795 || CONST_DOUBLE_P (x)
44796 || CONST_FIXED_P (x)))
44797 n_var++, one_var = i;
44798 else if (x != CONST0_RTX (inner_mode))
44799 all_const_zero = false;
44800 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44801 all_same = false;
44804 /* Constants are best loaded from the constant pool. */
44805 if (n_var == 0)
44807 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44808 return;
44811 /* If all values are identical, broadcast the value. */
44812 if (all_same
44813 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44814 XVECEXP (vals, 0, 0)))
44815 return;
44817 /* Values where only one field is non-constant are best loaded from
44818 the pool and overwritten via move later. */
44819 if (n_var == 1)
44821 if (all_const_zero
44822 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44823 XVECEXP (vals, 0, one_var),
44824 one_var))
44825 return;
44827 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44828 return;
44831 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44834 void
44835 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44837 machine_mode mode = GET_MODE (target);
44838 machine_mode inner_mode = GET_MODE_INNER (mode);
44839 machine_mode half_mode;
44840 bool use_vec_merge = false;
44841 rtx tmp;
44842 static rtx (*gen_extract[6][2]) (rtx, rtx)
44844 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44845 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44846 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44847 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44848 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44849 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44851 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44853 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44854 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44855 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44856 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44857 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44858 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44860 int i, j, n;
44861 machine_mode mmode = VOIDmode;
44862 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44864 switch (mode)
44866 case E_V2SFmode:
44867 case E_V2SImode:
44868 if (mmx_ok)
44870 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44871 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44872 if (elt == 0)
44873 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44874 else
44875 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44876 emit_insn (gen_rtx_SET (target, tmp));
44877 return;
44879 break;
44881 case E_V2DImode:
44882 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44883 if (use_vec_merge)
44884 break;
44886 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44887 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44888 if (elt == 0)
44889 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44890 else
44891 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44892 emit_insn (gen_rtx_SET (target, tmp));
44893 return;
44895 case E_V2DFmode:
44897 rtx op0, op1;
44899 /* For the two element vectors, we implement a VEC_CONCAT with
44900 the extraction of the other element. */
44902 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44903 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44905 if (elt == 0)
44906 op0 = val, op1 = tmp;
44907 else
44908 op0 = tmp, op1 = val;
44910 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44911 emit_insn (gen_rtx_SET (target, tmp));
44913 return;
44915 case E_V4SFmode:
44916 use_vec_merge = TARGET_SSE4_1;
44917 if (use_vec_merge)
44918 break;
44920 switch (elt)
44922 case 0:
44923 use_vec_merge = true;
44924 break;
44926 case 1:
44927 /* tmp = target = A B C D */
44928 tmp = copy_to_reg (target);
44929 /* target = A A B B */
44930 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44931 /* target = X A B B */
44932 ix86_expand_vector_set (false, target, val, 0);
44933 /* target = A X C D */
44934 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44935 const1_rtx, const0_rtx,
44936 GEN_INT (2+4), GEN_INT (3+4)));
44937 return;
44939 case 2:
44940 /* tmp = target = A B C D */
44941 tmp = copy_to_reg (target);
44942 /* tmp = X B C D */
44943 ix86_expand_vector_set (false, tmp, val, 0);
44944 /* target = A B X D */
44945 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44946 const0_rtx, const1_rtx,
44947 GEN_INT (0+4), GEN_INT (3+4)));
44948 return;
44950 case 3:
44951 /* tmp = target = A B C D */
44952 tmp = copy_to_reg (target);
44953 /* tmp = X B C D */
44954 ix86_expand_vector_set (false, tmp, val, 0);
44955 /* target = A B X D */
44956 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44957 const0_rtx, const1_rtx,
44958 GEN_INT (2+4), GEN_INT (0+4)));
44959 return;
44961 default:
44962 gcc_unreachable ();
44964 break;
44966 case E_V4SImode:
44967 use_vec_merge = TARGET_SSE4_1;
44968 if (use_vec_merge)
44969 break;
44971 /* Element 0 handled by vec_merge below. */
44972 if (elt == 0)
44974 use_vec_merge = true;
44975 break;
44978 if (TARGET_SSE2)
44980 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44981 store into element 0, then shuffle them back. */
44983 rtx order[4];
44985 order[0] = GEN_INT (elt);
44986 order[1] = const1_rtx;
44987 order[2] = const2_rtx;
44988 order[3] = GEN_INT (3);
44989 order[elt] = const0_rtx;
44991 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44992 order[1], order[2], order[3]));
44994 ix86_expand_vector_set (false, target, val, 0);
44996 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44997 order[1], order[2], order[3]));
44999 else
45001 /* For SSE1, we have to reuse the V4SF code. */
45002 rtx t = gen_reg_rtx (V4SFmode);
45003 emit_move_insn (t, gen_lowpart (V4SFmode, target));
45004 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
45005 emit_move_insn (target, gen_lowpart (mode, t));
45007 return;
45009 case E_V8HImode:
45010 use_vec_merge = TARGET_SSE2;
45011 break;
45012 case E_V4HImode:
45013 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
45014 break;
45016 case E_V16QImode:
45017 use_vec_merge = TARGET_SSE4_1;
45018 break;
45020 case E_V8QImode:
45021 break;
45023 case E_V32QImode:
45024 half_mode = V16QImode;
45025 j = 0;
45026 n = 16;
45027 goto half;
45029 case E_V16HImode:
45030 half_mode = V8HImode;
45031 j = 1;
45032 n = 8;
45033 goto half;
45035 case E_V8SImode:
45036 half_mode = V4SImode;
45037 j = 2;
45038 n = 4;
45039 goto half;
45041 case E_V4DImode:
45042 half_mode = V2DImode;
45043 j = 3;
45044 n = 2;
45045 goto half;
45047 case E_V8SFmode:
45048 half_mode = V4SFmode;
45049 j = 4;
45050 n = 4;
45051 goto half;
45053 case E_V4DFmode:
45054 half_mode = V2DFmode;
45055 j = 5;
45056 n = 2;
45057 goto half;
45059 half:
45060 /* Compute offset. */
45061 i = elt / n;
45062 elt %= n;
45064 gcc_assert (i <= 1);
45066 /* Extract the half. */
45067 tmp = gen_reg_rtx (half_mode);
45068 emit_insn (gen_extract[j][i] (tmp, target));
45070 /* Put val in tmp at elt. */
45071 ix86_expand_vector_set (false, tmp, val, elt);
45073 /* Put it back. */
45074 emit_insn (gen_insert[j][i] (target, target, tmp));
45075 return;
45077 case E_V8DFmode:
45078 if (TARGET_AVX512F)
45080 mmode = QImode;
45081 gen_blendm = gen_avx512f_blendmv8df;
45083 break;
45085 case E_V8DImode:
45086 if (TARGET_AVX512F)
45088 mmode = QImode;
45089 gen_blendm = gen_avx512f_blendmv8di;
45091 break;
45093 case E_V16SFmode:
45094 if (TARGET_AVX512F)
45096 mmode = HImode;
45097 gen_blendm = gen_avx512f_blendmv16sf;
45099 break;
45101 case E_V16SImode:
45102 if (TARGET_AVX512F)
45104 mmode = HImode;
45105 gen_blendm = gen_avx512f_blendmv16si;
45107 break;
45109 case E_V32HImode:
45110 if (TARGET_AVX512F && TARGET_AVX512BW)
45112 mmode = SImode;
45113 gen_blendm = gen_avx512bw_blendmv32hi;
45115 break;
45117 case E_V64QImode:
45118 if (TARGET_AVX512F && TARGET_AVX512BW)
45120 mmode = DImode;
45121 gen_blendm = gen_avx512bw_blendmv64qi;
45123 break;
45125 default:
45126 break;
45129 if (mmode != VOIDmode)
45131 tmp = gen_reg_rtx (mode);
45132 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
45133 /* The avx512*_blendm<mode> expanders have different operand order
45134 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
45135 elements where the mask is set and second input operand otherwise,
45136 in {sse,avx}*_*blend* the first input operand is used for elements
45137 where the mask is clear and second input operand otherwise. */
45138 emit_insn (gen_blendm (target, target, tmp,
45139 force_reg (mmode,
45140 gen_int_mode (1 << elt, mmode))));
45142 else if (use_vec_merge)
45144 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
45145 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
45146 emit_insn (gen_rtx_SET (target, tmp));
45148 else
45150 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45152 emit_move_insn (mem, target);
45154 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45155 emit_move_insn (tmp, val);
45157 emit_move_insn (target, mem);
45161 void
45162 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
45164 machine_mode mode = GET_MODE (vec);
45165 machine_mode inner_mode = GET_MODE_INNER (mode);
45166 bool use_vec_extr = false;
45167 rtx tmp;
45169 switch (mode)
45171 case E_V2SImode:
45172 case E_V2SFmode:
45173 if (!mmx_ok)
45174 break;
45175 /* FALLTHRU */
45177 case E_V2DFmode:
45178 case E_V2DImode:
45179 case E_V2TImode:
45180 case E_V4TImode:
45181 use_vec_extr = true;
45182 break;
45184 case E_V4SFmode:
45185 use_vec_extr = TARGET_SSE4_1;
45186 if (use_vec_extr)
45187 break;
45189 switch (elt)
45191 case 0:
45192 tmp = vec;
45193 break;
45195 case 1:
45196 case 3:
45197 tmp = gen_reg_rtx (mode);
45198 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
45199 GEN_INT (elt), GEN_INT (elt),
45200 GEN_INT (elt+4), GEN_INT (elt+4)));
45201 break;
45203 case 2:
45204 tmp = gen_reg_rtx (mode);
45205 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
45206 break;
45208 default:
45209 gcc_unreachable ();
45211 vec = tmp;
45212 use_vec_extr = true;
45213 elt = 0;
45214 break;
45216 case E_V4SImode:
45217 use_vec_extr = TARGET_SSE4_1;
45218 if (use_vec_extr)
45219 break;
45221 if (TARGET_SSE2)
45223 switch (elt)
45225 case 0:
45226 tmp = vec;
45227 break;
45229 case 1:
45230 case 3:
45231 tmp = gen_reg_rtx (mode);
45232 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
45233 GEN_INT (elt), GEN_INT (elt),
45234 GEN_INT (elt), GEN_INT (elt)));
45235 break;
45237 case 2:
45238 tmp = gen_reg_rtx (mode);
45239 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
45240 break;
45242 default:
45243 gcc_unreachable ();
45245 vec = tmp;
45246 use_vec_extr = true;
45247 elt = 0;
45249 else
45251 /* For SSE1, we have to reuse the V4SF code. */
45252 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
45253 gen_lowpart (V4SFmode, vec), elt);
45254 return;
45256 break;
45258 case E_V8HImode:
45259 use_vec_extr = TARGET_SSE2;
45260 break;
45261 case E_V4HImode:
45262 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
45263 break;
45265 case E_V16QImode:
45266 use_vec_extr = TARGET_SSE4_1;
45267 break;
45269 case E_V8SFmode:
45270 if (TARGET_AVX)
45272 tmp = gen_reg_rtx (V4SFmode);
45273 if (elt < 4)
45274 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
45275 else
45276 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
45277 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45278 return;
45280 break;
45282 case E_V4DFmode:
45283 if (TARGET_AVX)
45285 tmp = gen_reg_rtx (V2DFmode);
45286 if (elt < 2)
45287 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
45288 else
45289 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
45290 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45291 return;
45293 break;
45295 case E_V32QImode:
45296 if (TARGET_AVX)
45298 tmp = gen_reg_rtx (V16QImode);
45299 if (elt < 16)
45300 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
45301 else
45302 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
45303 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45304 return;
45306 break;
45308 case E_V16HImode:
45309 if (TARGET_AVX)
45311 tmp = gen_reg_rtx (V8HImode);
45312 if (elt < 8)
45313 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
45314 else
45315 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
45316 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45317 return;
45319 break;
45321 case E_V8SImode:
45322 if (TARGET_AVX)
45324 tmp = gen_reg_rtx (V4SImode);
45325 if (elt < 4)
45326 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
45327 else
45328 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
45329 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45330 return;
45332 break;
45334 case E_V4DImode:
45335 if (TARGET_AVX)
45337 tmp = gen_reg_rtx (V2DImode);
45338 if (elt < 2)
45339 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
45340 else
45341 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
45342 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45343 return;
45345 break;
45347 case E_V32HImode:
45348 if (TARGET_AVX512BW)
45350 tmp = gen_reg_rtx (V16HImode);
45351 if (elt < 16)
45352 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
45353 else
45354 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
45355 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45356 return;
45358 break;
45360 case E_V64QImode:
45361 if (TARGET_AVX512BW)
45363 tmp = gen_reg_rtx (V32QImode);
45364 if (elt < 32)
45365 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
45366 else
45367 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
45368 ix86_expand_vector_extract (false, target, tmp, elt & 31);
45369 return;
45371 break;
45373 case E_V16SFmode:
45374 tmp = gen_reg_rtx (V8SFmode);
45375 if (elt < 8)
45376 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
45377 else
45378 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
45379 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45380 return;
45382 case E_V8DFmode:
45383 tmp = gen_reg_rtx (V4DFmode);
45384 if (elt < 4)
45385 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
45386 else
45387 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
45388 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45389 return;
45391 case E_V16SImode:
45392 tmp = gen_reg_rtx (V8SImode);
45393 if (elt < 8)
45394 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
45395 else
45396 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
45397 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45398 return;
45400 case E_V8DImode:
45401 tmp = gen_reg_rtx (V4DImode);
45402 if (elt < 4)
45403 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
45404 else
45405 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
45406 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45407 return;
45409 case E_V8QImode:
45410 /* ??? Could extract the appropriate HImode element and shift. */
45411 default:
45412 break;
45415 if (use_vec_extr)
45417 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
45418 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
45420 /* Let the rtl optimizers know about the zero extension performed. */
45421 if (inner_mode == QImode || inner_mode == HImode)
45423 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
45424 target = gen_lowpart (SImode, target);
45427 emit_insn (gen_rtx_SET (target, tmp));
45429 else
45431 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45433 emit_move_insn (mem, vec);
45435 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45436 emit_move_insn (target, tmp);
45440 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
45441 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
45442 The upper bits of DEST are undefined, though they shouldn't cause
45443 exceptions (some bits from src or all zeros are ok). */
45445 static void
45446 emit_reduc_half (rtx dest, rtx src, int i)
45448 rtx tem, d = dest;
45449 switch (GET_MODE (src))
45451 case E_V4SFmode:
45452 if (i == 128)
45453 tem = gen_sse_movhlps (dest, src, src);
45454 else
45455 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45456 GEN_INT (1 + 4), GEN_INT (1 + 4));
45457 break;
45458 case E_V2DFmode:
45459 tem = gen_vec_interleave_highv2df (dest, src, src);
45460 break;
45461 case E_V16QImode:
45462 case E_V8HImode:
45463 case E_V4SImode:
45464 case E_V2DImode:
45465 d = gen_reg_rtx (V1TImode);
45466 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45467 GEN_INT (i / 2));
45468 break;
45469 case E_V8SFmode:
45470 if (i == 256)
45471 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45472 else
45473 tem = gen_avx_shufps256 (dest, src, src,
45474 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45475 break;
45476 case E_V4DFmode:
45477 if (i == 256)
45478 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45479 else
45480 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45481 break;
45482 case E_V32QImode:
45483 case E_V16HImode:
45484 case E_V8SImode:
45485 case E_V4DImode:
45486 if (i == 256)
45488 if (GET_MODE (dest) != V4DImode)
45489 d = gen_reg_rtx (V4DImode);
45490 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45491 gen_lowpart (V4DImode, src),
45492 const1_rtx);
45494 else
45496 d = gen_reg_rtx (V2TImode);
45497 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45498 GEN_INT (i / 2));
45500 break;
45501 case E_V64QImode:
45502 case E_V32HImode:
45503 case E_V16SImode:
45504 case E_V16SFmode:
45505 case E_V8DImode:
45506 case E_V8DFmode:
45507 if (i > 128)
45508 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45509 gen_lowpart (V16SImode, src),
45510 gen_lowpart (V16SImode, src),
45511 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45512 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45513 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45514 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45515 GEN_INT (0xC), GEN_INT (0xD),
45516 GEN_INT (0xE), GEN_INT (0xF),
45517 GEN_INT (0x10), GEN_INT (0x11),
45518 GEN_INT (0x12), GEN_INT (0x13),
45519 GEN_INT (0x14), GEN_INT (0x15),
45520 GEN_INT (0x16), GEN_INT (0x17));
45521 else
45522 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45523 gen_lowpart (V16SImode, src),
45524 GEN_INT (i == 128 ? 0x2 : 0x1),
45525 GEN_INT (0x3),
45526 GEN_INT (0x3),
45527 GEN_INT (0x3),
45528 GEN_INT (i == 128 ? 0x6 : 0x5),
45529 GEN_INT (0x7),
45530 GEN_INT (0x7),
45531 GEN_INT (0x7),
45532 GEN_INT (i == 128 ? 0xA : 0x9),
45533 GEN_INT (0xB),
45534 GEN_INT (0xB),
45535 GEN_INT (0xB),
45536 GEN_INT (i == 128 ? 0xE : 0xD),
45537 GEN_INT (0xF),
45538 GEN_INT (0xF),
45539 GEN_INT (0xF));
45540 break;
45541 default:
45542 gcc_unreachable ();
45544 emit_insn (tem);
45545 if (d != dest)
45546 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45549 /* Expand a vector reduction. FN is the binary pattern to reduce;
45550 DEST is the destination; IN is the input vector. */
45552 void
45553 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45555 rtx half, dst, vec = in;
45556 machine_mode mode = GET_MODE (in);
45557 int i;
45559 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45560 if (TARGET_SSE4_1
45561 && mode == V8HImode
45562 && fn == gen_uminv8hi3)
45564 emit_insn (gen_sse4_1_phminposuw (dest, in));
45565 return;
45568 for (i = GET_MODE_BITSIZE (mode);
45569 i > GET_MODE_UNIT_BITSIZE (mode);
45570 i >>= 1)
45572 half = gen_reg_rtx (mode);
45573 emit_reduc_half (half, vec, i);
45574 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45575 dst = dest;
45576 else
45577 dst = gen_reg_rtx (mode);
45578 emit_insn (fn (dst, half, vec));
45579 vec = dst;
45583 /* Target hook for scalar_mode_supported_p. */
45584 static bool
45585 ix86_scalar_mode_supported_p (scalar_mode mode)
45587 if (DECIMAL_FLOAT_MODE_P (mode))
45588 return default_decimal_float_supported_p ();
45589 else if (mode == TFmode)
45590 return true;
45591 else
45592 return default_scalar_mode_supported_p (mode);
45595 /* Implements target hook vector_mode_supported_p. */
45596 static bool
45597 ix86_vector_mode_supported_p (machine_mode mode)
45599 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45600 return true;
45601 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45602 return true;
45603 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45604 return true;
45605 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45606 return true;
45607 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45608 return true;
45609 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45610 return true;
45611 return false;
45614 /* Target hook for c_mode_for_suffix. */
45615 static machine_mode
45616 ix86_c_mode_for_suffix (char suffix)
45618 if (suffix == 'q')
45619 return TFmode;
45620 if (suffix == 'w')
45621 return XFmode;
45623 return VOIDmode;
45626 /* Worker function for TARGET_MD_ASM_ADJUST.
45628 We implement asm flag outputs, and maintain source compatibility
45629 with the old cc0-based compiler. */
45631 static rtx_insn *
45632 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45633 vec<const char *> &constraints,
45634 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45636 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45637 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45639 bool saw_asm_flag = false;
45641 start_sequence ();
45642 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45644 const char *con = constraints[i];
45645 if (strncmp (con, "=@cc", 4) != 0)
45646 continue;
45647 con += 4;
45648 if (strchr (con, ',') != NULL)
45650 error ("alternatives not allowed in asm flag output");
45651 continue;
45654 bool invert = false;
45655 if (con[0] == 'n')
45656 invert = true, con++;
45658 machine_mode mode = CCmode;
45659 rtx_code code = UNKNOWN;
45661 switch (con[0])
45663 case 'a':
45664 if (con[1] == 0)
45665 mode = CCAmode, code = EQ;
45666 else if (con[1] == 'e' && con[2] == 0)
45667 mode = CCCmode, code = NE;
45668 break;
45669 case 'b':
45670 if (con[1] == 0)
45671 mode = CCCmode, code = EQ;
45672 else if (con[1] == 'e' && con[2] == 0)
45673 mode = CCAmode, code = NE;
45674 break;
45675 case 'c':
45676 if (con[1] == 0)
45677 mode = CCCmode, code = EQ;
45678 break;
45679 case 'e':
45680 if (con[1] == 0)
45681 mode = CCZmode, code = EQ;
45682 break;
45683 case 'g':
45684 if (con[1] == 0)
45685 mode = CCGCmode, code = GT;
45686 else if (con[1] == 'e' && con[2] == 0)
45687 mode = CCGCmode, code = GE;
45688 break;
45689 case 'l':
45690 if (con[1] == 0)
45691 mode = CCGCmode, code = LT;
45692 else if (con[1] == 'e' && con[2] == 0)
45693 mode = CCGCmode, code = LE;
45694 break;
45695 case 'o':
45696 if (con[1] == 0)
45697 mode = CCOmode, code = EQ;
45698 break;
45699 case 'p':
45700 if (con[1] == 0)
45701 mode = CCPmode, code = EQ;
45702 break;
45703 case 's':
45704 if (con[1] == 0)
45705 mode = CCSmode, code = EQ;
45706 break;
45707 case 'z':
45708 if (con[1] == 0)
45709 mode = CCZmode, code = EQ;
45710 break;
45712 if (code == UNKNOWN)
45714 error ("unknown asm flag output %qs", constraints[i]);
45715 continue;
45717 if (invert)
45718 code = reverse_condition (code);
45720 rtx dest = outputs[i];
45721 if (!saw_asm_flag)
45723 /* This is the first asm flag output. Here we put the flags
45724 register in as the real output and adjust the condition to
45725 allow it. */
45726 constraints[i] = "=Bf";
45727 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45728 saw_asm_flag = true;
45730 else
45732 /* We don't need the flags register as output twice. */
45733 constraints[i] = "=X";
45734 outputs[i] = gen_rtx_SCRATCH (SImode);
45737 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45738 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45740 machine_mode dest_mode = GET_MODE (dest);
45741 if (!SCALAR_INT_MODE_P (dest_mode))
45743 error ("invalid type for asm flag output");
45744 continue;
45747 if (dest_mode == DImode && !TARGET_64BIT)
45748 dest_mode = SImode;
45750 if (dest_mode != QImode)
45752 rtx destqi = gen_reg_rtx (QImode);
45753 emit_insn (gen_rtx_SET (destqi, x));
45755 if (TARGET_ZERO_EXTEND_WITH_AND
45756 && optimize_function_for_speed_p (cfun))
45758 x = force_reg (dest_mode, const0_rtx);
45760 emit_insn (gen_movstrictqi
45761 (gen_lowpart (QImode, x), destqi));
45763 else
45764 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45767 if (dest_mode != GET_MODE (dest))
45769 rtx tmp = gen_reg_rtx (SImode);
45771 emit_insn (gen_rtx_SET (tmp, x));
45772 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45774 else
45775 emit_insn (gen_rtx_SET (dest, x));
45777 rtx_insn *seq = get_insns ();
45778 end_sequence ();
45780 if (saw_asm_flag)
45781 return seq;
45782 else
45784 /* If we had no asm flag outputs, clobber the flags. */
45785 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45786 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45787 return NULL;
45791 /* Implements target vector targetm.asm.encode_section_info. */
45793 static void ATTRIBUTE_UNUSED
45794 ix86_encode_section_info (tree decl, rtx rtl, int first)
45796 default_encode_section_info (decl, rtl, first);
45798 if (ix86_in_large_data_p (decl))
45799 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45802 /* Worker function for REVERSE_CONDITION. */
45804 enum rtx_code
45805 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45807 return (mode != CCFPmode && mode != CCFPUmode
45808 ? reverse_condition (code)
45809 : reverse_condition_maybe_unordered (code));
45812 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45813 to OPERANDS[0]. */
45815 const char *
45816 output_387_reg_move (rtx_insn *insn, rtx *operands)
45818 if (REG_P (operands[0]))
45820 if (REG_P (operands[1])
45821 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45823 if (REGNO (operands[0]) == FIRST_STACK_REG)
45824 return output_387_ffreep (operands, 0);
45825 return "fstp\t%y0";
45827 if (STACK_TOP_P (operands[0]))
45828 return "fld%Z1\t%y1";
45829 return "fst\t%y0";
45831 else if (MEM_P (operands[0]))
45833 gcc_assert (REG_P (operands[1]));
45834 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45835 return "fstp%Z0\t%y0";
45836 else
45838 /* There is no non-popping store to memory for XFmode.
45839 So if we need one, follow the store with a load. */
45840 if (GET_MODE (operands[0]) == XFmode)
45841 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45842 else
45843 return "fst%Z0\t%y0";
45846 else
45847 gcc_unreachable();
45850 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45851 FP status register is set. */
45853 void
45854 ix86_emit_fp_unordered_jump (rtx label)
45856 rtx reg = gen_reg_rtx (HImode);
45857 rtx temp;
45859 emit_insn (gen_x86_fnstsw_1 (reg));
45861 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45863 emit_insn (gen_x86_sahf_1 (reg));
45865 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45866 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45868 else
45870 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45872 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45873 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45876 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45877 gen_rtx_LABEL_REF (VOIDmode, label),
45878 pc_rtx);
45879 temp = gen_rtx_SET (pc_rtx, temp);
45881 emit_jump_insn (temp);
45882 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45885 /* Output code to perform a log1p XFmode calculation. */
45887 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45889 rtx_code_label *label1 = gen_label_rtx ();
45890 rtx_code_label *label2 = gen_label_rtx ();
45892 rtx tmp = gen_reg_rtx (XFmode);
45893 rtx tmp2 = gen_reg_rtx (XFmode);
45894 rtx test;
45896 emit_insn (gen_absxf2 (tmp, op1));
45897 test = gen_rtx_GE (VOIDmode, tmp,
45898 const_double_from_real_value (
45899 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45900 XFmode));
45901 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45903 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45904 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45905 emit_jump (label2);
45907 emit_label (label1);
45908 emit_move_insn (tmp, CONST1_RTX (XFmode));
45909 emit_insn (gen_addxf3 (tmp, op1, tmp));
45910 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45911 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45913 emit_label (label2);
45916 /* Emit code for round calculation. */
45917 void ix86_emit_i387_round (rtx op0, rtx op1)
45919 machine_mode inmode = GET_MODE (op1);
45920 machine_mode outmode = GET_MODE (op0);
45921 rtx e1, e2, res, tmp, tmp1, half;
45922 rtx scratch = gen_reg_rtx (HImode);
45923 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45924 rtx_code_label *jump_label = gen_label_rtx ();
45925 rtx insn;
45926 rtx (*gen_abs) (rtx, rtx);
45927 rtx (*gen_neg) (rtx, rtx);
45929 switch (inmode)
45931 case E_SFmode:
45932 gen_abs = gen_abssf2;
45933 break;
45934 case E_DFmode:
45935 gen_abs = gen_absdf2;
45936 break;
45937 case E_XFmode:
45938 gen_abs = gen_absxf2;
45939 break;
45940 default:
45941 gcc_unreachable ();
45944 switch (outmode)
45946 case E_SFmode:
45947 gen_neg = gen_negsf2;
45948 break;
45949 case E_DFmode:
45950 gen_neg = gen_negdf2;
45951 break;
45952 case E_XFmode:
45953 gen_neg = gen_negxf2;
45954 break;
45955 case E_HImode:
45956 gen_neg = gen_neghi2;
45957 break;
45958 case E_SImode:
45959 gen_neg = gen_negsi2;
45960 break;
45961 case E_DImode:
45962 gen_neg = gen_negdi2;
45963 break;
45964 default:
45965 gcc_unreachable ();
45968 e1 = gen_reg_rtx (inmode);
45969 e2 = gen_reg_rtx (inmode);
45970 res = gen_reg_rtx (outmode);
45972 half = const_double_from_real_value (dconsthalf, inmode);
45974 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45976 /* scratch = fxam(op1) */
45977 emit_insn (gen_rtx_SET (scratch,
45978 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45979 UNSPEC_FXAM)));
45980 /* e1 = fabs(op1) */
45981 emit_insn (gen_abs (e1, op1));
45983 /* e2 = e1 + 0.5 */
45984 half = force_reg (inmode, half);
45985 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45987 /* res = floor(e2) */
45988 if (inmode != XFmode)
45990 tmp1 = gen_reg_rtx (XFmode);
45992 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45994 else
45995 tmp1 = e2;
45997 switch (outmode)
45999 case E_SFmode:
46000 case E_DFmode:
46002 rtx tmp0 = gen_reg_rtx (XFmode);
46004 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
46006 emit_insn (gen_rtx_SET (res,
46007 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
46008 UNSPEC_TRUNC_NOOP)));
46010 break;
46011 case E_XFmode:
46012 emit_insn (gen_frndintxf2_floor (res, tmp1));
46013 break;
46014 case E_HImode:
46015 emit_insn (gen_lfloorxfhi2 (res, tmp1));
46016 break;
46017 case E_SImode:
46018 emit_insn (gen_lfloorxfsi2 (res, tmp1));
46019 break;
46020 case E_DImode:
46021 emit_insn (gen_lfloorxfdi2 (res, tmp1));
46022 break;
46023 default:
46024 gcc_unreachable ();
46027 /* flags = signbit(a) */
46028 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
46030 /* if (flags) then res = -res */
46031 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
46032 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
46033 gen_rtx_LABEL_REF (VOIDmode, jump_label),
46034 pc_rtx);
46035 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
46036 predict_jump (REG_BR_PROB_BASE * 50 / 100);
46037 JUMP_LABEL (insn) = jump_label;
46039 emit_insn (gen_neg (res, res));
46041 emit_label (jump_label);
46042 LABEL_NUSES (jump_label) = 1;
46044 emit_move_insn (op0, res);
46047 /* Output code to perform a Newton-Rhapson approximation of a single precision
46048 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
46050 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
46052 rtx x0, x1, e0, e1;
46054 x0 = gen_reg_rtx (mode);
46055 e0 = gen_reg_rtx (mode);
46056 e1 = gen_reg_rtx (mode);
46057 x1 = gen_reg_rtx (mode);
46059 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
46061 b = force_reg (mode, b);
46063 /* x0 = rcp(b) estimate */
46064 if (mode == V16SFmode || mode == V8DFmode)
46066 if (TARGET_AVX512ER)
46068 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
46069 UNSPEC_RCP28)));
46070 /* res = a * x0 */
46071 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
46072 return;
46074 else
46075 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
46076 UNSPEC_RCP14)));
46078 else
46079 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
46080 UNSPEC_RCP)));
46082 /* e0 = x0 * b */
46083 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
46085 /* e0 = x0 * e0 */
46086 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
46088 /* e1 = x0 + x0 */
46089 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
46091 /* x1 = e1 - e0 */
46092 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
46094 /* res = a * x1 */
46095 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
46098 /* Output code to perform a Newton-Rhapson approximation of a
46099 single precision floating point [reciprocal] square root. */
46101 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
46103 rtx x0, e0, e1, e2, e3, mthree, mhalf;
46104 REAL_VALUE_TYPE r;
46105 int unspec;
46107 x0 = gen_reg_rtx (mode);
46108 e0 = gen_reg_rtx (mode);
46109 e1 = gen_reg_rtx (mode);
46110 e2 = gen_reg_rtx (mode);
46111 e3 = gen_reg_rtx (mode);
46113 if (TARGET_AVX512ER && mode == V16SFmode)
46115 if (recip)
46116 /* res = rsqrt28(a) estimate */
46117 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
46118 UNSPEC_RSQRT28)));
46119 else
46121 /* x0 = rsqrt28(a) estimate */
46122 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
46123 UNSPEC_RSQRT28)));
46124 /* res = rcp28(x0) estimate */
46125 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
46126 UNSPEC_RCP28)));
46128 return;
46131 real_from_integer (&r, VOIDmode, -3, SIGNED);
46132 mthree = const_double_from_real_value (r, SFmode);
46134 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
46135 mhalf = const_double_from_real_value (r, SFmode);
46136 unspec = UNSPEC_RSQRT;
46138 if (VECTOR_MODE_P (mode))
46140 mthree = ix86_build_const_vector (mode, true, mthree);
46141 mhalf = ix86_build_const_vector (mode, true, mhalf);
46142 /* There is no 512-bit rsqrt. There is however rsqrt14. */
46143 if (GET_MODE_SIZE (mode) == 64)
46144 unspec = UNSPEC_RSQRT14;
46147 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
46148 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
46150 a = force_reg (mode, a);
46152 /* x0 = rsqrt(a) estimate */
46153 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
46154 unspec)));
46156 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
46157 if (!recip)
46159 rtx zero = force_reg (mode, CONST0_RTX(mode));
46160 rtx mask;
46162 /* Handle masked compare. */
46163 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
46165 mask = gen_reg_rtx (HImode);
46166 /* Imm value 0x4 corresponds to not-equal comparison. */
46167 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
46168 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
46170 else
46172 mask = gen_reg_rtx (mode);
46173 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
46174 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
46178 /* e0 = x0 * a */
46179 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
46180 /* e1 = e0 * x0 */
46181 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
46183 /* e2 = e1 - 3. */
46184 mthree = force_reg (mode, mthree);
46185 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
46187 mhalf = force_reg (mode, mhalf);
46188 if (recip)
46189 /* e3 = -.5 * x0 */
46190 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
46191 else
46192 /* e3 = -.5 * e0 */
46193 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
46194 /* ret = e2 * e3 */
46195 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
46198 #ifdef TARGET_SOLARIS
46199 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
46201 static void
46202 i386_solaris_elf_named_section (const char *name, unsigned int flags,
46203 tree decl)
46205 /* With Binutils 2.15, the "@unwind" marker must be specified on
46206 every occurrence of the ".eh_frame" section, not just the first
46207 one. */
46208 if (TARGET_64BIT
46209 && strcmp (name, ".eh_frame") == 0)
46211 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
46212 flags & SECTION_WRITE ? "aw" : "a");
46213 return;
46216 #ifndef USE_GAS
46217 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
46219 solaris_elf_asm_comdat_section (name, flags, decl);
46220 return;
46222 #endif
46224 default_elf_asm_named_section (name, flags, decl);
46226 #endif /* TARGET_SOLARIS */
46228 /* Return the mangling of TYPE if it is an extended fundamental type. */
46230 static const char *
46231 ix86_mangle_type (const_tree type)
46233 type = TYPE_MAIN_VARIANT (type);
46235 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
46236 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
46237 return NULL;
46239 switch (TYPE_MODE (type))
46241 case E_TFmode:
46242 /* __float128 is "g". */
46243 return "g";
46244 case E_XFmode:
46245 /* "long double" or __float80 is "e". */
46246 return "e";
46247 default:
46248 return NULL;
46252 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
46254 static tree
46255 ix86_stack_protect_guard (void)
46257 if (TARGET_SSP_TLS_GUARD)
46259 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
46260 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
46261 tree type = build_qualified_type (type_node, qual);
46262 tree t;
46264 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
46266 t = ix86_tls_stack_chk_guard_decl;
46268 if (t == NULL)
46270 rtx x;
46272 t = build_decl
46273 (UNKNOWN_LOCATION, VAR_DECL,
46274 get_identifier (ix86_stack_protector_guard_symbol_str),
46275 type);
46276 TREE_STATIC (t) = 1;
46277 TREE_PUBLIC (t) = 1;
46278 DECL_EXTERNAL (t) = 1;
46279 TREE_USED (t) = 1;
46280 TREE_THIS_VOLATILE (t) = 1;
46281 DECL_ARTIFICIAL (t) = 1;
46282 DECL_IGNORED_P (t) = 1;
46284 /* Do not share RTL as the declaration is visible outside of
46285 current function. */
46286 x = DECL_RTL (t);
46287 RTX_FLAG (x, used) = 1;
46289 ix86_tls_stack_chk_guard_decl = t;
46292 else
46294 tree asptrtype = build_pointer_type (type);
46296 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
46297 t = build2 (MEM_REF, asptrtype, t,
46298 build_int_cst (asptrtype, 0));
46301 return t;
46304 return default_stack_protect_guard ();
46307 /* For 32-bit code we can save PIC register setup by using
46308 __stack_chk_fail_local hidden function instead of calling
46309 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
46310 register, so it is better to call __stack_chk_fail directly. */
46312 static tree ATTRIBUTE_UNUSED
46313 ix86_stack_protect_fail (void)
46315 return TARGET_64BIT
46316 ? default_external_stack_protect_fail ()
46317 : default_hidden_stack_protect_fail ();
46320 /* Select a format to encode pointers in exception handling data. CODE
46321 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
46322 true if the symbol may be affected by dynamic relocations.
46324 ??? All x86 object file formats are capable of representing this.
46325 After all, the relocation needed is the same as for the call insn.
46326 Whether or not a particular assembler allows us to enter such, I
46327 guess we'll have to see. */
46329 asm_preferred_eh_data_format (int code, int global)
46331 if (flag_pic)
46333 int type = DW_EH_PE_sdata8;
46334 if (!TARGET_64BIT
46335 || ix86_cmodel == CM_SMALL_PIC
46336 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
46337 type = DW_EH_PE_sdata4;
46338 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
46340 if (ix86_cmodel == CM_SMALL
46341 || (ix86_cmodel == CM_MEDIUM && code))
46342 return DW_EH_PE_udata4;
46343 return DW_EH_PE_absptr;
46346 /* Expand copysign from SIGN to the positive value ABS_VALUE
46347 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
46348 the sign-bit. */
46349 static void
46350 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
46352 machine_mode mode = GET_MODE (sign);
46353 rtx sgn = gen_reg_rtx (mode);
46354 if (mask == NULL_RTX)
46356 machine_mode vmode;
46358 if (mode == SFmode)
46359 vmode = V4SFmode;
46360 else if (mode == DFmode)
46361 vmode = V2DFmode;
46362 else
46363 vmode = mode;
46365 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
46366 if (!VECTOR_MODE_P (mode))
46368 /* We need to generate a scalar mode mask in this case. */
46369 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46370 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46371 mask = gen_reg_rtx (mode);
46372 emit_insn (gen_rtx_SET (mask, tmp));
46375 else
46376 mask = gen_rtx_NOT (mode, mask);
46377 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
46378 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
46381 /* Expand fabs (OP0) and return a new rtx that holds the result. The
46382 mask for masking out the sign-bit is stored in *SMASK, if that is
46383 non-null. */
46384 static rtx
46385 ix86_expand_sse_fabs (rtx op0, rtx *smask)
46387 machine_mode vmode, mode = GET_MODE (op0);
46388 rtx xa, mask;
46390 xa = gen_reg_rtx (mode);
46391 if (mode == SFmode)
46392 vmode = V4SFmode;
46393 else if (mode == DFmode)
46394 vmode = V2DFmode;
46395 else
46396 vmode = mode;
46397 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
46398 if (!VECTOR_MODE_P (mode))
46400 /* We need to generate a scalar mode mask in this case. */
46401 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46402 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46403 mask = gen_reg_rtx (mode);
46404 emit_insn (gen_rtx_SET (mask, tmp));
46406 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
46408 if (smask)
46409 *smask = mask;
46411 return xa;
46414 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
46415 swapping the operands if SWAP_OPERANDS is true. The expanded
46416 code is a forward jump to a newly created label in case the
46417 comparison is true. The generated label rtx is returned. */
46418 static rtx_code_label *
46419 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
46420 bool swap_operands)
46422 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
46423 rtx_code_label *label;
46424 rtx tmp;
46426 if (swap_operands)
46427 std::swap (op0, op1);
46429 label = gen_label_rtx ();
46430 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
46431 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
46432 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
46433 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
46434 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
46435 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
46436 JUMP_LABEL (tmp) = label;
46438 return label;
46441 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
46442 using comparison code CODE. Operands are swapped for the comparison if
46443 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
46444 static rtx
46445 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
46446 bool swap_operands)
46448 rtx (*insn)(rtx, rtx, rtx, rtx);
46449 machine_mode mode = GET_MODE (op0);
46450 rtx mask = gen_reg_rtx (mode);
46452 if (swap_operands)
46453 std::swap (op0, op1);
46455 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
46457 emit_insn (insn (mask, op0, op1,
46458 gen_rtx_fmt_ee (code, mode, op0, op1)));
46459 return mask;
46462 /* Generate and return a rtx of mode MODE for 2**n where n is the number
46463 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
46464 static rtx
46465 ix86_gen_TWO52 (machine_mode mode)
46467 REAL_VALUE_TYPE TWO52r;
46468 rtx TWO52;
46470 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
46471 TWO52 = const_double_from_real_value (TWO52r, mode);
46472 TWO52 = force_reg (mode, TWO52);
46474 return TWO52;
46477 /* Expand SSE sequence for computing lround from OP1 storing
46478 into OP0. */
46479 void
46480 ix86_expand_lround (rtx op0, rtx op1)
46482 /* C code for the stuff we're doing below:
46483 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
46484 return (long)tmp;
46486 machine_mode mode = GET_MODE (op1);
46487 const struct real_format *fmt;
46488 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46489 rtx adj;
46491 /* load nextafter (0.5, 0.0) */
46492 fmt = REAL_MODE_FORMAT (mode);
46493 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46494 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46496 /* adj = copysign (0.5, op1) */
46497 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46498 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46500 /* adj = op1 + adj */
46501 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46503 /* op0 = (imode)adj */
46504 expand_fix (op0, adj, 0);
46507 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46508 into OPERAND0. */
46509 void
46510 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46512 /* C code for the stuff we're doing below (for do_floor):
46513 xi = (long)op1;
46514 xi -= (double)xi > op1 ? 1 : 0;
46515 return xi;
46517 machine_mode fmode = GET_MODE (op1);
46518 machine_mode imode = GET_MODE (op0);
46519 rtx ireg, freg, tmp;
46520 rtx_code_label *label;
46522 /* reg = (long)op1 */
46523 ireg = gen_reg_rtx (imode);
46524 expand_fix (ireg, op1, 0);
46526 /* freg = (double)reg */
46527 freg = gen_reg_rtx (fmode);
46528 expand_float (freg, ireg, 0);
46530 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46531 label = ix86_expand_sse_compare_and_jump (UNLE,
46532 freg, op1, !do_floor);
46533 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46534 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46535 emit_move_insn (ireg, tmp);
46537 emit_label (label);
46538 LABEL_NUSES (label) = 1;
46540 emit_move_insn (op0, ireg);
46543 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46544 result in OPERAND0. */
46545 void
46546 ix86_expand_rint (rtx operand0, rtx operand1)
46548 /* C code for the stuff we're doing below:
46549 xa = fabs (operand1);
46550 if (!isless (xa, 2**52))
46551 return operand1;
46552 xa = xa + 2**52 - 2**52;
46553 return copysign (xa, operand1);
46555 machine_mode mode = GET_MODE (operand0);
46556 rtx res, xa, TWO52, mask;
46557 rtx_code_label *label;
46559 res = gen_reg_rtx (mode);
46560 emit_move_insn (res, operand1);
46562 /* xa = abs (operand1) */
46563 xa = ix86_expand_sse_fabs (res, &mask);
46565 /* if (!isless (xa, TWO52)) goto label; */
46566 TWO52 = ix86_gen_TWO52 (mode);
46567 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46569 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46570 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46572 ix86_sse_copysign_to_positive (res, xa, res, mask);
46574 emit_label (label);
46575 LABEL_NUSES (label) = 1;
46577 emit_move_insn (operand0, res);
46580 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46581 into OPERAND0. */
46582 void
46583 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46585 /* C code for the stuff we expand below.
46586 double xa = fabs (x), x2;
46587 if (!isless (xa, TWO52))
46588 return x;
46589 xa = xa + TWO52 - TWO52;
46590 x2 = copysign (xa, x);
46591 Compensate. Floor:
46592 if (x2 > x)
46593 x2 -= 1;
46594 Compensate. Ceil:
46595 if (x2 < x)
46596 x2 -= -1;
46597 return x2;
46599 machine_mode mode = GET_MODE (operand0);
46600 rtx xa, TWO52, tmp, one, res, mask;
46601 rtx_code_label *label;
46603 TWO52 = ix86_gen_TWO52 (mode);
46605 /* Temporary for holding the result, initialized to the input
46606 operand to ease control flow. */
46607 res = gen_reg_rtx (mode);
46608 emit_move_insn (res, operand1);
46610 /* xa = abs (operand1) */
46611 xa = ix86_expand_sse_fabs (res, &mask);
46613 /* if (!isless (xa, TWO52)) goto label; */
46614 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46616 /* xa = xa + TWO52 - TWO52; */
46617 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46618 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46620 /* xa = copysign (xa, operand1) */
46621 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46623 /* generate 1.0 or -1.0 */
46624 one = force_reg (mode,
46625 const_double_from_real_value (do_floor
46626 ? dconst1 : dconstm1, mode));
46628 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46629 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46630 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46631 /* We always need to subtract here to preserve signed zero. */
46632 tmp = expand_simple_binop (mode, MINUS,
46633 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46634 emit_move_insn (res, tmp);
46636 emit_label (label);
46637 LABEL_NUSES (label) = 1;
46639 emit_move_insn (operand0, res);
46642 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46643 into OPERAND0. */
46644 void
46645 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46647 /* C code for the stuff we expand below.
46648 double xa = fabs (x), x2;
46649 if (!isless (xa, TWO52))
46650 return x;
46651 x2 = (double)(long)x;
46652 Compensate. Floor:
46653 if (x2 > x)
46654 x2 -= 1;
46655 Compensate. Ceil:
46656 if (x2 < x)
46657 x2 += 1;
46658 if (HONOR_SIGNED_ZEROS (mode))
46659 return copysign (x2, x);
46660 return x2;
46662 machine_mode mode = GET_MODE (operand0);
46663 rtx xa, xi, TWO52, tmp, one, res, mask;
46664 rtx_code_label *label;
46666 TWO52 = ix86_gen_TWO52 (mode);
46668 /* Temporary for holding the result, initialized to the input
46669 operand to ease control flow. */
46670 res = gen_reg_rtx (mode);
46671 emit_move_insn (res, operand1);
46673 /* xa = abs (operand1) */
46674 xa = ix86_expand_sse_fabs (res, &mask);
46676 /* if (!isless (xa, TWO52)) goto label; */
46677 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46679 /* xa = (double)(long)x */
46680 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46681 expand_fix (xi, res, 0);
46682 expand_float (xa, xi, 0);
46684 /* generate 1.0 */
46685 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46687 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46688 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46689 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46690 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46691 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46692 emit_move_insn (res, tmp);
46694 if (HONOR_SIGNED_ZEROS (mode))
46695 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46697 emit_label (label);
46698 LABEL_NUSES (label) = 1;
46700 emit_move_insn (operand0, res);
46703 /* Expand SSE sequence for computing round from OPERAND1 storing
46704 into OPERAND0. Sequence that works without relying on DImode truncation
46705 via cvttsd2siq that is only available on 64bit targets. */
46706 void
46707 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46709 /* C code for the stuff we expand below.
46710 double xa = fabs (x), xa2, x2;
46711 if (!isless (xa, TWO52))
46712 return x;
46713 Using the absolute value and copying back sign makes
46714 -0.0 -> -0.0 correct.
46715 xa2 = xa + TWO52 - TWO52;
46716 Compensate.
46717 dxa = xa2 - xa;
46718 if (dxa <= -0.5)
46719 xa2 += 1;
46720 else if (dxa > 0.5)
46721 xa2 -= 1;
46722 x2 = copysign (xa2, x);
46723 return x2;
46725 machine_mode mode = GET_MODE (operand0);
46726 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46727 rtx_code_label *label;
46729 TWO52 = ix86_gen_TWO52 (mode);
46731 /* Temporary for holding the result, initialized to the input
46732 operand to ease control flow. */
46733 res = gen_reg_rtx (mode);
46734 emit_move_insn (res, operand1);
46736 /* xa = abs (operand1) */
46737 xa = ix86_expand_sse_fabs (res, &mask);
46739 /* if (!isless (xa, TWO52)) goto label; */
46740 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46742 /* xa2 = xa + TWO52 - TWO52; */
46743 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46744 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46746 /* dxa = xa2 - xa; */
46747 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46749 /* generate 0.5, 1.0 and -0.5 */
46750 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46751 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46752 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46753 0, OPTAB_DIRECT);
46755 /* Compensate. */
46756 tmp = gen_reg_rtx (mode);
46757 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46758 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46759 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46760 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46761 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46762 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46763 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46764 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46766 /* res = copysign (xa2, operand1) */
46767 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46769 emit_label (label);
46770 LABEL_NUSES (label) = 1;
46772 emit_move_insn (operand0, res);
46775 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46776 into OPERAND0. */
46777 void
46778 ix86_expand_trunc (rtx operand0, rtx operand1)
46780 /* C code for SSE variant we expand below.
46781 double xa = fabs (x), x2;
46782 if (!isless (xa, TWO52))
46783 return x;
46784 x2 = (double)(long)x;
46785 if (HONOR_SIGNED_ZEROS (mode))
46786 return copysign (x2, x);
46787 return x2;
46789 machine_mode mode = GET_MODE (operand0);
46790 rtx xa, xi, TWO52, res, mask;
46791 rtx_code_label *label;
46793 TWO52 = ix86_gen_TWO52 (mode);
46795 /* Temporary for holding the result, initialized to the input
46796 operand to ease control flow. */
46797 res = gen_reg_rtx (mode);
46798 emit_move_insn (res, operand1);
46800 /* xa = abs (operand1) */
46801 xa = ix86_expand_sse_fabs (res, &mask);
46803 /* if (!isless (xa, TWO52)) goto label; */
46804 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46806 /* x = (double)(long)x */
46807 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46808 expand_fix (xi, res, 0);
46809 expand_float (res, xi, 0);
46811 if (HONOR_SIGNED_ZEROS (mode))
46812 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46814 emit_label (label);
46815 LABEL_NUSES (label) = 1;
46817 emit_move_insn (operand0, res);
46820 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46821 into OPERAND0. */
46822 void
46823 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46825 machine_mode mode = GET_MODE (operand0);
46826 rtx xa, mask, TWO52, one, res, smask, tmp;
46827 rtx_code_label *label;
46829 /* C code for SSE variant we expand below.
46830 double xa = fabs (x), x2;
46831 if (!isless (xa, TWO52))
46832 return x;
46833 xa2 = xa + TWO52 - TWO52;
46834 Compensate:
46835 if (xa2 > xa)
46836 xa2 -= 1.0;
46837 x2 = copysign (xa2, x);
46838 return x2;
46841 TWO52 = ix86_gen_TWO52 (mode);
46843 /* Temporary for holding the result, initialized to the input
46844 operand to ease control flow. */
46845 res = gen_reg_rtx (mode);
46846 emit_move_insn (res, operand1);
46848 /* xa = abs (operand1) */
46849 xa = ix86_expand_sse_fabs (res, &smask);
46851 /* if (!isless (xa, TWO52)) goto label; */
46852 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46854 /* res = xa + TWO52 - TWO52; */
46855 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46856 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46857 emit_move_insn (res, tmp);
46859 /* generate 1.0 */
46860 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46862 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46863 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46864 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46865 tmp = expand_simple_binop (mode, MINUS,
46866 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46867 emit_move_insn (res, tmp);
46869 /* res = copysign (res, operand1) */
46870 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46872 emit_label (label);
46873 LABEL_NUSES (label) = 1;
46875 emit_move_insn (operand0, res);
46878 /* Expand SSE sequence for computing round from OPERAND1 storing
46879 into OPERAND0. */
46880 void
46881 ix86_expand_round (rtx operand0, rtx operand1)
46883 /* C code for the stuff we're doing below:
46884 double xa = fabs (x);
46885 if (!isless (xa, TWO52))
46886 return x;
46887 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46888 return copysign (xa, x);
46890 machine_mode mode = GET_MODE (operand0);
46891 rtx res, TWO52, xa, xi, half, mask;
46892 rtx_code_label *label;
46893 const struct real_format *fmt;
46894 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46896 /* Temporary for holding the result, initialized to the input
46897 operand to ease control flow. */
46898 res = gen_reg_rtx (mode);
46899 emit_move_insn (res, operand1);
46901 TWO52 = ix86_gen_TWO52 (mode);
46902 xa = ix86_expand_sse_fabs (res, &mask);
46903 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46905 /* load nextafter (0.5, 0.0) */
46906 fmt = REAL_MODE_FORMAT (mode);
46907 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46908 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46910 /* xa = xa + 0.5 */
46911 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46912 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46914 /* xa = (double)(int64_t)xa */
46915 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46916 expand_fix (xi, xa, 0);
46917 expand_float (xa, xi, 0);
46919 /* res = copysign (xa, operand1) */
46920 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46922 emit_label (label);
46923 LABEL_NUSES (label) = 1;
46925 emit_move_insn (operand0, res);
46928 /* Expand SSE sequence for computing round
46929 from OP1 storing into OP0 using sse4 round insn. */
46930 void
46931 ix86_expand_round_sse4 (rtx op0, rtx op1)
46933 machine_mode mode = GET_MODE (op0);
46934 rtx e1, e2, res, half;
46935 const struct real_format *fmt;
46936 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46937 rtx (*gen_copysign) (rtx, rtx, rtx);
46938 rtx (*gen_round) (rtx, rtx, rtx);
46940 switch (mode)
46942 case E_SFmode:
46943 gen_copysign = gen_copysignsf3;
46944 gen_round = gen_sse4_1_roundsf2;
46945 break;
46946 case E_DFmode:
46947 gen_copysign = gen_copysigndf3;
46948 gen_round = gen_sse4_1_rounddf2;
46949 break;
46950 default:
46951 gcc_unreachable ();
46954 /* round (a) = trunc (a + copysign (0.5, a)) */
46956 /* load nextafter (0.5, 0.0) */
46957 fmt = REAL_MODE_FORMAT (mode);
46958 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46959 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46960 half = const_double_from_real_value (pred_half, mode);
46962 /* e1 = copysign (0.5, op1) */
46963 e1 = gen_reg_rtx (mode);
46964 emit_insn (gen_copysign (e1, half, op1));
46966 /* e2 = op1 + e1 */
46967 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46969 /* res = trunc (e2) */
46970 res = gen_reg_rtx (mode);
46971 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46973 emit_move_insn (op0, res);
46977 /* Table of valid machine attributes. */
46978 static const struct attribute_spec ix86_attribute_table[] =
46980 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46981 affects_type_identity } */
46982 /* Stdcall attribute says callee is responsible for popping arguments
46983 if they are not variable. */
46984 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46985 true },
46986 /* Fastcall attribute says callee is responsible for popping arguments
46987 if they are not variable. */
46988 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46989 true },
46990 /* Thiscall attribute says callee is responsible for popping arguments
46991 if they are not variable. */
46992 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46993 true },
46994 /* Cdecl attribute says the callee is a normal C declaration */
46995 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46996 true },
46997 /* Regparm attribute specifies how many integer arguments are to be
46998 passed in registers. */
46999 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
47000 true },
47001 /* Sseregparm attribute says we are using x86_64 calling conventions
47002 for FP arguments. */
47003 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
47004 true },
47005 /* The transactional memory builtins are implicitly regparm or fastcall
47006 depending on the ABI. Override the generic do-nothing attribute that
47007 these builtins were declared with. */
47008 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
47009 true },
47010 /* force_align_arg_pointer says this function realigns the stack at entry. */
47011 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
47012 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
47013 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
47014 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
47015 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
47016 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
47017 false },
47018 #endif
47019 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
47020 false },
47021 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
47022 false },
47023 #ifdef SUBTARGET_ATTRIBUTE_TABLE
47024 SUBTARGET_ATTRIBUTE_TABLE,
47025 #endif
47026 /* ms_abi and sysv_abi calling convention function attributes. */
47027 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
47028 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
47029 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
47030 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
47031 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
47032 false },
47033 { "callee_pop_aggregate_return", 1, 1, false, true, true,
47034 ix86_handle_callee_pop_aggregate_return, true },
47035 { "interrupt", 0, 0, false, true, true,
47036 ix86_handle_interrupt_attribute, false },
47037 { "no_caller_saved_registers", 0, 0, false, true, true,
47038 ix86_handle_no_caller_saved_registers_attribute, false },
47039 { "naked", 0, 0, true, false, false,
47040 ix86_handle_fndecl_attribute, false },
47042 /* End element. */
47043 { NULL, 0, 0, false, false, false, NULL, false }
47046 /* Implement targetm.vectorize.builtin_vectorization_cost. */
47047 static int
47048 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
47049 tree vectype, int)
47051 switch (type_of_cost)
47053 case scalar_stmt:
47054 return ix86_cost->scalar_stmt_cost;
47056 case scalar_load:
47057 return ix86_cost->scalar_load_cost;
47059 case scalar_store:
47060 return ix86_cost->scalar_store_cost;
47062 case vector_stmt:
47063 return ix86_cost->vec_stmt_cost;
47065 case vector_load:
47066 return ix86_cost->vec_align_load_cost;
47068 case vector_store:
47069 return ix86_cost->vec_store_cost;
47071 case vec_to_scalar:
47072 return ix86_cost->vec_to_scalar_cost;
47074 case scalar_to_vec:
47075 return ix86_cost->scalar_to_vec_cost;
47077 case unaligned_load:
47078 case unaligned_store:
47079 return ix86_cost->vec_unalign_load_cost;
47081 case cond_branch_taken:
47082 return ix86_cost->cond_taken_branch_cost;
47084 case cond_branch_not_taken:
47085 return ix86_cost->cond_not_taken_branch_cost;
47087 case vec_perm:
47088 case vec_promote_demote:
47089 return ix86_cost->vec_stmt_cost;
47091 case vec_construct:
47092 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
47094 default:
47095 gcc_unreachable ();
47099 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
47100 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
47101 insn every time. */
47103 static GTY(()) rtx_insn *vselect_insn;
47105 /* Initialize vselect_insn. */
47107 static void
47108 init_vselect_insn (void)
47110 unsigned i;
47111 rtx x;
47113 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
47114 for (i = 0; i < MAX_VECT_LEN; ++i)
47115 XVECEXP (x, 0, i) = const0_rtx;
47116 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
47117 const0_rtx), x);
47118 x = gen_rtx_SET (const0_rtx, x);
47119 start_sequence ();
47120 vselect_insn = emit_insn (x);
47121 end_sequence ();
47124 /* Construct (set target (vec_select op0 (parallel perm))) and
47125 return true if that's a valid instruction in the active ISA. */
47127 static bool
47128 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
47129 unsigned nelt, bool testing_p)
47131 unsigned int i;
47132 rtx x, save_vconcat;
47133 int icode;
47135 if (vselect_insn == NULL_RTX)
47136 init_vselect_insn ();
47138 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
47139 PUT_NUM_ELEM (XVEC (x, 0), nelt);
47140 for (i = 0; i < nelt; ++i)
47141 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
47142 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
47143 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
47144 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
47145 SET_DEST (PATTERN (vselect_insn)) = target;
47146 icode = recog_memoized (vselect_insn);
47148 if (icode >= 0 && !testing_p)
47149 emit_insn (copy_rtx (PATTERN (vselect_insn)));
47151 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
47152 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
47153 INSN_CODE (vselect_insn) = -1;
47155 return icode >= 0;
47158 /* Similar, but generate a vec_concat from op0 and op1 as well. */
47160 static bool
47161 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
47162 const unsigned char *perm, unsigned nelt,
47163 bool testing_p)
47165 machine_mode v2mode;
47166 rtx x;
47167 bool ok;
47169 if (vselect_insn == NULL_RTX)
47170 init_vselect_insn ();
47172 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
47173 return false;
47174 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
47175 PUT_MODE (x, v2mode);
47176 XEXP (x, 0) = op0;
47177 XEXP (x, 1) = op1;
47178 ok = expand_vselect (target, x, perm, nelt, testing_p);
47179 XEXP (x, 0) = const0_rtx;
47180 XEXP (x, 1) = const0_rtx;
47181 return ok;
47184 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47185 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
47187 static bool
47188 expand_vec_perm_blend (struct expand_vec_perm_d *d)
47190 machine_mode mmode, vmode = d->vmode;
47191 unsigned i, mask, nelt = d->nelt;
47192 rtx target, op0, op1, maskop, x;
47193 rtx rperm[32], vperm;
47195 if (d->one_operand_p)
47196 return false;
47197 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
47198 && (TARGET_AVX512BW
47199 || GET_MODE_UNIT_SIZE (vmode) >= 4))
47201 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47203 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47205 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47207 else
47208 return false;
47210 /* This is a blend, not a permute. Elements must stay in their
47211 respective lanes. */
47212 for (i = 0; i < nelt; ++i)
47214 unsigned e = d->perm[i];
47215 if (!(e == i || e == i + nelt))
47216 return false;
47219 if (d->testing_p)
47220 return true;
47222 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
47223 decision should be extracted elsewhere, so that we only try that
47224 sequence once all budget==3 options have been tried. */
47225 target = d->target;
47226 op0 = d->op0;
47227 op1 = d->op1;
47228 mask = 0;
47230 switch (vmode)
47232 case E_V8DFmode:
47233 case E_V16SFmode:
47234 case E_V4DFmode:
47235 case E_V8SFmode:
47236 case E_V2DFmode:
47237 case E_V4SFmode:
47238 case E_V8HImode:
47239 case E_V8SImode:
47240 case E_V32HImode:
47241 case E_V64QImode:
47242 case E_V16SImode:
47243 case E_V8DImode:
47244 for (i = 0; i < nelt; ++i)
47245 mask |= (d->perm[i] >= nelt) << i;
47246 break;
47248 case E_V2DImode:
47249 for (i = 0; i < 2; ++i)
47250 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
47251 vmode = V8HImode;
47252 goto do_subreg;
47254 case E_V4SImode:
47255 for (i = 0; i < 4; ++i)
47256 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47257 vmode = V8HImode;
47258 goto do_subreg;
47260 case E_V16QImode:
47261 /* See if bytes move in pairs so we can use pblendw with
47262 an immediate argument, rather than pblendvb with a vector
47263 argument. */
47264 for (i = 0; i < 16; i += 2)
47265 if (d->perm[i] + 1 != d->perm[i + 1])
47267 use_pblendvb:
47268 for (i = 0; i < nelt; ++i)
47269 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
47271 finish_pblendvb:
47272 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
47273 vperm = force_reg (vmode, vperm);
47275 if (GET_MODE_SIZE (vmode) == 16)
47276 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
47277 else
47278 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
47279 if (target != d->target)
47280 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47281 return true;
47284 for (i = 0; i < 8; ++i)
47285 mask |= (d->perm[i * 2] >= 16) << i;
47286 vmode = V8HImode;
47287 /* FALLTHRU */
47289 do_subreg:
47290 target = gen_reg_rtx (vmode);
47291 op0 = gen_lowpart (vmode, op0);
47292 op1 = gen_lowpart (vmode, op1);
47293 break;
47295 case E_V32QImode:
47296 /* See if bytes move in pairs. If not, vpblendvb must be used. */
47297 for (i = 0; i < 32; i += 2)
47298 if (d->perm[i] + 1 != d->perm[i + 1])
47299 goto use_pblendvb;
47300 /* See if bytes move in quadruplets. If yes, vpblendd
47301 with immediate can be used. */
47302 for (i = 0; i < 32; i += 4)
47303 if (d->perm[i] + 2 != d->perm[i + 2])
47304 break;
47305 if (i < 32)
47307 /* See if bytes move the same in both lanes. If yes,
47308 vpblendw with immediate can be used. */
47309 for (i = 0; i < 16; i += 2)
47310 if (d->perm[i] + 16 != d->perm[i + 16])
47311 goto use_pblendvb;
47313 /* Use vpblendw. */
47314 for (i = 0; i < 16; ++i)
47315 mask |= (d->perm[i * 2] >= 32) << i;
47316 vmode = V16HImode;
47317 goto do_subreg;
47320 /* Use vpblendd. */
47321 for (i = 0; i < 8; ++i)
47322 mask |= (d->perm[i * 4] >= 32) << i;
47323 vmode = V8SImode;
47324 goto do_subreg;
47326 case E_V16HImode:
47327 /* See if words move in pairs. If yes, vpblendd can be used. */
47328 for (i = 0; i < 16; i += 2)
47329 if (d->perm[i] + 1 != d->perm[i + 1])
47330 break;
47331 if (i < 16)
47333 /* See if words move the same in both lanes. If not,
47334 vpblendvb must be used. */
47335 for (i = 0; i < 8; i++)
47336 if (d->perm[i] + 8 != d->perm[i + 8])
47338 /* Use vpblendvb. */
47339 for (i = 0; i < 32; ++i)
47340 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
47342 vmode = V32QImode;
47343 nelt = 32;
47344 target = gen_reg_rtx (vmode);
47345 op0 = gen_lowpart (vmode, op0);
47346 op1 = gen_lowpart (vmode, op1);
47347 goto finish_pblendvb;
47350 /* Use vpblendw. */
47351 for (i = 0; i < 16; ++i)
47352 mask |= (d->perm[i] >= 16) << i;
47353 break;
47356 /* Use vpblendd. */
47357 for (i = 0; i < 8; ++i)
47358 mask |= (d->perm[i * 2] >= 16) << i;
47359 vmode = V8SImode;
47360 goto do_subreg;
47362 case E_V4DImode:
47363 /* Use vpblendd. */
47364 for (i = 0; i < 4; ++i)
47365 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47366 vmode = V8SImode;
47367 goto do_subreg;
47369 default:
47370 gcc_unreachable ();
47373 switch (vmode)
47375 case E_V8DFmode:
47376 case E_V8DImode:
47377 mmode = QImode;
47378 break;
47379 case E_V16SFmode:
47380 case E_V16SImode:
47381 mmode = HImode;
47382 break;
47383 case E_V32HImode:
47384 mmode = SImode;
47385 break;
47386 case E_V64QImode:
47387 mmode = DImode;
47388 break;
47389 default:
47390 mmode = VOIDmode;
47393 if (mmode != VOIDmode)
47394 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
47395 else
47396 maskop = GEN_INT (mask);
47398 /* This matches five different patterns with the different modes. */
47399 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
47400 x = gen_rtx_SET (target, x);
47401 emit_insn (x);
47402 if (target != d->target)
47403 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47405 return true;
47408 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47409 in terms of the variable form of vpermilps.
47411 Note that we will have already failed the immediate input vpermilps,
47412 which requires that the high and low part shuffle be identical; the
47413 variable form doesn't require that. */
47415 static bool
47416 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
47418 rtx rperm[8], vperm;
47419 unsigned i;
47421 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
47422 return false;
47424 /* We can only permute within the 128-bit lane. */
47425 for (i = 0; i < 8; ++i)
47427 unsigned e = d->perm[i];
47428 if (i < 4 ? e >= 4 : e < 4)
47429 return false;
47432 if (d->testing_p)
47433 return true;
47435 for (i = 0; i < 8; ++i)
47437 unsigned e = d->perm[i];
47439 /* Within each 128-bit lane, the elements of op0 are numbered
47440 from 0 and the elements of op1 are numbered from 4. */
47441 if (e >= 8 + 4)
47442 e -= 8;
47443 else if (e >= 4)
47444 e -= 4;
47446 rperm[i] = GEN_INT (e);
47449 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
47450 vperm = force_reg (V8SImode, vperm);
47451 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
47453 return true;
47456 /* Return true if permutation D can be performed as VMODE permutation
47457 instead. */
47459 static bool
47460 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
47462 unsigned int i, j, chunk;
47464 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
47465 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
47466 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
47467 return false;
47469 if (GET_MODE_NUNITS (vmode) >= d->nelt)
47470 return true;
47472 chunk = d->nelt / GET_MODE_NUNITS (vmode);
47473 for (i = 0; i < d->nelt; i += chunk)
47474 if (d->perm[i] & (chunk - 1))
47475 return false;
47476 else
47477 for (j = 1; j < chunk; ++j)
47478 if (d->perm[i] + j != d->perm[i + j])
47479 return false;
47481 return true;
47484 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47485 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
47487 static bool
47488 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47490 unsigned i, nelt, eltsz, mask;
47491 unsigned char perm[64];
47492 machine_mode vmode = V16QImode;
47493 rtx rperm[64], vperm, target, op0, op1;
47495 nelt = d->nelt;
47497 if (!d->one_operand_p)
47499 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47501 if (TARGET_AVX2
47502 && valid_perm_using_mode_p (V2TImode, d))
47504 if (d->testing_p)
47505 return true;
47507 /* Use vperm2i128 insn. The pattern uses
47508 V4DImode instead of V2TImode. */
47509 target = d->target;
47510 if (d->vmode != V4DImode)
47511 target = gen_reg_rtx (V4DImode);
47512 op0 = gen_lowpart (V4DImode, d->op0);
47513 op1 = gen_lowpart (V4DImode, d->op1);
47514 rperm[0]
47515 = GEN_INT ((d->perm[0] / (nelt / 2))
47516 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47517 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47518 if (target != d->target)
47519 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47520 return true;
47522 return false;
47525 else
47527 if (GET_MODE_SIZE (d->vmode) == 16)
47529 if (!TARGET_SSSE3)
47530 return false;
47532 else if (GET_MODE_SIZE (d->vmode) == 32)
47534 if (!TARGET_AVX2)
47535 return false;
47537 /* V4DImode should be already handled through
47538 expand_vselect by vpermq instruction. */
47539 gcc_assert (d->vmode != V4DImode);
47541 vmode = V32QImode;
47542 if (d->vmode == V8SImode
47543 || d->vmode == V16HImode
47544 || d->vmode == V32QImode)
47546 /* First see if vpermq can be used for
47547 V8SImode/V16HImode/V32QImode. */
47548 if (valid_perm_using_mode_p (V4DImode, d))
47550 for (i = 0; i < 4; i++)
47551 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47552 if (d->testing_p)
47553 return true;
47554 target = gen_reg_rtx (V4DImode);
47555 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47556 perm, 4, false))
47558 emit_move_insn (d->target,
47559 gen_lowpart (d->vmode, target));
47560 return true;
47562 return false;
47565 /* Next see if vpermd can be used. */
47566 if (valid_perm_using_mode_p (V8SImode, d))
47567 vmode = V8SImode;
47569 /* Or if vpermps can be used. */
47570 else if (d->vmode == V8SFmode)
47571 vmode = V8SImode;
47573 if (vmode == V32QImode)
47575 /* vpshufb only works intra lanes, it is not
47576 possible to shuffle bytes in between the lanes. */
47577 for (i = 0; i < nelt; ++i)
47578 if ((d->perm[i] ^ i) & (nelt / 2))
47579 return false;
47582 else if (GET_MODE_SIZE (d->vmode) == 64)
47584 if (!TARGET_AVX512BW)
47585 return false;
47587 /* If vpermq didn't work, vpshufb won't work either. */
47588 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47589 return false;
47591 vmode = V64QImode;
47592 if (d->vmode == V16SImode
47593 || d->vmode == V32HImode
47594 || d->vmode == V64QImode)
47596 /* First see if vpermq can be used for
47597 V16SImode/V32HImode/V64QImode. */
47598 if (valid_perm_using_mode_p (V8DImode, d))
47600 for (i = 0; i < 8; i++)
47601 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47602 if (d->testing_p)
47603 return true;
47604 target = gen_reg_rtx (V8DImode);
47605 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47606 perm, 8, false))
47608 emit_move_insn (d->target,
47609 gen_lowpart (d->vmode, target));
47610 return true;
47612 return false;
47615 /* Next see if vpermd can be used. */
47616 if (valid_perm_using_mode_p (V16SImode, d))
47617 vmode = V16SImode;
47619 /* Or if vpermps can be used. */
47620 else if (d->vmode == V16SFmode)
47621 vmode = V16SImode;
47622 if (vmode == V64QImode)
47624 /* vpshufb only works intra lanes, it is not
47625 possible to shuffle bytes in between the lanes. */
47626 for (i = 0; i < nelt; ++i)
47627 if ((d->perm[i] ^ i) & (nelt / 4))
47628 return false;
47631 else
47632 return false;
47635 if (d->testing_p)
47636 return true;
47638 if (vmode == V8SImode)
47639 for (i = 0; i < 8; ++i)
47640 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47641 else if (vmode == V16SImode)
47642 for (i = 0; i < 16; ++i)
47643 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47644 else
47646 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47647 if (!d->one_operand_p)
47648 mask = 2 * nelt - 1;
47649 else if (vmode == V16QImode)
47650 mask = nelt - 1;
47651 else if (vmode == V64QImode)
47652 mask = nelt / 4 - 1;
47653 else
47654 mask = nelt / 2 - 1;
47656 for (i = 0; i < nelt; ++i)
47658 unsigned j, e = d->perm[i] & mask;
47659 for (j = 0; j < eltsz; ++j)
47660 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47664 vperm = gen_rtx_CONST_VECTOR (vmode,
47665 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47666 vperm = force_reg (vmode, vperm);
47668 target = d->target;
47669 if (d->vmode != vmode)
47670 target = gen_reg_rtx (vmode);
47671 op0 = gen_lowpart (vmode, d->op0);
47672 if (d->one_operand_p)
47674 if (vmode == V16QImode)
47675 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47676 else if (vmode == V32QImode)
47677 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47678 else if (vmode == V64QImode)
47679 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47680 else if (vmode == V8SFmode)
47681 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47682 else if (vmode == V8SImode)
47683 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47684 else if (vmode == V16SFmode)
47685 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47686 else if (vmode == V16SImode)
47687 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47688 else
47689 gcc_unreachable ();
47691 else
47693 op1 = gen_lowpart (vmode, d->op1);
47694 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47696 if (target != d->target)
47697 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47699 return true;
47702 /* For V*[QHS]Imode permutations, check if the same permutation
47703 can't be performed in a 2x, 4x or 8x wider inner mode. */
47705 static bool
47706 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47707 struct expand_vec_perm_d *nd)
47709 int i;
47710 machine_mode mode = VOIDmode;
47712 switch (d->vmode)
47714 case E_V16QImode: mode = V8HImode; break;
47715 case E_V32QImode: mode = V16HImode; break;
47716 case E_V64QImode: mode = V32HImode; break;
47717 case E_V8HImode: mode = V4SImode; break;
47718 case E_V16HImode: mode = V8SImode; break;
47719 case E_V32HImode: mode = V16SImode; break;
47720 case E_V4SImode: mode = V2DImode; break;
47721 case E_V8SImode: mode = V4DImode; break;
47722 case E_V16SImode: mode = V8DImode; break;
47723 default: return false;
47725 for (i = 0; i < d->nelt; i += 2)
47726 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47727 return false;
47728 nd->vmode = mode;
47729 nd->nelt = d->nelt / 2;
47730 for (i = 0; i < nd->nelt; i++)
47731 nd->perm[i] = d->perm[2 * i] / 2;
47732 if (GET_MODE_INNER (mode) != DImode)
47733 canonicalize_vector_int_perm (nd, nd);
47734 if (nd != d)
47736 nd->one_operand_p = d->one_operand_p;
47737 nd->testing_p = d->testing_p;
47738 if (d->op0 == d->op1)
47739 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47740 else
47742 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47743 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47745 if (d->testing_p)
47746 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47747 else
47748 nd->target = gen_reg_rtx (nd->vmode);
47750 return true;
47753 /* Try to expand one-operand permutation with constant mask. */
47755 static bool
47756 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47758 machine_mode mode = GET_MODE (d->op0);
47759 machine_mode maskmode = mode;
47760 rtx (*gen) (rtx, rtx, rtx) = NULL;
47761 rtx target, op0, mask;
47762 rtx vec[64];
47764 if (!rtx_equal_p (d->op0, d->op1))
47765 return false;
47767 if (!TARGET_AVX512F)
47768 return false;
47770 switch (mode)
47772 case E_V16SImode:
47773 gen = gen_avx512f_permvarv16si;
47774 break;
47775 case E_V16SFmode:
47776 gen = gen_avx512f_permvarv16sf;
47777 maskmode = V16SImode;
47778 break;
47779 case E_V8DImode:
47780 gen = gen_avx512f_permvarv8di;
47781 break;
47782 case E_V8DFmode:
47783 gen = gen_avx512f_permvarv8df;
47784 maskmode = V8DImode;
47785 break;
47786 default:
47787 return false;
47790 target = d->target;
47791 op0 = d->op0;
47792 for (int i = 0; i < d->nelt; ++i)
47793 vec[i] = GEN_INT (d->perm[i]);
47794 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47795 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47796 return true;
47799 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47800 in a single instruction. */
47802 static bool
47803 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47805 unsigned i, nelt = d->nelt;
47806 struct expand_vec_perm_d nd;
47808 /* Check plain VEC_SELECT first, because AVX has instructions that could
47809 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47810 input where SEL+CONCAT may not. */
47811 if (d->one_operand_p)
47813 int mask = nelt - 1;
47814 bool identity_perm = true;
47815 bool broadcast_perm = true;
47817 for (i = 0; i < nelt; i++)
47819 nd.perm[i] = d->perm[i] & mask;
47820 if (nd.perm[i] != i)
47821 identity_perm = false;
47822 if (nd.perm[i])
47823 broadcast_perm = false;
47826 if (identity_perm)
47828 if (!d->testing_p)
47829 emit_move_insn (d->target, d->op0);
47830 return true;
47832 else if (broadcast_perm && TARGET_AVX2)
47834 /* Use vpbroadcast{b,w,d}. */
47835 rtx (*gen) (rtx, rtx) = NULL;
47836 switch (d->vmode)
47838 case E_V64QImode:
47839 if (TARGET_AVX512BW)
47840 gen = gen_avx512bw_vec_dupv64qi_1;
47841 break;
47842 case E_V32QImode:
47843 gen = gen_avx2_pbroadcastv32qi_1;
47844 break;
47845 case E_V32HImode:
47846 if (TARGET_AVX512BW)
47847 gen = gen_avx512bw_vec_dupv32hi_1;
47848 break;
47849 case E_V16HImode:
47850 gen = gen_avx2_pbroadcastv16hi_1;
47851 break;
47852 case E_V16SImode:
47853 if (TARGET_AVX512F)
47854 gen = gen_avx512f_vec_dupv16si_1;
47855 break;
47856 case E_V8SImode:
47857 gen = gen_avx2_pbroadcastv8si_1;
47858 break;
47859 case E_V16QImode:
47860 gen = gen_avx2_pbroadcastv16qi;
47861 break;
47862 case E_V8HImode:
47863 gen = gen_avx2_pbroadcastv8hi;
47864 break;
47865 case E_V16SFmode:
47866 if (TARGET_AVX512F)
47867 gen = gen_avx512f_vec_dupv16sf_1;
47868 break;
47869 case E_V8SFmode:
47870 gen = gen_avx2_vec_dupv8sf_1;
47871 break;
47872 case E_V8DFmode:
47873 if (TARGET_AVX512F)
47874 gen = gen_avx512f_vec_dupv8df_1;
47875 break;
47876 case E_V8DImode:
47877 if (TARGET_AVX512F)
47878 gen = gen_avx512f_vec_dupv8di_1;
47879 break;
47880 /* For other modes prefer other shuffles this function creates. */
47881 default: break;
47883 if (gen != NULL)
47885 if (!d->testing_p)
47886 emit_insn (gen (d->target, d->op0));
47887 return true;
47891 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47892 return true;
47894 /* There are plenty of patterns in sse.md that are written for
47895 SEL+CONCAT and are not replicated for a single op. Perhaps
47896 that should be changed, to avoid the nastiness here. */
47898 /* Recognize interleave style patterns, which means incrementing
47899 every other permutation operand. */
47900 for (i = 0; i < nelt; i += 2)
47902 nd.perm[i] = d->perm[i] & mask;
47903 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47905 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47906 d->testing_p))
47907 return true;
47909 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47910 if (nelt >= 4)
47912 for (i = 0; i < nelt; i += 4)
47914 nd.perm[i + 0] = d->perm[i + 0] & mask;
47915 nd.perm[i + 1] = d->perm[i + 1] & mask;
47916 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47917 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47920 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47921 d->testing_p))
47922 return true;
47926 /* Finally, try the fully general two operand permute. */
47927 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47928 d->testing_p))
47929 return true;
47931 /* Recognize interleave style patterns with reversed operands. */
47932 if (!d->one_operand_p)
47934 for (i = 0; i < nelt; ++i)
47936 unsigned e = d->perm[i];
47937 if (e >= nelt)
47938 e -= nelt;
47939 else
47940 e += nelt;
47941 nd.perm[i] = e;
47944 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47945 d->testing_p))
47946 return true;
47949 /* Try the SSE4.1 blend variable merge instructions. */
47950 if (expand_vec_perm_blend (d))
47951 return true;
47953 /* Try one of the AVX vpermil variable permutations. */
47954 if (expand_vec_perm_vpermil (d))
47955 return true;
47957 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47958 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47959 if (expand_vec_perm_pshufb (d))
47960 return true;
47962 /* Try the AVX2 vpalignr instruction. */
47963 if (expand_vec_perm_palignr (d, true))
47964 return true;
47966 /* Try the AVX512F vperm{s,d} instructions. */
47967 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47968 return true;
47970 /* Try the AVX512F vpermi2 instructions. */
47971 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47972 return true;
47974 /* See if we can get the same permutation in different vector integer
47975 mode. */
47976 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47978 if (!d->testing_p)
47979 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47980 return true;
47982 return false;
47985 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47986 in terms of a pair of pshuflw + pshufhw instructions. */
47988 static bool
47989 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47991 unsigned char perm2[MAX_VECT_LEN];
47992 unsigned i;
47993 bool ok;
47995 if (d->vmode != V8HImode || !d->one_operand_p)
47996 return false;
47998 /* The two permutations only operate in 64-bit lanes. */
47999 for (i = 0; i < 4; ++i)
48000 if (d->perm[i] >= 4)
48001 return false;
48002 for (i = 4; i < 8; ++i)
48003 if (d->perm[i] < 4)
48004 return false;
48006 if (d->testing_p)
48007 return true;
48009 /* Emit the pshuflw. */
48010 memcpy (perm2, d->perm, 4);
48011 for (i = 4; i < 8; ++i)
48012 perm2[i] = i;
48013 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
48014 gcc_assert (ok);
48016 /* Emit the pshufhw. */
48017 memcpy (perm2 + 4, d->perm + 4, 4);
48018 for (i = 0; i < 4; ++i)
48019 perm2[i] = i;
48020 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
48021 gcc_assert (ok);
48023 return true;
48026 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48027 the permutation using the SSSE3 palignr instruction. This succeeds
48028 when all of the elements in PERM fit within one vector and we merely
48029 need to shift them down so that a single vector permutation has a
48030 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
48031 the vpalignr instruction itself can perform the requested permutation. */
48033 static bool
48034 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
48036 unsigned i, nelt = d->nelt;
48037 unsigned min, max, minswap, maxswap;
48038 bool in_order, ok, swap = false;
48039 rtx shift, target;
48040 struct expand_vec_perm_d dcopy;
48042 /* Even with AVX, palignr only operates on 128-bit vectors,
48043 in AVX2 palignr operates on both 128-bit lanes. */
48044 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48045 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
48046 return false;
48048 min = 2 * nelt;
48049 max = 0;
48050 minswap = 2 * nelt;
48051 maxswap = 0;
48052 for (i = 0; i < nelt; ++i)
48054 unsigned e = d->perm[i];
48055 unsigned eswap = d->perm[i] ^ nelt;
48056 if (GET_MODE_SIZE (d->vmode) == 32)
48058 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
48059 eswap = e ^ (nelt / 2);
48061 if (e < min)
48062 min = e;
48063 if (e > max)
48064 max = e;
48065 if (eswap < minswap)
48066 minswap = eswap;
48067 if (eswap > maxswap)
48068 maxswap = eswap;
48070 if (min == 0
48071 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
48073 if (d->one_operand_p
48074 || minswap == 0
48075 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
48076 ? nelt / 2 : nelt))
48077 return false;
48078 swap = true;
48079 min = minswap;
48080 max = maxswap;
48083 /* Given that we have SSSE3, we know we'll be able to implement the
48084 single operand permutation after the palignr with pshufb for
48085 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
48086 first. */
48087 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
48088 return true;
48090 dcopy = *d;
48091 if (swap)
48093 dcopy.op0 = d->op1;
48094 dcopy.op1 = d->op0;
48095 for (i = 0; i < nelt; ++i)
48096 dcopy.perm[i] ^= nelt;
48099 in_order = true;
48100 for (i = 0; i < nelt; ++i)
48102 unsigned e = dcopy.perm[i];
48103 if (GET_MODE_SIZE (d->vmode) == 32
48104 && e >= nelt
48105 && (e & (nelt / 2 - 1)) < min)
48106 e = e - min - (nelt / 2);
48107 else
48108 e = e - min;
48109 if (e != i)
48110 in_order = false;
48111 dcopy.perm[i] = e;
48113 dcopy.one_operand_p = true;
48115 if (single_insn_only_p && !in_order)
48116 return false;
48118 /* For AVX2, test whether we can permute the result in one instruction. */
48119 if (d->testing_p)
48121 if (in_order)
48122 return true;
48123 dcopy.op1 = dcopy.op0;
48124 return expand_vec_perm_1 (&dcopy);
48127 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
48128 if (GET_MODE_SIZE (d->vmode) == 16)
48130 target = gen_reg_rtx (TImode);
48131 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
48132 gen_lowpart (TImode, dcopy.op0), shift));
48134 else
48136 target = gen_reg_rtx (V2TImode);
48137 emit_insn (gen_avx2_palignrv2ti (target,
48138 gen_lowpart (V2TImode, dcopy.op1),
48139 gen_lowpart (V2TImode, dcopy.op0),
48140 shift));
48143 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
48145 /* Test for the degenerate case where the alignment by itself
48146 produces the desired permutation. */
48147 if (in_order)
48149 emit_move_insn (d->target, dcopy.op0);
48150 return true;
48153 ok = expand_vec_perm_1 (&dcopy);
48154 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
48156 return ok;
48159 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
48160 the permutation using the SSE4_1 pblendv instruction. Potentially
48161 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
48163 static bool
48164 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
48166 unsigned i, which, nelt = d->nelt;
48167 struct expand_vec_perm_d dcopy, dcopy1;
48168 machine_mode vmode = d->vmode;
48169 bool ok;
48171 /* Use the same checks as in expand_vec_perm_blend. */
48172 if (d->one_operand_p)
48173 return false;
48174 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
48176 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
48178 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
48180 else
48181 return false;
48183 /* Figure out where permutation elements stay not in their
48184 respective lanes. */
48185 for (i = 0, which = 0; i < nelt; ++i)
48187 unsigned e = d->perm[i];
48188 if (e != i)
48189 which |= (e < nelt ? 1 : 2);
48191 /* We can pblend the part where elements stay not in their
48192 respective lanes only when these elements are all in one
48193 half of a permutation.
48194 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
48195 lanes, but both 8 and 9 >= 8
48196 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
48197 respective lanes and 8 >= 8, but 2 not. */
48198 if (which != 1 && which != 2)
48199 return false;
48200 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
48201 return true;
48203 /* First we apply one operand permutation to the part where
48204 elements stay not in their respective lanes. */
48205 dcopy = *d;
48206 if (which == 2)
48207 dcopy.op0 = dcopy.op1 = d->op1;
48208 else
48209 dcopy.op0 = dcopy.op1 = d->op0;
48210 if (!d->testing_p)
48211 dcopy.target = gen_reg_rtx (vmode);
48212 dcopy.one_operand_p = true;
48214 for (i = 0; i < nelt; ++i)
48215 dcopy.perm[i] = d->perm[i] & (nelt - 1);
48217 ok = expand_vec_perm_1 (&dcopy);
48218 if (GET_MODE_SIZE (vmode) != 16 && !ok)
48219 return false;
48220 else
48221 gcc_assert (ok);
48222 if (d->testing_p)
48223 return true;
48225 /* Next we put permuted elements into their positions. */
48226 dcopy1 = *d;
48227 if (which == 2)
48228 dcopy1.op1 = dcopy.target;
48229 else
48230 dcopy1.op0 = dcopy.target;
48232 for (i = 0; i < nelt; ++i)
48233 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
48235 ok = expand_vec_perm_blend (&dcopy1);
48236 gcc_assert (ok);
48238 return true;
48241 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
48243 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48244 a two vector permutation into a single vector permutation by using
48245 an interleave operation to merge the vectors. */
48247 static bool
48248 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
48250 struct expand_vec_perm_d dremap, dfinal;
48251 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
48252 unsigned HOST_WIDE_INT contents;
48253 unsigned char remap[2 * MAX_VECT_LEN];
48254 rtx_insn *seq;
48255 bool ok, same_halves = false;
48257 if (GET_MODE_SIZE (d->vmode) == 16)
48259 if (d->one_operand_p)
48260 return false;
48262 else if (GET_MODE_SIZE (d->vmode) == 32)
48264 if (!TARGET_AVX)
48265 return false;
48266 /* For 32-byte modes allow even d->one_operand_p.
48267 The lack of cross-lane shuffling in some instructions
48268 might prevent a single insn shuffle. */
48269 dfinal = *d;
48270 dfinal.testing_p = true;
48271 /* If expand_vec_perm_interleave3 can expand this into
48272 a 3 insn sequence, give up and let it be expanded as
48273 3 insn sequence. While that is one insn longer,
48274 it doesn't need a memory operand and in the common
48275 case that both interleave low and high permutations
48276 with the same operands are adjacent needs 4 insns
48277 for both after CSE. */
48278 if (expand_vec_perm_interleave3 (&dfinal))
48279 return false;
48281 else
48282 return false;
48284 /* Examine from whence the elements come. */
48285 contents = 0;
48286 for (i = 0; i < nelt; ++i)
48287 contents |= HOST_WIDE_INT_1U << d->perm[i];
48289 memset (remap, 0xff, sizeof (remap));
48290 dremap = *d;
48292 if (GET_MODE_SIZE (d->vmode) == 16)
48294 unsigned HOST_WIDE_INT h1, h2, h3, h4;
48296 /* Split the two input vectors into 4 halves. */
48297 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
48298 h2 = h1 << nelt2;
48299 h3 = h2 << nelt2;
48300 h4 = h3 << nelt2;
48302 /* If the elements from the low halves use interleave low, and similarly
48303 for interleave high. If the elements are from mis-matched halves, we
48304 can use shufps for V4SF/V4SI or do a DImode shuffle. */
48305 if ((contents & (h1 | h3)) == contents)
48307 /* punpckl* */
48308 for (i = 0; i < nelt2; ++i)
48310 remap[i] = i * 2;
48311 remap[i + nelt] = i * 2 + 1;
48312 dremap.perm[i * 2] = i;
48313 dremap.perm[i * 2 + 1] = i + nelt;
48315 if (!TARGET_SSE2 && d->vmode == V4SImode)
48316 dremap.vmode = V4SFmode;
48318 else if ((contents & (h2 | h4)) == contents)
48320 /* punpckh* */
48321 for (i = 0; i < nelt2; ++i)
48323 remap[i + nelt2] = i * 2;
48324 remap[i + nelt + nelt2] = i * 2 + 1;
48325 dremap.perm[i * 2] = i + nelt2;
48326 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
48328 if (!TARGET_SSE2 && d->vmode == V4SImode)
48329 dremap.vmode = V4SFmode;
48331 else if ((contents & (h1 | h4)) == contents)
48333 /* shufps */
48334 for (i = 0; i < nelt2; ++i)
48336 remap[i] = i;
48337 remap[i + nelt + nelt2] = i + nelt2;
48338 dremap.perm[i] = i;
48339 dremap.perm[i + nelt2] = i + nelt + nelt2;
48341 if (nelt != 4)
48343 /* shufpd */
48344 dremap.vmode = V2DImode;
48345 dremap.nelt = 2;
48346 dremap.perm[0] = 0;
48347 dremap.perm[1] = 3;
48350 else if ((contents & (h2 | h3)) == contents)
48352 /* shufps */
48353 for (i = 0; i < nelt2; ++i)
48355 remap[i + nelt2] = i;
48356 remap[i + nelt] = i + nelt2;
48357 dremap.perm[i] = i + nelt2;
48358 dremap.perm[i + nelt2] = i + nelt;
48360 if (nelt != 4)
48362 /* shufpd */
48363 dremap.vmode = V2DImode;
48364 dremap.nelt = 2;
48365 dremap.perm[0] = 1;
48366 dremap.perm[1] = 2;
48369 else
48370 return false;
48372 else
48374 unsigned int nelt4 = nelt / 4, nzcnt = 0;
48375 unsigned HOST_WIDE_INT q[8];
48376 unsigned int nonzero_halves[4];
48378 /* Split the two input vectors into 8 quarters. */
48379 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
48380 for (i = 1; i < 8; ++i)
48381 q[i] = q[0] << (nelt4 * i);
48382 for (i = 0; i < 4; ++i)
48383 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
48385 nonzero_halves[nzcnt] = i;
48386 ++nzcnt;
48389 if (nzcnt == 1)
48391 gcc_assert (d->one_operand_p);
48392 nonzero_halves[1] = nonzero_halves[0];
48393 same_halves = true;
48395 else if (d->one_operand_p)
48397 gcc_assert (nonzero_halves[0] == 0);
48398 gcc_assert (nonzero_halves[1] == 1);
48401 if (nzcnt <= 2)
48403 if (d->perm[0] / nelt2 == nonzero_halves[1])
48405 /* Attempt to increase the likelihood that dfinal
48406 shuffle will be intra-lane. */
48407 std::swap (nonzero_halves[0], nonzero_halves[1]);
48410 /* vperm2f128 or vperm2i128. */
48411 for (i = 0; i < nelt2; ++i)
48413 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
48414 remap[i + nonzero_halves[0] * nelt2] = i;
48415 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
48416 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
48419 if (d->vmode != V8SFmode
48420 && d->vmode != V4DFmode
48421 && d->vmode != V8SImode)
48423 dremap.vmode = V8SImode;
48424 dremap.nelt = 8;
48425 for (i = 0; i < 4; ++i)
48427 dremap.perm[i] = i + nonzero_halves[0] * 4;
48428 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
48432 else if (d->one_operand_p)
48433 return false;
48434 else if (TARGET_AVX2
48435 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
48437 /* vpunpckl* */
48438 for (i = 0; i < nelt4; ++i)
48440 remap[i] = i * 2;
48441 remap[i + nelt] = i * 2 + 1;
48442 remap[i + nelt2] = i * 2 + nelt2;
48443 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
48444 dremap.perm[i * 2] = i;
48445 dremap.perm[i * 2 + 1] = i + nelt;
48446 dremap.perm[i * 2 + nelt2] = i + nelt2;
48447 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
48450 else if (TARGET_AVX2
48451 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
48453 /* vpunpckh* */
48454 for (i = 0; i < nelt4; ++i)
48456 remap[i + nelt4] = i * 2;
48457 remap[i + nelt + nelt4] = i * 2 + 1;
48458 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
48459 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
48460 dremap.perm[i * 2] = i + nelt4;
48461 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
48462 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
48463 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
48466 else
48467 return false;
48470 /* Use the remapping array set up above to move the elements from their
48471 swizzled locations into their final destinations. */
48472 dfinal = *d;
48473 for (i = 0; i < nelt; ++i)
48475 unsigned e = remap[d->perm[i]];
48476 gcc_assert (e < nelt);
48477 /* If same_halves is true, both halves of the remapped vector are the
48478 same. Avoid cross-lane accesses if possible. */
48479 if (same_halves && i >= nelt2)
48481 gcc_assert (e < nelt2);
48482 dfinal.perm[i] = e + nelt2;
48484 else
48485 dfinal.perm[i] = e;
48487 if (!d->testing_p)
48489 dremap.target = gen_reg_rtx (dremap.vmode);
48490 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48492 dfinal.op1 = dfinal.op0;
48493 dfinal.one_operand_p = true;
48495 /* Test if the final remap can be done with a single insn. For V4SFmode or
48496 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48497 start_sequence ();
48498 ok = expand_vec_perm_1 (&dfinal);
48499 seq = get_insns ();
48500 end_sequence ();
48502 if (!ok)
48503 return false;
48505 if (d->testing_p)
48506 return true;
48508 if (dremap.vmode != dfinal.vmode)
48510 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48511 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48514 ok = expand_vec_perm_1 (&dremap);
48515 gcc_assert (ok);
48517 emit_insn (seq);
48518 return true;
48521 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48522 a single vector cross-lane permutation into vpermq followed
48523 by any of the single insn permutations. */
48525 static bool
48526 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48528 struct expand_vec_perm_d dremap, dfinal;
48529 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48530 unsigned contents[2];
48531 bool ok;
48533 if (!(TARGET_AVX2
48534 && (d->vmode == V32QImode || d->vmode == V16HImode)
48535 && d->one_operand_p))
48536 return false;
48538 contents[0] = 0;
48539 contents[1] = 0;
48540 for (i = 0; i < nelt2; ++i)
48542 contents[0] |= 1u << (d->perm[i] / nelt4);
48543 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48546 for (i = 0; i < 2; ++i)
48548 unsigned int cnt = 0;
48549 for (j = 0; j < 4; ++j)
48550 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48551 return false;
48554 if (d->testing_p)
48555 return true;
48557 dremap = *d;
48558 dremap.vmode = V4DImode;
48559 dremap.nelt = 4;
48560 dremap.target = gen_reg_rtx (V4DImode);
48561 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48562 dremap.op1 = dremap.op0;
48563 dremap.one_operand_p = true;
48564 for (i = 0; i < 2; ++i)
48566 unsigned int cnt = 0;
48567 for (j = 0; j < 4; ++j)
48568 if ((contents[i] & (1u << j)) != 0)
48569 dremap.perm[2 * i + cnt++] = j;
48570 for (; cnt < 2; ++cnt)
48571 dremap.perm[2 * i + cnt] = 0;
48574 dfinal = *d;
48575 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48576 dfinal.op1 = dfinal.op0;
48577 dfinal.one_operand_p = true;
48578 for (i = 0, j = 0; i < nelt; ++i)
48580 if (i == nelt2)
48581 j = 2;
48582 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48583 if ((d->perm[i] / nelt4) == dremap.perm[j])
48585 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48586 dfinal.perm[i] |= nelt4;
48587 else
48588 gcc_unreachable ();
48591 ok = expand_vec_perm_1 (&dremap);
48592 gcc_assert (ok);
48594 ok = expand_vec_perm_1 (&dfinal);
48595 gcc_assert (ok);
48597 return true;
48600 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48601 a vector permutation using two instructions, vperm2f128 resp.
48602 vperm2i128 followed by any single in-lane permutation. */
48604 static bool
48605 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48607 struct expand_vec_perm_d dfirst, dsecond;
48608 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48609 bool ok;
48611 if (!TARGET_AVX
48612 || GET_MODE_SIZE (d->vmode) != 32
48613 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48614 return false;
48616 dsecond = *d;
48617 dsecond.one_operand_p = false;
48618 dsecond.testing_p = true;
48620 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48621 immediate. For perm < 16 the second permutation uses
48622 d->op0 as first operand, for perm >= 16 it uses d->op1
48623 as first operand. The second operand is the result of
48624 vperm2[fi]128. */
48625 for (perm = 0; perm < 32; perm++)
48627 /* Ignore permutations which do not move anything cross-lane. */
48628 if (perm < 16)
48630 /* The second shuffle for e.g. V4DFmode has
48631 0123 and ABCD operands.
48632 Ignore AB23, as 23 is already in the second lane
48633 of the first operand. */
48634 if ((perm & 0xc) == (1 << 2)) continue;
48635 /* And 01CD, as 01 is in the first lane of the first
48636 operand. */
48637 if ((perm & 3) == 0) continue;
48638 /* And 4567, as then the vperm2[fi]128 doesn't change
48639 anything on the original 4567 second operand. */
48640 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48642 else
48644 /* The second shuffle for e.g. V4DFmode has
48645 4567 and ABCD operands.
48646 Ignore AB67, as 67 is already in the second lane
48647 of the first operand. */
48648 if ((perm & 0xc) == (3 << 2)) continue;
48649 /* And 45CD, as 45 is in the first lane of the first
48650 operand. */
48651 if ((perm & 3) == 2) continue;
48652 /* And 0123, as then the vperm2[fi]128 doesn't change
48653 anything on the original 0123 first operand. */
48654 if ((perm & 0xf) == (1 << 2)) continue;
48657 for (i = 0; i < nelt; i++)
48659 j = d->perm[i] / nelt2;
48660 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48661 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48662 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48663 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48664 else
48665 break;
48668 if (i == nelt)
48670 start_sequence ();
48671 ok = expand_vec_perm_1 (&dsecond);
48672 end_sequence ();
48674 else
48675 ok = false;
48677 if (ok)
48679 if (d->testing_p)
48680 return true;
48682 /* Found a usable second shuffle. dfirst will be
48683 vperm2f128 on d->op0 and d->op1. */
48684 dsecond.testing_p = false;
48685 dfirst = *d;
48686 dfirst.target = gen_reg_rtx (d->vmode);
48687 for (i = 0; i < nelt; i++)
48688 dfirst.perm[i] = (i & (nelt2 - 1))
48689 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48691 canonicalize_perm (&dfirst);
48692 ok = expand_vec_perm_1 (&dfirst);
48693 gcc_assert (ok);
48695 /* And dsecond is some single insn shuffle, taking
48696 d->op0 and result of vperm2f128 (if perm < 16) or
48697 d->op1 and result of vperm2f128 (otherwise). */
48698 if (perm >= 16)
48699 dsecond.op0 = dsecond.op1;
48700 dsecond.op1 = dfirst.target;
48702 ok = expand_vec_perm_1 (&dsecond);
48703 gcc_assert (ok);
48705 return true;
48708 /* For one operand, the only useful vperm2f128 permutation is 0x01
48709 aka lanes swap. */
48710 if (d->one_operand_p)
48711 return false;
48714 return false;
48717 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48718 a two vector permutation using 2 intra-lane interleave insns
48719 and cross-lane shuffle for 32-byte vectors. */
48721 static bool
48722 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48724 unsigned i, nelt;
48725 rtx (*gen) (rtx, rtx, rtx);
48727 if (d->one_operand_p)
48728 return false;
48729 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48731 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48733 else
48734 return false;
48736 nelt = d->nelt;
48737 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48738 return false;
48739 for (i = 0; i < nelt; i += 2)
48740 if (d->perm[i] != d->perm[0] + i / 2
48741 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48742 return false;
48744 if (d->testing_p)
48745 return true;
48747 switch (d->vmode)
48749 case E_V32QImode:
48750 if (d->perm[0])
48751 gen = gen_vec_interleave_highv32qi;
48752 else
48753 gen = gen_vec_interleave_lowv32qi;
48754 break;
48755 case E_V16HImode:
48756 if (d->perm[0])
48757 gen = gen_vec_interleave_highv16hi;
48758 else
48759 gen = gen_vec_interleave_lowv16hi;
48760 break;
48761 case E_V8SImode:
48762 if (d->perm[0])
48763 gen = gen_vec_interleave_highv8si;
48764 else
48765 gen = gen_vec_interleave_lowv8si;
48766 break;
48767 case E_V4DImode:
48768 if (d->perm[0])
48769 gen = gen_vec_interleave_highv4di;
48770 else
48771 gen = gen_vec_interleave_lowv4di;
48772 break;
48773 case E_V8SFmode:
48774 if (d->perm[0])
48775 gen = gen_vec_interleave_highv8sf;
48776 else
48777 gen = gen_vec_interleave_lowv8sf;
48778 break;
48779 case E_V4DFmode:
48780 if (d->perm[0])
48781 gen = gen_vec_interleave_highv4df;
48782 else
48783 gen = gen_vec_interleave_lowv4df;
48784 break;
48785 default:
48786 gcc_unreachable ();
48789 emit_insn (gen (d->target, d->op0, d->op1));
48790 return true;
48793 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48794 a single vector permutation using a single intra-lane vector
48795 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48796 the non-swapped and swapped vectors together. */
48798 static bool
48799 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48801 struct expand_vec_perm_d dfirst, dsecond;
48802 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48803 rtx_insn *seq;
48804 bool ok;
48805 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48807 if (!TARGET_AVX
48808 || TARGET_AVX2
48809 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48810 || !d->one_operand_p)
48811 return false;
48813 dfirst = *d;
48814 for (i = 0; i < nelt; i++)
48815 dfirst.perm[i] = 0xff;
48816 for (i = 0, msk = 0; i < nelt; i++)
48818 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48819 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48820 return false;
48821 dfirst.perm[j] = d->perm[i];
48822 if (j != i)
48823 msk |= (1 << i);
48825 for (i = 0; i < nelt; i++)
48826 if (dfirst.perm[i] == 0xff)
48827 dfirst.perm[i] = i;
48829 if (!d->testing_p)
48830 dfirst.target = gen_reg_rtx (dfirst.vmode);
48832 start_sequence ();
48833 ok = expand_vec_perm_1 (&dfirst);
48834 seq = get_insns ();
48835 end_sequence ();
48837 if (!ok)
48838 return false;
48840 if (d->testing_p)
48841 return true;
48843 emit_insn (seq);
48845 dsecond = *d;
48846 dsecond.op0 = dfirst.target;
48847 dsecond.op1 = dfirst.target;
48848 dsecond.one_operand_p = true;
48849 dsecond.target = gen_reg_rtx (dsecond.vmode);
48850 for (i = 0; i < nelt; i++)
48851 dsecond.perm[i] = i ^ nelt2;
48853 ok = expand_vec_perm_1 (&dsecond);
48854 gcc_assert (ok);
48856 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48857 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48858 return true;
48861 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48862 permutation using two vperm2f128, followed by a vshufpd insn blending
48863 the two vectors together. */
48865 static bool
48866 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48868 struct expand_vec_perm_d dfirst, dsecond, dthird;
48869 bool ok;
48871 if (!TARGET_AVX || (d->vmode != V4DFmode))
48872 return false;
48874 if (d->testing_p)
48875 return true;
48877 dfirst = *d;
48878 dsecond = *d;
48879 dthird = *d;
48881 dfirst.perm[0] = (d->perm[0] & ~1);
48882 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48883 dfirst.perm[2] = (d->perm[2] & ~1);
48884 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48885 dsecond.perm[0] = (d->perm[1] & ~1);
48886 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48887 dsecond.perm[2] = (d->perm[3] & ~1);
48888 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48889 dthird.perm[0] = (d->perm[0] % 2);
48890 dthird.perm[1] = (d->perm[1] % 2) + 4;
48891 dthird.perm[2] = (d->perm[2] % 2) + 2;
48892 dthird.perm[3] = (d->perm[3] % 2) + 6;
48894 dfirst.target = gen_reg_rtx (dfirst.vmode);
48895 dsecond.target = gen_reg_rtx (dsecond.vmode);
48896 dthird.op0 = dfirst.target;
48897 dthird.op1 = dsecond.target;
48898 dthird.one_operand_p = false;
48900 canonicalize_perm (&dfirst);
48901 canonicalize_perm (&dsecond);
48903 ok = expand_vec_perm_1 (&dfirst)
48904 && expand_vec_perm_1 (&dsecond)
48905 && expand_vec_perm_1 (&dthird);
48907 gcc_assert (ok);
48909 return true;
48912 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48913 permutation with two pshufb insns and an ior. We should have already
48914 failed all two instruction sequences. */
48916 static bool
48917 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48919 rtx rperm[2][16], vperm, l, h, op, m128;
48920 unsigned int i, nelt, eltsz;
48922 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48923 return false;
48924 gcc_assert (!d->one_operand_p);
48926 if (d->testing_p)
48927 return true;
48929 nelt = d->nelt;
48930 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48932 /* Generate two permutation masks. If the required element is within
48933 the given vector it is shuffled into the proper lane. If the required
48934 element is in the other vector, force a zero into the lane by setting
48935 bit 7 in the permutation mask. */
48936 m128 = GEN_INT (-128);
48937 for (i = 0; i < nelt; ++i)
48939 unsigned j, e = d->perm[i];
48940 unsigned which = (e >= nelt);
48941 if (e >= nelt)
48942 e -= nelt;
48944 for (j = 0; j < eltsz; ++j)
48946 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48947 rperm[1-which][i*eltsz + j] = m128;
48951 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48952 vperm = force_reg (V16QImode, vperm);
48954 l = gen_reg_rtx (V16QImode);
48955 op = gen_lowpart (V16QImode, d->op0);
48956 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48958 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48959 vperm = force_reg (V16QImode, vperm);
48961 h = gen_reg_rtx (V16QImode);
48962 op = gen_lowpart (V16QImode, d->op1);
48963 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48965 op = d->target;
48966 if (d->vmode != V16QImode)
48967 op = gen_reg_rtx (V16QImode);
48968 emit_insn (gen_iorv16qi3 (op, l, h));
48969 if (op != d->target)
48970 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48972 return true;
48975 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48976 with two vpshufb insns, vpermq and vpor. We should have already failed
48977 all two or three instruction sequences. */
48979 static bool
48980 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48982 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48983 unsigned int i, nelt, eltsz;
48985 if (!TARGET_AVX2
48986 || !d->one_operand_p
48987 || (d->vmode != V32QImode && d->vmode != V16HImode))
48988 return false;
48990 if (d->testing_p)
48991 return true;
48993 nelt = d->nelt;
48994 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48996 /* Generate two permutation masks. If the required element is within
48997 the same lane, it is shuffled in. If the required element from the
48998 other lane, force a zero by setting bit 7 in the permutation mask.
48999 In the other mask the mask has non-negative elements if element
49000 is requested from the other lane, but also moved to the other lane,
49001 so that the result of vpshufb can have the two V2TImode halves
49002 swapped. */
49003 m128 = GEN_INT (-128);
49004 for (i = 0; i < nelt; ++i)
49006 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49007 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49009 for (j = 0; j < eltsz; ++j)
49011 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
49012 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
49016 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
49017 vperm = force_reg (V32QImode, vperm);
49019 h = gen_reg_rtx (V32QImode);
49020 op = gen_lowpart (V32QImode, d->op0);
49021 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
49023 /* Swap the 128-byte lanes of h into hp. */
49024 hp = gen_reg_rtx (V4DImode);
49025 op = gen_lowpart (V4DImode, h);
49026 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
49027 const1_rtx));
49029 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
49030 vperm = force_reg (V32QImode, vperm);
49032 l = gen_reg_rtx (V32QImode);
49033 op = gen_lowpart (V32QImode, d->op0);
49034 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
49036 op = d->target;
49037 if (d->vmode != V32QImode)
49038 op = gen_reg_rtx (V32QImode);
49039 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
49040 if (op != d->target)
49041 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49043 return true;
49046 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
49047 and extract-odd permutations of two V32QImode and V16QImode operand
49048 with two vpshufb insns, vpor and vpermq. We should have already
49049 failed all two or three instruction sequences. */
49051 static bool
49052 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
49054 rtx rperm[2][32], vperm, l, h, ior, op, m128;
49055 unsigned int i, nelt, eltsz;
49057 if (!TARGET_AVX2
49058 || d->one_operand_p
49059 || (d->vmode != V32QImode && d->vmode != V16HImode))
49060 return false;
49062 for (i = 0; i < d->nelt; ++i)
49063 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
49064 return false;
49066 if (d->testing_p)
49067 return true;
49069 nelt = d->nelt;
49070 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49072 /* Generate two permutation masks. In the first permutation mask
49073 the first quarter will contain indexes for the first half
49074 of the op0, the second quarter will contain bit 7 set, third quarter
49075 will contain indexes for the second half of the op0 and the
49076 last quarter bit 7 set. In the second permutation mask
49077 the first quarter will contain bit 7 set, the second quarter
49078 indexes for the first half of the op1, the third quarter bit 7 set
49079 and last quarter indexes for the second half of the op1.
49080 I.e. the first mask e.g. for V32QImode extract even will be:
49081 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
49082 (all values masked with 0xf except for -128) and second mask
49083 for extract even will be
49084 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
49085 m128 = GEN_INT (-128);
49086 for (i = 0; i < nelt; ++i)
49088 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49089 unsigned which = d->perm[i] >= nelt;
49090 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
49092 for (j = 0; j < eltsz; ++j)
49094 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
49095 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
49099 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
49100 vperm = force_reg (V32QImode, vperm);
49102 l = gen_reg_rtx (V32QImode);
49103 op = gen_lowpart (V32QImode, d->op0);
49104 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
49106 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
49107 vperm = force_reg (V32QImode, vperm);
49109 h = gen_reg_rtx (V32QImode);
49110 op = gen_lowpart (V32QImode, d->op1);
49111 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
49113 ior = gen_reg_rtx (V32QImode);
49114 emit_insn (gen_iorv32qi3 (ior, l, h));
49116 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
49117 op = gen_reg_rtx (V4DImode);
49118 ior = gen_lowpart (V4DImode, ior);
49119 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
49120 const1_rtx, GEN_INT (3)));
49121 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49123 return true;
49126 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
49127 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
49128 with two "and" and "pack" or two "shift" and "pack" insns. We should
49129 have already failed all two instruction sequences. */
49131 static bool
49132 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
49134 rtx op, dop0, dop1, t, rperm[16];
49135 unsigned i, odd, c, s, nelt = d->nelt;
49136 bool end_perm = false;
49137 machine_mode half_mode;
49138 rtx (*gen_and) (rtx, rtx, rtx);
49139 rtx (*gen_pack) (rtx, rtx, rtx);
49140 rtx (*gen_shift) (rtx, rtx, rtx);
49142 if (d->one_operand_p)
49143 return false;
49145 switch (d->vmode)
49147 case E_V8HImode:
49148 /* Required for "pack". */
49149 if (!TARGET_SSE4_1)
49150 return false;
49151 c = 0xffff;
49152 s = 16;
49153 half_mode = V4SImode;
49154 gen_and = gen_andv4si3;
49155 gen_pack = gen_sse4_1_packusdw;
49156 gen_shift = gen_lshrv4si3;
49157 break;
49158 case E_V16QImode:
49159 /* No check as all instructions are SSE2. */
49160 c = 0xff;
49161 s = 8;
49162 half_mode = V8HImode;
49163 gen_and = gen_andv8hi3;
49164 gen_pack = gen_sse2_packuswb;
49165 gen_shift = gen_lshrv8hi3;
49166 break;
49167 case E_V16HImode:
49168 if (!TARGET_AVX2)
49169 return false;
49170 c = 0xffff;
49171 s = 16;
49172 half_mode = V8SImode;
49173 gen_and = gen_andv8si3;
49174 gen_pack = gen_avx2_packusdw;
49175 gen_shift = gen_lshrv8si3;
49176 end_perm = true;
49177 break;
49178 case E_V32QImode:
49179 if (!TARGET_AVX2)
49180 return false;
49181 c = 0xff;
49182 s = 8;
49183 half_mode = V16HImode;
49184 gen_and = gen_andv16hi3;
49185 gen_pack = gen_avx2_packuswb;
49186 gen_shift = gen_lshrv16hi3;
49187 end_perm = true;
49188 break;
49189 default:
49190 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
49191 general shuffles. */
49192 return false;
49195 /* Check that permutation is even or odd. */
49196 odd = d->perm[0];
49197 if (odd > 1)
49198 return false;
49200 for (i = 1; i < nelt; ++i)
49201 if (d->perm[i] != 2 * i + odd)
49202 return false;
49204 if (d->testing_p)
49205 return true;
49207 dop0 = gen_reg_rtx (half_mode);
49208 dop1 = gen_reg_rtx (half_mode);
49209 if (odd == 0)
49211 for (i = 0; i < nelt / 2; i++)
49212 rperm[i] = GEN_INT (c);
49213 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
49214 t = force_reg (half_mode, t);
49215 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
49216 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
49218 else
49220 emit_insn (gen_shift (dop0,
49221 gen_lowpart (half_mode, d->op0),
49222 GEN_INT (s)));
49223 emit_insn (gen_shift (dop1,
49224 gen_lowpart (half_mode, d->op1),
49225 GEN_INT (s)));
49227 /* In AVX2 for 256 bit case we need to permute pack result. */
49228 if (TARGET_AVX2 && end_perm)
49230 op = gen_reg_rtx (d->vmode);
49231 t = gen_reg_rtx (V4DImode);
49232 emit_insn (gen_pack (op, dop0, dop1));
49233 emit_insn (gen_avx2_permv4di_1 (t,
49234 gen_lowpart (V4DImode, op),
49235 const0_rtx,
49236 const2_rtx,
49237 const1_rtx,
49238 GEN_INT (3)));
49239 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
49241 else
49242 emit_insn (gen_pack (d->target, dop0, dop1));
49244 return true;
49247 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
49248 and extract-odd permutations of two V64QI operands
49249 with two "shifts", two "truncs" and one "concat" insns for "odd"
49250 and two "truncs" and one concat insn for "even."
49251 Have already failed all two instruction sequences. */
49253 static bool
49254 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
49256 rtx t1, t2, t3, t4;
49257 unsigned i, odd, nelt = d->nelt;
49259 if (!TARGET_AVX512BW
49260 || d->one_operand_p
49261 || d->vmode != V64QImode)
49262 return false;
49264 /* Check that permutation is even or odd. */
49265 odd = d->perm[0];
49266 if (odd > 1)
49267 return false;
49269 for (i = 1; i < nelt; ++i)
49270 if (d->perm[i] != 2 * i + odd)
49271 return false;
49273 if (d->testing_p)
49274 return true;
49277 if (odd)
49279 t1 = gen_reg_rtx (V32HImode);
49280 t2 = gen_reg_rtx (V32HImode);
49281 emit_insn (gen_lshrv32hi3 (t1,
49282 gen_lowpart (V32HImode, d->op0),
49283 GEN_INT (8)));
49284 emit_insn (gen_lshrv32hi3 (t2,
49285 gen_lowpart (V32HImode, d->op1),
49286 GEN_INT (8)));
49288 else
49290 t1 = gen_lowpart (V32HImode, d->op0);
49291 t2 = gen_lowpart (V32HImode, d->op1);
49294 t3 = gen_reg_rtx (V32QImode);
49295 t4 = gen_reg_rtx (V32QImode);
49296 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
49297 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
49298 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
49300 return true;
49303 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
49304 and extract-odd permutations. */
49306 static bool
49307 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
49309 rtx t1, t2, t3, t4, t5;
49311 switch (d->vmode)
49313 case E_V4DFmode:
49314 if (d->testing_p)
49315 break;
49316 t1 = gen_reg_rtx (V4DFmode);
49317 t2 = gen_reg_rtx (V4DFmode);
49319 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49320 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
49321 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
49323 /* Now an unpck[lh]pd will produce the result required. */
49324 if (odd)
49325 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
49326 else
49327 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
49328 emit_insn (t3);
49329 break;
49331 case E_V8SFmode:
49333 int mask = odd ? 0xdd : 0x88;
49335 if (d->testing_p)
49336 break;
49337 t1 = gen_reg_rtx (V8SFmode);
49338 t2 = gen_reg_rtx (V8SFmode);
49339 t3 = gen_reg_rtx (V8SFmode);
49341 /* Shuffle within the 128-bit lanes to produce:
49342 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
49343 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
49344 GEN_INT (mask)));
49346 /* Shuffle the lanes around to produce:
49347 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
49348 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
49349 GEN_INT (0x3)));
49351 /* Shuffle within the 128-bit lanes to produce:
49352 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
49353 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
49355 /* Shuffle within the 128-bit lanes to produce:
49356 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
49357 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
49359 /* Shuffle the lanes around to produce:
49360 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
49361 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
49362 GEN_INT (0x20)));
49364 break;
49366 case E_V2DFmode:
49367 case E_V4SFmode:
49368 case E_V2DImode:
49369 case E_V4SImode:
49370 /* These are always directly implementable by expand_vec_perm_1. */
49371 gcc_unreachable ();
49373 case E_V8HImode:
49374 if (TARGET_SSE4_1)
49375 return expand_vec_perm_even_odd_pack (d);
49376 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
49377 return expand_vec_perm_pshufb2 (d);
49378 else
49380 if (d->testing_p)
49381 break;
49382 /* We need 2*log2(N)-1 operations to achieve odd/even
49383 with interleave. */
49384 t1 = gen_reg_rtx (V8HImode);
49385 t2 = gen_reg_rtx (V8HImode);
49386 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
49387 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
49388 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
49389 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
49390 if (odd)
49391 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
49392 else
49393 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
49394 emit_insn (t3);
49396 break;
49398 case E_V16QImode:
49399 return expand_vec_perm_even_odd_pack (d);
49401 case E_V16HImode:
49402 case E_V32QImode:
49403 return expand_vec_perm_even_odd_pack (d);
49405 case E_V64QImode:
49406 return expand_vec_perm_even_odd_trunc (d);
49408 case E_V4DImode:
49409 if (!TARGET_AVX2)
49411 struct expand_vec_perm_d d_copy = *d;
49412 d_copy.vmode = V4DFmode;
49413 if (d->testing_p)
49414 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
49415 else
49416 d_copy.target = gen_reg_rtx (V4DFmode);
49417 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
49418 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
49419 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49421 if (!d->testing_p)
49422 emit_move_insn (d->target,
49423 gen_lowpart (V4DImode, d_copy.target));
49424 return true;
49426 return false;
49429 if (d->testing_p)
49430 break;
49432 t1 = gen_reg_rtx (V4DImode);
49433 t2 = gen_reg_rtx (V4DImode);
49435 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49436 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
49437 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
49439 /* Now an vpunpck[lh]qdq will produce the result required. */
49440 if (odd)
49441 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
49442 else
49443 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
49444 emit_insn (t3);
49445 break;
49447 case E_V8SImode:
49448 if (!TARGET_AVX2)
49450 struct expand_vec_perm_d d_copy = *d;
49451 d_copy.vmode = V8SFmode;
49452 if (d->testing_p)
49453 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
49454 else
49455 d_copy.target = gen_reg_rtx (V8SFmode);
49456 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
49457 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
49458 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49460 if (!d->testing_p)
49461 emit_move_insn (d->target,
49462 gen_lowpart (V8SImode, d_copy.target));
49463 return true;
49465 return false;
49468 if (d->testing_p)
49469 break;
49471 t1 = gen_reg_rtx (V8SImode);
49472 t2 = gen_reg_rtx (V8SImode);
49473 t3 = gen_reg_rtx (V4DImode);
49474 t4 = gen_reg_rtx (V4DImode);
49475 t5 = gen_reg_rtx (V4DImode);
49477 /* Shuffle the lanes around into
49478 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
49479 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
49480 gen_lowpart (V4DImode, d->op1),
49481 GEN_INT (0x20)));
49482 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
49483 gen_lowpart (V4DImode, d->op1),
49484 GEN_INT (0x31)));
49486 /* Swap the 2nd and 3rd position in each lane into
49487 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
49488 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49489 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49490 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49491 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49493 /* Now an vpunpck[lh]qdq will produce
49494 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49495 if (odd)
49496 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49497 gen_lowpart (V4DImode, t2));
49498 else
49499 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49500 gen_lowpart (V4DImode, t2));
49501 emit_insn (t3);
49502 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49503 break;
49505 default:
49506 gcc_unreachable ();
49509 return true;
49512 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49513 extract-even and extract-odd permutations. */
49515 static bool
49516 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49518 unsigned i, odd, nelt = d->nelt;
49520 odd = d->perm[0];
49521 if (odd != 0 && odd != 1)
49522 return false;
49524 for (i = 1; i < nelt; ++i)
49525 if (d->perm[i] != 2 * i + odd)
49526 return false;
49528 return expand_vec_perm_even_odd_1 (d, odd);
49531 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49532 permutations. We assume that expand_vec_perm_1 has already failed. */
49534 static bool
49535 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49537 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49538 machine_mode vmode = d->vmode;
49539 unsigned char perm2[4];
49540 rtx op0 = d->op0, dest;
49541 bool ok;
49543 switch (vmode)
49545 case E_V4DFmode:
49546 case E_V8SFmode:
49547 /* These are special-cased in sse.md so that we can optionally
49548 use the vbroadcast instruction. They expand to two insns
49549 if the input happens to be in a register. */
49550 gcc_unreachable ();
49552 case E_V2DFmode:
49553 case E_V2DImode:
49554 case E_V4SFmode:
49555 case E_V4SImode:
49556 /* These are always implementable using standard shuffle patterns. */
49557 gcc_unreachable ();
49559 case E_V8HImode:
49560 case E_V16QImode:
49561 /* These can be implemented via interleave. We save one insn by
49562 stopping once we have promoted to V4SImode and then use pshufd. */
49563 if (d->testing_p)
49564 return true;
49567 rtx dest;
49568 rtx (*gen) (rtx, rtx, rtx)
49569 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49570 : gen_vec_interleave_lowv8hi;
49572 if (elt >= nelt2)
49574 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49575 : gen_vec_interleave_highv8hi;
49576 elt -= nelt2;
49578 nelt2 /= 2;
49580 dest = gen_reg_rtx (vmode);
49581 emit_insn (gen (dest, op0, op0));
49582 vmode = get_mode_wider_vector (vmode);
49583 op0 = gen_lowpart (vmode, dest);
49585 while (vmode != V4SImode);
49587 memset (perm2, elt, 4);
49588 dest = gen_reg_rtx (V4SImode);
49589 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49590 gcc_assert (ok);
49591 if (!d->testing_p)
49592 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49593 return true;
49595 case E_V64QImode:
49596 case E_V32QImode:
49597 case E_V16HImode:
49598 case E_V8SImode:
49599 case E_V4DImode:
49600 /* For AVX2 broadcasts of the first element vpbroadcast* or
49601 vpermq should be used by expand_vec_perm_1. */
49602 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49603 return false;
49605 default:
49606 gcc_unreachable ();
49610 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49611 broadcast permutations. */
49613 static bool
49614 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49616 unsigned i, elt, nelt = d->nelt;
49618 if (!d->one_operand_p)
49619 return false;
49621 elt = d->perm[0];
49622 for (i = 1; i < nelt; ++i)
49623 if (d->perm[i] != elt)
49624 return false;
49626 return expand_vec_perm_broadcast_1 (d);
49629 /* Implement arbitrary permutations of two V64QImode operands
49630 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49631 static bool
49632 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49634 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49635 return false;
49637 if (d->testing_p)
49638 return true;
49640 struct expand_vec_perm_d ds[2];
49641 rtx rperm[128], vperm, target0, target1;
49642 unsigned int i, nelt;
49643 machine_mode vmode;
49645 nelt = d->nelt;
49646 vmode = V64QImode;
49648 for (i = 0; i < 2; i++)
49650 ds[i] = *d;
49651 ds[i].vmode = V32HImode;
49652 ds[i].nelt = 32;
49653 ds[i].target = gen_reg_rtx (V32HImode);
49654 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49655 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49658 /* Prepare permutations such that the first one takes care of
49659 putting the even bytes into the right positions or one higher
49660 positions (ds[0]) and the second one takes care of
49661 putting the odd bytes into the right positions or one below
49662 (ds[1]). */
49664 for (i = 0; i < nelt; i++)
49666 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49667 if (i & 1)
49669 rperm[i] = constm1_rtx;
49670 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49672 else
49674 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49675 rperm[i + 64] = constm1_rtx;
49679 bool ok = expand_vec_perm_1 (&ds[0]);
49680 gcc_assert (ok);
49681 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49683 ok = expand_vec_perm_1 (&ds[1]);
49684 gcc_assert (ok);
49685 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49687 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49688 vperm = force_reg (vmode, vperm);
49689 target0 = gen_reg_rtx (V64QImode);
49690 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49692 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49693 vperm = force_reg (vmode, vperm);
49694 target1 = gen_reg_rtx (V64QImode);
49695 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49697 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49698 return true;
49701 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49702 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49703 all the shorter instruction sequences. */
49705 static bool
49706 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49708 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49709 unsigned int i, nelt, eltsz;
49710 bool used[4];
49712 if (!TARGET_AVX2
49713 || d->one_operand_p
49714 || (d->vmode != V32QImode && d->vmode != V16HImode))
49715 return false;
49717 if (d->testing_p)
49718 return true;
49720 nelt = d->nelt;
49721 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49723 /* Generate 4 permutation masks. If the required element is within
49724 the same lane, it is shuffled in. If the required element from the
49725 other lane, force a zero by setting bit 7 in the permutation mask.
49726 In the other mask the mask has non-negative elements if element
49727 is requested from the other lane, but also moved to the other lane,
49728 so that the result of vpshufb can have the two V2TImode halves
49729 swapped. */
49730 m128 = GEN_INT (-128);
49731 for (i = 0; i < 32; ++i)
49733 rperm[0][i] = m128;
49734 rperm[1][i] = m128;
49735 rperm[2][i] = m128;
49736 rperm[3][i] = m128;
49738 used[0] = false;
49739 used[1] = false;
49740 used[2] = false;
49741 used[3] = false;
49742 for (i = 0; i < nelt; ++i)
49744 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49745 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49746 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49748 for (j = 0; j < eltsz; ++j)
49749 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49750 used[which] = true;
49753 for (i = 0; i < 2; ++i)
49755 if (!used[2 * i + 1])
49757 h[i] = NULL_RTX;
49758 continue;
49760 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49761 gen_rtvec_v (32, rperm[2 * i + 1]));
49762 vperm = force_reg (V32QImode, vperm);
49763 h[i] = gen_reg_rtx (V32QImode);
49764 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49765 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49768 /* Swap the 128-byte lanes of h[X]. */
49769 for (i = 0; i < 2; ++i)
49771 if (h[i] == NULL_RTX)
49772 continue;
49773 op = gen_reg_rtx (V4DImode);
49774 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49775 const2_rtx, GEN_INT (3), const0_rtx,
49776 const1_rtx));
49777 h[i] = gen_lowpart (V32QImode, op);
49780 for (i = 0; i < 2; ++i)
49782 if (!used[2 * i])
49784 l[i] = NULL_RTX;
49785 continue;
49787 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49788 vperm = force_reg (V32QImode, vperm);
49789 l[i] = gen_reg_rtx (V32QImode);
49790 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49791 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49794 for (i = 0; i < 2; ++i)
49796 if (h[i] && l[i])
49798 op = gen_reg_rtx (V32QImode);
49799 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49800 l[i] = op;
49802 else if (h[i])
49803 l[i] = h[i];
49806 gcc_assert (l[0] && l[1]);
49807 op = d->target;
49808 if (d->vmode != V32QImode)
49809 op = gen_reg_rtx (V32QImode);
49810 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49811 if (op != d->target)
49812 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49813 return true;
49816 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49817 With all of the interface bits taken care of, perform the expansion
49818 in D and return true on success. */
49820 static bool
49821 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49823 /* Try a single instruction expansion. */
49824 if (expand_vec_perm_1 (d))
49825 return true;
49827 /* Try sequences of two instructions. */
49829 if (expand_vec_perm_pshuflw_pshufhw (d))
49830 return true;
49832 if (expand_vec_perm_palignr (d, false))
49833 return true;
49835 if (expand_vec_perm_interleave2 (d))
49836 return true;
49838 if (expand_vec_perm_broadcast (d))
49839 return true;
49841 if (expand_vec_perm_vpermq_perm_1 (d))
49842 return true;
49844 if (expand_vec_perm_vperm2f128 (d))
49845 return true;
49847 if (expand_vec_perm_pblendv (d))
49848 return true;
49850 /* Try sequences of three instructions. */
49852 if (expand_vec_perm_even_odd_pack (d))
49853 return true;
49855 if (expand_vec_perm_2vperm2f128_vshuf (d))
49856 return true;
49858 if (expand_vec_perm_pshufb2 (d))
49859 return true;
49861 if (expand_vec_perm_interleave3 (d))
49862 return true;
49864 if (expand_vec_perm_vperm2f128_vblend (d))
49865 return true;
49867 /* Try sequences of four instructions. */
49869 if (expand_vec_perm_even_odd_trunc (d))
49870 return true;
49871 if (expand_vec_perm_vpshufb2_vpermq (d))
49872 return true;
49874 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49875 return true;
49877 if (expand_vec_perm_vpermi2_vpshub2 (d))
49878 return true;
49880 /* ??? Look for narrow permutations whose element orderings would
49881 allow the promotion to a wider mode. */
49883 /* ??? Look for sequences of interleave or a wider permute that place
49884 the data into the correct lanes for a half-vector shuffle like
49885 pshuf[lh]w or vpermilps. */
49887 /* ??? Look for sequences of interleave that produce the desired results.
49888 The combinatorics of punpck[lh] get pretty ugly... */
49890 if (expand_vec_perm_even_odd (d))
49891 return true;
49893 /* Even longer sequences. */
49894 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49895 return true;
49897 /* See if we can get the same permutation in different vector integer
49898 mode. */
49899 struct expand_vec_perm_d nd;
49900 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49902 if (!d->testing_p)
49903 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49904 return true;
49907 return false;
49910 /* If a permutation only uses one operand, make it clear. Returns true
49911 if the permutation references both operands. */
49913 static bool
49914 canonicalize_perm (struct expand_vec_perm_d *d)
49916 int i, which, nelt = d->nelt;
49918 for (i = which = 0; i < nelt; ++i)
49919 which |= (d->perm[i] < nelt ? 1 : 2);
49921 d->one_operand_p = true;
49922 switch (which)
49924 default:
49925 gcc_unreachable();
49927 case 3:
49928 if (!rtx_equal_p (d->op0, d->op1))
49930 d->one_operand_p = false;
49931 break;
49933 /* The elements of PERM do not suggest that only the first operand
49934 is used, but both operands are identical. Allow easier matching
49935 of the permutation by folding the permutation into the single
49936 input vector. */
49937 /* FALLTHRU */
49939 case 2:
49940 for (i = 0; i < nelt; ++i)
49941 d->perm[i] &= nelt - 1;
49942 d->op0 = d->op1;
49943 break;
49945 case 1:
49946 d->op1 = d->op0;
49947 break;
49950 return (which == 3);
49953 bool
49954 ix86_expand_vec_perm_const (rtx operands[4])
49956 struct expand_vec_perm_d d;
49957 unsigned char perm[MAX_VECT_LEN];
49958 int i, nelt;
49959 bool two_args;
49960 rtx sel;
49962 d.target = operands[0];
49963 d.op0 = operands[1];
49964 d.op1 = operands[2];
49965 sel = operands[3];
49967 d.vmode = GET_MODE (d.target);
49968 gcc_assert (VECTOR_MODE_P (d.vmode));
49969 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49970 d.testing_p = false;
49972 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49973 gcc_assert (XVECLEN (sel, 0) == nelt);
49974 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49976 for (i = 0; i < nelt; ++i)
49978 rtx e = XVECEXP (sel, 0, i);
49979 int ei = INTVAL (e) & (2 * nelt - 1);
49980 d.perm[i] = ei;
49981 perm[i] = ei;
49984 two_args = canonicalize_perm (&d);
49986 if (ix86_expand_vec_perm_const_1 (&d))
49987 return true;
49989 /* If the selector says both arguments are needed, but the operands are the
49990 same, the above tried to expand with one_operand_p and flattened selector.
49991 If that didn't work, retry without one_operand_p; we succeeded with that
49992 during testing. */
49993 if (two_args && d.one_operand_p)
49995 d.one_operand_p = false;
49996 memcpy (d.perm, perm, sizeof (perm));
49997 return ix86_expand_vec_perm_const_1 (&d);
50000 return false;
50003 /* Implement targetm.vectorize.vec_perm_const_ok. */
50005 static bool
50006 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
50007 const unsigned char *sel)
50009 struct expand_vec_perm_d d;
50010 unsigned int i, nelt, which;
50011 bool ret;
50013 d.vmode = vmode;
50014 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
50015 d.testing_p = true;
50017 /* Given sufficient ISA support we can just return true here
50018 for selected vector modes. */
50019 switch (d.vmode)
50021 case E_V16SFmode:
50022 case E_V16SImode:
50023 case E_V8DImode:
50024 case E_V8DFmode:
50025 if (TARGET_AVX512F)
50026 /* All implementable with a single vpermi2 insn. */
50027 return true;
50028 break;
50029 case E_V32HImode:
50030 if (TARGET_AVX512BW)
50031 /* All implementable with a single vpermi2 insn. */
50032 return true;
50033 break;
50034 case E_V64QImode:
50035 if (TARGET_AVX512BW)
50036 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
50037 return true;
50038 break;
50039 case E_V8SImode:
50040 case E_V8SFmode:
50041 case E_V4DFmode:
50042 case E_V4DImode:
50043 if (TARGET_AVX512VL)
50044 /* All implementable with a single vpermi2 insn. */
50045 return true;
50046 break;
50047 case E_V16HImode:
50048 if (TARGET_AVX2)
50049 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
50050 return true;
50051 break;
50052 case E_V32QImode:
50053 if (TARGET_AVX2)
50054 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
50055 return true;
50056 break;
50057 case E_V4SImode:
50058 case E_V4SFmode:
50059 case E_V8HImode:
50060 case E_V16QImode:
50061 /* All implementable with a single vpperm insn. */
50062 if (TARGET_XOP)
50063 return true;
50064 /* All implementable with 2 pshufb + 1 ior. */
50065 if (TARGET_SSSE3)
50066 return true;
50067 break;
50068 case E_V2DImode:
50069 case E_V2DFmode:
50070 /* All implementable with shufpd or unpck[lh]pd. */
50071 return true;
50072 default:
50073 return false;
50076 /* Extract the values from the vector CST into the permutation
50077 array in D. */
50078 memcpy (d.perm, sel, nelt);
50079 for (i = which = 0; i < nelt; ++i)
50081 unsigned char e = d.perm[i];
50082 gcc_assert (e < 2 * nelt);
50083 which |= (e < nelt ? 1 : 2);
50086 /* For all elements from second vector, fold the elements to first. */
50087 if (which == 2)
50088 for (i = 0; i < nelt; ++i)
50089 d.perm[i] -= nelt;
50091 /* Check whether the mask can be applied to the vector type. */
50092 d.one_operand_p = (which != 3);
50094 /* Implementable with shufps or pshufd. */
50095 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
50096 return true;
50098 /* Otherwise we have to go through the motions and see if we can
50099 figure out how to generate the requested permutation. */
50100 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
50101 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
50102 if (!d.one_operand_p)
50103 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
50105 start_sequence ();
50106 ret = ix86_expand_vec_perm_const_1 (&d);
50107 end_sequence ();
50109 return ret;
50112 void
50113 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
50115 struct expand_vec_perm_d d;
50116 unsigned i, nelt;
50118 d.target = targ;
50119 d.op0 = op0;
50120 d.op1 = op1;
50121 d.vmode = GET_MODE (targ);
50122 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
50123 d.one_operand_p = false;
50124 d.testing_p = false;
50126 for (i = 0; i < nelt; ++i)
50127 d.perm[i] = i * 2 + odd;
50129 /* We'll either be able to implement the permutation directly... */
50130 if (expand_vec_perm_1 (&d))
50131 return;
50133 /* ... or we use the special-case patterns. */
50134 expand_vec_perm_even_odd_1 (&d, odd);
50137 static void
50138 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
50140 struct expand_vec_perm_d d;
50141 unsigned i, nelt, base;
50142 bool ok;
50144 d.target = targ;
50145 d.op0 = op0;
50146 d.op1 = op1;
50147 d.vmode = GET_MODE (targ);
50148 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
50149 d.one_operand_p = false;
50150 d.testing_p = false;
50152 base = high_p ? nelt / 2 : 0;
50153 for (i = 0; i < nelt / 2; ++i)
50155 d.perm[i * 2] = i + base;
50156 d.perm[i * 2 + 1] = i + base + nelt;
50159 /* Note that for AVX this isn't one instruction. */
50160 ok = ix86_expand_vec_perm_const_1 (&d);
50161 gcc_assert (ok);
50165 /* Expand a vector operation CODE for a V*QImode in terms of the
50166 same operation on V*HImode. */
50168 void
50169 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
50171 machine_mode qimode = GET_MODE (dest);
50172 machine_mode himode;
50173 rtx (*gen_il) (rtx, rtx, rtx);
50174 rtx (*gen_ih) (rtx, rtx, rtx);
50175 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
50176 struct expand_vec_perm_d d;
50177 bool ok, full_interleave;
50178 bool uns_p = false;
50179 int i;
50181 switch (qimode)
50183 case E_V16QImode:
50184 himode = V8HImode;
50185 gen_il = gen_vec_interleave_lowv16qi;
50186 gen_ih = gen_vec_interleave_highv16qi;
50187 break;
50188 case E_V32QImode:
50189 himode = V16HImode;
50190 gen_il = gen_avx2_interleave_lowv32qi;
50191 gen_ih = gen_avx2_interleave_highv32qi;
50192 break;
50193 case E_V64QImode:
50194 himode = V32HImode;
50195 gen_il = gen_avx512bw_interleave_lowv64qi;
50196 gen_ih = gen_avx512bw_interleave_highv64qi;
50197 break;
50198 default:
50199 gcc_unreachable ();
50202 op2_l = op2_h = op2;
50203 switch (code)
50205 case MULT:
50206 /* Unpack data such that we've got a source byte in each low byte of
50207 each word. We don't care what goes into the high byte of each word.
50208 Rather than trying to get zero in there, most convenient is to let
50209 it be a copy of the low byte. */
50210 op2_l = gen_reg_rtx (qimode);
50211 op2_h = gen_reg_rtx (qimode);
50212 emit_insn (gen_il (op2_l, op2, op2));
50213 emit_insn (gen_ih (op2_h, op2, op2));
50214 /* FALLTHRU */
50216 op1_l = gen_reg_rtx (qimode);
50217 op1_h = gen_reg_rtx (qimode);
50218 emit_insn (gen_il (op1_l, op1, op1));
50219 emit_insn (gen_ih (op1_h, op1, op1));
50220 full_interleave = qimode == V16QImode;
50221 break;
50223 case ASHIFT:
50224 case LSHIFTRT:
50225 uns_p = true;
50226 /* FALLTHRU */
50227 case ASHIFTRT:
50228 op1_l = gen_reg_rtx (himode);
50229 op1_h = gen_reg_rtx (himode);
50230 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
50231 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
50232 full_interleave = true;
50233 break;
50234 default:
50235 gcc_unreachable ();
50238 /* Perform the operation. */
50239 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
50240 1, OPTAB_DIRECT);
50241 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
50242 1, OPTAB_DIRECT);
50243 gcc_assert (res_l && res_h);
50245 /* Merge the data back into the right place. */
50246 d.target = dest;
50247 d.op0 = gen_lowpart (qimode, res_l);
50248 d.op1 = gen_lowpart (qimode, res_h);
50249 d.vmode = qimode;
50250 d.nelt = GET_MODE_NUNITS (qimode);
50251 d.one_operand_p = false;
50252 d.testing_p = false;
50254 if (full_interleave)
50256 /* For SSE2, we used an full interleave, so the desired
50257 results are in the even elements. */
50258 for (i = 0; i < d.nelt; ++i)
50259 d.perm[i] = i * 2;
50261 else
50263 /* For AVX, the interleave used above was not cross-lane. So the
50264 extraction is evens but with the second and third quarter swapped.
50265 Happily, that is even one insn shorter than even extraction.
50266 For AVX512BW we have 4 lanes. We extract evens from within a lane,
50267 always first from the first and then from the second source operand,
50268 the index bits above the low 4 bits remains the same.
50269 Thus, for d.nelt == 32 we want permutation
50270 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
50271 and for d.nelt == 64 we want permutation
50272 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
50273 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
50274 for (i = 0; i < d.nelt; ++i)
50275 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
50278 ok = ix86_expand_vec_perm_const_1 (&d);
50279 gcc_assert (ok);
50281 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50282 gen_rtx_fmt_ee (code, qimode, op1, op2));
50285 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
50286 if op is CONST_VECTOR with all odd elements equal to their
50287 preceding element. */
50289 static bool
50290 const_vector_equal_evenodd_p (rtx op)
50292 machine_mode mode = GET_MODE (op);
50293 int i, nunits = GET_MODE_NUNITS (mode);
50294 if (GET_CODE (op) != CONST_VECTOR
50295 || nunits != CONST_VECTOR_NUNITS (op))
50296 return false;
50297 for (i = 0; i < nunits; i += 2)
50298 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
50299 return false;
50300 return true;
50303 void
50304 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
50305 bool uns_p, bool odd_p)
50307 machine_mode mode = GET_MODE (op1);
50308 machine_mode wmode = GET_MODE (dest);
50309 rtx x;
50310 rtx orig_op1 = op1, orig_op2 = op2;
50312 if (!nonimmediate_operand (op1, mode))
50313 op1 = force_reg (mode, op1);
50314 if (!nonimmediate_operand (op2, mode))
50315 op2 = force_reg (mode, op2);
50317 /* We only play even/odd games with vectors of SImode. */
50318 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
50320 /* If we're looking for the odd results, shift those members down to
50321 the even slots. For some cpus this is faster than a PSHUFD. */
50322 if (odd_p)
50324 /* For XOP use vpmacsdqh, but only for smult, as it is only
50325 signed. */
50326 if (TARGET_XOP && mode == V4SImode && !uns_p)
50328 x = force_reg (wmode, CONST0_RTX (wmode));
50329 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
50330 return;
50333 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
50334 if (!const_vector_equal_evenodd_p (orig_op1))
50335 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
50336 x, NULL, 1, OPTAB_DIRECT);
50337 if (!const_vector_equal_evenodd_p (orig_op2))
50338 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
50339 x, NULL, 1, OPTAB_DIRECT);
50340 op1 = gen_lowpart (mode, op1);
50341 op2 = gen_lowpart (mode, op2);
50344 if (mode == V16SImode)
50346 if (uns_p)
50347 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
50348 else
50349 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
50351 else if (mode == V8SImode)
50353 if (uns_p)
50354 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
50355 else
50356 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
50358 else if (uns_p)
50359 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
50360 else if (TARGET_SSE4_1)
50361 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
50362 else
50364 rtx s1, s2, t0, t1, t2;
50366 /* The easiest way to implement this without PMULDQ is to go through
50367 the motions as if we are performing a full 64-bit multiply. With
50368 the exception that we need to do less shuffling of the elements. */
50370 /* Compute the sign-extension, aka highparts, of the two operands. */
50371 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50372 op1, pc_rtx, pc_rtx);
50373 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50374 op2, pc_rtx, pc_rtx);
50376 /* Multiply LO(A) * HI(B), and vice-versa. */
50377 t1 = gen_reg_rtx (wmode);
50378 t2 = gen_reg_rtx (wmode);
50379 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
50380 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
50382 /* Multiply LO(A) * LO(B). */
50383 t0 = gen_reg_rtx (wmode);
50384 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
50386 /* Combine and shift the highparts into place. */
50387 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
50388 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
50389 1, OPTAB_DIRECT);
50391 /* Combine high and low parts. */
50392 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
50393 return;
50395 emit_insn (x);
50398 void
50399 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
50400 bool uns_p, bool high_p)
50402 machine_mode wmode = GET_MODE (dest);
50403 machine_mode mode = GET_MODE (op1);
50404 rtx t1, t2, t3, t4, mask;
50406 switch (mode)
50408 case E_V4SImode:
50409 t1 = gen_reg_rtx (mode);
50410 t2 = gen_reg_rtx (mode);
50411 if (TARGET_XOP && !uns_p)
50413 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
50414 shuffle the elements once so that all elements are in the right
50415 place for immediate use: { A C B D }. */
50416 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
50417 const1_rtx, GEN_INT (3)));
50418 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
50419 const1_rtx, GEN_INT (3)));
50421 else
50423 /* Put the elements into place for the multiply. */
50424 ix86_expand_vec_interleave (t1, op1, op1, high_p);
50425 ix86_expand_vec_interleave (t2, op2, op2, high_p);
50426 high_p = false;
50428 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
50429 break;
50431 case E_V8SImode:
50432 /* Shuffle the elements between the lanes. After this we
50433 have { A B E F | C D G H } for each operand. */
50434 t1 = gen_reg_rtx (V4DImode);
50435 t2 = gen_reg_rtx (V4DImode);
50436 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
50437 const0_rtx, const2_rtx,
50438 const1_rtx, GEN_INT (3)));
50439 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
50440 const0_rtx, const2_rtx,
50441 const1_rtx, GEN_INT (3)));
50443 /* Shuffle the elements within the lanes. After this we
50444 have { A A B B | C C D D } or { E E F F | G G H H }. */
50445 t3 = gen_reg_rtx (V8SImode);
50446 t4 = gen_reg_rtx (V8SImode);
50447 mask = GEN_INT (high_p
50448 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
50449 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
50450 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
50451 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
50453 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
50454 break;
50456 case E_V8HImode:
50457 case E_V16HImode:
50458 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
50459 uns_p, OPTAB_DIRECT);
50460 t2 = expand_binop (mode,
50461 uns_p ? umul_highpart_optab : smul_highpart_optab,
50462 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
50463 gcc_assert (t1 && t2);
50465 t3 = gen_reg_rtx (mode);
50466 ix86_expand_vec_interleave (t3, t1, t2, high_p);
50467 emit_move_insn (dest, gen_lowpart (wmode, t3));
50468 break;
50470 case E_V16QImode:
50471 case E_V32QImode:
50472 case E_V32HImode:
50473 case E_V16SImode:
50474 case E_V64QImode:
50475 t1 = gen_reg_rtx (wmode);
50476 t2 = gen_reg_rtx (wmode);
50477 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
50478 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
50480 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
50481 break;
50483 default:
50484 gcc_unreachable ();
50488 void
50489 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50491 rtx res_1, res_2, res_3, res_4;
50493 res_1 = gen_reg_rtx (V4SImode);
50494 res_2 = gen_reg_rtx (V4SImode);
50495 res_3 = gen_reg_rtx (V2DImode);
50496 res_4 = gen_reg_rtx (V2DImode);
50497 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50498 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50500 /* Move the results in element 2 down to element 1; we don't care
50501 what goes in elements 2 and 3. Then we can merge the parts
50502 back together with an interleave.
50504 Note that two other sequences were tried:
50505 (1) Use interleaves at the start instead of psrldq, which allows
50506 us to use a single shufps to merge things back at the end.
50507 (2) Use shufps here to combine the two vectors, then pshufd to
50508 put the elements in the correct order.
50509 In both cases the cost of the reformatting stall was too high
50510 and the overall sequence slower. */
50512 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50513 const0_rtx, const2_rtx,
50514 const0_rtx, const0_rtx));
50515 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50516 const0_rtx, const2_rtx,
50517 const0_rtx, const0_rtx));
50518 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50520 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50523 void
50524 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50526 machine_mode mode = GET_MODE (op0);
50527 rtx t1, t2, t3, t4, t5, t6;
50529 if (TARGET_AVX512DQ && mode == V8DImode)
50530 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50531 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50532 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50533 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50534 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50535 else if (TARGET_XOP && mode == V2DImode)
50537 /* op1: A,B,C,D, op2: E,F,G,H */
50538 op1 = gen_lowpart (V4SImode, op1);
50539 op2 = gen_lowpart (V4SImode, op2);
50541 t1 = gen_reg_rtx (V4SImode);
50542 t2 = gen_reg_rtx (V4SImode);
50543 t3 = gen_reg_rtx (V2DImode);
50544 t4 = gen_reg_rtx (V2DImode);
50546 /* t1: B,A,D,C */
50547 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50548 GEN_INT (1),
50549 GEN_INT (0),
50550 GEN_INT (3),
50551 GEN_INT (2)));
50553 /* t2: (B*E),(A*F),(D*G),(C*H) */
50554 emit_insn (gen_mulv4si3 (t2, t1, op2));
50556 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50557 emit_insn (gen_xop_phadddq (t3, t2));
50559 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50560 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50562 /* Multiply lower parts and add all */
50563 t5 = gen_reg_rtx (V2DImode);
50564 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50565 gen_lowpart (V4SImode, op1),
50566 gen_lowpart (V4SImode, op2)));
50567 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50570 else
50572 machine_mode nmode;
50573 rtx (*umul) (rtx, rtx, rtx);
50575 if (mode == V2DImode)
50577 umul = gen_vec_widen_umult_even_v4si;
50578 nmode = V4SImode;
50580 else if (mode == V4DImode)
50582 umul = gen_vec_widen_umult_even_v8si;
50583 nmode = V8SImode;
50585 else if (mode == V8DImode)
50587 umul = gen_vec_widen_umult_even_v16si;
50588 nmode = V16SImode;
50590 else
50591 gcc_unreachable ();
50594 /* Multiply low parts. */
50595 t1 = gen_reg_rtx (mode);
50596 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50598 /* Shift input vectors right 32 bits so we can multiply high parts. */
50599 t6 = GEN_INT (32);
50600 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50601 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50603 /* Multiply high parts by low parts. */
50604 t4 = gen_reg_rtx (mode);
50605 t5 = gen_reg_rtx (mode);
50606 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50607 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50609 /* Combine and shift the highparts back. */
50610 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50611 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50613 /* Combine high and low parts. */
50614 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50617 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50618 gen_rtx_MULT (mode, op1, op2));
50621 /* Return 1 if control tansfer instruction INSN
50622 should be encoded with bnd prefix.
50623 If insn is NULL then return 1 when control
50624 transfer instructions should be prefixed with
50625 bnd by default for current function. */
50627 bool
50628 ix86_bnd_prefixed_insn_p (rtx insn)
50630 /* For call insns check special flag. */
50631 if (insn && CALL_P (insn))
50633 rtx call = get_call_rtx_from (insn);
50634 if (call)
50635 return CALL_EXPR_WITH_BOUNDS_P (call);
50638 /* All other insns are prefixed only if function is instrumented. */
50639 return chkp_function_instrumented_p (current_function_decl);
50642 /* Calculate integer abs() using only SSE2 instructions. */
50644 void
50645 ix86_expand_sse2_abs (rtx target, rtx input)
50647 machine_mode mode = GET_MODE (target);
50648 rtx tmp0, tmp1, x;
50650 switch (mode)
50652 /* For 32-bit signed integer X, the best way to calculate the absolute
50653 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50654 case E_V4SImode:
50655 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50656 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50657 NULL, 0, OPTAB_DIRECT);
50658 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50659 NULL, 0, OPTAB_DIRECT);
50660 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50661 target, 0, OPTAB_DIRECT);
50662 break;
50664 /* For 16-bit signed integer X, the best way to calculate the absolute
50665 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50666 case E_V8HImode:
50667 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50669 x = expand_simple_binop (mode, SMAX, tmp0, input,
50670 target, 0, OPTAB_DIRECT);
50671 break;
50673 /* For 8-bit signed integer X, the best way to calculate the absolute
50674 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50675 as SSE2 provides the PMINUB insn. */
50676 case E_V16QImode:
50677 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50679 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50680 target, 0, OPTAB_DIRECT);
50681 break;
50683 default:
50684 gcc_unreachable ();
50687 if (x != target)
50688 emit_move_insn (target, x);
50691 /* Expand an extract from a vector register through pextr insn.
50692 Return true if successful. */
50694 bool
50695 ix86_expand_pextr (rtx *operands)
50697 rtx dst = operands[0];
50698 rtx src = operands[1];
50700 unsigned int size = INTVAL (operands[2]);
50701 unsigned int pos = INTVAL (operands[3]);
50703 if (SUBREG_P (dst))
50705 /* Reject non-lowpart subregs. */
50706 if (SUBREG_BYTE (dst) > 0)
50707 return false;
50708 dst = SUBREG_REG (dst);
50711 if (SUBREG_P (src))
50713 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50714 src = SUBREG_REG (src);
50717 switch (GET_MODE (src))
50719 case E_V16QImode:
50720 case E_V8HImode:
50721 case E_V4SImode:
50722 case E_V2DImode:
50723 case E_V1TImode:
50724 case E_TImode:
50726 machine_mode srcmode, dstmode;
50727 rtx d, pat;
50729 if (!int_mode_for_size (size, 0).exists (&dstmode))
50730 return false;
50732 switch (dstmode)
50734 case E_QImode:
50735 if (!TARGET_SSE4_1)
50736 return false;
50737 srcmode = V16QImode;
50738 break;
50740 case E_HImode:
50741 if (!TARGET_SSE2)
50742 return false;
50743 srcmode = V8HImode;
50744 break;
50746 case E_SImode:
50747 if (!TARGET_SSE4_1)
50748 return false;
50749 srcmode = V4SImode;
50750 break;
50752 case E_DImode:
50753 gcc_assert (TARGET_64BIT);
50754 if (!TARGET_SSE4_1)
50755 return false;
50756 srcmode = V2DImode;
50757 break;
50759 default:
50760 return false;
50763 /* Reject extractions from misaligned positions. */
50764 if (pos & (size-1))
50765 return false;
50767 if (GET_MODE (dst) == dstmode)
50768 d = dst;
50769 else
50770 d = gen_reg_rtx (dstmode);
50772 /* Construct insn pattern. */
50773 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50774 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50776 /* Let the rtl optimizers know about the zero extension performed. */
50777 if (dstmode == QImode || dstmode == HImode)
50779 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50780 d = gen_lowpart (SImode, d);
50783 emit_insn (gen_rtx_SET (d, pat));
50785 if (d != dst)
50786 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50787 return true;
50790 default:
50791 return false;
50795 /* Expand an insert into a vector register through pinsr insn.
50796 Return true if successful. */
50798 bool
50799 ix86_expand_pinsr (rtx *operands)
50801 rtx dst = operands[0];
50802 rtx src = operands[3];
50804 unsigned int size = INTVAL (operands[1]);
50805 unsigned int pos = INTVAL (operands[2]);
50807 if (SUBREG_P (dst))
50809 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50810 dst = SUBREG_REG (dst);
50813 switch (GET_MODE (dst))
50815 case E_V16QImode:
50816 case E_V8HImode:
50817 case E_V4SImode:
50818 case E_V2DImode:
50819 case E_V1TImode:
50820 case E_TImode:
50822 machine_mode srcmode, dstmode;
50823 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50824 rtx d;
50826 if (!int_mode_for_size (size, 0).exists (&srcmode))
50827 return false;
50829 switch (srcmode)
50831 case E_QImode:
50832 if (!TARGET_SSE4_1)
50833 return false;
50834 dstmode = V16QImode;
50835 pinsr = gen_sse4_1_pinsrb;
50836 break;
50838 case E_HImode:
50839 if (!TARGET_SSE2)
50840 return false;
50841 dstmode = V8HImode;
50842 pinsr = gen_sse2_pinsrw;
50843 break;
50845 case E_SImode:
50846 if (!TARGET_SSE4_1)
50847 return false;
50848 dstmode = V4SImode;
50849 pinsr = gen_sse4_1_pinsrd;
50850 break;
50852 case E_DImode:
50853 gcc_assert (TARGET_64BIT);
50854 if (!TARGET_SSE4_1)
50855 return false;
50856 dstmode = V2DImode;
50857 pinsr = gen_sse4_1_pinsrq;
50858 break;
50860 default:
50861 return false;
50864 /* Reject insertions to misaligned positions. */
50865 if (pos & (size-1))
50866 return false;
50868 if (SUBREG_P (src))
50870 unsigned int srcpos = SUBREG_BYTE (src);
50872 if (srcpos > 0)
50874 rtx extr_ops[4];
50876 extr_ops[0] = gen_reg_rtx (srcmode);
50877 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50878 extr_ops[2] = GEN_INT (size);
50879 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50881 if (!ix86_expand_pextr (extr_ops))
50882 return false;
50884 src = extr_ops[0];
50886 else
50887 src = gen_lowpart (srcmode, SUBREG_REG (src));
50890 if (GET_MODE (dst) == dstmode)
50891 d = dst;
50892 else
50893 d = gen_reg_rtx (dstmode);
50895 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50896 gen_lowpart (srcmode, src),
50897 GEN_INT (1 << (pos / size))));
50898 if (d != dst)
50899 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50900 return true;
50903 default:
50904 return false;
50908 /* This function returns the calling abi specific va_list type node.
50909 It returns the FNDECL specific va_list type. */
50911 static tree
50912 ix86_fn_abi_va_list (tree fndecl)
50914 if (!TARGET_64BIT)
50915 return va_list_type_node;
50916 gcc_assert (fndecl != NULL_TREE);
50918 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50919 return ms_va_list_type_node;
50920 else
50921 return sysv_va_list_type_node;
50924 /* Returns the canonical va_list type specified by TYPE. If there
50925 is no valid TYPE provided, it return NULL_TREE. */
50927 static tree
50928 ix86_canonical_va_list_type (tree type)
50930 if (TARGET_64BIT)
50932 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50933 return ms_va_list_type_node;
50935 if ((TREE_CODE (type) == ARRAY_TYPE
50936 && integer_zerop (array_type_nelts (type)))
50937 || POINTER_TYPE_P (type))
50939 tree elem_type = TREE_TYPE (type);
50940 if (TREE_CODE (elem_type) == RECORD_TYPE
50941 && lookup_attribute ("sysv_abi va_list",
50942 TYPE_ATTRIBUTES (elem_type)))
50943 return sysv_va_list_type_node;
50946 return NULL_TREE;
50949 return std_canonical_va_list_type (type);
50952 /* Iterate through the target-specific builtin types for va_list.
50953 IDX denotes the iterator, *PTREE is set to the result type of
50954 the va_list builtin, and *PNAME to its internal type.
50955 Returns zero if there is no element for this index, otherwise
50956 IDX should be increased upon the next call.
50957 Note, do not iterate a base builtin's name like __builtin_va_list.
50958 Used from c_common_nodes_and_builtins. */
50960 static int
50961 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50963 if (TARGET_64BIT)
50965 switch (idx)
50967 default:
50968 break;
50970 case 0:
50971 *ptree = ms_va_list_type_node;
50972 *pname = "__builtin_ms_va_list";
50973 return 1;
50975 case 1:
50976 *ptree = sysv_va_list_type_node;
50977 *pname = "__builtin_sysv_va_list";
50978 return 1;
50982 return 0;
50985 #undef TARGET_SCHED_DISPATCH
50986 #define TARGET_SCHED_DISPATCH has_dispatch
50987 #undef TARGET_SCHED_DISPATCH_DO
50988 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50989 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50990 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50991 #undef TARGET_SCHED_REORDER
50992 #define TARGET_SCHED_REORDER ix86_sched_reorder
50993 #undef TARGET_SCHED_ADJUST_PRIORITY
50994 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50995 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50996 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50997 ix86_dependencies_evaluation_hook
50999 /* The size of the dispatch window is the total number of bytes of
51000 object code allowed in a window. */
51001 #define DISPATCH_WINDOW_SIZE 16
51003 /* Number of dispatch windows considered for scheduling. */
51004 #define MAX_DISPATCH_WINDOWS 3
51006 /* Maximum number of instructions in a window. */
51007 #define MAX_INSN 4
51009 /* Maximum number of immediate operands in a window. */
51010 #define MAX_IMM 4
51012 /* Maximum number of immediate bits allowed in a window. */
51013 #define MAX_IMM_SIZE 128
51015 /* Maximum number of 32 bit immediates allowed in a window. */
51016 #define MAX_IMM_32 4
51018 /* Maximum number of 64 bit immediates allowed in a window. */
51019 #define MAX_IMM_64 2
51021 /* Maximum total of loads or prefetches allowed in a window. */
51022 #define MAX_LOAD 2
51024 /* Maximum total of stores allowed in a window. */
51025 #define MAX_STORE 1
51027 #undef BIG
51028 #define BIG 100
51031 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
51032 enum dispatch_group {
51033 disp_no_group = 0,
51034 disp_load,
51035 disp_store,
51036 disp_load_store,
51037 disp_prefetch,
51038 disp_imm,
51039 disp_imm_32,
51040 disp_imm_64,
51041 disp_branch,
51042 disp_cmp,
51043 disp_jcc,
51044 disp_last
51047 /* Number of allowable groups in a dispatch window. It is an array
51048 indexed by dispatch_group enum. 100 is used as a big number,
51049 because the number of these kind of operations does not have any
51050 effect in dispatch window, but we need them for other reasons in
51051 the table. */
51052 static unsigned int num_allowable_groups[disp_last] = {
51053 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
51056 char group_name[disp_last + 1][16] = {
51057 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
51058 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
51059 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
51062 /* Instruction path. */
51063 enum insn_path {
51064 no_path = 0,
51065 path_single, /* Single micro op. */
51066 path_double, /* Double micro op. */
51067 path_multi, /* Instructions with more than 2 micro op.. */
51068 last_path
51071 /* sched_insn_info defines a window to the instructions scheduled in
51072 the basic block. It contains a pointer to the insn_info table and
51073 the instruction scheduled.
51075 Windows are allocated for each basic block and are linked
51076 together. */
51077 typedef struct sched_insn_info_s {
51078 rtx insn;
51079 enum dispatch_group group;
51080 enum insn_path path;
51081 int byte_len;
51082 int imm_bytes;
51083 } sched_insn_info;
51085 /* Linked list of dispatch windows. This is a two way list of
51086 dispatch windows of a basic block. It contains information about
51087 the number of uops in the window and the total number of
51088 instructions and of bytes in the object code for this dispatch
51089 window. */
51090 typedef struct dispatch_windows_s {
51091 int num_insn; /* Number of insn in the window. */
51092 int num_uops; /* Number of uops in the window. */
51093 int window_size; /* Number of bytes in the window. */
51094 int window_num; /* Window number between 0 or 1. */
51095 int num_imm; /* Number of immediates in an insn. */
51096 int num_imm_32; /* Number of 32 bit immediates in an insn. */
51097 int num_imm_64; /* Number of 64 bit immediates in an insn. */
51098 int imm_size; /* Total immediates in the window. */
51099 int num_loads; /* Total memory loads in the window. */
51100 int num_stores; /* Total memory stores in the window. */
51101 int violation; /* Violation exists in window. */
51102 sched_insn_info *window; /* Pointer to the window. */
51103 struct dispatch_windows_s *next;
51104 struct dispatch_windows_s *prev;
51105 } dispatch_windows;
51107 /* Immediate valuse used in an insn. */
51108 typedef struct imm_info_s
51110 int imm;
51111 int imm32;
51112 int imm64;
51113 } imm_info;
51115 static dispatch_windows *dispatch_window_list;
51116 static dispatch_windows *dispatch_window_list1;
51118 /* Get dispatch group of insn. */
51120 static enum dispatch_group
51121 get_mem_group (rtx_insn *insn)
51123 enum attr_memory memory;
51125 if (INSN_CODE (insn) < 0)
51126 return disp_no_group;
51127 memory = get_attr_memory (insn);
51128 if (memory == MEMORY_STORE)
51129 return disp_store;
51131 if (memory == MEMORY_LOAD)
51132 return disp_load;
51134 if (memory == MEMORY_BOTH)
51135 return disp_load_store;
51137 return disp_no_group;
51140 /* Return true if insn is a compare instruction. */
51142 static bool
51143 is_cmp (rtx_insn *insn)
51145 enum attr_type type;
51147 type = get_attr_type (insn);
51148 return (type == TYPE_TEST
51149 || type == TYPE_ICMP
51150 || type == TYPE_FCMP
51151 || GET_CODE (PATTERN (insn)) == COMPARE);
51154 /* Return true if a dispatch violation encountered. */
51156 static bool
51157 dispatch_violation (void)
51159 if (dispatch_window_list->next)
51160 return dispatch_window_list->next->violation;
51161 return dispatch_window_list->violation;
51164 /* Return true if insn is a branch instruction. */
51166 static bool
51167 is_branch (rtx_insn *insn)
51169 return (CALL_P (insn) || JUMP_P (insn));
51172 /* Return true if insn is a prefetch instruction. */
51174 static bool
51175 is_prefetch (rtx_insn *insn)
51177 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
51180 /* This function initializes a dispatch window and the list container holding a
51181 pointer to the window. */
51183 static void
51184 init_window (int window_num)
51186 int i;
51187 dispatch_windows *new_list;
51189 if (window_num == 0)
51190 new_list = dispatch_window_list;
51191 else
51192 new_list = dispatch_window_list1;
51194 new_list->num_insn = 0;
51195 new_list->num_uops = 0;
51196 new_list->window_size = 0;
51197 new_list->next = NULL;
51198 new_list->prev = NULL;
51199 new_list->window_num = window_num;
51200 new_list->num_imm = 0;
51201 new_list->num_imm_32 = 0;
51202 new_list->num_imm_64 = 0;
51203 new_list->imm_size = 0;
51204 new_list->num_loads = 0;
51205 new_list->num_stores = 0;
51206 new_list->violation = false;
51208 for (i = 0; i < MAX_INSN; i++)
51210 new_list->window[i].insn = NULL;
51211 new_list->window[i].group = disp_no_group;
51212 new_list->window[i].path = no_path;
51213 new_list->window[i].byte_len = 0;
51214 new_list->window[i].imm_bytes = 0;
51216 return;
51219 /* This function allocates and initializes a dispatch window and the
51220 list container holding a pointer to the window. */
51222 static dispatch_windows *
51223 allocate_window (void)
51225 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
51226 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
51228 return new_list;
51231 /* This routine initializes the dispatch scheduling information. It
51232 initiates building dispatch scheduler tables and constructs the
51233 first dispatch window. */
51235 static void
51236 init_dispatch_sched (void)
51238 /* Allocate a dispatch list and a window. */
51239 dispatch_window_list = allocate_window ();
51240 dispatch_window_list1 = allocate_window ();
51241 init_window (0);
51242 init_window (1);
51245 /* This function returns true if a branch is detected. End of a basic block
51246 does not have to be a branch, but here we assume only branches end a
51247 window. */
51249 static bool
51250 is_end_basic_block (enum dispatch_group group)
51252 return group == disp_branch;
51255 /* This function is called when the end of a window processing is reached. */
51257 static void
51258 process_end_window (void)
51260 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
51261 if (dispatch_window_list->next)
51263 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
51264 gcc_assert (dispatch_window_list->window_size
51265 + dispatch_window_list1->window_size <= 48);
51266 init_window (1);
51268 init_window (0);
51271 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
51272 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
51273 for 48 bytes of instructions. Note that these windows are not dispatch
51274 windows that their sizes are DISPATCH_WINDOW_SIZE. */
51276 static dispatch_windows *
51277 allocate_next_window (int window_num)
51279 if (window_num == 0)
51281 if (dispatch_window_list->next)
51282 init_window (1);
51283 init_window (0);
51284 return dispatch_window_list;
51287 dispatch_window_list->next = dispatch_window_list1;
51288 dispatch_window_list1->prev = dispatch_window_list;
51290 return dispatch_window_list1;
51293 /* Compute number of immediate operands of an instruction. */
51295 static void
51296 find_constant (rtx in_rtx, imm_info *imm_values)
51298 if (INSN_P (in_rtx))
51299 in_rtx = PATTERN (in_rtx);
51300 subrtx_iterator::array_type array;
51301 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
51302 if (const_rtx x = *iter)
51303 switch (GET_CODE (x))
51305 case CONST:
51306 case SYMBOL_REF:
51307 case CONST_INT:
51308 (imm_values->imm)++;
51309 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
51310 (imm_values->imm32)++;
51311 else
51312 (imm_values->imm64)++;
51313 break;
51315 case CONST_DOUBLE:
51316 case CONST_WIDE_INT:
51317 (imm_values->imm)++;
51318 (imm_values->imm64)++;
51319 break;
51321 case CODE_LABEL:
51322 if (LABEL_KIND (x) == LABEL_NORMAL)
51324 (imm_values->imm)++;
51325 (imm_values->imm32)++;
51327 break;
51329 default:
51330 break;
51334 /* Return total size of immediate operands of an instruction along with number
51335 of corresponding immediate-operands. It initializes its parameters to zero
51336 befor calling FIND_CONSTANT.
51337 INSN is the input instruction. IMM is the total of immediates.
51338 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
51339 bit immediates. */
51341 static int
51342 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
51344 imm_info imm_values = {0, 0, 0};
51346 find_constant (insn, &imm_values);
51347 *imm = imm_values.imm;
51348 *imm32 = imm_values.imm32;
51349 *imm64 = imm_values.imm64;
51350 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
51353 /* This function indicates if an operand of an instruction is an
51354 immediate. */
51356 static bool
51357 has_immediate (rtx_insn *insn)
51359 int num_imm_operand;
51360 int num_imm32_operand;
51361 int num_imm64_operand;
51363 if (insn)
51364 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51365 &num_imm64_operand);
51366 return false;
51369 /* Return single or double path for instructions. */
51371 static enum insn_path
51372 get_insn_path (rtx_insn *insn)
51374 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
51376 if ((int)path == 0)
51377 return path_single;
51379 if ((int)path == 1)
51380 return path_double;
51382 return path_multi;
51385 /* Return insn dispatch group. */
51387 static enum dispatch_group
51388 get_insn_group (rtx_insn *insn)
51390 enum dispatch_group group = get_mem_group (insn);
51391 if (group)
51392 return group;
51394 if (is_branch (insn))
51395 return disp_branch;
51397 if (is_cmp (insn))
51398 return disp_cmp;
51400 if (has_immediate (insn))
51401 return disp_imm;
51403 if (is_prefetch (insn))
51404 return disp_prefetch;
51406 return disp_no_group;
51409 /* Count number of GROUP restricted instructions in a dispatch
51410 window WINDOW_LIST. */
51412 static int
51413 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
51415 enum dispatch_group group = get_insn_group (insn);
51416 int imm_size;
51417 int num_imm_operand;
51418 int num_imm32_operand;
51419 int num_imm64_operand;
51421 if (group == disp_no_group)
51422 return 0;
51424 if (group == disp_imm)
51426 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51427 &num_imm64_operand);
51428 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
51429 || num_imm_operand + window_list->num_imm > MAX_IMM
51430 || (num_imm32_operand > 0
51431 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
51432 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
51433 || (num_imm64_operand > 0
51434 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
51435 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
51436 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
51437 && num_imm64_operand > 0
51438 && ((window_list->num_imm_64 > 0
51439 && window_list->num_insn >= 2)
51440 || window_list->num_insn >= 3)))
51441 return BIG;
51443 return 1;
51446 if ((group == disp_load_store
51447 && (window_list->num_loads >= MAX_LOAD
51448 || window_list->num_stores >= MAX_STORE))
51449 || ((group == disp_load
51450 || group == disp_prefetch)
51451 && window_list->num_loads >= MAX_LOAD)
51452 || (group == disp_store
51453 && window_list->num_stores >= MAX_STORE))
51454 return BIG;
51456 return 1;
51459 /* This function returns true if insn satisfies dispatch rules on the
51460 last window scheduled. */
51462 static bool
51463 fits_dispatch_window (rtx_insn *insn)
51465 dispatch_windows *window_list = dispatch_window_list;
51466 dispatch_windows *window_list_next = dispatch_window_list->next;
51467 unsigned int num_restrict;
51468 enum dispatch_group group = get_insn_group (insn);
51469 enum insn_path path = get_insn_path (insn);
51470 int sum;
51472 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
51473 instructions should be given the lowest priority in the
51474 scheduling process in Haifa scheduler to make sure they will be
51475 scheduled in the same dispatch window as the reference to them. */
51476 if (group == disp_jcc || group == disp_cmp)
51477 return false;
51479 /* Check nonrestricted. */
51480 if (group == disp_no_group || group == disp_branch)
51481 return true;
51483 /* Get last dispatch window. */
51484 if (window_list_next)
51485 window_list = window_list_next;
51487 if (window_list->window_num == 1)
51489 sum = window_list->prev->window_size + window_list->window_size;
51491 if (sum == 32
51492 || (min_insn_size (insn) + sum) >= 48)
51493 /* Window 1 is full. Go for next window. */
51494 return true;
51497 num_restrict = count_num_restricted (insn, window_list);
51499 if (num_restrict > num_allowable_groups[group])
51500 return false;
51502 /* See if it fits in the first window. */
51503 if (window_list->window_num == 0)
51505 /* The first widow should have only single and double path
51506 uops. */
51507 if (path == path_double
51508 && (window_list->num_uops + 2) > MAX_INSN)
51509 return false;
51510 else if (path != path_single)
51511 return false;
51513 return true;
51516 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51517 dispatch window WINDOW_LIST. */
51519 static void
51520 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51522 int byte_len = min_insn_size (insn);
51523 int num_insn = window_list->num_insn;
51524 int imm_size;
51525 sched_insn_info *window = window_list->window;
51526 enum dispatch_group group = get_insn_group (insn);
51527 enum insn_path path = get_insn_path (insn);
51528 int num_imm_operand;
51529 int num_imm32_operand;
51530 int num_imm64_operand;
51532 if (!window_list->violation && group != disp_cmp
51533 && !fits_dispatch_window (insn))
51534 window_list->violation = true;
51536 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51537 &num_imm64_operand);
51539 /* Initialize window with new instruction. */
51540 window[num_insn].insn = insn;
51541 window[num_insn].byte_len = byte_len;
51542 window[num_insn].group = group;
51543 window[num_insn].path = path;
51544 window[num_insn].imm_bytes = imm_size;
51546 window_list->window_size += byte_len;
51547 window_list->num_insn = num_insn + 1;
51548 window_list->num_uops = window_list->num_uops + num_uops;
51549 window_list->imm_size += imm_size;
51550 window_list->num_imm += num_imm_operand;
51551 window_list->num_imm_32 += num_imm32_operand;
51552 window_list->num_imm_64 += num_imm64_operand;
51554 if (group == disp_store)
51555 window_list->num_stores += 1;
51556 else if (group == disp_load
51557 || group == disp_prefetch)
51558 window_list->num_loads += 1;
51559 else if (group == disp_load_store)
51561 window_list->num_stores += 1;
51562 window_list->num_loads += 1;
51566 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51567 If the total bytes of instructions or the number of instructions in
51568 the window exceed allowable, it allocates a new window. */
51570 static void
51571 add_to_dispatch_window (rtx_insn *insn)
51573 int byte_len;
51574 dispatch_windows *window_list;
51575 dispatch_windows *next_list;
51576 dispatch_windows *window0_list;
51577 enum insn_path path;
51578 enum dispatch_group insn_group;
51579 bool insn_fits;
51580 int num_insn;
51581 int num_uops;
51582 int window_num;
51583 int insn_num_uops;
51584 int sum;
51586 if (INSN_CODE (insn) < 0)
51587 return;
51589 byte_len = min_insn_size (insn);
51590 window_list = dispatch_window_list;
51591 next_list = window_list->next;
51592 path = get_insn_path (insn);
51593 insn_group = get_insn_group (insn);
51595 /* Get the last dispatch window. */
51596 if (next_list)
51597 window_list = dispatch_window_list->next;
51599 if (path == path_single)
51600 insn_num_uops = 1;
51601 else if (path == path_double)
51602 insn_num_uops = 2;
51603 else
51604 insn_num_uops = (int) path;
51606 /* If current window is full, get a new window.
51607 Window number zero is full, if MAX_INSN uops are scheduled in it.
51608 Window number one is full, if window zero's bytes plus window
51609 one's bytes is 32, or if the bytes of the new instruction added
51610 to the total makes it greater than 48, or it has already MAX_INSN
51611 instructions in it. */
51612 num_insn = window_list->num_insn;
51613 num_uops = window_list->num_uops;
51614 window_num = window_list->window_num;
51615 insn_fits = fits_dispatch_window (insn);
51617 if (num_insn >= MAX_INSN
51618 || num_uops + insn_num_uops > MAX_INSN
51619 || !(insn_fits))
51621 window_num = ~window_num & 1;
51622 window_list = allocate_next_window (window_num);
51625 if (window_num == 0)
51627 add_insn_window (insn, window_list, insn_num_uops);
51628 if (window_list->num_insn >= MAX_INSN
51629 && insn_group == disp_branch)
51631 process_end_window ();
51632 return;
51635 else if (window_num == 1)
51637 window0_list = window_list->prev;
51638 sum = window0_list->window_size + window_list->window_size;
51639 if (sum == 32
51640 || (byte_len + sum) >= 48)
51642 process_end_window ();
51643 window_list = dispatch_window_list;
51646 add_insn_window (insn, window_list, insn_num_uops);
51648 else
51649 gcc_unreachable ();
51651 if (is_end_basic_block (insn_group))
51653 /* End of basic block is reached do end-basic-block process. */
51654 process_end_window ();
51655 return;
51659 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51661 DEBUG_FUNCTION static void
51662 debug_dispatch_window_file (FILE *file, int window_num)
51664 dispatch_windows *list;
51665 int i;
51667 if (window_num == 0)
51668 list = dispatch_window_list;
51669 else
51670 list = dispatch_window_list1;
51672 fprintf (file, "Window #%d:\n", list->window_num);
51673 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51674 list->num_insn, list->num_uops, list->window_size);
51675 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51676 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51678 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51679 list->num_stores);
51680 fprintf (file, " insn info:\n");
51682 for (i = 0; i < MAX_INSN; i++)
51684 if (!list->window[i].insn)
51685 break;
51686 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51687 i, group_name[list->window[i].group],
51688 i, (void *)list->window[i].insn,
51689 i, list->window[i].path,
51690 i, list->window[i].byte_len,
51691 i, list->window[i].imm_bytes);
51695 /* Print to stdout a dispatch window. */
51697 DEBUG_FUNCTION void
51698 debug_dispatch_window (int window_num)
51700 debug_dispatch_window_file (stdout, window_num);
51703 /* Print INSN dispatch information to FILE. */
51705 DEBUG_FUNCTION static void
51706 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51708 int byte_len;
51709 enum insn_path path;
51710 enum dispatch_group group;
51711 int imm_size;
51712 int num_imm_operand;
51713 int num_imm32_operand;
51714 int num_imm64_operand;
51716 if (INSN_CODE (insn) < 0)
51717 return;
51719 byte_len = min_insn_size (insn);
51720 path = get_insn_path (insn);
51721 group = get_insn_group (insn);
51722 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51723 &num_imm64_operand);
51725 fprintf (file, " insn info:\n");
51726 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51727 group_name[group], path, byte_len);
51728 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51729 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51732 /* Print to STDERR the status of the ready list with respect to
51733 dispatch windows. */
51735 DEBUG_FUNCTION void
51736 debug_ready_dispatch (void)
51738 int i;
51739 int no_ready = number_in_ready ();
51741 fprintf (stdout, "Number of ready: %d\n", no_ready);
51743 for (i = 0; i < no_ready; i++)
51744 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51747 /* This routine is the driver of the dispatch scheduler. */
51749 static void
51750 do_dispatch (rtx_insn *insn, int mode)
51752 if (mode == DISPATCH_INIT)
51753 init_dispatch_sched ();
51754 else if (mode == ADD_TO_DISPATCH_WINDOW)
51755 add_to_dispatch_window (insn);
51758 /* Return TRUE if Dispatch Scheduling is supported. */
51760 static bool
51761 has_dispatch (rtx_insn *insn, int action)
51763 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51764 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51765 switch (action)
51767 default:
51768 return false;
51770 case IS_DISPATCH_ON:
51771 return true;
51773 case IS_CMP:
51774 return is_cmp (insn);
51776 case DISPATCH_VIOLATION:
51777 return dispatch_violation ();
51779 case FITS_DISPATCH_WINDOW:
51780 return fits_dispatch_window (insn);
51783 return false;
51786 /* Implementation of reassociation_width target hook used by
51787 reassoc phase to identify parallelism level in reassociated
51788 tree. Statements tree_code is passed in OPC. Arguments type
51789 is passed in MODE.
51791 Currently parallel reassociation is enabled for Atom
51792 processors only and we set reassociation width to be 2
51793 because Atom may issue up to 2 instructions per cycle.
51795 Return value should be fixed if parallel reassociation is
51796 enabled for other processors. */
51798 static int
51799 ix86_reassociation_width (unsigned int, machine_mode mode)
51801 /* Vector part. */
51802 if (VECTOR_MODE_P (mode))
51804 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51805 return 2;
51806 else
51807 return 1;
51810 /* Scalar part. */
51811 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51812 return 2;
51813 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51814 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51815 else
51816 return 1;
51819 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51820 place emms and femms instructions. */
51822 static machine_mode
51823 ix86_preferred_simd_mode (scalar_mode mode)
51825 if (!TARGET_SSE)
51826 return word_mode;
51828 switch (mode)
51830 case E_QImode:
51831 return TARGET_AVX512BW ? V64QImode :
51832 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51833 case E_HImode:
51834 return TARGET_AVX512BW ? V32HImode :
51835 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51836 case E_SImode:
51837 return TARGET_AVX512F ? V16SImode :
51838 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51839 case E_DImode:
51840 return TARGET_AVX512F ? V8DImode :
51841 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51843 case E_SFmode:
51844 if (TARGET_AVX512F)
51845 return V16SFmode;
51846 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51847 return V8SFmode;
51848 else
51849 return V4SFmode;
51851 case E_DFmode:
51852 if (TARGET_AVX512F)
51853 return V8DFmode;
51854 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51855 return V4DFmode;
51856 else if (TARGET_SSE2)
51857 return V2DFmode;
51858 /* FALLTHRU */
51860 default:
51861 return word_mode;
51865 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51866 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51867 256bit and 128bit vectors. */
51869 static unsigned int
51870 ix86_autovectorize_vector_sizes (void)
51872 return TARGET_AVX512F ? 64 | 32 | 16 :
51873 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51876 /* Implemenation of targetm.vectorize.get_mask_mode. */
51878 static opt_machine_mode
51879 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51881 unsigned elem_size = vector_size / nunits;
51883 /* Scalar mask case. */
51884 if ((TARGET_AVX512F && vector_size == 64)
51885 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51887 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51888 return smallest_int_mode_for_size (nunits);
51891 scalar_int_mode elem_mode
51892 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
51894 gcc_assert (elem_size * nunits == vector_size);
51896 return mode_for_vector (elem_mode, nunits);
51901 /* Return class of registers which could be used for pseudo of MODE
51902 and of class RCLASS for spilling instead of memory. Return NO_REGS
51903 if it is not possible or non-profitable. */
51905 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51907 static reg_class_t
51908 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51910 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51911 && TARGET_SSE2
51912 && TARGET_INTER_UNIT_MOVES_TO_VEC
51913 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51914 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51915 && INTEGER_CLASS_P (rclass))
51916 return ALL_SSE_REGS;
51917 return NO_REGS;
51920 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51921 but returns a lower bound. */
51923 static unsigned int
51924 ix86_max_noce_ifcvt_seq_cost (edge e)
51926 bool predictable_p = predictable_edge_p (e);
51928 enum compiler_param param
51929 = (predictable_p
51930 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51931 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51933 /* If we have a parameter set, use that, otherwise take a guess using
51934 BRANCH_COST. */
51935 if (global_options_set.x_param_values[param])
51936 return PARAM_VALUE (param);
51937 else
51938 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51941 /* Return true if SEQ is a good candidate as a replacement for the
51942 if-convertible sequence described in IF_INFO. */
51944 static bool
51945 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51947 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51949 int cmov_cnt = 0;
51950 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51951 Maybe we should allow even more conditional moves as long as they
51952 are used far enough not to stall the CPU, or also consider
51953 IF_INFO->TEST_BB succ edge probabilities. */
51954 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51956 rtx set = single_set (insn);
51957 if (!set)
51958 continue;
51959 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51960 continue;
51961 rtx src = SET_SRC (set);
51962 machine_mode mode = GET_MODE (src);
51963 if (GET_MODE_CLASS (mode) != MODE_INT
51964 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51965 continue;
51966 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51967 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51968 continue;
51969 /* insn is CMOV or FCMOV. */
51970 if (++cmov_cnt > 1)
51971 return false;
51974 return default_noce_conversion_profitable_p (seq, if_info);
51977 /* Implement targetm.vectorize.init_cost. */
51979 static void *
51980 ix86_init_cost (struct loop *)
51982 unsigned *cost = XNEWVEC (unsigned, 3);
51983 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51984 return cost;
51987 /* Implement targetm.vectorize.add_stmt_cost. */
51989 static unsigned
51990 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51991 struct _stmt_vec_info *stmt_info, int misalign,
51992 enum vect_cost_model_location where)
51994 unsigned *cost = (unsigned *) data;
51995 unsigned retval = 0;
51997 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51998 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
52000 /* Penalize DFmode vector operations for Bonnell. */
52001 if (TARGET_BONNELL && kind == vector_stmt
52002 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
52003 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
52005 /* Statements in an inner loop relative to the loop being
52006 vectorized are weighted more heavily. The value here is
52007 arbitrary and could potentially be improved with analysis. */
52008 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
52009 count *= 50; /* FIXME. */
52011 retval = (unsigned) (count * stmt_cost);
52013 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
52014 for Silvermont as it has out of order integer pipeline and can execute
52015 2 scalar instruction per tick, but has in order SIMD pipeline. */
52016 if ((TARGET_SILVERMONT || TARGET_INTEL)
52017 && stmt_info && stmt_info->stmt)
52019 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
52020 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
52021 retval = (retval * 17) / 10;
52024 cost[where] += retval;
52026 return retval;
52029 /* Implement targetm.vectorize.finish_cost. */
52031 static void
52032 ix86_finish_cost (void *data, unsigned *prologue_cost,
52033 unsigned *body_cost, unsigned *epilogue_cost)
52035 unsigned *cost = (unsigned *) data;
52036 *prologue_cost = cost[vect_prologue];
52037 *body_cost = cost[vect_body];
52038 *epilogue_cost = cost[vect_epilogue];
52041 /* Implement targetm.vectorize.destroy_cost_data. */
52043 static void
52044 ix86_destroy_cost_data (void *data)
52046 free (data);
52049 /* Validate target specific memory model bits in VAL. */
52051 static unsigned HOST_WIDE_INT
52052 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
52054 enum memmodel model = memmodel_from_int (val);
52055 bool strong;
52057 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
52058 |MEMMODEL_MASK)
52059 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
52061 warning (OPT_Winvalid_memory_model,
52062 "Unknown architecture specific memory model");
52063 return MEMMODEL_SEQ_CST;
52065 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
52066 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
52068 warning (OPT_Winvalid_memory_model,
52069 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
52070 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
52072 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
52074 warning (OPT_Winvalid_memory_model,
52075 "HLE_RELEASE not used with RELEASE or stronger memory model");
52076 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
52078 return val;
52081 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
52082 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
52083 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
52084 or number of vecsize_mangle variants that should be emitted. */
52086 static int
52087 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
52088 struct cgraph_simd_clone *clonei,
52089 tree base_type, int num)
52091 int ret = 1;
52093 if (clonei->simdlen
52094 && (clonei->simdlen < 2
52095 || clonei->simdlen > 1024
52096 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
52098 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52099 "unsupported simdlen %d", clonei->simdlen);
52100 return 0;
52103 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
52104 if (TREE_CODE (ret_type) != VOID_TYPE)
52105 switch (TYPE_MODE (ret_type))
52107 case E_QImode:
52108 case E_HImode:
52109 case E_SImode:
52110 case E_DImode:
52111 case E_SFmode:
52112 case E_DFmode:
52113 /* case E_SCmode: */
52114 /* case E_DCmode: */
52115 break;
52116 default:
52117 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52118 "unsupported return type %qT for simd\n", ret_type);
52119 return 0;
52122 tree t;
52123 int i;
52125 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
52126 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
52127 switch (TYPE_MODE (TREE_TYPE (t)))
52129 case E_QImode:
52130 case E_HImode:
52131 case E_SImode:
52132 case E_DImode:
52133 case E_SFmode:
52134 case E_DFmode:
52135 /* case E_SCmode: */
52136 /* case E_DCmode: */
52137 break;
52138 default:
52139 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52140 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
52141 return 0;
52144 if (clonei->cilk_elemental)
52146 /* Parse here processor clause. If not present, default to 'b'. */
52147 clonei->vecsize_mangle = 'b';
52149 else if (!TREE_PUBLIC (node->decl))
52151 /* If the function isn't exported, we can pick up just one ISA
52152 for the clones. */
52153 if (TARGET_AVX512F)
52154 clonei->vecsize_mangle = 'e';
52155 else if (TARGET_AVX2)
52156 clonei->vecsize_mangle = 'd';
52157 else if (TARGET_AVX)
52158 clonei->vecsize_mangle = 'c';
52159 else
52160 clonei->vecsize_mangle = 'b';
52161 ret = 1;
52163 else
52165 clonei->vecsize_mangle = "bcde"[num];
52166 ret = 4;
52168 clonei->mask_mode = VOIDmode;
52169 switch (clonei->vecsize_mangle)
52171 case 'b':
52172 clonei->vecsize_int = 128;
52173 clonei->vecsize_float = 128;
52174 break;
52175 case 'c':
52176 clonei->vecsize_int = 128;
52177 clonei->vecsize_float = 256;
52178 break;
52179 case 'd':
52180 clonei->vecsize_int = 256;
52181 clonei->vecsize_float = 256;
52182 break;
52183 case 'e':
52184 clonei->vecsize_int = 512;
52185 clonei->vecsize_float = 512;
52186 if (TYPE_MODE (base_type) == QImode)
52187 clonei->mask_mode = DImode;
52188 else
52189 clonei->mask_mode = SImode;
52190 break;
52192 if (clonei->simdlen == 0)
52194 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
52195 clonei->simdlen = clonei->vecsize_int;
52196 else
52197 clonei->simdlen = clonei->vecsize_float;
52198 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
52200 else if (clonei->simdlen > 16)
52202 /* For compatibility with ICC, use the same upper bounds
52203 for simdlen. In particular, for CTYPE below, use the return type,
52204 unless the function returns void, in that case use the characteristic
52205 type. If it is possible for given SIMDLEN to pass CTYPE value
52206 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
52207 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
52208 emit corresponding clone. */
52209 tree ctype = ret_type;
52210 if (TREE_CODE (ret_type) == VOID_TYPE)
52211 ctype = base_type;
52212 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
52213 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
52214 cnt /= clonei->vecsize_int;
52215 else
52216 cnt /= clonei->vecsize_float;
52217 if (cnt > (TARGET_64BIT ? 16 : 8))
52219 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
52220 "unsupported simdlen %d", clonei->simdlen);
52221 return 0;
52224 return ret;
52227 /* Add target attribute to SIMD clone NODE if needed. */
52229 static void
52230 ix86_simd_clone_adjust (struct cgraph_node *node)
52232 const char *str = NULL;
52233 gcc_assert (node->decl == cfun->decl);
52234 switch (node->simdclone->vecsize_mangle)
52236 case 'b':
52237 if (!TARGET_SSE2)
52238 str = "sse2";
52239 break;
52240 case 'c':
52241 if (!TARGET_AVX)
52242 str = "avx";
52243 break;
52244 case 'd':
52245 if (!TARGET_AVX2)
52246 str = "avx2";
52247 break;
52248 case 'e':
52249 if (!TARGET_AVX512F)
52250 str = "avx512f";
52251 break;
52252 default:
52253 gcc_unreachable ();
52255 if (str == NULL)
52256 return;
52257 push_cfun (NULL);
52258 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
52259 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
52260 gcc_assert (ok);
52261 pop_cfun ();
52262 ix86_reset_previous_fndecl ();
52263 ix86_set_current_function (node->decl);
52266 /* If SIMD clone NODE can't be used in a vectorized loop
52267 in current function, return -1, otherwise return a badness of using it
52268 (0 if it is most desirable from vecsize_mangle point of view, 1
52269 slightly less desirable, etc.). */
52271 static int
52272 ix86_simd_clone_usable (struct cgraph_node *node)
52274 switch (node->simdclone->vecsize_mangle)
52276 case 'b':
52277 if (!TARGET_SSE2)
52278 return -1;
52279 if (!TARGET_AVX)
52280 return 0;
52281 return TARGET_AVX2 ? 2 : 1;
52282 case 'c':
52283 if (!TARGET_AVX)
52284 return -1;
52285 return TARGET_AVX2 ? 1 : 0;
52286 case 'd':
52287 if (!TARGET_AVX2)
52288 return -1;
52289 return 0;
52290 case 'e':
52291 if (!TARGET_AVX512F)
52292 return -1;
52293 return 0;
52294 default:
52295 gcc_unreachable ();
52299 /* This function adjusts the unroll factor based on
52300 the hardware capabilities. For ex, bdver3 has
52301 a loop buffer which makes unrolling of smaller
52302 loops less important. This function decides the
52303 unroll factor using number of memory references
52304 (value 32 is used) as a heuristic. */
52306 static unsigned
52307 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
52309 basic_block *bbs;
52310 rtx_insn *insn;
52311 unsigned i;
52312 unsigned mem_count = 0;
52314 if (!TARGET_ADJUST_UNROLL)
52315 return nunroll;
52317 /* Count the number of memory references within the loop body.
52318 This value determines the unrolling factor for bdver3 and bdver4
52319 architectures. */
52320 subrtx_iterator::array_type array;
52321 bbs = get_loop_body (loop);
52322 for (i = 0; i < loop->num_nodes; i++)
52323 FOR_BB_INSNS (bbs[i], insn)
52324 if (NONDEBUG_INSN_P (insn))
52325 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
52326 if (const_rtx x = *iter)
52327 if (MEM_P (x))
52329 machine_mode mode = GET_MODE (x);
52330 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
52331 if (n_words > 4)
52332 mem_count += 2;
52333 else
52334 mem_count += 1;
52336 free (bbs);
52338 if (mem_count && mem_count <=32)
52339 return 32/mem_count;
52341 return nunroll;
52345 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
52347 static bool
52348 ix86_float_exceptions_rounding_supported_p (void)
52350 /* For x87 floating point with standard excess precision handling,
52351 there is no adddf3 pattern (since x87 floating point only has
52352 XFmode operations) so the default hook implementation gets this
52353 wrong. */
52354 return TARGET_80387 || TARGET_SSE_MATH;
52357 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
52359 static void
52360 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
52362 if (!TARGET_80387 && !TARGET_SSE_MATH)
52363 return;
52364 tree exceptions_var = create_tmp_var_raw (integer_type_node);
52365 if (TARGET_80387)
52367 tree fenv_index_type = build_index_type (size_int (6));
52368 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
52369 tree fenv_var = create_tmp_var_raw (fenv_type);
52370 TREE_ADDRESSABLE (fenv_var) = 1;
52371 tree fenv_ptr = build_pointer_type (fenv_type);
52372 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
52373 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
52374 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
52375 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
52376 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
52377 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
52378 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
52379 tree hold_fnclex = build_call_expr (fnclex, 0);
52380 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
52381 NULL_TREE, NULL_TREE);
52382 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
52383 hold_fnclex);
52384 *clear = build_call_expr (fnclex, 0);
52385 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
52386 tree fnstsw_call = build_call_expr (fnstsw, 0);
52387 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
52388 sw_var, fnstsw_call);
52389 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
52390 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
52391 exceptions_var, exceptions_x87);
52392 *update = build2 (COMPOUND_EXPR, integer_type_node,
52393 sw_mod, update_mod);
52394 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
52395 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
52397 if (TARGET_SSE_MATH)
52399 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
52400 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
52401 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
52402 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
52403 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
52404 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
52405 mxcsr_orig_var, stmxcsr_hold_call);
52406 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
52407 mxcsr_orig_var,
52408 build_int_cst (unsigned_type_node, 0x1f80));
52409 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
52410 build_int_cst (unsigned_type_node, 0xffffffc0));
52411 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
52412 mxcsr_mod_var, hold_mod_val);
52413 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52414 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
52415 hold_assign_orig, hold_assign_mod);
52416 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
52417 ldmxcsr_hold_call);
52418 if (*hold)
52419 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
52420 else
52421 *hold = hold_all;
52422 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52423 if (*clear)
52424 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
52425 ldmxcsr_clear_call);
52426 else
52427 *clear = ldmxcsr_clear_call;
52428 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
52429 tree exceptions_sse = fold_convert (integer_type_node,
52430 stxmcsr_update_call);
52431 if (*update)
52433 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
52434 exceptions_var, exceptions_sse);
52435 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
52436 exceptions_var, exceptions_mod);
52437 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
52438 exceptions_assign);
52440 else
52441 *update = build2 (MODIFY_EXPR, integer_type_node,
52442 exceptions_var, exceptions_sse);
52443 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
52444 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52445 ldmxcsr_update_call);
52447 tree atomic_feraiseexcept
52448 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
52449 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
52450 1, exceptions_var);
52451 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52452 atomic_feraiseexcept_call);
52455 /* Return mode to be used for bounds or VOIDmode
52456 if bounds are not supported. */
52458 static machine_mode
52459 ix86_mpx_bound_mode ()
52461 /* Do not support pointer checker if MPX
52462 is not enabled. */
52463 if (!TARGET_MPX)
52465 if (flag_check_pointer_bounds)
52466 warning (0, "Pointer Checker requires MPX support on this target."
52467 " Use -mmpx options to enable MPX.");
52468 return VOIDmode;
52471 return BNDmode;
52474 /* Return constant used to statically initialize constant bounds.
52476 This function is used to create special bound values. For now
52477 only INIT bounds and NONE bounds are expected. More special
52478 values may be added later. */
52480 static tree
52481 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
52483 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
52484 : build_zero_cst (pointer_sized_int_node);
52485 tree high = ub ? build_zero_cst (pointer_sized_int_node)
52486 : build_minus_one_cst (pointer_sized_int_node);
52488 /* This function is supposed to be used to create INIT and
52489 NONE bounds only. */
52490 gcc_assert ((lb == 0 && ub == -1)
52491 || (lb == -1 && ub == 0));
52493 return build_complex (NULL, low, high);
52496 /* Generate a list of statements STMTS to initialize pointer bounds
52497 variable VAR with bounds LB and UB. Return the number of generated
52498 statements. */
52500 static int
52501 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52503 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52504 tree lhs, modify, var_p;
52506 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52507 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52509 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52510 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52511 append_to_statement_list (modify, stmts);
52513 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52514 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52515 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52516 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52517 append_to_statement_list (modify, stmts);
52519 return 2;
52522 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52523 /* For i386, common symbol is local only for non-PIE binaries. For
52524 x86-64, common symbol is local only for non-PIE binaries or linker
52525 supports copy reloc in PIE binaries. */
52527 static bool
52528 ix86_binds_local_p (const_tree exp)
52530 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52531 (!flag_pic
52532 || (TARGET_64BIT
52533 && HAVE_LD_PIE_COPYRELOC != 0)));
52535 #endif
52537 /* If MEM is in the form of [base+offset], extract the two parts
52538 of address and set to BASE and OFFSET, otherwise return false. */
52540 static bool
52541 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52543 rtx addr;
52545 gcc_assert (MEM_P (mem));
52547 addr = XEXP (mem, 0);
52549 if (GET_CODE (addr) == CONST)
52550 addr = XEXP (addr, 0);
52552 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52554 *base = addr;
52555 *offset = const0_rtx;
52556 return true;
52559 if (GET_CODE (addr) == PLUS
52560 && (REG_P (XEXP (addr, 0))
52561 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52562 && CONST_INT_P (XEXP (addr, 1)))
52564 *base = XEXP (addr, 0);
52565 *offset = XEXP (addr, 1);
52566 return true;
52569 return false;
52572 /* Given OPERANDS of consecutive load/store, check if we can merge
52573 them into move multiple. LOAD is true if they are load instructions.
52574 MODE is the mode of memory operands. */
52576 bool
52577 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52578 machine_mode mode)
52580 HOST_WIDE_INT offval_1, offval_2, msize;
52581 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52583 if (load)
52585 mem_1 = operands[1];
52586 mem_2 = operands[3];
52587 reg_1 = operands[0];
52588 reg_2 = operands[2];
52590 else
52592 mem_1 = operands[0];
52593 mem_2 = operands[2];
52594 reg_1 = operands[1];
52595 reg_2 = operands[3];
52598 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52600 if (REGNO (reg_1) != REGNO (reg_2))
52601 return false;
52603 /* Check if the addresses are in the form of [base+offset]. */
52604 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52605 return false;
52606 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52607 return false;
52609 /* Check if the bases are the same. */
52610 if (!rtx_equal_p (base_1, base_2))
52611 return false;
52613 offval_1 = INTVAL (offset_1);
52614 offval_2 = INTVAL (offset_2);
52615 msize = GET_MODE_SIZE (mode);
52616 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52617 if (offval_1 + msize != offval_2)
52618 return false;
52620 return true;
52623 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52625 static bool
52626 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52627 optimization_type opt_type)
52629 switch (op)
52631 case asin_optab:
52632 case acos_optab:
52633 case log1p_optab:
52634 case exp_optab:
52635 case exp10_optab:
52636 case exp2_optab:
52637 case expm1_optab:
52638 case ldexp_optab:
52639 case scalb_optab:
52640 case round_optab:
52641 return opt_type == OPTIMIZE_FOR_SPEED;
52643 case rint_optab:
52644 if (SSE_FLOAT_MODE_P (mode1)
52645 && TARGET_SSE_MATH
52646 && !flag_trapping_math
52647 && !TARGET_SSE4_1)
52648 return opt_type == OPTIMIZE_FOR_SPEED;
52649 return true;
52651 case floor_optab:
52652 case ceil_optab:
52653 case btrunc_optab:
52654 if (SSE_FLOAT_MODE_P (mode1)
52655 && TARGET_SSE_MATH
52656 && !flag_trapping_math
52657 && TARGET_SSE4_1)
52658 return true;
52659 return opt_type == OPTIMIZE_FOR_SPEED;
52661 case rsqrt_optab:
52662 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52664 default:
52665 return true;
52669 /* Address space support.
52671 This is not "far pointers" in the 16-bit sense, but an easy way
52672 to use %fs and %gs segment prefixes. Therefore:
52674 (a) All address spaces have the same modes,
52675 (b) All address spaces have the same addresss forms,
52676 (c) While %fs and %gs are technically subsets of the generic
52677 address space, they are probably not subsets of each other.
52678 (d) Since we have no access to the segment base register values
52679 without resorting to a system call, we cannot convert a
52680 non-default address space to a default address space.
52681 Therefore we do not claim %fs or %gs are subsets of generic.
52683 Therefore we can (mostly) use the default hooks. */
52685 /* All use of segmentation is assumed to make address 0 valid. */
52687 static bool
52688 ix86_addr_space_zero_address_valid (addr_space_t as)
52690 return as != ADDR_SPACE_GENERIC;
52693 static void
52694 ix86_init_libfuncs (void)
52696 if (TARGET_64BIT)
52698 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52699 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52701 else
52703 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52704 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52707 #if TARGET_MACHO
52708 darwin_rename_builtins ();
52709 #endif
52712 /* Generate call to __divmoddi4. */
52714 static void
52715 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52716 rtx op0, rtx op1,
52717 rtx *quot_p, rtx *rem_p)
52719 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52721 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52722 mode,
52723 op0, GET_MODE (op0),
52724 op1, GET_MODE (op1),
52725 XEXP (rem, 0), Pmode);
52726 *quot_p = quot;
52727 *rem_p = rem;
52730 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52731 FPU, assume that the fpcw is set to extended precision; when using
52732 only SSE, rounding is correct; when using both SSE and the FPU,
52733 the rounding precision is indeterminate, since either may be chosen
52734 apparently at random. */
52736 static enum flt_eval_method
52737 ix86_excess_precision (enum excess_precision_type type)
52739 switch (type)
52741 case EXCESS_PRECISION_TYPE_FAST:
52742 /* The fastest type to promote to will always be the native type,
52743 whether that occurs with implicit excess precision or
52744 otherwise. */
52745 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52746 case EXCESS_PRECISION_TYPE_STANDARD:
52747 case EXCESS_PRECISION_TYPE_IMPLICIT:
52748 /* Otherwise, the excess precision we want when we are
52749 in a standards compliant mode, and the implicit precision we
52750 provide would be identical were it not for the unpredictable
52751 cases. */
52752 if (!TARGET_80387)
52753 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52754 else if (!TARGET_MIX_SSE_I387)
52756 if (!TARGET_SSE_MATH)
52757 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52758 else if (TARGET_SSE2)
52759 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52762 /* If we are in standards compliant mode, but we know we will
52763 calculate in unpredictable precision, return
52764 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52765 excess precision if the target can't guarantee it will honor
52766 it. */
52767 return (type == EXCESS_PRECISION_TYPE_STANDARD
52768 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52769 : FLT_EVAL_METHOD_UNPREDICTABLE);
52770 default:
52771 gcc_unreachable ();
52774 return FLT_EVAL_METHOD_UNPREDICTABLE;
52777 /* Target-specific selftests. */
52779 #if CHECKING_P
52781 namespace selftest {
52783 /* Verify that hard regs are dumped as expected (in compact mode). */
52785 static void
52786 ix86_test_dumping_hard_regs ()
52788 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52789 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52792 /* Test dumping an insn with repeated references to the same SCRATCH,
52793 to verify the rtx_reuse code. */
52795 static void
52796 ix86_test_dumping_memory_blockage ()
52798 set_new_first_and_last_insn (NULL, NULL);
52800 rtx pat = gen_memory_blockage ();
52801 rtx_reuse_manager r;
52802 r.preprocess (pat);
52804 /* Verify that the repeated references to the SCRATCH show use
52805 reuse IDS. The first should be prefixed with a reuse ID,
52806 and the second should be dumped as a "reuse_rtx" of that ID.
52807 The expected string assumes Pmode == DImode. */
52808 if (Pmode == DImode)
52809 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52810 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52811 " (unspec:BLK [\n"
52812 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52813 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52816 /* Verify loading an RTL dump; specifically a dump of copying
52817 a param on x86_64 from a hard reg into the frame.
52818 This test is target-specific since the dump contains target-specific
52819 hard reg names. */
52821 static void
52822 ix86_test_loading_dump_fragment_1 ()
52824 rtl_dump_test t (SELFTEST_LOCATION,
52825 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52827 rtx_insn *insn = get_insn_by_uid (1);
52829 /* The block structure and indentation here is purely for
52830 readability; it mirrors the structure of the rtx. */
52831 tree mem_expr;
52833 rtx pat = PATTERN (insn);
52834 ASSERT_EQ (SET, GET_CODE (pat));
52836 rtx dest = SET_DEST (pat);
52837 ASSERT_EQ (MEM, GET_CODE (dest));
52838 /* Verify the "/c" was parsed. */
52839 ASSERT_TRUE (RTX_FLAG (dest, call));
52840 ASSERT_EQ (SImode, GET_MODE (dest));
52842 rtx addr = XEXP (dest, 0);
52843 ASSERT_EQ (PLUS, GET_CODE (addr));
52844 ASSERT_EQ (DImode, GET_MODE (addr));
52846 rtx lhs = XEXP (addr, 0);
52847 /* Verify that the "frame" REG was consolidated. */
52848 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52851 rtx rhs = XEXP (addr, 1);
52852 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52853 ASSERT_EQ (-4, INTVAL (rhs));
52856 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52857 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52858 /* "i" should have been handled by synthesizing a global int
52859 variable named "i". */
52860 mem_expr = MEM_EXPR (dest);
52861 ASSERT_NE (mem_expr, NULL);
52862 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52863 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52864 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52865 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52866 /* "+0". */
52867 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52868 ASSERT_EQ (0, MEM_OFFSET (dest));
52869 /* "S4". */
52870 ASSERT_EQ (4, MEM_SIZE (dest));
52871 /* "A32. */
52872 ASSERT_EQ (32, MEM_ALIGN (dest));
52875 rtx src = SET_SRC (pat);
52876 ASSERT_EQ (REG, GET_CODE (src));
52877 ASSERT_EQ (SImode, GET_MODE (src));
52878 ASSERT_EQ (5, REGNO (src));
52879 tree reg_expr = REG_EXPR (src);
52880 /* "i" here should point to the same var as for the MEM_EXPR. */
52881 ASSERT_EQ (reg_expr, mem_expr);
52886 /* Verify that the RTL loader copes with a call_insn dump.
52887 This test is target-specific since the dump contains a target-specific
52888 hard reg name. */
52890 static void
52891 ix86_test_loading_call_insn ()
52893 /* The test dump includes register "xmm0", where requires TARGET_SSE
52894 to exist. */
52895 if (!TARGET_SSE)
52896 return;
52898 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52900 rtx_insn *insn = get_insns ();
52901 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52903 /* "/j". */
52904 ASSERT_TRUE (RTX_FLAG (insn, jump));
52906 rtx pat = PATTERN (insn);
52907 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52909 /* Verify REG_NOTES. */
52911 /* "(expr_list:REG_CALL_DECL". */
52912 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52913 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52914 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52916 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52917 rtx_expr_list *note1 = note0->next ();
52918 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52920 ASSERT_EQ (NULL, note1->next ());
52923 /* Verify CALL_INSN_FUNCTION_USAGE. */
52925 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52926 rtx_expr_list *usage
52927 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52928 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52929 ASSERT_EQ (DFmode, GET_MODE (usage));
52930 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52931 ASSERT_EQ (NULL, usage->next ());
52935 /* Verify that the RTL loader copes a dump from print_rtx_function.
52936 This test is target-specific since the dump contains target-specific
52937 hard reg names. */
52939 static void
52940 ix86_test_loading_full_dump ()
52942 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52944 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52946 rtx_insn *insn_1 = get_insn_by_uid (1);
52947 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52949 rtx_insn *insn_7 = get_insn_by_uid (7);
52950 ASSERT_EQ (INSN, GET_CODE (insn_7));
52951 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52953 rtx_insn *insn_15 = get_insn_by_uid (15);
52954 ASSERT_EQ (INSN, GET_CODE (insn_15));
52955 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52957 /* Verify crtl->return_rtx. */
52958 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52959 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52960 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52963 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52964 In particular, verify that it correctly loads the 2nd operand.
52965 This test is target-specific since these are machine-specific
52966 operands (and enums). */
52968 static void
52969 ix86_test_loading_unspec ()
52971 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52973 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52975 ASSERT_TRUE (cfun);
52977 /* Test of an UNSPEC. */
52978 rtx_insn *insn = get_insns ();
52979 ASSERT_EQ (INSN, GET_CODE (insn));
52980 rtx set = single_set (insn);
52981 ASSERT_NE (NULL, set);
52982 rtx dst = SET_DEST (set);
52983 ASSERT_EQ (MEM, GET_CODE (dst));
52984 rtx src = SET_SRC (set);
52985 ASSERT_EQ (UNSPEC, GET_CODE (src));
52986 ASSERT_EQ (BLKmode, GET_MODE (src));
52987 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52989 rtx v0 = XVECEXP (src, 0, 0);
52991 /* Verify that the two uses of the first SCRATCH have pointer
52992 equality. */
52993 rtx scratch_a = XEXP (dst, 0);
52994 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52996 rtx scratch_b = XEXP (v0, 0);
52997 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52999 ASSERT_EQ (scratch_a, scratch_b);
53001 /* Verify that the two mems are thus treated as equal. */
53002 ASSERT_TRUE (rtx_equal_p (dst, v0));
53004 /* Verify the the insn is recognized. */
53005 ASSERT_NE(-1, recog_memoized (insn));
53007 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
53008 insn = NEXT_INSN (insn);
53009 ASSERT_EQ (INSN, GET_CODE (insn));
53011 set = single_set (insn);
53012 ASSERT_NE (NULL, set);
53014 src = SET_SRC (set);
53015 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
53016 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
53019 /* Run all target-specific selftests. */
53021 static void
53022 ix86_run_selftests (void)
53024 ix86_test_dumping_hard_regs ();
53025 ix86_test_dumping_memory_blockage ();
53027 /* Various tests of loading RTL dumps, here because they contain
53028 ix86-isms (e.g. names of hard regs). */
53029 ix86_test_loading_dump_fragment_1 ();
53030 ix86_test_loading_call_insn ();
53031 ix86_test_loading_full_dump ();
53032 ix86_test_loading_unspec ();
53035 } // namespace selftest
53037 #endif /* CHECKING_P */
53039 /* Initialize the GCC target structure. */
53040 #undef TARGET_RETURN_IN_MEMORY
53041 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
53043 #undef TARGET_LEGITIMIZE_ADDRESS
53044 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
53046 #undef TARGET_ATTRIBUTE_TABLE
53047 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
53048 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
53049 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
53050 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
53051 # undef TARGET_MERGE_DECL_ATTRIBUTES
53052 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
53053 #endif
53055 #undef TARGET_COMP_TYPE_ATTRIBUTES
53056 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
53058 #undef TARGET_INIT_BUILTINS
53059 #define TARGET_INIT_BUILTINS ix86_init_builtins
53060 #undef TARGET_BUILTIN_DECL
53061 #define TARGET_BUILTIN_DECL ix86_builtin_decl
53062 #undef TARGET_EXPAND_BUILTIN
53063 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
53065 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
53066 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
53067 ix86_builtin_vectorized_function
53069 #undef TARGET_VECTORIZE_BUILTIN_GATHER
53070 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
53072 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
53073 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
53075 #undef TARGET_BUILTIN_RECIPROCAL
53076 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
53078 #undef TARGET_ASM_FUNCTION_EPILOGUE
53079 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
53081 #undef TARGET_ENCODE_SECTION_INFO
53082 #ifndef SUBTARGET_ENCODE_SECTION_INFO
53083 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
53084 #else
53085 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
53086 #endif
53088 #undef TARGET_ASM_OPEN_PAREN
53089 #define TARGET_ASM_OPEN_PAREN ""
53090 #undef TARGET_ASM_CLOSE_PAREN
53091 #define TARGET_ASM_CLOSE_PAREN ""
53093 #undef TARGET_ASM_BYTE_OP
53094 #define TARGET_ASM_BYTE_OP ASM_BYTE
53096 #undef TARGET_ASM_ALIGNED_HI_OP
53097 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
53098 #undef TARGET_ASM_ALIGNED_SI_OP
53099 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
53100 #ifdef ASM_QUAD
53101 #undef TARGET_ASM_ALIGNED_DI_OP
53102 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
53103 #endif
53105 #undef TARGET_PROFILE_BEFORE_PROLOGUE
53106 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
53108 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
53109 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
53111 #undef TARGET_ASM_UNALIGNED_HI_OP
53112 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
53113 #undef TARGET_ASM_UNALIGNED_SI_OP
53114 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
53115 #undef TARGET_ASM_UNALIGNED_DI_OP
53116 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
53118 #undef TARGET_PRINT_OPERAND
53119 #define TARGET_PRINT_OPERAND ix86_print_operand
53120 #undef TARGET_PRINT_OPERAND_ADDRESS
53121 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
53122 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
53123 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
53124 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
53125 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
53127 #undef TARGET_SCHED_INIT_GLOBAL
53128 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
53129 #undef TARGET_SCHED_ADJUST_COST
53130 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
53131 #undef TARGET_SCHED_ISSUE_RATE
53132 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
53133 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
53134 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
53135 ia32_multipass_dfa_lookahead
53136 #undef TARGET_SCHED_MACRO_FUSION_P
53137 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
53138 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
53139 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
53141 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
53142 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
53144 #undef TARGET_MEMMODEL_CHECK
53145 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
53147 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
53148 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
53150 #ifdef HAVE_AS_TLS
53151 #undef TARGET_HAVE_TLS
53152 #define TARGET_HAVE_TLS true
53153 #endif
53154 #undef TARGET_CANNOT_FORCE_CONST_MEM
53155 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
53156 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
53157 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
53159 #undef TARGET_DELEGITIMIZE_ADDRESS
53160 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
53162 #undef TARGET_MS_BITFIELD_LAYOUT_P
53163 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
53165 #if TARGET_MACHO
53166 #undef TARGET_BINDS_LOCAL_P
53167 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
53168 #else
53169 #undef TARGET_BINDS_LOCAL_P
53170 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
53171 #endif
53172 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
53173 #undef TARGET_BINDS_LOCAL_P
53174 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
53175 #endif
53177 #undef TARGET_ASM_OUTPUT_MI_THUNK
53178 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
53179 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
53180 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
53182 #undef TARGET_ASM_FILE_START
53183 #define TARGET_ASM_FILE_START x86_file_start
53185 #undef TARGET_OPTION_OVERRIDE
53186 #define TARGET_OPTION_OVERRIDE ix86_option_override
53188 #undef TARGET_REGISTER_MOVE_COST
53189 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
53190 #undef TARGET_MEMORY_MOVE_COST
53191 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
53192 #undef TARGET_RTX_COSTS
53193 #define TARGET_RTX_COSTS ix86_rtx_costs
53194 #undef TARGET_ADDRESS_COST
53195 #define TARGET_ADDRESS_COST ix86_address_cost
53197 #undef TARGET_FLAGS_REGNUM
53198 #define TARGET_FLAGS_REGNUM FLAGS_REG
53199 #undef TARGET_FIXED_CONDITION_CODE_REGS
53200 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
53201 #undef TARGET_CC_MODES_COMPATIBLE
53202 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
53204 #undef TARGET_MACHINE_DEPENDENT_REORG
53205 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
53207 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
53208 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
53210 #undef TARGET_BUILD_BUILTIN_VA_LIST
53211 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
53213 #undef TARGET_FOLD_BUILTIN
53214 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
53216 #undef TARGET_GIMPLE_FOLD_BUILTIN
53217 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
53219 #undef TARGET_COMPARE_VERSION_PRIORITY
53220 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
53222 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
53223 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
53224 ix86_generate_version_dispatcher_body
53226 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
53227 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
53228 ix86_get_function_versions_dispatcher
53230 #undef TARGET_ENUM_VA_LIST_P
53231 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
53233 #undef TARGET_FN_ABI_VA_LIST
53234 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
53236 #undef TARGET_CANONICAL_VA_LIST_TYPE
53237 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
53239 #undef TARGET_EXPAND_BUILTIN_VA_START
53240 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
53242 #undef TARGET_MD_ASM_ADJUST
53243 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
53245 #undef TARGET_C_EXCESS_PRECISION
53246 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
53247 #undef TARGET_PROMOTE_PROTOTYPES
53248 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
53249 #undef TARGET_SETUP_INCOMING_VARARGS
53250 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
53251 #undef TARGET_MUST_PASS_IN_STACK
53252 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
53253 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
53254 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
53255 #undef TARGET_FUNCTION_ARG_ADVANCE
53256 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
53257 #undef TARGET_FUNCTION_ARG
53258 #define TARGET_FUNCTION_ARG ix86_function_arg
53259 #undef TARGET_INIT_PIC_REG
53260 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
53261 #undef TARGET_USE_PSEUDO_PIC_REG
53262 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
53263 #undef TARGET_FUNCTION_ARG_BOUNDARY
53264 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
53265 #undef TARGET_PASS_BY_REFERENCE
53266 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
53267 #undef TARGET_INTERNAL_ARG_POINTER
53268 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
53269 #undef TARGET_UPDATE_STACK_BOUNDARY
53270 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
53271 #undef TARGET_GET_DRAP_RTX
53272 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
53273 #undef TARGET_STRICT_ARGUMENT_NAMING
53274 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
53275 #undef TARGET_STATIC_CHAIN
53276 #define TARGET_STATIC_CHAIN ix86_static_chain
53277 #undef TARGET_TRAMPOLINE_INIT
53278 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
53279 #undef TARGET_RETURN_POPS_ARGS
53280 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
53282 #undef TARGET_WARN_FUNC_RETURN
53283 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
53285 #undef TARGET_LEGITIMATE_COMBINED_INSN
53286 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
53288 #undef TARGET_ASAN_SHADOW_OFFSET
53289 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
53291 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
53292 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
53294 #undef TARGET_SCALAR_MODE_SUPPORTED_P
53295 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
53297 #undef TARGET_VECTOR_MODE_SUPPORTED_P
53298 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
53300 #undef TARGET_C_MODE_FOR_SUFFIX
53301 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
53303 #ifdef HAVE_AS_TLS
53304 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
53305 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
53306 #endif
53308 #ifdef SUBTARGET_INSERT_ATTRIBUTES
53309 #undef TARGET_INSERT_ATTRIBUTES
53310 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
53311 #endif
53313 #undef TARGET_MANGLE_TYPE
53314 #define TARGET_MANGLE_TYPE ix86_mangle_type
53316 #undef TARGET_STACK_PROTECT_GUARD
53317 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
53319 #if !TARGET_MACHO
53320 #undef TARGET_STACK_PROTECT_FAIL
53321 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
53322 #endif
53324 #undef TARGET_FUNCTION_VALUE
53325 #define TARGET_FUNCTION_VALUE ix86_function_value
53327 #undef TARGET_FUNCTION_VALUE_REGNO_P
53328 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
53330 #undef TARGET_PROMOTE_FUNCTION_MODE
53331 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
53333 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
53334 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
53336 #undef TARGET_MEMBER_TYPE_FORCES_BLK
53337 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
53339 #undef TARGET_INSTANTIATE_DECLS
53340 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
53342 #undef TARGET_SECONDARY_RELOAD
53343 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
53344 #undef TARGET_SECONDARY_MEMORY_NEEDED
53345 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
53346 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
53347 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
53349 #undef TARGET_CLASS_MAX_NREGS
53350 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
53352 #undef TARGET_PREFERRED_RELOAD_CLASS
53353 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
53354 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
53355 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
53356 #undef TARGET_CLASS_LIKELY_SPILLED_P
53357 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
53359 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
53360 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
53361 ix86_builtin_vectorization_cost
53362 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
53363 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
53364 ix86_vectorize_vec_perm_const_ok
53365 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
53366 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
53367 ix86_preferred_simd_mode
53368 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
53369 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
53370 ix86_autovectorize_vector_sizes
53371 #undef TARGET_VECTORIZE_GET_MASK_MODE
53372 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
53373 #undef TARGET_VECTORIZE_INIT_COST
53374 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
53375 #undef TARGET_VECTORIZE_ADD_STMT_COST
53376 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
53377 #undef TARGET_VECTORIZE_FINISH_COST
53378 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
53379 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
53380 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
53382 #undef TARGET_SET_CURRENT_FUNCTION
53383 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
53385 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
53386 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
53388 #undef TARGET_OPTION_SAVE
53389 #define TARGET_OPTION_SAVE ix86_function_specific_save
53391 #undef TARGET_OPTION_RESTORE
53392 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
53394 #undef TARGET_OPTION_POST_STREAM_IN
53395 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
53397 #undef TARGET_OPTION_PRINT
53398 #define TARGET_OPTION_PRINT ix86_function_specific_print
53400 #undef TARGET_OPTION_FUNCTION_VERSIONS
53401 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
53403 #undef TARGET_CAN_INLINE_P
53404 #define TARGET_CAN_INLINE_P ix86_can_inline_p
53406 #undef TARGET_LEGITIMATE_ADDRESS_P
53407 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
53409 #undef TARGET_REGISTER_PRIORITY
53410 #define TARGET_REGISTER_PRIORITY ix86_register_priority
53412 #undef TARGET_REGISTER_USAGE_LEVELING_P
53413 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
53415 #undef TARGET_LEGITIMATE_CONSTANT_P
53416 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
53418 #undef TARGET_COMPUTE_FRAME_LAYOUT
53419 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
53421 #undef TARGET_FRAME_POINTER_REQUIRED
53422 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
53424 #undef TARGET_CAN_ELIMINATE
53425 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
53427 #undef TARGET_EXTRA_LIVE_ON_ENTRY
53428 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
53430 #undef TARGET_ASM_CODE_END
53431 #define TARGET_ASM_CODE_END ix86_code_end
53433 #undef TARGET_CONDITIONAL_REGISTER_USAGE
53434 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
53436 #undef TARGET_LOOP_UNROLL_ADJUST
53437 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
53439 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
53440 #undef TARGET_SPILL_CLASS
53441 #define TARGET_SPILL_CLASS ix86_spill_class
53443 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
53444 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
53445 ix86_simd_clone_compute_vecsize_and_simdlen
53447 #undef TARGET_SIMD_CLONE_ADJUST
53448 #define TARGET_SIMD_CLONE_ADJUST \
53449 ix86_simd_clone_adjust
53451 #undef TARGET_SIMD_CLONE_USABLE
53452 #define TARGET_SIMD_CLONE_USABLE \
53453 ix86_simd_clone_usable
53455 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
53456 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
53457 ix86_float_exceptions_rounding_supported_p
53459 #undef TARGET_MODE_EMIT
53460 #define TARGET_MODE_EMIT ix86_emit_mode_set
53462 #undef TARGET_MODE_NEEDED
53463 #define TARGET_MODE_NEEDED ix86_mode_needed
53465 #undef TARGET_MODE_AFTER
53466 #define TARGET_MODE_AFTER ix86_mode_after
53468 #undef TARGET_MODE_ENTRY
53469 #define TARGET_MODE_ENTRY ix86_mode_entry
53471 #undef TARGET_MODE_EXIT
53472 #define TARGET_MODE_EXIT ix86_mode_exit
53474 #undef TARGET_MODE_PRIORITY
53475 #define TARGET_MODE_PRIORITY ix86_mode_priority
53477 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
53478 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
53480 #undef TARGET_LOAD_BOUNDS_FOR_ARG
53481 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
53483 #undef TARGET_STORE_BOUNDS_FOR_ARG
53484 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
53486 #undef TARGET_LOAD_RETURNED_BOUNDS
53487 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
53489 #undef TARGET_STORE_RETURNED_BOUNDS
53490 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
53492 #undef TARGET_CHKP_BOUND_MODE
53493 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53495 #undef TARGET_BUILTIN_CHKP_FUNCTION
53496 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53498 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53499 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53501 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53502 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53504 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53505 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53507 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53508 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53510 #undef TARGET_OFFLOAD_OPTIONS
53511 #define TARGET_OFFLOAD_OPTIONS \
53512 ix86_offload_options
53514 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53515 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53517 #undef TARGET_OPTAB_SUPPORTED_P
53518 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53520 #undef TARGET_HARD_REGNO_SCRATCH_OK
53521 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53523 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53524 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53526 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53527 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53529 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53530 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53532 #undef TARGET_INIT_LIBFUNCS
53533 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53535 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53536 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53538 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53539 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53541 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53542 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53544 #undef TARGET_HARD_REGNO_NREGS
53545 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
53546 #undef TARGET_HARD_REGNO_MODE_OK
53547 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
53549 #undef TARGET_MODES_TIEABLE_P
53550 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
53552 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
53553 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
53554 ix86_hard_regno_call_part_clobbered
53556 #undef TARGET_CAN_CHANGE_MODE_CLASS
53557 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
53559 #if CHECKING_P
53560 #undef TARGET_RUN_TARGET_SELFTESTS
53561 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53562 #endif /* #if CHECKING_P */
53564 struct gcc_target targetm = TARGET_INITIALIZER;
53566 #include "gt-i386.h"