Fix fallthrough comments for Ada, Solaris/x86, SPARC
[official-gcc.git] / gcc / config / i386 / i386.c
blob256893b34d389c6ee874f54738b13ac57eadcbe4
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
81 /* This file should be included last. */
82 #include "target-def.h"
84 static rtx legitimize_dllimport_symbol (rtx, bool);
85 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
86 static rtx legitimize_pe_coff_symbol (rtx, bool);
87 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
89 #ifndef CHECK_STACK_LIMIT
90 #define CHECK_STACK_LIMIT (-1)
91 #endif
93 /* Return index of given mode in mult and division cost tables. */
94 #define MODE_INDEX(mode) \
95 ((mode) == QImode ? 0 \
96 : (mode) == HImode ? 1 \
97 : (mode) == SImode ? 2 \
98 : (mode) == DImode ? 3 \
99 : 4)
101 /* Processor costs (relative to an add) */
102 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
103 #define COSTS_N_BYTES(N) ((N) * 2)
105 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
107 static stringop_algs ix86_size_memcpy[2] = {
108 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
109 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
110 static stringop_algs ix86_size_memset[2] = {
111 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
112 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
114 const
115 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
116 COSTS_N_BYTES (2), /* cost of an add instruction */
117 COSTS_N_BYTES (3), /* cost of a lea instruction */
118 COSTS_N_BYTES (2), /* variable shift costs */
119 COSTS_N_BYTES (3), /* constant shift costs */
120 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
121 COSTS_N_BYTES (3), /* HI */
122 COSTS_N_BYTES (3), /* SI */
123 COSTS_N_BYTES (3), /* DI */
124 COSTS_N_BYTES (5)}, /* other */
125 0, /* cost of multiply per each bit set */
126 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
127 COSTS_N_BYTES (3), /* HI */
128 COSTS_N_BYTES (3), /* SI */
129 COSTS_N_BYTES (3), /* DI */
130 COSTS_N_BYTES (5)}, /* other */
131 COSTS_N_BYTES (3), /* cost of movsx */
132 COSTS_N_BYTES (3), /* cost of movzx */
133 0, /* "large" insn */
134 2, /* MOVE_RATIO */
135 2, /* cost for loading QImode using movzbl */
136 {2, 2, 2}, /* cost of loading integer registers
137 in QImode, HImode and SImode.
138 Relative to reg-reg move (2). */
139 {2, 2, 2}, /* cost of storing integer registers */
140 2, /* cost of reg,reg fld/fst */
141 {2, 2, 2}, /* cost of loading fp registers
142 in SFmode, DFmode and XFmode */
143 {2, 2, 2}, /* cost of storing fp registers
144 in SFmode, DFmode and XFmode */
145 3, /* cost of moving MMX register */
146 {3, 3}, /* cost of loading MMX registers
147 in SImode and DImode */
148 {3, 3}, /* cost of storing MMX registers
149 in SImode and DImode */
150 3, /* cost of moving SSE register */
151 {3, 3, 3}, /* cost of loading SSE registers
152 in SImode, DImode and TImode */
153 {3, 3, 3}, /* cost of storing SSE registers
154 in SImode, DImode and TImode */
155 3, /* MMX or SSE register to integer */
156 0, /* size of l1 cache */
157 0, /* size of l2 cache */
158 0, /* size of prefetch block */
159 0, /* number of parallel prefetches */
160 2, /* Branch cost */
161 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
162 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
163 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
164 COSTS_N_BYTES (2), /* cost of FABS instruction. */
165 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
166 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
167 ix86_size_memcpy,
168 ix86_size_memset,
169 1, /* scalar_stmt_cost. */
170 1, /* scalar load_cost. */
171 1, /* scalar_store_cost. */
172 1, /* vec_stmt_cost. */
173 1, /* vec_to_scalar_cost. */
174 1, /* scalar_to_vec_cost. */
175 1, /* vec_align_load_cost. */
176 1, /* vec_unalign_load_cost. */
177 1, /* vec_store_cost. */
178 1, /* cond_taken_branch_cost. */
179 1, /* cond_not_taken_branch_cost. */
182 /* Processor costs (relative to an add) */
183 static stringop_algs i386_memcpy[2] = {
184 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
185 DUMMY_STRINGOP_ALGS};
186 static stringop_algs i386_memset[2] = {
187 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
188 DUMMY_STRINGOP_ALGS};
190 static const
191 struct processor_costs i386_cost = { /* 386 specific costs */
192 COSTS_N_INSNS (1), /* cost of an add instruction */
193 COSTS_N_INSNS (1), /* cost of a lea instruction */
194 COSTS_N_INSNS (3), /* variable shift costs */
195 COSTS_N_INSNS (2), /* constant shift costs */
196 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
197 COSTS_N_INSNS (6), /* HI */
198 COSTS_N_INSNS (6), /* SI */
199 COSTS_N_INSNS (6), /* DI */
200 COSTS_N_INSNS (6)}, /* other */
201 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
202 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
203 COSTS_N_INSNS (23), /* HI */
204 COSTS_N_INSNS (23), /* SI */
205 COSTS_N_INSNS (23), /* DI */
206 COSTS_N_INSNS (23)}, /* other */
207 COSTS_N_INSNS (3), /* cost of movsx */
208 COSTS_N_INSNS (2), /* cost of movzx */
209 15, /* "large" insn */
210 3, /* MOVE_RATIO */
211 4, /* cost for loading QImode using movzbl */
212 {2, 4, 2}, /* cost of loading integer registers
213 in QImode, HImode and SImode.
214 Relative to reg-reg move (2). */
215 {2, 4, 2}, /* cost of storing integer registers */
216 2, /* cost of reg,reg fld/fst */
217 {8, 8, 8}, /* cost of loading fp registers
218 in SFmode, DFmode and XFmode */
219 {8, 8, 8}, /* cost of storing fp registers
220 in SFmode, DFmode and XFmode */
221 2, /* cost of moving MMX register */
222 {4, 8}, /* cost of loading MMX registers
223 in SImode and DImode */
224 {4, 8}, /* cost of storing MMX registers
225 in SImode and DImode */
226 2, /* cost of moving SSE register */
227 {4, 8, 16}, /* cost of loading SSE registers
228 in SImode, DImode and TImode */
229 {4, 8, 16}, /* cost of storing SSE registers
230 in SImode, DImode and TImode */
231 3, /* MMX or SSE register to integer */
232 0, /* size of l1 cache */
233 0, /* size of l2 cache */
234 0, /* size of prefetch block */
235 0, /* number of parallel prefetches */
236 1, /* Branch cost */
237 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
238 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
239 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
240 COSTS_N_INSNS (22), /* cost of FABS instruction. */
241 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
242 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
243 i386_memcpy,
244 i386_memset,
245 1, /* scalar_stmt_cost. */
246 1, /* scalar load_cost. */
247 1, /* scalar_store_cost. */
248 1, /* vec_stmt_cost. */
249 1, /* vec_to_scalar_cost. */
250 1, /* scalar_to_vec_cost. */
251 1, /* vec_align_load_cost. */
252 2, /* vec_unalign_load_cost. */
253 1, /* vec_store_cost. */
254 3, /* cond_taken_branch_cost. */
255 1, /* cond_not_taken_branch_cost. */
258 static stringop_algs i486_memcpy[2] = {
259 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
260 DUMMY_STRINGOP_ALGS};
261 static stringop_algs i486_memset[2] = {
262 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
263 DUMMY_STRINGOP_ALGS};
265 static const
266 struct processor_costs i486_cost = { /* 486 specific costs */
267 COSTS_N_INSNS (1), /* cost of an add instruction */
268 COSTS_N_INSNS (1), /* cost of a lea instruction */
269 COSTS_N_INSNS (3), /* variable shift costs */
270 COSTS_N_INSNS (2), /* constant shift costs */
271 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
272 COSTS_N_INSNS (12), /* HI */
273 COSTS_N_INSNS (12), /* SI */
274 COSTS_N_INSNS (12), /* DI */
275 COSTS_N_INSNS (12)}, /* other */
276 1, /* cost of multiply per each bit set */
277 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
278 COSTS_N_INSNS (40), /* HI */
279 COSTS_N_INSNS (40), /* SI */
280 COSTS_N_INSNS (40), /* DI */
281 COSTS_N_INSNS (40)}, /* other */
282 COSTS_N_INSNS (3), /* cost of movsx */
283 COSTS_N_INSNS (2), /* cost of movzx */
284 15, /* "large" insn */
285 3, /* MOVE_RATIO */
286 4, /* cost for loading QImode using movzbl */
287 {2, 4, 2}, /* cost of loading integer registers
288 in QImode, HImode and SImode.
289 Relative to reg-reg move (2). */
290 {2, 4, 2}, /* cost of storing integer registers */
291 2, /* cost of reg,reg fld/fst */
292 {8, 8, 8}, /* cost of loading fp registers
293 in SFmode, DFmode and XFmode */
294 {8, 8, 8}, /* cost of storing fp registers
295 in SFmode, DFmode and XFmode */
296 2, /* cost of moving MMX register */
297 {4, 8}, /* cost of loading MMX registers
298 in SImode and DImode */
299 {4, 8}, /* cost of storing MMX registers
300 in SImode and DImode */
301 2, /* cost of moving SSE register */
302 {4, 8, 16}, /* cost of loading SSE registers
303 in SImode, DImode and TImode */
304 {4, 8, 16}, /* cost of storing SSE registers
305 in SImode, DImode and TImode */
306 3, /* MMX or SSE register to integer */
307 4, /* size of l1 cache. 486 has 8kB cache
308 shared for code and data, so 4kB is
309 not really precise. */
310 4, /* size of l2 cache */
311 0, /* size of prefetch block */
312 0, /* number of parallel prefetches */
313 1, /* Branch cost */
314 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
315 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
316 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
317 COSTS_N_INSNS (3), /* cost of FABS instruction. */
318 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
319 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
320 i486_memcpy,
321 i486_memset,
322 1, /* scalar_stmt_cost. */
323 1, /* scalar load_cost. */
324 1, /* scalar_store_cost. */
325 1, /* vec_stmt_cost. */
326 1, /* vec_to_scalar_cost. */
327 1, /* scalar_to_vec_cost. */
328 1, /* vec_align_load_cost. */
329 2, /* vec_unalign_load_cost. */
330 1, /* vec_store_cost. */
331 3, /* cond_taken_branch_cost. */
332 1, /* cond_not_taken_branch_cost. */
335 static stringop_algs pentium_memcpy[2] = {
336 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
337 DUMMY_STRINGOP_ALGS};
338 static stringop_algs pentium_memset[2] = {
339 {libcall, {{-1, rep_prefix_4_byte, false}}},
340 DUMMY_STRINGOP_ALGS};
342 static const
343 struct processor_costs pentium_cost = {
344 COSTS_N_INSNS (1), /* cost of an add instruction */
345 COSTS_N_INSNS (1), /* cost of a lea instruction */
346 COSTS_N_INSNS (4), /* variable shift costs */
347 COSTS_N_INSNS (1), /* constant shift costs */
348 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
349 COSTS_N_INSNS (11), /* HI */
350 COSTS_N_INSNS (11), /* SI */
351 COSTS_N_INSNS (11), /* DI */
352 COSTS_N_INSNS (11)}, /* other */
353 0, /* cost of multiply per each bit set */
354 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
355 COSTS_N_INSNS (25), /* HI */
356 COSTS_N_INSNS (25), /* SI */
357 COSTS_N_INSNS (25), /* DI */
358 COSTS_N_INSNS (25)}, /* other */
359 COSTS_N_INSNS (3), /* cost of movsx */
360 COSTS_N_INSNS (2), /* cost of movzx */
361 8, /* "large" insn */
362 6, /* MOVE_RATIO */
363 6, /* cost for loading QImode using movzbl */
364 {2, 4, 2}, /* cost of loading integer registers
365 in QImode, HImode and SImode.
366 Relative to reg-reg move (2). */
367 {2, 4, 2}, /* cost of storing integer registers */
368 2, /* cost of reg,reg fld/fst */
369 {2, 2, 6}, /* cost of loading fp registers
370 in SFmode, DFmode and XFmode */
371 {4, 4, 6}, /* cost of storing fp registers
372 in SFmode, DFmode and XFmode */
373 8, /* cost of moving MMX register */
374 {8, 8}, /* cost of loading MMX registers
375 in SImode and DImode */
376 {8, 8}, /* cost of storing MMX registers
377 in SImode and DImode */
378 2, /* cost of moving SSE register */
379 {4, 8, 16}, /* cost of loading SSE registers
380 in SImode, DImode and TImode */
381 {4, 8, 16}, /* cost of storing SSE registers
382 in SImode, DImode and TImode */
383 3, /* MMX or SSE register to integer */
384 8, /* size of l1 cache. */
385 8, /* size of l2 cache */
386 0, /* size of prefetch block */
387 0, /* number of parallel prefetches */
388 2, /* Branch cost */
389 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
390 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
391 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
392 COSTS_N_INSNS (1), /* cost of FABS instruction. */
393 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
394 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
395 pentium_memcpy,
396 pentium_memset,
397 1, /* scalar_stmt_cost. */
398 1, /* scalar load_cost. */
399 1, /* scalar_store_cost. */
400 1, /* vec_stmt_cost. */
401 1, /* vec_to_scalar_cost. */
402 1, /* scalar_to_vec_cost. */
403 1, /* vec_align_load_cost. */
404 2, /* vec_unalign_load_cost. */
405 1, /* vec_store_cost. */
406 3, /* cond_taken_branch_cost. */
407 1, /* cond_not_taken_branch_cost. */
410 static const
411 struct processor_costs lakemont_cost = {
412 COSTS_N_INSNS (1), /* cost of an add instruction */
413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
414 COSTS_N_INSNS (1), /* variable shift costs */
415 COSTS_N_INSNS (1), /* constant shift costs */
416 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
417 COSTS_N_INSNS (11), /* HI */
418 COSTS_N_INSNS (11), /* SI */
419 COSTS_N_INSNS (11), /* DI */
420 COSTS_N_INSNS (11)}, /* other */
421 0, /* cost of multiply per each bit set */
422 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
423 COSTS_N_INSNS (25), /* HI */
424 COSTS_N_INSNS (25), /* SI */
425 COSTS_N_INSNS (25), /* DI */
426 COSTS_N_INSNS (25)}, /* other */
427 COSTS_N_INSNS (3), /* cost of movsx */
428 COSTS_N_INSNS (2), /* cost of movzx */
429 8, /* "large" insn */
430 17, /* MOVE_RATIO */
431 6, /* cost for loading QImode using movzbl */
432 {2, 4, 2}, /* cost of loading integer registers
433 in QImode, HImode and SImode.
434 Relative to reg-reg move (2). */
435 {2, 4, 2}, /* cost of storing integer registers */
436 2, /* cost of reg,reg fld/fst */
437 {2, 2, 6}, /* cost of loading fp registers
438 in SFmode, DFmode and XFmode */
439 {4, 4, 6}, /* cost of storing fp registers
440 in SFmode, DFmode and XFmode */
441 8, /* cost of moving MMX register */
442 {8, 8}, /* cost of loading MMX registers
443 in SImode and DImode */
444 {8, 8}, /* cost of storing MMX registers
445 in SImode and DImode */
446 2, /* cost of moving SSE register */
447 {4, 8, 16}, /* cost of loading SSE registers
448 in SImode, DImode and TImode */
449 {4, 8, 16}, /* cost of storing SSE registers
450 in SImode, DImode and TImode */
451 3, /* MMX or SSE register to integer */
452 8, /* size of l1 cache. */
453 8, /* size of l2 cache */
454 0, /* size of prefetch block */
455 0, /* number of parallel prefetches */
456 2, /* Branch cost */
457 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
458 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
459 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
460 COSTS_N_INSNS (1), /* cost of FABS instruction. */
461 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
462 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
463 pentium_memcpy,
464 pentium_memset,
465 1, /* scalar_stmt_cost. */
466 1, /* scalar load_cost. */
467 1, /* scalar_store_cost. */
468 1, /* vec_stmt_cost. */
469 1, /* vec_to_scalar_cost. */
470 1, /* scalar_to_vec_cost. */
471 1, /* vec_align_load_cost. */
472 2, /* vec_unalign_load_cost. */
473 1, /* vec_store_cost. */
474 3, /* cond_taken_branch_cost. */
475 1, /* cond_not_taken_branch_cost. */
478 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
479 (we ensure the alignment). For small blocks inline loop is still a
480 noticeable win, for bigger blocks either rep movsl or rep movsb is
481 way to go. Rep movsb has apparently more expensive startup time in CPU,
482 but after 4K the difference is down in the noise. */
483 static stringop_algs pentiumpro_memcpy[2] = {
484 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
485 {8192, rep_prefix_4_byte, false},
486 {-1, rep_prefix_1_byte, false}}},
487 DUMMY_STRINGOP_ALGS};
488 static stringop_algs pentiumpro_memset[2] = {
489 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
490 {8192, rep_prefix_4_byte, false},
491 {-1, libcall, false}}},
492 DUMMY_STRINGOP_ALGS};
493 static const
494 struct processor_costs pentiumpro_cost = {
495 COSTS_N_INSNS (1), /* cost of an add instruction */
496 COSTS_N_INSNS (1), /* cost of a lea instruction */
497 COSTS_N_INSNS (1), /* variable shift costs */
498 COSTS_N_INSNS (1), /* constant shift costs */
499 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
500 COSTS_N_INSNS (4), /* HI */
501 COSTS_N_INSNS (4), /* SI */
502 COSTS_N_INSNS (4), /* DI */
503 COSTS_N_INSNS (4)}, /* other */
504 0, /* cost of multiply per each bit set */
505 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
506 COSTS_N_INSNS (17), /* HI */
507 COSTS_N_INSNS (17), /* SI */
508 COSTS_N_INSNS (17), /* DI */
509 COSTS_N_INSNS (17)}, /* other */
510 COSTS_N_INSNS (1), /* cost of movsx */
511 COSTS_N_INSNS (1), /* cost of movzx */
512 8, /* "large" insn */
513 6, /* MOVE_RATIO */
514 2, /* cost for loading QImode using movzbl */
515 {4, 4, 4}, /* cost of loading integer registers
516 in QImode, HImode and SImode.
517 Relative to reg-reg move (2). */
518 {2, 2, 2}, /* cost of storing integer registers */
519 2, /* cost of reg,reg fld/fst */
520 {2, 2, 6}, /* cost of loading fp registers
521 in SFmode, DFmode and XFmode */
522 {4, 4, 6}, /* cost of storing fp registers
523 in SFmode, DFmode and XFmode */
524 2, /* cost of moving MMX register */
525 {2, 2}, /* cost of loading MMX registers
526 in SImode and DImode */
527 {2, 2}, /* cost of storing MMX registers
528 in SImode and DImode */
529 2, /* cost of moving SSE register */
530 {2, 2, 8}, /* cost of loading SSE registers
531 in SImode, DImode and TImode */
532 {2, 2, 8}, /* cost of storing SSE registers
533 in SImode, DImode and TImode */
534 3, /* MMX or SSE register to integer */
535 8, /* size of l1 cache. */
536 256, /* size of l2 cache */
537 32, /* size of prefetch block */
538 6, /* number of parallel prefetches */
539 2, /* Branch cost */
540 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
541 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
542 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
543 COSTS_N_INSNS (2), /* cost of FABS instruction. */
544 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
545 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
546 pentiumpro_memcpy,
547 pentiumpro_memset,
548 1, /* scalar_stmt_cost. */
549 1, /* scalar load_cost. */
550 1, /* scalar_store_cost. */
551 1, /* vec_stmt_cost. */
552 1, /* vec_to_scalar_cost. */
553 1, /* scalar_to_vec_cost. */
554 1, /* vec_align_load_cost. */
555 2, /* vec_unalign_load_cost. */
556 1, /* vec_store_cost. */
557 3, /* cond_taken_branch_cost. */
558 1, /* cond_not_taken_branch_cost. */
561 static stringop_algs geode_memcpy[2] = {
562 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
563 DUMMY_STRINGOP_ALGS};
564 static stringop_algs geode_memset[2] = {
565 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
566 DUMMY_STRINGOP_ALGS};
567 static const
568 struct processor_costs geode_cost = {
569 COSTS_N_INSNS (1), /* cost of an add instruction */
570 COSTS_N_INSNS (1), /* cost of a lea instruction */
571 COSTS_N_INSNS (2), /* variable shift costs */
572 COSTS_N_INSNS (1), /* constant shift costs */
573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
574 COSTS_N_INSNS (4), /* HI */
575 COSTS_N_INSNS (7), /* SI */
576 COSTS_N_INSNS (7), /* DI */
577 COSTS_N_INSNS (7)}, /* other */
578 0, /* cost of multiply per each bit set */
579 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
580 COSTS_N_INSNS (23), /* HI */
581 COSTS_N_INSNS (39), /* SI */
582 COSTS_N_INSNS (39), /* DI */
583 COSTS_N_INSNS (39)}, /* other */
584 COSTS_N_INSNS (1), /* cost of movsx */
585 COSTS_N_INSNS (1), /* cost of movzx */
586 8, /* "large" insn */
587 4, /* MOVE_RATIO */
588 1, /* cost for loading QImode using movzbl */
589 {1, 1, 1}, /* cost of loading integer registers
590 in QImode, HImode and SImode.
591 Relative to reg-reg move (2). */
592 {1, 1, 1}, /* cost of storing integer registers */
593 1, /* cost of reg,reg fld/fst */
594 {1, 1, 1}, /* cost of loading fp registers
595 in SFmode, DFmode and XFmode */
596 {4, 6, 6}, /* cost of storing fp registers
597 in SFmode, DFmode and XFmode */
599 2, /* cost of moving MMX register */
600 {2, 2}, /* cost of loading MMX registers
601 in SImode and DImode */
602 {2, 2}, /* cost of storing MMX registers
603 in SImode and DImode */
604 2, /* cost of moving SSE register */
605 {2, 2, 8}, /* cost of loading SSE registers
606 in SImode, DImode and TImode */
607 {2, 2, 8}, /* cost of storing SSE registers
608 in SImode, DImode and TImode */
609 3, /* MMX or SSE register to integer */
610 64, /* size of l1 cache. */
611 128, /* size of l2 cache. */
612 32, /* size of prefetch block */
613 1, /* number of parallel prefetches */
614 1, /* Branch cost */
615 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
616 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
617 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
618 COSTS_N_INSNS (1), /* cost of FABS instruction. */
619 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
620 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
621 geode_memcpy,
622 geode_memset,
623 1, /* scalar_stmt_cost. */
624 1, /* scalar load_cost. */
625 1, /* scalar_store_cost. */
626 1, /* vec_stmt_cost. */
627 1, /* vec_to_scalar_cost. */
628 1, /* scalar_to_vec_cost. */
629 1, /* vec_align_load_cost. */
630 2, /* vec_unalign_load_cost. */
631 1, /* vec_store_cost. */
632 3, /* cond_taken_branch_cost. */
633 1, /* cond_not_taken_branch_cost. */
636 static stringop_algs k6_memcpy[2] = {
637 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
638 DUMMY_STRINGOP_ALGS};
639 static stringop_algs k6_memset[2] = {
640 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
641 DUMMY_STRINGOP_ALGS};
642 static const
643 struct processor_costs k6_cost = {
644 COSTS_N_INSNS (1), /* cost of an add instruction */
645 COSTS_N_INSNS (2), /* cost of a lea instruction */
646 COSTS_N_INSNS (1), /* variable shift costs */
647 COSTS_N_INSNS (1), /* constant shift costs */
648 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
649 COSTS_N_INSNS (3), /* HI */
650 COSTS_N_INSNS (3), /* SI */
651 COSTS_N_INSNS (3), /* DI */
652 COSTS_N_INSNS (3)}, /* other */
653 0, /* cost of multiply per each bit set */
654 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
655 COSTS_N_INSNS (18), /* HI */
656 COSTS_N_INSNS (18), /* SI */
657 COSTS_N_INSNS (18), /* DI */
658 COSTS_N_INSNS (18)}, /* other */
659 COSTS_N_INSNS (2), /* cost of movsx */
660 COSTS_N_INSNS (2), /* cost of movzx */
661 8, /* "large" insn */
662 4, /* MOVE_RATIO */
663 3, /* cost for loading QImode using movzbl */
664 {4, 5, 4}, /* cost of loading integer registers
665 in QImode, HImode and SImode.
666 Relative to reg-reg move (2). */
667 {2, 3, 2}, /* cost of storing integer registers */
668 4, /* cost of reg,reg fld/fst */
669 {6, 6, 6}, /* cost of loading fp registers
670 in SFmode, DFmode and XFmode */
671 {4, 4, 4}, /* cost of storing fp registers
672 in SFmode, DFmode and XFmode */
673 2, /* cost of moving MMX register */
674 {2, 2}, /* cost of loading MMX registers
675 in SImode and DImode */
676 {2, 2}, /* cost of storing MMX registers
677 in SImode and DImode */
678 2, /* cost of moving SSE register */
679 {2, 2, 8}, /* cost of loading SSE registers
680 in SImode, DImode and TImode */
681 {2, 2, 8}, /* cost of storing SSE registers
682 in SImode, DImode and TImode */
683 6, /* MMX or SSE register to integer */
684 32, /* size of l1 cache. */
685 32, /* size of l2 cache. Some models
686 have integrated l2 cache, but
687 optimizing for k6 is not important
688 enough to worry about that. */
689 32, /* size of prefetch block */
690 1, /* number of parallel prefetches */
691 1, /* Branch cost */
692 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
693 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
694 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
695 COSTS_N_INSNS (2), /* cost of FABS instruction. */
696 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
697 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
698 k6_memcpy,
699 k6_memset,
700 1, /* scalar_stmt_cost. */
701 1, /* scalar load_cost. */
702 1, /* scalar_store_cost. */
703 1, /* vec_stmt_cost. */
704 1, /* vec_to_scalar_cost. */
705 1, /* scalar_to_vec_cost. */
706 1, /* vec_align_load_cost. */
707 2, /* vec_unalign_load_cost. */
708 1, /* vec_store_cost. */
709 3, /* cond_taken_branch_cost. */
710 1, /* cond_not_taken_branch_cost. */
713 /* For some reason, Athlon deals better with REP prefix (relative to loops)
714 compared to K8. Alignment becomes important after 8 bytes for memcpy and
715 128 bytes for memset. */
716 static stringop_algs athlon_memcpy[2] = {
717 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
718 DUMMY_STRINGOP_ALGS};
719 static stringop_algs athlon_memset[2] = {
720 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
721 DUMMY_STRINGOP_ALGS};
722 static const
723 struct processor_costs athlon_cost = {
724 COSTS_N_INSNS (1), /* cost of an add instruction */
725 COSTS_N_INSNS (2), /* cost of a lea instruction */
726 COSTS_N_INSNS (1), /* variable shift costs */
727 COSTS_N_INSNS (1), /* constant shift costs */
728 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
729 COSTS_N_INSNS (5), /* HI */
730 COSTS_N_INSNS (5), /* SI */
731 COSTS_N_INSNS (5), /* DI */
732 COSTS_N_INSNS (5)}, /* other */
733 0, /* cost of multiply per each bit set */
734 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
735 COSTS_N_INSNS (26), /* HI */
736 COSTS_N_INSNS (42), /* SI */
737 COSTS_N_INSNS (74), /* DI */
738 COSTS_N_INSNS (74)}, /* other */
739 COSTS_N_INSNS (1), /* cost of movsx */
740 COSTS_N_INSNS (1), /* cost of movzx */
741 8, /* "large" insn */
742 9, /* MOVE_RATIO */
743 4, /* cost for loading QImode using movzbl */
744 {3, 4, 3}, /* cost of loading integer registers
745 in QImode, HImode and SImode.
746 Relative to reg-reg move (2). */
747 {3, 4, 3}, /* cost of storing integer registers */
748 4, /* cost of reg,reg fld/fst */
749 {4, 4, 12}, /* cost of loading fp registers
750 in SFmode, DFmode and XFmode */
751 {6, 6, 8}, /* cost of storing fp registers
752 in SFmode, DFmode and XFmode */
753 2, /* cost of moving MMX register */
754 {4, 4}, /* cost of loading MMX registers
755 in SImode and DImode */
756 {4, 4}, /* cost of storing MMX registers
757 in SImode and DImode */
758 2, /* cost of moving SSE register */
759 {4, 4, 6}, /* cost of loading SSE registers
760 in SImode, DImode and TImode */
761 {4, 4, 5}, /* cost of storing SSE registers
762 in SImode, DImode and TImode */
763 5, /* MMX or SSE register to integer */
764 64, /* size of l1 cache. */
765 256, /* size of l2 cache. */
766 64, /* size of prefetch block */
767 6, /* number of parallel prefetches */
768 5, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
775 athlon_memcpy,
776 athlon_memset,
777 1, /* scalar_stmt_cost. */
778 1, /* scalar load_cost. */
779 1, /* scalar_store_cost. */
780 1, /* vec_stmt_cost. */
781 1, /* vec_to_scalar_cost. */
782 1, /* scalar_to_vec_cost. */
783 1, /* vec_align_load_cost. */
784 2, /* vec_unalign_load_cost. */
785 1, /* vec_store_cost. */
786 3, /* cond_taken_branch_cost. */
787 1, /* cond_not_taken_branch_cost. */
790 /* K8 has optimized REP instruction for medium sized blocks, but for very
791 small blocks it is better to use loop. For large blocks, libcall can
792 do nontemporary accesses and beat inline considerably. */
793 static stringop_algs k8_memcpy[2] = {
794 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
795 {-1, rep_prefix_4_byte, false}}},
796 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
797 {-1, libcall, false}}}};
798 static stringop_algs k8_memset[2] = {
799 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
800 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
801 {libcall, {{48, unrolled_loop, false},
802 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
803 static const
804 struct processor_costs k8_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (26), /* HI */
817 COSTS_N_INSNS (42), /* SI */
818 COSTS_N_INSNS (74), /* DI */
819 COSTS_N_INSNS (74)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 3, 6}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 5, /* MMX or SSE register to integer */
845 64, /* size of l1 cache. */
846 512, /* size of l2 cache. */
847 64, /* size of prefetch block */
848 /* New AMD processors never drop prefetches; if they cannot be performed
849 immediately, they are queued. We set number of simultaneous prefetches
850 to a large constant to reflect this (it probably is not a good idea not
851 to limit number of prefetches at all, as their execution also takes some
852 time). */
853 100, /* number of parallel prefetches */
854 3, /* Branch cost */
855 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
856 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
857 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
858 COSTS_N_INSNS (2), /* cost of FABS instruction. */
859 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
860 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
862 k8_memcpy,
863 k8_memset,
864 4, /* scalar_stmt_cost. */
865 2, /* scalar load_cost. */
866 2, /* scalar_store_cost. */
867 5, /* vec_stmt_cost. */
868 0, /* vec_to_scalar_cost. */
869 2, /* scalar_to_vec_cost. */
870 2, /* vec_align_load_cost. */
871 3, /* vec_unalign_load_cost. */
872 3, /* vec_store_cost. */
873 3, /* cond_taken_branch_cost. */
874 2, /* cond_not_taken_branch_cost. */
877 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
878 very small blocks it is better to use loop. For large blocks, libcall can
879 do nontemporary accesses and beat inline considerably. */
880 static stringop_algs amdfam10_memcpy[2] = {
881 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
882 {-1, rep_prefix_4_byte, false}}},
883 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
884 {-1, libcall, false}}}};
885 static stringop_algs amdfam10_memset[2] = {
886 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
887 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
888 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
889 {-1, libcall, false}}}};
890 struct processor_costs amdfam10_cost = {
891 COSTS_N_INSNS (1), /* cost of an add instruction */
892 COSTS_N_INSNS (2), /* cost of a lea instruction */
893 COSTS_N_INSNS (1), /* variable shift costs */
894 COSTS_N_INSNS (1), /* constant shift costs */
895 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
896 COSTS_N_INSNS (4), /* HI */
897 COSTS_N_INSNS (3), /* SI */
898 COSTS_N_INSNS (4), /* DI */
899 COSTS_N_INSNS (5)}, /* other */
900 0, /* cost of multiply per each bit set */
901 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
902 COSTS_N_INSNS (35), /* HI */
903 COSTS_N_INSNS (51), /* SI */
904 COSTS_N_INSNS (83), /* DI */
905 COSTS_N_INSNS (83)}, /* other */
906 COSTS_N_INSNS (1), /* cost of movsx */
907 COSTS_N_INSNS (1), /* cost of movzx */
908 8, /* "large" insn */
909 9, /* MOVE_RATIO */
910 4, /* cost for loading QImode using movzbl */
911 {3, 4, 3}, /* cost of loading integer registers
912 in QImode, HImode and SImode.
913 Relative to reg-reg move (2). */
914 {3, 4, 3}, /* cost of storing integer registers */
915 4, /* cost of reg,reg fld/fst */
916 {4, 4, 12}, /* cost of loading fp registers
917 in SFmode, DFmode and XFmode */
918 {6, 6, 8}, /* cost of storing fp registers
919 in SFmode, DFmode and XFmode */
920 2, /* cost of moving MMX register */
921 {3, 3}, /* cost of loading MMX registers
922 in SImode and DImode */
923 {4, 4}, /* cost of storing MMX registers
924 in SImode and DImode */
925 2, /* cost of moving SSE register */
926 {4, 4, 3}, /* cost of loading SSE registers
927 in SImode, DImode and TImode */
928 {4, 4, 5}, /* cost of storing SSE registers
929 in SImode, DImode and TImode */
930 3, /* MMX or SSE register to integer */
931 /* On K8:
932 MOVD reg64, xmmreg Double FSTORE 4
933 MOVD reg32, xmmreg Double FSTORE 4
934 On AMDFAM10:
935 MOVD reg64, xmmreg Double FADD 3
936 1/1 1/1
937 MOVD reg32, xmmreg Double FADD 3
938 1/1 1/1 */
939 64, /* size of l1 cache. */
940 512, /* size of l2 cache. */
941 64, /* size of prefetch block */
942 /* New AMD processors never drop prefetches; if they cannot be performed
943 immediately, they are queued. We set number of simultaneous prefetches
944 to a large constant to reflect this (it probably is not a good idea not
945 to limit number of prefetches at all, as their execution also takes some
946 time). */
947 100, /* number of parallel prefetches */
948 2, /* Branch cost */
949 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
950 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
951 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
952 COSTS_N_INSNS (2), /* cost of FABS instruction. */
953 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
954 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
956 amdfam10_memcpy,
957 amdfam10_memset,
958 4, /* scalar_stmt_cost. */
959 2, /* scalar load_cost. */
960 2, /* scalar_store_cost. */
961 6, /* vec_stmt_cost. */
962 0, /* vec_to_scalar_cost. */
963 2, /* scalar_to_vec_cost. */
964 2, /* vec_align_load_cost. */
965 2, /* vec_unalign_load_cost. */
966 2, /* vec_store_cost. */
967 2, /* cond_taken_branch_cost. */
968 1, /* cond_not_taken_branch_cost. */
971 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
972 very small blocks it is better to use loop. For large blocks, libcall
973 can do nontemporary accesses and beat inline considerably. */
974 static stringop_algs bdver1_memcpy[2] = {
975 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
976 {-1, rep_prefix_4_byte, false}}},
977 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
978 {-1, libcall, false}}}};
979 static stringop_algs bdver1_memset[2] = {
980 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
981 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
982 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
983 {-1, libcall, false}}}};
985 const struct processor_costs bdver1_cost = {
986 COSTS_N_INSNS (1), /* cost of an add instruction */
987 COSTS_N_INSNS (1), /* cost of a lea instruction */
988 COSTS_N_INSNS (1), /* variable shift costs */
989 COSTS_N_INSNS (1), /* constant shift costs */
990 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
991 COSTS_N_INSNS (4), /* HI */
992 COSTS_N_INSNS (4), /* SI */
993 COSTS_N_INSNS (6), /* DI */
994 COSTS_N_INSNS (6)}, /* other */
995 0, /* cost of multiply per each bit set */
996 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
997 COSTS_N_INSNS (35), /* HI */
998 COSTS_N_INSNS (51), /* SI */
999 COSTS_N_INSNS (83), /* DI */
1000 COSTS_N_INSNS (83)}, /* other */
1001 COSTS_N_INSNS (1), /* cost of movsx */
1002 COSTS_N_INSNS (1), /* cost of movzx */
1003 8, /* "large" insn */
1004 9, /* MOVE_RATIO */
1005 4, /* cost for loading QImode using movzbl */
1006 {5, 5, 4}, /* cost of loading integer registers
1007 in QImode, HImode and SImode.
1008 Relative to reg-reg move (2). */
1009 {4, 4, 4}, /* cost of storing integer registers */
1010 2, /* cost of reg,reg fld/fst */
1011 {5, 5, 12}, /* cost of loading fp registers
1012 in SFmode, DFmode and XFmode */
1013 {4, 4, 8}, /* cost of storing fp registers
1014 in SFmode, DFmode and XFmode */
1015 2, /* cost of moving MMX register */
1016 {4, 4}, /* cost of loading MMX registers
1017 in SImode and DImode */
1018 {4, 4}, /* cost of storing MMX registers
1019 in SImode and DImode */
1020 2, /* cost of moving SSE register */
1021 {4, 4, 4}, /* cost of loading SSE registers
1022 in SImode, DImode and TImode */
1023 {4, 4, 4}, /* cost of storing SSE registers
1024 in SImode, DImode and TImode */
1025 2, /* MMX or SSE register to integer */
1026 /* On K8:
1027 MOVD reg64, xmmreg Double FSTORE 4
1028 MOVD reg32, xmmreg Double FSTORE 4
1029 On AMDFAM10:
1030 MOVD reg64, xmmreg Double FADD 3
1031 1/1 1/1
1032 MOVD reg32, xmmreg Double FADD 3
1033 1/1 1/1 */
1034 16, /* size of l1 cache. */
1035 2048, /* size of l2 cache. */
1036 64, /* size of prefetch block */
1037 /* New AMD processors never drop prefetches; if they cannot be performed
1038 immediately, they are queued. We set number of simultaneous prefetches
1039 to a large constant to reflect this (it probably is not a good idea not
1040 to limit number of prefetches at all, as their execution also takes some
1041 time). */
1042 100, /* number of parallel prefetches */
1043 2, /* Branch cost */
1044 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1045 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1046 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1047 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1048 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1049 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1051 bdver1_memcpy,
1052 bdver1_memset,
1053 6, /* scalar_stmt_cost. */
1054 4, /* scalar load_cost. */
1055 4, /* scalar_store_cost. */
1056 6, /* vec_stmt_cost. */
1057 0, /* vec_to_scalar_cost. */
1058 2, /* scalar_to_vec_cost. */
1059 4, /* vec_align_load_cost. */
1060 4, /* vec_unalign_load_cost. */
1061 4, /* vec_store_cost. */
1062 4, /* cond_taken_branch_cost. */
1063 2, /* cond_not_taken_branch_cost. */
1066 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1067 very small blocks it is better to use loop. For large blocks, libcall
1068 can do nontemporary accesses and beat inline considerably. */
1070 static stringop_algs bdver2_memcpy[2] = {
1071 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1072 {-1, rep_prefix_4_byte, false}}},
1073 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1074 {-1, libcall, false}}}};
1075 static stringop_algs bdver2_memset[2] = {
1076 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1077 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1078 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1079 {-1, libcall, false}}}};
1081 const struct processor_costs bdver2_cost = {
1082 COSTS_N_INSNS (1), /* cost of an add instruction */
1083 COSTS_N_INSNS (1), /* cost of a lea instruction */
1084 COSTS_N_INSNS (1), /* variable shift costs */
1085 COSTS_N_INSNS (1), /* constant shift costs */
1086 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1087 COSTS_N_INSNS (4), /* HI */
1088 COSTS_N_INSNS (4), /* SI */
1089 COSTS_N_INSNS (6), /* DI */
1090 COSTS_N_INSNS (6)}, /* other */
1091 0, /* cost of multiply per each bit set */
1092 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1093 COSTS_N_INSNS (35), /* HI */
1094 COSTS_N_INSNS (51), /* SI */
1095 COSTS_N_INSNS (83), /* DI */
1096 COSTS_N_INSNS (83)}, /* other */
1097 COSTS_N_INSNS (1), /* cost of movsx */
1098 COSTS_N_INSNS (1), /* cost of movzx */
1099 8, /* "large" insn */
1100 9, /* MOVE_RATIO */
1101 4, /* cost for loading QImode using movzbl */
1102 {5, 5, 4}, /* cost of loading integer registers
1103 in QImode, HImode and SImode.
1104 Relative to reg-reg move (2). */
1105 {4, 4, 4}, /* cost of storing integer registers */
1106 2, /* cost of reg,reg fld/fst */
1107 {5, 5, 12}, /* cost of loading fp registers
1108 in SFmode, DFmode and XFmode */
1109 {4, 4, 8}, /* cost of storing fp registers
1110 in SFmode, DFmode and XFmode */
1111 2, /* cost of moving MMX register */
1112 {4, 4}, /* cost of loading MMX registers
1113 in SImode and DImode */
1114 {4, 4}, /* cost of storing MMX registers
1115 in SImode and DImode */
1116 2, /* cost of moving SSE register */
1117 {4, 4, 4}, /* cost of loading SSE registers
1118 in SImode, DImode and TImode */
1119 {4, 4, 4}, /* cost of storing SSE registers
1120 in SImode, DImode and TImode */
1121 2, /* MMX or SSE register to integer */
1122 /* On K8:
1123 MOVD reg64, xmmreg Double FSTORE 4
1124 MOVD reg32, xmmreg Double FSTORE 4
1125 On AMDFAM10:
1126 MOVD reg64, xmmreg Double FADD 3
1127 1/1 1/1
1128 MOVD reg32, xmmreg Double FADD 3
1129 1/1 1/1 */
1130 16, /* size of l1 cache. */
1131 2048, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 2, /* Branch cost */
1140 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1147 bdver2_memcpy,
1148 bdver2_memset,
1149 6, /* scalar_stmt_cost. */
1150 4, /* scalar load_cost. */
1151 4, /* scalar_store_cost. */
1152 6, /* vec_stmt_cost. */
1153 0, /* vec_to_scalar_cost. */
1154 2, /* scalar_to_vec_cost. */
1155 4, /* vec_align_load_cost. */
1156 4, /* vec_unalign_load_cost. */
1157 4, /* vec_store_cost. */
1158 4, /* cond_taken_branch_cost. */
1159 2, /* cond_not_taken_branch_cost. */
1163 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall
1165 can do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs bdver3_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs bdver3_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 struct processor_costs bdver3_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (1), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (4), /* SI */
1184 COSTS_N_INSNS (6), /* DI */
1185 COSTS_N_INSNS (6)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {5, 5, 4}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {4, 4, 4}, /* cost of storing integer registers */
1201 2, /* cost of reg,reg fld/fst */
1202 {5, 5, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {4, 4, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {4, 4}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 4}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 4}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 2, /* MMX or SSE register to integer */
1217 16, /* size of l1 cache. */
1218 2048, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1234 bdver3_memcpy,
1235 bdver3_memset,
1236 6, /* scalar_stmt_cost. */
1237 4, /* scalar load_cost. */
1238 4, /* scalar_store_cost. */
1239 6, /* vec_stmt_cost. */
1240 0, /* vec_to_scalar_cost. */
1241 2, /* scalar_to_vec_cost. */
1242 4, /* vec_align_load_cost. */
1243 4, /* vec_unalign_load_cost. */
1244 4, /* vec_store_cost. */
1245 4, /* cond_taken_branch_cost. */
1246 2, /* cond_not_taken_branch_cost. */
1249 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1250 very small blocks it is better to use loop. For large blocks, libcall
1251 can do nontemporary accesses and beat inline considerably. */
1252 static stringop_algs bdver4_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs bdver4_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 struct processor_costs bdver4_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (1), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (4), /* SI */
1270 COSTS_N_INSNS (6), /* DI */
1271 COSTS_N_INSNS (6)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {5, 5, 4}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {4, 4, 4}, /* cost of storing integer registers */
1287 2, /* cost of reg,reg fld/fst */
1288 {5, 5, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {4, 4, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {4, 4}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 4}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 4}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 2, /* MMX or SSE register to integer */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320 bdver4_memcpy,
1321 bdver4_memset,
1322 6, /* scalar_stmt_cost. */
1323 4, /* scalar load_cost. */
1324 4, /* scalar_store_cost. */
1325 6, /* vec_stmt_cost. */
1326 0, /* vec_to_scalar_cost. */
1327 2, /* scalar_to_vec_cost. */
1328 4, /* vec_align_load_cost. */
1329 4, /* vec_unalign_load_cost. */
1330 4, /* vec_store_cost. */
1331 4, /* cond_taken_branch_cost. */
1332 2, /* cond_not_taken_branch_cost. */
1336 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1337 very small blocks it is better to use loop. For large blocks, libcall
1338 can do nontemporary accesses and beat inline considerably. */
1339 static stringop_algs znver1_memcpy[2] = {
1340 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1341 {-1, rep_prefix_4_byte, false}}},
1342 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1343 {-1, libcall, false}}}};
1344 static stringop_algs znver1_memset[2] = {
1345 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1346 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1347 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1348 {-1, libcall, false}}}};
1349 struct processor_costs znver1_cost = {
1350 COSTS_N_INSNS (1), /* cost of an add instruction. */
1351 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1352 COSTS_N_INSNS (1), /* variable shift costs. */
1353 COSTS_N_INSNS (1), /* constant shift costs. */
1354 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1355 COSTS_N_INSNS (3), /* HI. */
1356 COSTS_N_INSNS (3), /* SI. */
1357 COSTS_N_INSNS (4), /* DI. */
1358 COSTS_N_INSNS (4)}, /* other. */
1359 0, /* cost of multiply per each bit
1360 set. */
1361 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1362 COSTS_N_INSNS (35), /* HI. */
1363 COSTS_N_INSNS (51), /* SI. */
1364 COSTS_N_INSNS (83), /* DI. */
1365 COSTS_N_INSNS (83)}, /* other. */
1366 COSTS_N_INSNS (1), /* cost of movsx. */
1367 COSTS_N_INSNS (1), /* cost of movzx. */
1368 8, /* "large" insn. */
1369 9, /* MOVE_RATIO. */
1370 4, /* cost for loading QImode using
1371 movzbl. */
1372 {5, 5, 4}, /* cost of loading integer registers
1373 in QImode, HImode and SImode.
1374 Relative to reg-reg move (2). */
1375 {4, 4, 4}, /* cost of storing integer
1376 registers. */
1377 2, /* cost of reg,reg fld/fst. */
1378 {5, 5, 12}, /* cost of loading fp registers
1379 in SFmode, DFmode and XFmode. */
1380 {4, 4, 8}, /* cost of storing fp registers
1381 in SFmode, DFmode and XFmode. */
1382 2, /* cost of moving MMX register. */
1383 {4, 4}, /* cost of loading MMX registers
1384 in SImode and DImode. */
1385 {4, 4}, /* cost of storing MMX registers
1386 in SImode and DImode. */
1387 2, /* cost of moving SSE register. */
1388 {4, 4, 4}, /* cost of loading SSE registers
1389 in SImode, DImode and TImode. */
1390 {4, 4, 4}, /* cost of storing SSE registers
1391 in SImode, DImode and TImode. */
1392 2, /* MMX or SSE register to integer. */
1393 32, /* size of l1 cache. */
1394 512, /* size of l2 cache. */
1395 64, /* size of prefetch block. */
1396 /* New AMD processors never drop prefetches; if they cannot be performed
1397 immediately, they are queued. We set number of simultaneous prefetches
1398 to a large constant to reflect this (it probably is not a good idea not
1399 to limit number of prefetches at all, as their execution also takes some
1400 time). */
1401 100, /* number of parallel prefetches. */
1402 2, /* Branch cost. */
1403 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1404 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1405 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1406 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1407 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1408 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1410 znver1_memcpy,
1411 znver1_memset,
1412 6, /* scalar_stmt_cost. */
1413 4, /* scalar load_cost. */
1414 4, /* scalar_store_cost. */
1415 6, /* vec_stmt_cost. */
1416 0, /* vec_to_scalar_cost. */
1417 2, /* scalar_to_vec_cost. */
1418 4, /* vec_align_load_cost. */
1419 4, /* vec_unalign_load_cost. */
1420 4, /* vec_store_cost. */
1421 4, /* cond_taken_branch_cost. */
1422 2, /* cond_not_taken_branch_cost. */
1425 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1426 very small blocks it is better to use loop. For large blocks, libcall can
1427 do nontemporary accesses and beat inline considerably. */
1428 static stringop_algs btver1_memcpy[2] = {
1429 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1430 {-1, rep_prefix_4_byte, false}}},
1431 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1432 {-1, libcall, false}}}};
1433 static stringop_algs btver1_memset[2] = {
1434 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1435 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1436 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1437 {-1, libcall, false}}}};
1438 const struct processor_costs btver1_cost = {
1439 COSTS_N_INSNS (1), /* cost of an add instruction */
1440 COSTS_N_INSNS (2), /* cost of a lea instruction */
1441 COSTS_N_INSNS (1), /* variable shift costs */
1442 COSTS_N_INSNS (1), /* constant shift costs */
1443 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1444 COSTS_N_INSNS (4), /* HI */
1445 COSTS_N_INSNS (3), /* SI */
1446 COSTS_N_INSNS (4), /* DI */
1447 COSTS_N_INSNS (5)}, /* other */
1448 0, /* cost of multiply per each bit set */
1449 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1450 COSTS_N_INSNS (35), /* HI */
1451 COSTS_N_INSNS (51), /* SI */
1452 COSTS_N_INSNS (83), /* DI */
1453 COSTS_N_INSNS (83)}, /* other */
1454 COSTS_N_INSNS (1), /* cost of movsx */
1455 COSTS_N_INSNS (1), /* cost of movzx */
1456 8, /* "large" insn */
1457 9, /* MOVE_RATIO */
1458 4, /* cost for loading QImode using movzbl */
1459 {3, 4, 3}, /* cost of loading integer registers
1460 in QImode, HImode and SImode.
1461 Relative to reg-reg move (2). */
1462 {3, 4, 3}, /* cost of storing integer registers */
1463 4, /* cost of reg,reg fld/fst */
1464 {4, 4, 12}, /* cost of loading fp registers
1465 in SFmode, DFmode and XFmode */
1466 {6, 6, 8}, /* cost of storing fp registers
1467 in SFmode, DFmode and XFmode */
1468 2, /* cost of moving MMX register */
1469 {3, 3}, /* cost of loading MMX registers
1470 in SImode and DImode */
1471 {4, 4}, /* cost of storing MMX registers
1472 in SImode and DImode */
1473 2, /* cost of moving SSE register */
1474 {4, 4, 3}, /* cost of loading SSE registers
1475 in SImode, DImode and TImode */
1476 {4, 4, 5}, /* cost of storing SSE registers
1477 in SImode, DImode and TImode */
1478 3, /* MMX or SSE register to integer */
1479 /* On K8:
1480 MOVD reg64, xmmreg Double FSTORE 4
1481 MOVD reg32, xmmreg Double FSTORE 4
1482 On AMDFAM10:
1483 MOVD reg64, xmmreg Double FADD 3
1484 1/1 1/1
1485 MOVD reg32, xmmreg Double FADD 3
1486 1/1 1/1 */
1487 32, /* size of l1 cache. */
1488 512, /* size of l2 cache. */
1489 64, /* size of prefetch block */
1490 100, /* number of parallel prefetches */
1491 2, /* Branch cost */
1492 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1493 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1494 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1495 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1496 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1497 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1499 btver1_memcpy,
1500 btver1_memset,
1501 4, /* scalar_stmt_cost. */
1502 2, /* scalar load_cost. */
1503 2, /* scalar_store_cost. */
1504 6, /* vec_stmt_cost. */
1505 0, /* vec_to_scalar_cost. */
1506 2, /* scalar_to_vec_cost. */
1507 2, /* vec_align_load_cost. */
1508 2, /* vec_unalign_load_cost. */
1509 2, /* vec_store_cost. */
1510 2, /* cond_taken_branch_cost. */
1511 1, /* cond_not_taken_branch_cost. */
1514 static stringop_algs btver2_memcpy[2] = {
1515 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1516 {-1, rep_prefix_4_byte, false}}},
1517 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1518 {-1, libcall, false}}}};
1519 static stringop_algs btver2_memset[2] = {
1520 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1521 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1522 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1523 {-1, libcall, false}}}};
1524 const struct processor_costs btver2_cost = {
1525 COSTS_N_INSNS (1), /* cost of an add instruction */
1526 COSTS_N_INSNS (2), /* cost of a lea instruction */
1527 COSTS_N_INSNS (1), /* variable shift costs */
1528 COSTS_N_INSNS (1), /* constant shift costs */
1529 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1530 COSTS_N_INSNS (4), /* HI */
1531 COSTS_N_INSNS (3), /* SI */
1532 COSTS_N_INSNS (4), /* DI */
1533 COSTS_N_INSNS (5)}, /* other */
1534 0, /* cost of multiply per each bit set */
1535 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1536 COSTS_N_INSNS (35), /* HI */
1537 COSTS_N_INSNS (51), /* SI */
1538 COSTS_N_INSNS (83), /* DI */
1539 COSTS_N_INSNS (83)}, /* other */
1540 COSTS_N_INSNS (1), /* cost of movsx */
1541 COSTS_N_INSNS (1), /* cost of movzx */
1542 8, /* "large" insn */
1543 9, /* MOVE_RATIO */
1544 4, /* cost for loading QImode using movzbl */
1545 {3, 4, 3}, /* cost of loading integer registers
1546 in QImode, HImode and SImode.
1547 Relative to reg-reg move (2). */
1548 {3, 4, 3}, /* cost of storing integer registers */
1549 4, /* cost of reg,reg fld/fst */
1550 {4, 4, 12}, /* cost of loading fp registers
1551 in SFmode, DFmode and XFmode */
1552 {6, 6, 8}, /* cost of storing fp registers
1553 in SFmode, DFmode and XFmode */
1554 2, /* cost of moving MMX register */
1555 {3, 3}, /* cost of loading MMX registers
1556 in SImode and DImode */
1557 {4, 4}, /* cost of storing MMX registers
1558 in SImode and DImode */
1559 2, /* cost of moving SSE register */
1560 {4, 4, 3}, /* cost of loading SSE registers
1561 in SImode, DImode and TImode */
1562 {4, 4, 5}, /* cost of storing SSE registers
1563 in SImode, DImode and TImode */
1564 3, /* MMX or SSE register to integer */
1565 /* On K8:
1566 MOVD reg64, xmmreg Double FSTORE 4
1567 MOVD reg32, xmmreg Double FSTORE 4
1568 On AMDFAM10:
1569 MOVD reg64, xmmreg Double FADD 3
1570 1/1 1/1
1571 MOVD reg32, xmmreg Double FADD 3
1572 1/1 1/1 */
1573 32, /* size of l1 cache. */
1574 2048, /* size of l2 cache. */
1575 64, /* size of prefetch block */
1576 100, /* number of parallel prefetches */
1577 2, /* Branch cost */
1578 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1579 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1580 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1581 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1582 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1583 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1584 btver2_memcpy,
1585 btver2_memset,
1586 4, /* scalar_stmt_cost. */
1587 2, /* scalar load_cost. */
1588 2, /* scalar_store_cost. */
1589 6, /* vec_stmt_cost. */
1590 0, /* vec_to_scalar_cost. */
1591 2, /* scalar_to_vec_cost. */
1592 2, /* vec_align_load_cost. */
1593 2, /* vec_unalign_load_cost. */
1594 2, /* vec_store_cost. */
1595 2, /* cond_taken_branch_cost. */
1596 1, /* cond_not_taken_branch_cost. */
1599 static stringop_algs pentium4_memcpy[2] = {
1600 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1601 DUMMY_STRINGOP_ALGS};
1602 static stringop_algs pentium4_memset[2] = {
1603 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1604 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1605 DUMMY_STRINGOP_ALGS};
1607 static const
1608 struct processor_costs pentium4_cost = {
1609 COSTS_N_INSNS (1), /* cost of an add instruction */
1610 COSTS_N_INSNS (3), /* cost of a lea instruction */
1611 COSTS_N_INSNS (4), /* variable shift costs */
1612 COSTS_N_INSNS (4), /* constant shift costs */
1613 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1614 COSTS_N_INSNS (15), /* HI */
1615 COSTS_N_INSNS (15), /* SI */
1616 COSTS_N_INSNS (15), /* DI */
1617 COSTS_N_INSNS (15)}, /* other */
1618 0, /* cost of multiply per each bit set */
1619 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1620 COSTS_N_INSNS (56), /* HI */
1621 COSTS_N_INSNS (56), /* SI */
1622 COSTS_N_INSNS (56), /* DI */
1623 COSTS_N_INSNS (56)}, /* other */
1624 COSTS_N_INSNS (1), /* cost of movsx */
1625 COSTS_N_INSNS (1), /* cost of movzx */
1626 16, /* "large" insn */
1627 6, /* MOVE_RATIO */
1628 2, /* cost for loading QImode using movzbl */
1629 {4, 5, 4}, /* cost of loading integer registers
1630 in QImode, HImode and SImode.
1631 Relative to reg-reg move (2). */
1632 {2, 3, 2}, /* cost of storing integer registers */
1633 2, /* cost of reg,reg fld/fst */
1634 {2, 2, 6}, /* cost of loading fp registers
1635 in SFmode, DFmode and XFmode */
1636 {4, 4, 6}, /* cost of storing fp registers
1637 in SFmode, DFmode and XFmode */
1638 2, /* cost of moving MMX register */
1639 {2, 2}, /* cost of loading MMX registers
1640 in SImode and DImode */
1641 {2, 2}, /* cost of storing MMX registers
1642 in SImode and DImode */
1643 12, /* cost of moving SSE register */
1644 {12, 12, 12}, /* cost of loading SSE registers
1645 in SImode, DImode and TImode */
1646 {2, 2, 8}, /* cost of storing SSE registers
1647 in SImode, DImode and TImode */
1648 10, /* MMX or SSE register to integer */
1649 8, /* size of l1 cache. */
1650 256, /* size of l2 cache. */
1651 64, /* size of prefetch block */
1652 6, /* number of parallel prefetches */
1653 2, /* Branch cost */
1654 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1655 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1656 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1657 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1658 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1659 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1660 pentium4_memcpy,
1661 pentium4_memset,
1662 1, /* scalar_stmt_cost. */
1663 1, /* scalar load_cost. */
1664 1, /* scalar_store_cost. */
1665 1, /* vec_stmt_cost. */
1666 1, /* vec_to_scalar_cost. */
1667 1, /* scalar_to_vec_cost. */
1668 1, /* vec_align_load_cost. */
1669 2, /* vec_unalign_load_cost. */
1670 1, /* vec_store_cost. */
1671 3, /* cond_taken_branch_cost. */
1672 1, /* cond_not_taken_branch_cost. */
1675 static stringop_algs nocona_memcpy[2] = {
1676 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1677 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1678 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1680 static stringop_algs nocona_memset[2] = {
1681 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1682 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1683 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1684 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1686 static const
1687 struct processor_costs nocona_cost = {
1688 COSTS_N_INSNS (1), /* cost of an add instruction */
1689 COSTS_N_INSNS (1), /* cost of a lea instruction */
1690 COSTS_N_INSNS (1), /* variable shift costs */
1691 COSTS_N_INSNS (1), /* constant shift costs */
1692 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1693 COSTS_N_INSNS (10), /* HI */
1694 COSTS_N_INSNS (10), /* SI */
1695 COSTS_N_INSNS (10), /* DI */
1696 COSTS_N_INSNS (10)}, /* other */
1697 0, /* cost of multiply per each bit set */
1698 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1699 COSTS_N_INSNS (66), /* HI */
1700 COSTS_N_INSNS (66), /* SI */
1701 COSTS_N_INSNS (66), /* DI */
1702 COSTS_N_INSNS (66)}, /* other */
1703 COSTS_N_INSNS (1), /* cost of movsx */
1704 COSTS_N_INSNS (1), /* cost of movzx */
1705 16, /* "large" insn */
1706 17, /* MOVE_RATIO */
1707 4, /* cost for loading QImode using movzbl */
1708 {4, 4, 4}, /* cost of loading integer registers
1709 in QImode, HImode and SImode.
1710 Relative to reg-reg move (2). */
1711 {4, 4, 4}, /* cost of storing integer registers */
1712 3, /* cost of reg,reg fld/fst */
1713 {12, 12, 12}, /* cost of loading fp registers
1714 in SFmode, DFmode and XFmode */
1715 {4, 4, 4}, /* cost of storing fp registers
1716 in SFmode, DFmode and XFmode */
1717 6, /* cost of moving MMX register */
1718 {12, 12}, /* cost of loading MMX registers
1719 in SImode and DImode */
1720 {12, 12}, /* cost of storing MMX registers
1721 in SImode and DImode */
1722 6, /* cost of moving SSE register */
1723 {12, 12, 12}, /* cost of loading SSE registers
1724 in SImode, DImode and TImode */
1725 {12, 12, 12}, /* cost of storing SSE registers
1726 in SImode, DImode and TImode */
1727 8, /* MMX or SSE register to integer */
1728 8, /* size of l1 cache. */
1729 1024, /* size of l2 cache. */
1730 64, /* size of prefetch block */
1731 8, /* number of parallel prefetches */
1732 1, /* Branch cost */
1733 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1734 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1735 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1736 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1737 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1738 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1739 nocona_memcpy,
1740 nocona_memset,
1741 1, /* scalar_stmt_cost. */
1742 1, /* scalar load_cost. */
1743 1, /* scalar_store_cost. */
1744 1, /* vec_stmt_cost. */
1745 1, /* vec_to_scalar_cost. */
1746 1, /* scalar_to_vec_cost. */
1747 1, /* vec_align_load_cost. */
1748 2, /* vec_unalign_load_cost. */
1749 1, /* vec_store_cost. */
1750 3, /* cond_taken_branch_cost. */
1751 1, /* cond_not_taken_branch_cost. */
1754 static stringop_algs atom_memcpy[2] = {
1755 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1756 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1757 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1758 static stringop_algs atom_memset[2] = {
1759 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1760 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1761 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1762 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1763 static const
1764 struct processor_costs atom_cost = {
1765 COSTS_N_INSNS (1), /* cost of an add instruction */
1766 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1767 COSTS_N_INSNS (1), /* variable shift costs */
1768 COSTS_N_INSNS (1), /* constant shift costs */
1769 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1770 COSTS_N_INSNS (4), /* HI */
1771 COSTS_N_INSNS (3), /* SI */
1772 COSTS_N_INSNS (4), /* DI */
1773 COSTS_N_INSNS (2)}, /* other */
1774 0, /* cost of multiply per each bit set */
1775 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1776 COSTS_N_INSNS (26), /* HI */
1777 COSTS_N_INSNS (42), /* SI */
1778 COSTS_N_INSNS (74), /* DI */
1779 COSTS_N_INSNS (74)}, /* other */
1780 COSTS_N_INSNS (1), /* cost of movsx */
1781 COSTS_N_INSNS (1), /* cost of movzx */
1782 8, /* "large" insn */
1783 17, /* MOVE_RATIO */
1784 4, /* cost for loading QImode using movzbl */
1785 {4, 4, 4}, /* cost of loading integer registers
1786 in QImode, HImode and SImode.
1787 Relative to reg-reg move (2). */
1788 {4, 4, 4}, /* cost of storing integer registers */
1789 4, /* cost of reg,reg fld/fst */
1790 {12, 12, 12}, /* cost of loading fp registers
1791 in SFmode, DFmode and XFmode */
1792 {6, 6, 8}, /* cost of storing fp registers
1793 in SFmode, DFmode and XFmode */
1794 2, /* cost of moving MMX register */
1795 {8, 8}, /* cost of loading MMX registers
1796 in SImode and DImode */
1797 {8, 8}, /* cost of storing MMX registers
1798 in SImode and DImode */
1799 2, /* cost of moving SSE register */
1800 {8, 8, 8}, /* cost of loading SSE registers
1801 in SImode, DImode and TImode */
1802 {8, 8, 8}, /* cost of storing SSE registers
1803 in SImode, DImode and TImode */
1804 5, /* MMX or SSE register to integer */
1805 32, /* size of l1 cache. */
1806 256, /* size of l2 cache. */
1807 64, /* size of prefetch block */
1808 6, /* number of parallel prefetches */
1809 3, /* Branch cost */
1810 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1811 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1812 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1813 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1814 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1815 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1816 atom_memcpy,
1817 atom_memset,
1818 1, /* scalar_stmt_cost. */
1819 1, /* scalar load_cost. */
1820 1, /* scalar_store_cost. */
1821 1, /* vec_stmt_cost. */
1822 1, /* vec_to_scalar_cost. */
1823 1, /* scalar_to_vec_cost. */
1824 1, /* vec_align_load_cost. */
1825 2, /* vec_unalign_load_cost. */
1826 1, /* vec_store_cost. */
1827 3, /* cond_taken_branch_cost. */
1828 1, /* cond_not_taken_branch_cost. */
1831 static stringop_algs slm_memcpy[2] = {
1832 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1833 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1834 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835 static stringop_algs slm_memset[2] = {
1836 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1837 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1839 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static const
1841 struct processor_costs slm_cost = {
1842 COSTS_N_INSNS (1), /* cost of an add instruction */
1843 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1844 COSTS_N_INSNS (1), /* variable shift costs */
1845 COSTS_N_INSNS (1), /* constant shift costs */
1846 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1847 COSTS_N_INSNS (3), /* HI */
1848 COSTS_N_INSNS (3), /* SI */
1849 COSTS_N_INSNS (4), /* DI */
1850 COSTS_N_INSNS (2)}, /* other */
1851 0, /* cost of multiply per each bit set */
1852 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1853 COSTS_N_INSNS (26), /* HI */
1854 COSTS_N_INSNS (42), /* SI */
1855 COSTS_N_INSNS (74), /* DI */
1856 COSTS_N_INSNS (74)}, /* other */
1857 COSTS_N_INSNS (1), /* cost of movsx */
1858 COSTS_N_INSNS (1), /* cost of movzx */
1859 8, /* "large" insn */
1860 17, /* MOVE_RATIO */
1861 4, /* cost for loading QImode using movzbl */
1862 {4, 4, 4}, /* cost of loading integer registers
1863 in QImode, HImode and SImode.
1864 Relative to reg-reg move (2). */
1865 {4, 4, 4}, /* cost of storing integer registers */
1866 4, /* cost of reg,reg fld/fst */
1867 {12, 12, 12}, /* cost of loading fp registers
1868 in SFmode, DFmode and XFmode */
1869 {6, 6, 8}, /* cost of storing fp registers
1870 in SFmode, DFmode and XFmode */
1871 2, /* cost of moving MMX register */
1872 {8, 8}, /* cost of loading MMX registers
1873 in SImode and DImode */
1874 {8, 8}, /* cost of storing MMX registers
1875 in SImode and DImode */
1876 2, /* cost of moving SSE register */
1877 {8, 8, 8}, /* cost of loading SSE registers
1878 in SImode, DImode and TImode */
1879 {8, 8, 8}, /* cost of storing SSE registers
1880 in SImode, DImode and TImode */
1881 5, /* MMX or SSE register to integer */
1882 32, /* size of l1 cache. */
1883 256, /* size of l2 cache. */
1884 64, /* size of prefetch block */
1885 6, /* number of parallel prefetches */
1886 3, /* Branch cost */
1887 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1888 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1889 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1890 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1891 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1892 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1893 slm_memcpy,
1894 slm_memset,
1895 1, /* scalar_stmt_cost. */
1896 1, /* scalar load_cost. */
1897 1, /* scalar_store_cost. */
1898 1, /* vec_stmt_cost. */
1899 4, /* vec_to_scalar_cost. */
1900 1, /* scalar_to_vec_cost. */
1901 1, /* vec_align_load_cost. */
1902 2, /* vec_unalign_load_cost. */
1903 1, /* vec_store_cost. */
1904 3, /* cond_taken_branch_cost. */
1905 1, /* cond_not_taken_branch_cost. */
1908 static stringop_algs intel_memcpy[2] = {
1909 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1910 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1911 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1912 static stringop_algs intel_memset[2] = {
1913 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1914 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1915 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1916 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1917 static const
1918 struct processor_costs intel_cost = {
1919 COSTS_N_INSNS (1), /* cost of an add instruction */
1920 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1921 COSTS_N_INSNS (1), /* variable shift costs */
1922 COSTS_N_INSNS (1), /* constant shift costs */
1923 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1924 COSTS_N_INSNS (3), /* HI */
1925 COSTS_N_INSNS (3), /* SI */
1926 COSTS_N_INSNS (4), /* DI */
1927 COSTS_N_INSNS (2)}, /* other */
1928 0, /* cost of multiply per each bit set */
1929 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1930 COSTS_N_INSNS (26), /* HI */
1931 COSTS_N_INSNS (42), /* SI */
1932 COSTS_N_INSNS (74), /* DI */
1933 COSTS_N_INSNS (74)}, /* other */
1934 COSTS_N_INSNS (1), /* cost of movsx */
1935 COSTS_N_INSNS (1), /* cost of movzx */
1936 8, /* "large" insn */
1937 17, /* MOVE_RATIO */
1938 4, /* cost for loading QImode using movzbl */
1939 {4, 4, 4}, /* cost of loading integer registers
1940 in QImode, HImode and SImode.
1941 Relative to reg-reg move (2). */
1942 {4, 4, 4}, /* cost of storing integer registers */
1943 4, /* cost of reg,reg fld/fst */
1944 {12, 12, 12}, /* cost of loading fp registers
1945 in SFmode, DFmode and XFmode */
1946 {6, 6, 8}, /* cost of storing fp registers
1947 in SFmode, DFmode and XFmode */
1948 2, /* cost of moving MMX register */
1949 {8, 8}, /* cost of loading MMX registers
1950 in SImode and DImode */
1951 {8, 8}, /* cost of storing MMX registers
1952 in SImode and DImode */
1953 2, /* cost of moving SSE register */
1954 {8, 8, 8}, /* cost of loading SSE registers
1955 in SImode, DImode and TImode */
1956 {8, 8, 8}, /* cost of storing SSE registers
1957 in SImode, DImode and TImode */
1958 5, /* MMX or SSE register to integer */
1959 32, /* size of l1 cache. */
1960 256, /* size of l2 cache. */
1961 64, /* size of prefetch block */
1962 6, /* number of parallel prefetches */
1963 3, /* Branch cost */
1964 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1965 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1966 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1967 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1968 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1969 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1970 intel_memcpy,
1971 intel_memset,
1972 1, /* scalar_stmt_cost. */
1973 1, /* scalar load_cost. */
1974 1, /* scalar_store_cost. */
1975 1, /* vec_stmt_cost. */
1976 4, /* vec_to_scalar_cost. */
1977 1, /* scalar_to_vec_cost. */
1978 1, /* vec_align_load_cost. */
1979 2, /* vec_unalign_load_cost. */
1980 1, /* vec_store_cost. */
1981 3, /* cond_taken_branch_cost. */
1982 1, /* cond_not_taken_branch_cost. */
1985 /* Generic should produce code tuned for Core-i7 (and newer chips)
1986 and btver1 (and newer chips). */
1988 static stringop_algs generic_memcpy[2] = {
1989 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1990 {-1, libcall, false}}},
1991 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1992 {-1, libcall, false}}}};
1993 static stringop_algs generic_memset[2] = {
1994 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1995 {-1, libcall, false}}},
1996 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1997 {-1, libcall, false}}}};
1998 static const
1999 struct processor_costs generic_cost = {
2000 COSTS_N_INSNS (1), /* cost of an add instruction */
2001 /* On all chips taken into consideration lea is 2 cycles and more. With
2002 this cost however our current implementation of synth_mult results in
2003 use of unnecessary temporary registers causing regression on several
2004 SPECfp benchmarks. */
2005 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2006 COSTS_N_INSNS (1), /* variable shift costs */
2007 COSTS_N_INSNS (1), /* constant shift costs */
2008 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2009 COSTS_N_INSNS (4), /* HI */
2010 COSTS_N_INSNS (3), /* SI */
2011 COSTS_N_INSNS (4), /* DI */
2012 COSTS_N_INSNS (2)}, /* other */
2013 0, /* cost of multiply per each bit set */
2014 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2015 COSTS_N_INSNS (26), /* HI */
2016 COSTS_N_INSNS (42), /* SI */
2017 COSTS_N_INSNS (74), /* DI */
2018 COSTS_N_INSNS (74)}, /* other */
2019 COSTS_N_INSNS (1), /* cost of movsx */
2020 COSTS_N_INSNS (1), /* cost of movzx */
2021 8, /* "large" insn */
2022 17, /* MOVE_RATIO */
2023 4, /* cost for loading QImode using movzbl */
2024 {4, 4, 4}, /* cost of loading integer registers
2025 in QImode, HImode and SImode.
2026 Relative to reg-reg move (2). */
2027 {4, 4, 4}, /* cost of storing integer registers */
2028 4, /* cost of reg,reg fld/fst */
2029 {12, 12, 12}, /* cost of loading fp registers
2030 in SFmode, DFmode and XFmode */
2031 {6, 6, 8}, /* cost of storing fp registers
2032 in SFmode, DFmode and XFmode */
2033 2, /* cost of moving MMX register */
2034 {8, 8}, /* cost of loading MMX registers
2035 in SImode and DImode */
2036 {8, 8}, /* cost of storing MMX registers
2037 in SImode and DImode */
2038 2, /* cost of moving SSE register */
2039 {8, 8, 8}, /* cost of loading SSE registers
2040 in SImode, DImode and TImode */
2041 {8, 8, 8}, /* cost of storing SSE registers
2042 in SImode, DImode and TImode */
2043 5, /* MMX or SSE register to integer */
2044 32, /* size of l1 cache. */
2045 512, /* size of l2 cache. */
2046 64, /* size of prefetch block */
2047 6, /* number of parallel prefetches */
2048 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2049 value is increased to perhaps more appropriate value of 5. */
2050 3, /* Branch cost */
2051 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2052 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2053 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2054 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2055 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2056 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2057 generic_memcpy,
2058 generic_memset,
2059 1, /* scalar_stmt_cost. */
2060 1, /* scalar load_cost. */
2061 1, /* scalar_store_cost. */
2062 1, /* vec_stmt_cost. */
2063 1, /* vec_to_scalar_cost. */
2064 1, /* scalar_to_vec_cost. */
2065 1, /* vec_align_load_cost. */
2066 2, /* vec_unalign_load_cost. */
2067 1, /* vec_store_cost. */
2068 3, /* cond_taken_branch_cost. */
2069 1, /* cond_not_taken_branch_cost. */
2072 /* core_cost should produce code tuned for Core familly of CPUs. */
2073 static stringop_algs core_memcpy[2] = {
2074 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2075 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2076 {-1, libcall, false}}}};
2077 static stringop_algs core_memset[2] = {
2078 {libcall, {{6, loop_1_byte, true},
2079 {24, loop, true},
2080 {8192, rep_prefix_4_byte, true},
2081 {-1, libcall, false}}},
2082 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2083 {-1, libcall, false}}}};
2085 static const
2086 struct processor_costs core_cost = {
2087 COSTS_N_INSNS (1), /* cost of an add instruction */
2088 /* On all chips taken into consideration lea is 2 cycles and more. With
2089 this cost however our current implementation of synth_mult results in
2090 use of unnecessary temporary registers causing regression on several
2091 SPECfp benchmarks. */
2092 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2093 COSTS_N_INSNS (1), /* variable shift costs */
2094 COSTS_N_INSNS (1), /* constant shift costs */
2095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2096 COSTS_N_INSNS (4), /* HI */
2097 COSTS_N_INSNS (3), /* SI */
2098 COSTS_N_INSNS (4), /* DI */
2099 COSTS_N_INSNS (2)}, /* other */
2100 0, /* cost of multiply per each bit set */
2101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2102 COSTS_N_INSNS (26), /* HI */
2103 COSTS_N_INSNS (42), /* SI */
2104 COSTS_N_INSNS (74), /* DI */
2105 COSTS_N_INSNS (74)}, /* other */
2106 COSTS_N_INSNS (1), /* cost of movsx */
2107 COSTS_N_INSNS (1), /* cost of movzx */
2108 8, /* "large" insn */
2109 17, /* MOVE_RATIO */
2110 4, /* cost for loading QImode using movzbl */
2111 {4, 4, 4}, /* cost of loading integer registers
2112 in QImode, HImode and SImode.
2113 Relative to reg-reg move (2). */
2114 {4, 4, 4}, /* cost of storing integer registers */
2115 4, /* cost of reg,reg fld/fst */
2116 {12, 12, 12}, /* cost of loading fp registers
2117 in SFmode, DFmode and XFmode */
2118 {6, 6, 8}, /* cost of storing fp registers
2119 in SFmode, DFmode and XFmode */
2120 2, /* cost of moving MMX register */
2121 {8, 8}, /* cost of loading MMX registers
2122 in SImode and DImode */
2123 {8, 8}, /* cost of storing MMX registers
2124 in SImode and DImode */
2125 2, /* cost of moving SSE register */
2126 {8, 8, 8}, /* cost of loading SSE registers
2127 in SImode, DImode and TImode */
2128 {8, 8, 8}, /* cost of storing SSE registers
2129 in SImode, DImode and TImode */
2130 5, /* MMX or SSE register to integer */
2131 64, /* size of l1 cache. */
2132 512, /* size of l2 cache. */
2133 64, /* size of prefetch block */
2134 6, /* number of parallel prefetches */
2135 /* FIXME perhaps more appropriate value is 5. */
2136 3, /* Branch cost */
2137 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2138 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2139 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2140 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2141 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2142 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2143 core_memcpy,
2144 core_memset,
2145 1, /* scalar_stmt_cost. */
2146 1, /* scalar load_cost. */
2147 1, /* scalar_store_cost. */
2148 1, /* vec_stmt_cost. */
2149 1, /* vec_to_scalar_cost. */
2150 1, /* scalar_to_vec_cost. */
2151 1, /* vec_align_load_cost. */
2152 2, /* vec_unalign_load_cost. */
2153 1, /* vec_store_cost. */
2154 3, /* cond_taken_branch_cost. */
2155 1, /* cond_not_taken_branch_cost. */
2159 /* Set by -mtune. */
2160 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2162 /* Set by -mtune or -Os. */
2163 const struct processor_costs *ix86_cost = &pentium_cost;
2165 /* Processor feature/optimization bitmasks. */
2166 #define m_386 (1U<<PROCESSOR_I386)
2167 #define m_486 (1U<<PROCESSOR_I486)
2168 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2169 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2170 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2171 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2172 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2173 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2174 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2175 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2176 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2177 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2178 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2179 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2180 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2181 #define m_KNL (1U<<PROCESSOR_KNL)
2182 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2183 #define m_INTEL (1U<<PROCESSOR_INTEL)
2185 #define m_GEODE (1U<<PROCESSOR_GEODE)
2186 #define m_K6 (1U<<PROCESSOR_K6)
2187 #define m_K6_GEODE (m_K6 | m_GEODE)
2188 #define m_K8 (1U<<PROCESSOR_K8)
2189 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2190 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2191 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2192 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2193 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2194 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2195 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2196 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2197 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2198 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2199 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2200 #define m_BTVER (m_BTVER1 | m_BTVER2)
2201 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2202 | m_ZNVER1)
2204 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2206 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2207 #undef DEF_TUNE
2208 #define DEF_TUNE(tune, name, selector) name,
2209 #include "x86-tune.def"
2210 #undef DEF_TUNE
2213 /* Feature tests against the various tunings. */
2214 unsigned char ix86_tune_features[X86_TUNE_LAST];
2216 /* Feature tests against the various tunings used to create ix86_tune_features
2217 based on the processor mask. */
2218 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2219 #undef DEF_TUNE
2220 #define DEF_TUNE(tune, name, selector) selector,
2221 #include "x86-tune.def"
2222 #undef DEF_TUNE
2225 /* Feature tests against the various architecture variations. */
2226 unsigned char ix86_arch_features[X86_ARCH_LAST];
2228 /* Feature tests against the various architecture variations, used to create
2229 ix86_arch_features based on the processor mask. */
2230 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2231 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2232 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2234 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2235 ~m_386,
2237 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2238 ~(m_386 | m_486),
2240 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2241 ~m_386,
2243 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2244 ~m_386,
2247 /* In case the average insn count for single function invocation is
2248 lower than this constant, emit fast (but longer) prologue and
2249 epilogue code. */
2250 #define FAST_PROLOGUE_INSN_COUNT 20
2252 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2253 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2254 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2255 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2257 /* Array of the smallest class containing reg number REGNO, indexed by
2258 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2260 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2262 /* ax, dx, cx, bx */
2263 AREG, DREG, CREG, BREG,
2264 /* si, di, bp, sp */
2265 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2266 /* FP registers */
2267 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2268 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2269 /* arg pointer */
2270 NON_Q_REGS,
2271 /* flags, fpsr, fpcr, frame */
2272 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2273 /* SSE registers */
2274 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2275 SSE_REGS, SSE_REGS,
2276 /* MMX registers */
2277 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2278 MMX_REGS, MMX_REGS,
2279 /* REX registers */
2280 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2281 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2282 /* SSE REX registers */
2283 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2284 SSE_REGS, SSE_REGS,
2285 /* AVX-512 SSE registers */
2286 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2287 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2288 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2289 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2290 /* Mask registers. */
2291 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2292 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2293 /* MPX bound registers */
2294 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2297 /* The "default" register map used in 32bit mode. */
2299 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2301 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2302 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2303 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2304 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2305 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2306 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2307 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2308 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2309 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2310 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2311 101, 102, 103, 104, /* bound registers */
2314 /* The "default" register map used in 64bit mode. */
2316 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2318 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2319 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2320 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2321 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2322 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2323 8,9,10,11,12,13,14,15, /* extended integer registers */
2324 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2325 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2326 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2327 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2328 126, 127, 128, 129, /* bound registers */
2331 /* Define the register numbers to be used in Dwarf debugging information.
2332 The SVR4 reference port C compiler uses the following register numbers
2333 in its Dwarf output code:
2334 0 for %eax (gcc regno = 0)
2335 1 for %ecx (gcc regno = 2)
2336 2 for %edx (gcc regno = 1)
2337 3 for %ebx (gcc regno = 3)
2338 4 for %esp (gcc regno = 7)
2339 5 for %ebp (gcc regno = 6)
2340 6 for %esi (gcc regno = 4)
2341 7 for %edi (gcc regno = 5)
2342 The following three DWARF register numbers are never generated by
2343 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2344 believes these numbers have these meanings.
2345 8 for %eip (no gcc equivalent)
2346 9 for %eflags (gcc regno = 17)
2347 10 for %trapno (no gcc equivalent)
2348 It is not at all clear how we should number the FP stack registers
2349 for the x86 architecture. If the version of SDB on x86/svr4 were
2350 a bit less brain dead with respect to floating-point then we would
2351 have a precedent to follow with respect to DWARF register numbers
2352 for x86 FP registers, but the SDB on x86/svr4 is so completely
2353 broken with respect to FP registers that it is hardly worth thinking
2354 of it as something to strive for compatibility with.
2355 The version of x86/svr4 SDB I have at the moment does (partially)
2356 seem to believe that DWARF register number 11 is associated with
2357 the x86 register %st(0), but that's about all. Higher DWARF
2358 register numbers don't seem to be associated with anything in
2359 particular, and even for DWARF regno 11, SDB only seems to under-
2360 stand that it should say that a variable lives in %st(0) (when
2361 asked via an `=' command) if we said it was in DWARF regno 11,
2362 but SDB still prints garbage when asked for the value of the
2363 variable in question (via a `/' command).
2364 (Also note that the labels SDB prints for various FP stack regs
2365 when doing an `x' command are all wrong.)
2366 Note that these problems generally don't affect the native SVR4
2367 C compiler because it doesn't allow the use of -O with -g and
2368 because when it is *not* optimizing, it allocates a memory
2369 location for each floating-point variable, and the memory
2370 location is what gets described in the DWARF AT_location
2371 attribute for the variable in question.
2372 Regardless of the severe mental illness of the x86/svr4 SDB, we
2373 do something sensible here and we use the following DWARF
2374 register numbers. Note that these are all stack-top-relative
2375 numbers.
2376 11 for %st(0) (gcc regno = 8)
2377 12 for %st(1) (gcc regno = 9)
2378 13 for %st(2) (gcc regno = 10)
2379 14 for %st(3) (gcc regno = 11)
2380 15 for %st(4) (gcc regno = 12)
2381 16 for %st(5) (gcc regno = 13)
2382 17 for %st(6) (gcc regno = 14)
2383 18 for %st(7) (gcc regno = 15)
2385 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2387 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2388 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2389 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2390 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2391 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2392 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2393 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2394 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2395 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2396 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2397 101, 102, 103, 104, /* bound registers */
2400 /* Define parameter passing and return registers. */
2402 static int const x86_64_int_parameter_registers[6] =
2404 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2407 static int const x86_64_ms_abi_int_parameter_registers[4] =
2409 CX_REG, DX_REG, R8_REG, R9_REG
2412 static int const x86_64_int_return_registers[4] =
2414 AX_REG, DX_REG, DI_REG, SI_REG
2417 /* Additional registers that are clobbered by SYSV calls. */
2419 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2421 SI_REG, DI_REG,
2422 XMM6_REG, XMM7_REG,
2423 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2424 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2427 /* Define the structure for the machine field in struct function. */
2429 struct GTY(()) stack_local_entry {
2430 unsigned short mode;
2431 unsigned short n;
2432 rtx rtl;
2433 struct stack_local_entry *next;
2436 /* Structure describing stack frame layout.
2437 Stack grows downward:
2439 [arguments]
2440 <- ARG_POINTER
2441 saved pc
2443 saved static chain if ix86_static_chain_on_stack
2445 saved frame pointer if frame_pointer_needed
2446 <- HARD_FRAME_POINTER
2447 [saved regs]
2448 <- regs_save_offset
2449 [padding0]
2451 [saved SSE regs]
2452 <- sse_regs_save_offset
2453 [padding1] |
2454 | <- FRAME_POINTER
2455 [va_arg registers] |
2457 [frame] |
2459 [padding2] | = to_allocate
2460 <- STACK_POINTER
2462 struct ix86_frame
2464 int nsseregs;
2465 int nregs;
2466 int va_arg_size;
2467 int red_zone_size;
2468 int outgoing_arguments_size;
2470 /* The offsets relative to ARG_POINTER. */
2471 HOST_WIDE_INT frame_pointer_offset;
2472 HOST_WIDE_INT hard_frame_pointer_offset;
2473 HOST_WIDE_INT stack_pointer_offset;
2474 HOST_WIDE_INT hfp_save_offset;
2475 HOST_WIDE_INT reg_save_offset;
2476 HOST_WIDE_INT sse_reg_save_offset;
2478 /* When save_regs_using_mov is set, emit prologue using
2479 move instead of push instructions. */
2480 bool save_regs_using_mov;
2483 /* Which cpu are we scheduling for. */
2484 enum attr_cpu ix86_schedule;
2486 /* Which cpu are we optimizing for. */
2487 enum processor_type ix86_tune;
2489 /* Which instruction set architecture to use. */
2490 enum processor_type ix86_arch;
2492 /* True if processor has SSE prefetch instruction. */
2493 unsigned char x86_prefetch_sse;
2495 /* -mstackrealign option */
2496 static const char ix86_force_align_arg_pointer_string[]
2497 = "force_align_arg_pointer";
2499 static rtx (*ix86_gen_leave) (void);
2500 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2501 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2502 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2503 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2504 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2505 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2506 static rtx (*ix86_gen_clzero) (rtx);
2507 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2509 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2510 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2514 /* Preferred alignment for stack boundary in bits. */
2515 unsigned int ix86_preferred_stack_boundary;
2517 /* Alignment for incoming stack boundary in bits specified at
2518 command line. */
2519 static unsigned int ix86_user_incoming_stack_boundary;
2521 /* Default alignment for incoming stack boundary in bits. */
2522 static unsigned int ix86_default_incoming_stack_boundary;
2524 /* Alignment for incoming stack boundary in bits. */
2525 unsigned int ix86_incoming_stack_boundary;
2527 /* Calling abi specific va_list type nodes. */
2528 static GTY(()) tree sysv_va_list_type_node;
2529 static GTY(()) tree ms_va_list_type_node;
2531 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2532 char internal_label_prefix[16];
2533 int internal_label_prefix_len;
2535 /* Fence to use after loop using movnt. */
2536 tree x86_mfence;
2538 /* Register class used for passing given 64bit part of the argument.
2539 These represent classes as documented by the PS ABI, with the exception
2540 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2541 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2543 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2544 whenever possible (upper half does contain padding). */
2545 enum x86_64_reg_class
2547 X86_64_NO_CLASS,
2548 X86_64_INTEGER_CLASS,
2549 X86_64_INTEGERSI_CLASS,
2550 X86_64_SSE_CLASS,
2551 X86_64_SSESF_CLASS,
2552 X86_64_SSEDF_CLASS,
2553 X86_64_SSEUP_CLASS,
2554 X86_64_X87_CLASS,
2555 X86_64_X87UP_CLASS,
2556 X86_64_COMPLEX_X87_CLASS,
2557 X86_64_MEMORY_CLASS
2560 #define MAX_CLASSES 8
2562 /* Table of constants used by fldpi, fldln2, etc.... */
2563 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2564 static bool ext_80387_constants_init = 0;
2567 static struct machine_function * ix86_init_machine_status (void);
2568 static rtx ix86_function_value (const_tree, const_tree, bool);
2569 static bool ix86_function_value_regno_p (const unsigned int);
2570 static unsigned int ix86_function_arg_boundary (machine_mode,
2571 const_tree);
2572 static rtx ix86_static_chain (const_tree, bool);
2573 static int ix86_function_regparm (const_tree, const_tree);
2574 static void ix86_compute_frame_layout (struct ix86_frame *);
2575 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2576 rtx, rtx, int);
2577 static void ix86_add_new_builtins (HOST_WIDE_INT);
2578 static tree ix86_canonical_va_list_type (tree);
2579 static void predict_jump (int);
2580 static unsigned int split_stack_prologue_scratch_regno (void);
2581 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2583 enum ix86_function_specific_strings
2585 IX86_FUNCTION_SPECIFIC_ARCH,
2586 IX86_FUNCTION_SPECIFIC_TUNE,
2587 IX86_FUNCTION_SPECIFIC_MAX
2590 static char *ix86_target_string (HOST_WIDE_INT, int, int, const char *,
2591 const char *, enum fpmath_unit, bool);
2592 static void ix86_function_specific_save (struct cl_target_option *,
2593 struct gcc_options *opts);
2594 static void ix86_function_specific_restore (struct gcc_options *opts,
2595 struct cl_target_option *);
2596 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2597 static void ix86_function_specific_print (FILE *, int,
2598 struct cl_target_option *);
2599 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2600 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2601 struct gcc_options *,
2602 struct gcc_options *,
2603 struct gcc_options *);
2604 static bool ix86_can_inline_p (tree, tree);
2605 static void ix86_set_current_function (tree);
2606 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2608 static enum calling_abi ix86_function_abi (const_tree);
2611 #ifndef SUBTARGET32_DEFAULT_CPU
2612 #define SUBTARGET32_DEFAULT_CPU "i386"
2613 #endif
2615 /* Whether -mtune= or -march= were specified */
2616 static int ix86_tune_defaulted;
2617 static int ix86_arch_specified;
2619 /* Vectorization library interface and handlers. */
2620 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2622 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2623 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2625 /* Processor target table, indexed by processor number */
2626 struct ptt
2628 const char *const name; /* processor name */
2629 const struct processor_costs *cost; /* Processor costs */
2630 const int align_loop; /* Default alignments. */
2631 const int align_loop_max_skip;
2632 const int align_jump;
2633 const int align_jump_max_skip;
2634 const int align_func;
2637 /* This table must be in sync with enum processor_type in i386.h. */
2638 static const struct ptt processor_target_table[PROCESSOR_max] =
2640 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2641 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2642 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2643 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2644 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2645 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2646 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2647 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2648 {"core2", &core_cost, 16, 10, 16, 10, 16},
2649 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2650 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2651 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2652 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2653 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2654 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2655 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2656 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2657 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2658 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2659 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2660 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2661 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2662 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2663 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2664 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2665 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2666 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2667 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2668 {"znver1", &znver1_cost, 16, 10, 16, 7, 11}
2671 static unsigned int
2672 rest_of_handle_insert_vzeroupper (void)
2674 int i;
2676 /* vzeroupper instructions are inserted immediately after reload to
2677 account for possible spills from 256bit registers. The pass
2678 reuses mode switching infrastructure by re-running mode insertion
2679 pass, so disable entities that have already been processed. */
2680 for (i = 0; i < MAX_386_ENTITIES; i++)
2681 ix86_optimize_mode_switching[i] = 0;
2683 ix86_optimize_mode_switching[AVX_U128] = 1;
2685 /* Call optimize_mode_switching. */
2686 g->get_passes ()->execute_pass_mode_switching ();
2687 return 0;
2690 /* Return 1 if INSN uses or defines a hard register.
2691 Hard register uses in a memory address are ignored.
2692 Clobbers and flags definitions are ignored. */
2694 static bool
2695 has_non_address_hard_reg (rtx_insn *insn)
2697 df_ref ref;
2698 FOR_EACH_INSN_DEF (ref, insn)
2699 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2700 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2701 && DF_REF_REGNO (ref) != FLAGS_REG)
2702 return true;
2704 FOR_EACH_INSN_USE (ref, insn)
2705 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2706 return true;
2708 return false;
2711 /* Check if comparison INSN may be transformed
2712 into vector comparison. Currently we transform
2713 zero checks only which look like:
2715 (set (reg:CCZ 17 flags)
2716 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2717 (subreg:SI (reg:DI x) 0))
2718 (const_int 0 [0]))) */
2720 static bool
2721 convertible_comparison_p (rtx_insn *insn)
2723 if (!TARGET_SSE4_1)
2724 return false;
2726 rtx def_set = single_set (insn);
2728 gcc_assert (def_set);
2730 rtx src = SET_SRC (def_set);
2731 rtx dst = SET_DEST (def_set);
2733 gcc_assert (GET_CODE (src) == COMPARE);
2735 if (GET_CODE (dst) != REG
2736 || REGNO (dst) != FLAGS_REG
2737 || GET_MODE (dst) != CCZmode)
2738 return false;
2740 rtx op1 = XEXP (src, 0);
2741 rtx op2 = XEXP (src, 1);
2743 if (op2 != CONST0_RTX (GET_MODE (op2)))
2744 return false;
2746 if (GET_CODE (op1) != IOR)
2747 return false;
2749 op2 = XEXP (op1, 1);
2750 op1 = XEXP (op1, 0);
2752 if (!SUBREG_P (op1)
2753 || !SUBREG_P (op2)
2754 || GET_MODE (op1) != SImode
2755 || GET_MODE (op2) != SImode
2756 || ((SUBREG_BYTE (op1) != 0
2757 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2758 && (SUBREG_BYTE (op2) != 0
2759 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2760 return false;
2762 op1 = SUBREG_REG (op1);
2763 op2 = SUBREG_REG (op2);
2765 if (op1 != op2
2766 || !REG_P (op1)
2767 || GET_MODE (op1) != DImode)
2768 return false;
2770 return true;
2773 /* The DImode version of scalar_to_vector_candidate_p. */
2775 static bool
2776 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2778 rtx def_set = single_set (insn);
2780 if (!def_set)
2781 return false;
2783 if (has_non_address_hard_reg (insn))
2784 return false;
2786 rtx src = SET_SRC (def_set);
2787 rtx dst = SET_DEST (def_set);
2789 if (GET_CODE (src) == COMPARE)
2790 return convertible_comparison_p (insn);
2792 /* We are interested in DImode promotion only. */
2793 if ((GET_MODE (src) != DImode
2794 && !CONST_INT_P (src))
2795 || GET_MODE (dst) != DImode)
2796 return false;
2798 if (!REG_P (dst) && !MEM_P (dst))
2799 return false;
2801 switch (GET_CODE (src))
2803 case PLUS:
2804 case MINUS:
2805 case IOR:
2806 case XOR:
2807 case AND:
2808 break;
2810 case REG:
2811 return true;
2813 case MEM:
2814 case CONST_INT:
2815 return REG_P (dst);
2817 default:
2818 return false;
2821 if (!REG_P (XEXP (src, 0))
2822 && !MEM_P (XEXP (src, 0))
2823 && !CONST_INT_P (XEXP (src, 0))
2824 /* Check for andnot case. */
2825 && (GET_CODE (src) != AND
2826 || GET_CODE (XEXP (src, 0)) != NOT
2827 || !REG_P (XEXP (XEXP (src, 0), 0))))
2828 return false;
2830 if (!REG_P (XEXP (src, 1))
2831 && !MEM_P (XEXP (src, 1))
2832 && !CONST_INT_P (XEXP (src, 1)))
2833 return false;
2835 if ((GET_MODE (XEXP (src, 0)) != DImode
2836 && !CONST_INT_P (XEXP (src, 0)))
2837 || (GET_MODE (XEXP (src, 1)) != DImode
2838 && !CONST_INT_P (XEXP (src, 1))))
2839 return false;
2841 return true;
2844 /* The TImode version of scalar_to_vector_candidate_p. */
2846 static bool
2847 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2849 rtx def_set = single_set (insn);
2851 if (!def_set)
2852 return false;
2854 if (has_non_address_hard_reg (insn))
2855 return false;
2857 rtx src = SET_SRC (def_set);
2858 rtx dst = SET_DEST (def_set);
2860 /* Only TImode load and store are allowed. */
2861 if (GET_MODE (dst) != TImode)
2862 return false;
2864 if (MEM_P (dst))
2866 /* Check for store. Memory must be aligned or unaligned store
2867 is optimal. Only support store from register, standard SSE
2868 constant or CONST_WIDE_INT generated from piecewise store.
2870 ??? Verify performance impact before enabling CONST_INT for
2871 __int128 store. */
2872 if (misaligned_operand (dst, TImode)
2873 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2874 return false;
2876 switch (GET_CODE (src))
2878 default:
2879 return false;
2881 case REG:
2882 case CONST_WIDE_INT:
2883 return true;
2885 case CONST_INT:
2886 return standard_sse_constant_p (src, TImode);
2889 else if (MEM_P (src))
2891 /* Check for load. Memory must be aligned or unaligned load is
2892 optimal. */
2893 return (REG_P (dst)
2894 && (!misaligned_operand (src, TImode)
2895 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2898 return false;
2901 /* Return 1 if INSN may be converted into vector
2902 instruction. */
2904 static bool
2905 scalar_to_vector_candidate_p (rtx_insn *insn)
2907 if (TARGET_64BIT)
2908 return timode_scalar_to_vector_candidate_p (insn);
2909 else
2910 return dimode_scalar_to_vector_candidate_p (insn);
2913 /* The DImode version of remove_non_convertible_regs. */
2915 static void
2916 dimode_remove_non_convertible_regs (bitmap candidates)
2918 bitmap_iterator bi;
2919 unsigned id;
2920 bitmap regs = BITMAP_ALLOC (NULL);
2922 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2924 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2925 rtx reg = SET_DEST (def_set);
2927 if (!REG_P (reg)
2928 || bitmap_bit_p (regs, REGNO (reg))
2929 || HARD_REGISTER_P (reg))
2930 continue;
2932 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2933 def;
2934 def = DF_REF_NEXT_REG (def))
2936 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2938 if (dump_file)
2939 fprintf (dump_file,
2940 "r%d has non convertible definition in insn %d\n",
2941 REGNO (reg), DF_REF_INSN_UID (def));
2943 bitmap_set_bit (regs, REGNO (reg));
2944 break;
2949 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2951 for (df_ref def = DF_REG_DEF_CHAIN (id);
2952 def;
2953 def = DF_REF_NEXT_REG (def))
2954 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2956 if (dump_file)
2957 fprintf (dump_file, "Removing insn %d from candidates list\n",
2958 DF_REF_INSN_UID (def));
2960 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2964 BITMAP_FREE (regs);
2967 /* For a register REGNO, scan instructions for its defs and uses.
2968 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2970 static void
2971 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2972 unsigned int regno)
2974 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2975 def;
2976 def = DF_REF_NEXT_REG (def))
2978 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2980 if (dump_file)
2981 fprintf (dump_file,
2982 "r%d has non convertible def in insn %d\n",
2983 regno, DF_REF_INSN_UID (def));
2985 bitmap_set_bit (regs, regno);
2986 break;
2990 for (df_ref ref = DF_REG_USE_CHAIN (regno);
2991 ref;
2992 ref = DF_REF_NEXT_REG (ref))
2994 /* Debug instructions are skipped. */
2995 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2996 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2998 if (dump_file)
2999 fprintf (dump_file,
3000 "r%d has non convertible use in insn %d\n",
3001 regno, DF_REF_INSN_UID (ref));
3003 bitmap_set_bit (regs, regno);
3004 break;
3009 /* The TImode version of remove_non_convertible_regs. */
3011 static void
3012 timode_remove_non_convertible_regs (bitmap candidates)
3014 bitmap_iterator bi;
3015 unsigned id;
3016 bitmap regs = BITMAP_ALLOC (NULL);
3018 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3020 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3021 rtx dest = SET_DEST (def_set);
3022 rtx src = SET_SRC (def_set);
3024 if ((!REG_P (dest)
3025 || bitmap_bit_p (regs, REGNO (dest))
3026 || HARD_REGISTER_P (dest))
3027 && (!REG_P (src)
3028 || bitmap_bit_p (regs, REGNO (src))
3029 || HARD_REGISTER_P (src)))
3030 continue;
3032 if (REG_P (dest))
3033 timode_check_non_convertible_regs (candidates, regs,
3034 REGNO (dest));
3036 if (REG_P (src))
3037 timode_check_non_convertible_regs (candidates, regs,
3038 REGNO (src));
3041 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3043 for (df_ref def = DF_REG_DEF_CHAIN (id);
3044 def;
3045 def = DF_REF_NEXT_REG (def))
3046 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3048 if (dump_file)
3049 fprintf (dump_file, "Removing insn %d from candidates list\n",
3050 DF_REF_INSN_UID (def));
3052 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3055 for (df_ref ref = DF_REG_USE_CHAIN (id);
3056 ref;
3057 ref = DF_REF_NEXT_REG (ref))
3058 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3060 if (dump_file)
3061 fprintf (dump_file, "Removing insn %d from candidates list\n",
3062 DF_REF_INSN_UID (ref));
3064 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3068 BITMAP_FREE (regs);
3071 /* For a given bitmap of insn UIDs scans all instruction and
3072 remove insn from CANDIDATES in case it has both convertible
3073 and not convertible definitions.
3075 All insns in a bitmap are conversion candidates according to
3076 scalar_to_vector_candidate_p. Currently it implies all insns
3077 are single_set. */
3079 static void
3080 remove_non_convertible_regs (bitmap candidates)
3082 if (TARGET_64BIT)
3083 timode_remove_non_convertible_regs (candidates);
3084 else
3085 dimode_remove_non_convertible_regs (candidates);
3088 class scalar_chain
3090 public:
3091 scalar_chain ();
3092 virtual ~scalar_chain ();
3094 static unsigned max_id;
3096 /* ID of a chain. */
3097 unsigned int chain_id;
3098 /* A queue of instructions to be included into a chain. */
3099 bitmap queue;
3100 /* Instructions included into a chain. */
3101 bitmap insns;
3102 /* All registers defined by a chain. */
3103 bitmap defs;
3104 /* Registers used in both vector and sclar modes. */
3105 bitmap defs_conv;
3107 void build (bitmap candidates, unsigned insn_uid);
3108 virtual int compute_convert_gain () = 0;
3109 int convert ();
3111 protected:
3112 void add_to_queue (unsigned insn_uid);
3113 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3115 private:
3116 void add_insn (bitmap candidates, unsigned insn_uid);
3117 void analyze_register_chain (bitmap candidates, df_ref ref);
3118 virtual void mark_dual_mode_def (df_ref def) = 0;
3119 virtual void convert_insn (rtx_insn *insn) = 0;
3120 virtual void convert_registers () = 0;
3123 class dimode_scalar_chain : public scalar_chain
3125 public:
3126 int compute_convert_gain ();
3127 private:
3128 void mark_dual_mode_def (df_ref def);
3129 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3130 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3131 void convert_insn (rtx_insn *insn);
3132 void convert_op (rtx *op, rtx_insn *insn);
3133 void convert_reg (unsigned regno);
3134 void make_vector_copies (unsigned regno);
3135 void convert_registers ();
3136 int vector_const_cost (rtx exp);
3139 class timode_scalar_chain : public scalar_chain
3141 public:
3142 /* Convert from TImode to V1TImode is always faster. */
3143 int compute_convert_gain () { return 1; }
3145 private:
3146 void mark_dual_mode_def (df_ref def);
3147 void fix_debug_reg_uses (rtx reg);
3148 void convert_insn (rtx_insn *insn);
3149 /* We don't convert registers to difference size. */
3150 void convert_registers () {}
3153 unsigned scalar_chain::max_id = 0;
3155 /* Initialize new chain. */
3157 scalar_chain::scalar_chain ()
3159 chain_id = ++max_id;
3161 if (dump_file)
3162 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3164 bitmap_obstack_initialize (NULL);
3165 insns = BITMAP_ALLOC (NULL);
3166 defs = BITMAP_ALLOC (NULL);
3167 defs_conv = BITMAP_ALLOC (NULL);
3168 queue = NULL;
3171 /* Free chain's data. */
3173 scalar_chain::~scalar_chain ()
3175 BITMAP_FREE (insns);
3176 BITMAP_FREE (defs);
3177 BITMAP_FREE (defs_conv);
3178 bitmap_obstack_release (NULL);
3181 /* Add instruction into chains' queue. */
3183 void
3184 scalar_chain::add_to_queue (unsigned insn_uid)
3186 if (bitmap_bit_p (insns, insn_uid)
3187 || bitmap_bit_p (queue, insn_uid))
3188 return;
3190 if (dump_file)
3191 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3192 insn_uid, chain_id);
3193 bitmap_set_bit (queue, insn_uid);
3196 /* For DImode conversion, mark register defined by DEF as requiring
3197 conversion. */
3199 void
3200 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3202 gcc_assert (DF_REF_REG_DEF_P (def));
3204 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3205 return;
3207 if (dump_file)
3208 fprintf (dump_file,
3209 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3210 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3212 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3215 /* For TImode conversion, it is unused. */
3217 void
3218 timode_scalar_chain::mark_dual_mode_def (df_ref)
3220 gcc_unreachable ();
3223 /* Check REF's chain to add new insns into a queue
3224 and find registers requiring conversion. */
3226 void
3227 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3229 df_link *chain;
3231 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3232 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3233 add_to_queue (DF_REF_INSN_UID (ref));
3235 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3237 unsigned uid = DF_REF_INSN_UID (chain->ref);
3239 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3240 continue;
3242 if (!DF_REF_REG_MEM_P (chain->ref))
3244 if (bitmap_bit_p (insns, uid))
3245 continue;
3247 if (bitmap_bit_p (candidates, uid))
3249 add_to_queue (uid);
3250 continue;
3254 if (DF_REF_REG_DEF_P (chain->ref))
3256 if (dump_file)
3257 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3258 DF_REF_REGNO (chain->ref), uid);
3259 mark_dual_mode_def (chain->ref);
3261 else
3263 if (dump_file)
3264 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3265 DF_REF_REGNO (chain->ref), uid);
3266 mark_dual_mode_def (ref);
3271 /* Add instruction into a chain. */
3273 void
3274 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3276 if (bitmap_bit_p (insns, insn_uid))
3277 return;
3279 if (dump_file)
3280 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3282 bitmap_set_bit (insns, insn_uid);
3284 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3285 rtx def_set = single_set (insn);
3286 if (def_set && REG_P (SET_DEST (def_set))
3287 && !HARD_REGISTER_P (SET_DEST (def_set)))
3288 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3290 df_ref ref;
3291 df_ref def;
3292 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3293 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3294 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3295 def;
3296 def = DF_REF_NEXT_REG (def))
3297 analyze_register_chain (candidates, def);
3298 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3299 if (!DF_REF_REG_MEM_P (ref))
3300 analyze_register_chain (candidates, ref);
3303 /* Build new chain starting from insn INSN_UID recursively
3304 adding all dependent uses and definitions. */
3306 void
3307 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3309 queue = BITMAP_ALLOC (NULL);
3310 bitmap_set_bit (queue, insn_uid);
3312 if (dump_file)
3313 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3315 while (!bitmap_empty_p (queue))
3317 insn_uid = bitmap_first_set_bit (queue);
3318 bitmap_clear_bit (queue, insn_uid);
3319 bitmap_clear_bit (candidates, insn_uid);
3320 add_insn (candidates, insn_uid);
3323 if (dump_file)
3325 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3326 fprintf (dump_file, " insns: ");
3327 dump_bitmap (dump_file, insns);
3328 if (!bitmap_empty_p (defs_conv))
3330 bitmap_iterator bi;
3331 unsigned id;
3332 const char *comma = "";
3333 fprintf (dump_file, " defs to convert: ");
3334 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3336 fprintf (dump_file, "%sr%d", comma, id);
3337 comma = ", ";
3339 fprintf (dump_file, "\n");
3343 BITMAP_FREE (queue);
3346 /* Return a cost of building a vector costant
3347 instead of using a scalar one. */
3350 dimode_scalar_chain::vector_const_cost (rtx exp)
3352 gcc_assert (CONST_INT_P (exp));
3354 if (standard_sse_constant_p (exp, V2DImode))
3355 return COSTS_N_INSNS (1);
3356 return ix86_cost->sse_load[1];
3359 /* Compute a gain for chain conversion. */
3362 dimode_scalar_chain::compute_convert_gain ()
3364 bitmap_iterator bi;
3365 unsigned insn_uid;
3366 int gain = 0;
3367 int cost = 0;
3369 if (dump_file)
3370 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3372 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3374 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3375 rtx def_set = single_set (insn);
3376 rtx src = SET_SRC (def_set);
3377 rtx dst = SET_DEST (def_set);
3379 if (REG_P (src) && REG_P (dst))
3380 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3381 else if (REG_P (src) && MEM_P (dst))
3382 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3383 else if (MEM_P (src) && REG_P (dst))
3384 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3385 else if (GET_CODE (src) == PLUS
3386 || GET_CODE (src) == MINUS
3387 || GET_CODE (src) == IOR
3388 || GET_CODE (src) == XOR
3389 || GET_CODE (src) == AND)
3391 gain += ix86_cost->add;
3392 if (CONST_INT_P (XEXP (src, 0)))
3393 gain -= vector_const_cost (XEXP (src, 0));
3394 if (CONST_INT_P (XEXP (src, 1)))
3395 gain -= vector_const_cost (XEXP (src, 1));
3397 else if (GET_CODE (src) == COMPARE)
3399 /* Assume comparison cost is the same. */
3401 else if (GET_CODE (src) == CONST_INT)
3403 if (REG_P (dst))
3404 gain += COSTS_N_INSNS (2);
3405 else if (MEM_P (dst))
3406 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3407 gain -= vector_const_cost (src);
3409 else
3410 gcc_unreachable ();
3413 if (dump_file)
3414 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3416 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3417 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3419 if (dump_file)
3420 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3422 gain -= cost;
3424 if (dump_file)
3425 fprintf (dump_file, " Total gain: %d\n", gain);
3427 return gain;
3430 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3433 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3435 if (x == reg)
3436 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3438 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3439 int i, j;
3440 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3442 if (fmt[i] == 'e')
3443 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3444 else if (fmt[i] == 'E')
3445 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3446 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3447 reg, new_reg);
3450 return x;
3453 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3455 void
3456 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3457 rtx reg, rtx new_reg)
3459 replace_with_subreg (single_set (insn), reg, new_reg);
3462 /* Insert generated conversion instruction sequence INSNS
3463 after instruction AFTER. New BB may be required in case
3464 instruction has EH region attached. */
3466 void
3467 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3469 if (!control_flow_insn_p (after))
3471 emit_insn_after (insns, after);
3472 return;
3475 basic_block bb = BLOCK_FOR_INSN (after);
3476 edge e = find_fallthru_edge (bb->succs);
3477 gcc_assert (e);
3479 basic_block new_bb = split_edge (e);
3480 emit_insn_after (insns, BB_HEAD (new_bb));
3483 /* Make vector copies for all register REGNO definitions
3484 and replace its uses in a chain. */
3486 void
3487 dimode_scalar_chain::make_vector_copies (unsigned regno)
3489 rtx reg = regno_reg_rtx[regno];
3490 rtx vreg = gen_reg_rtx (DImode);
3491 df_ref ref;
3493 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3494 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3496 rtx_insn *insn = DF_REF_INSN (ref);
3498 start_sequence ();
3499 if (TARGET_SSE4_1)
3501 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3502 CONST0_RTX (V4SImode),
3503 gen_rtx_SUBREG (SImode, reg, 0)));
3504 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3505 gen_rtx_SUBREG (V4SImode, vreg, 0),
3506 gen_rtx_SUBREG (SImode, reg, 4),
3507 GEN_INT (2)));
3509 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3511 rtx tmp = gen_reg_rtx (DImode);
3512 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3513 CONST0_RTX (V4SImode),
3514 gen_rtx_SUBREG (SImode, reg, 0)));
3515 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3516 CONST0_RTX (V4SImode),
3517 gen_rtx_SUBREG (SImode, reg, 4)));
3518 emit_insn (gen_vec_interleave_lowv4si
3519 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3520 gen_rtx_SUBREG (V4SImode, vreg, 0),
3521 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3523 else
3525 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3526 emit_move_insn (adjust_address (tmp, SImode, 0),
3527 gen_rtx_SUBREG (SImode, reg, 0));
3528 emit_move_insn (adjust_address (tmp, SImode, 4),
3529 gen_rtx_SUBREG (SImode, reg, 4));
3530 emit_move_insn (vreg, tmp);
3532 rtx_insn *seq = get_insns ();
3533 end_sequence ();
3534 emit_conversion_insns (seq, insn);
3536 if (dump_file)
3537 fprintf (dump_file,
3538 " Copied r%d to a vector register r%d for insn %d\n",
3539 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3542 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3543 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3545 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3547 if (dump_file)
3548 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3549 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3553 /* Convert all definitions of register REGNO
3554 and fix its uses. Scalar copies may be created
3555 in case register is used in not convertible insn. */
3557 void
3558 dimode_scalar_chain::convert_reg (unsigned regno)
3560 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3561 rtx reg = regno_reg_rtx[regno];
3562 rtx scopy = NULL_RTX;
3563 df_ref ref;
3564 bitmap conv;
3566 conv = BITMAP_ALLOC (NULL);
3567 bitmap_copy (conv, insns);
3569 if (scalar_copy)
3570 scopy = gen_reg_rtx (DImode);
3572 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3574 rtx_insn *insn = DF_REF_INSN (ref);
3575 rtx def_set = single_set (insn);
3576 rtx src = SET_SRC (def_set);
3577 rtx reg = DF_REF_REG (ref);
3579 if (!MEM_P (src))
3581 replace_with_subreg_in_insn (insn, reg, reg);
3582 bitmap_clear_bit (conv, INSN_UID (insn));
3585 if (scalar_copy)
3587 rtx vcopy = gen_reg_rtx (V2DImode);
3589 start_sequence ();
3590 if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3592 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3593 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3594 gen_rtx_SUBREG (SImode, vcopy, 0));
3595 emit_move_insn (vcopy,
3596 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3597 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3598 gen_rtx_SUBREG (SImode, vcopy, 0));
3600 else
3602 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3603 emit_move_insn (tmp, reg);
3604 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3605 adjust_address (tmp, SImode, 0));
3606 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3607 adjust_address (tmp, SImode, 4));
3609 rtx_insn *seq = get_insns ();
3610 end_sequence ();
3611 emit_conversion_insns (seq, insn);
3613 if (dump_file)
3614 fprintf (dump_file,
3615 " Copied r%d to a scalar register r%d for insn %d\n",
3616 regno, REGNO (scopy), INSN_UID (insn));
3620 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3621 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3623 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3625 rtx def_set = single_set (DF_REF_INSN (ref));
3626 if (!MEM_P (SET_DEST (def_set))
3627 || !REG_P (SET_SRC (def_set)))
3628 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3629 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3632 /* Skip debug insns and uninitialized uses. */
3633 else if (DF_REF_CHAIN (ref)
3634 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3636 gcc_assert (scopy);
3637 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3638 df_insn_rescan (DF_REF_INSN (ref));
3641 BITMAP_FREE (conv);
3644 /* Convert operand OP in INSN. We should handle
3645 memory operands and uninitialized registers.
3646 All other register uses are converted during
3647 registers conversion. */
3649 void
3650 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3652 *op = copy_rtx_if_shared (*op);
3654 if (GET_CODE (*op) == NOT)
3656 convert_op (&XEXP (*op, 0), insn);
3657 PUT_MODE (*op, V2DImode);
3659 else if (MEM_P (*op))
3661 rtx tmp = gen_reg_rtx (DImode);
3663 emit_insn_before (gen_move_insn (tmp, *op), insn);
3664 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3666 if (dump_file)
3667 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3668 INSN_UID (insn), REGNO (tmp));
3670 else if (REG_P (*op))
3672 /* We may have not converted register usage in case
3673 this register has no definition. Otherwise it
3674 should be converted in convert_reg. */
3675 df_ref ref;
3676 FOR_EACH_INSN_USE (ref, insn)
3677 if (DF_REF_REGNO (ref) == REGNO (*op))
3679 gcc_assert (!DF_REF_CHAIN (ref));
3680 break;
3682 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3684 else if (CONST_INT_P (*op))
3686 rtx vec_cst;
3687 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3689 /* Prefer all ones vector in case of -1. */
3690 if (constm1_operand (*op, GET_MODE (*op)))
3691 vec_cst = CONSTM1_RTX (V2DImode);
3692 else
3693 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3694 gen_rtvec (2, *op, const0_rtx));
3696 if (!standard_sse_constant_p (vec_cst, V2DImode))
3698 start_sequence ();
3699 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3700 rtx_insn *seq = get_insns ();
3701 end_sequence ();
3702 emit_insn_before (seq, insn);
3705 emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
3706 *op = tmp;
3708 else
3710 gcc_assert (SUBREG_P (*op));
3711 gcc_assert (GET_MODE (*op) == V2DImode);
3715 /* Convert INSN to vector mode. */
3717 void
3718 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3720 rtx def_set = single_set (insn);
3721 rtx src = SET_SRC (def_set);
3722 rtx dst = SET_DEST (def_set);
3723 rtx subreg;
3725 if (MEM_P (dst) && !REG_P (src))
3727 /* There are no scalar integer instructions and therefore
3728 temporary register usage is required. */
3729 rtx tmp = gen_reg_rtx (DImode);
3730 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3731 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3734 switch (GET_CODE (src))
3736 case PLUS:
3737 case MINUS:
3738 case IOR:
3739 case XOR:
3740 case AND:
3741 convert_op (&XEXP (src, 0), insn);
3742 convert_op (&XEXP (src, 1), insn);
3743 PUT_MODE (src, V2DImode);
3744 break;
3746 case MEM:
3747 if (!REG_P (dst))
3748 convert_op (&src, insn);
3749 break;
3751 case REG:
3752 if (!MEM_P (dst))
3753 convert_op (&src, insn);
3754 break;
3756 case SUBREG:
3757 gcc_assert (GET_MODE (src) == V2DImode);
3758 break;
3760 case COMPARE:
3761 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3763 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3764 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3766 if (REG_P (src))
3767 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3768 else
3769 subreg = copy_rtx_if_shared (src);
3770 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3771 copy_rtx_if_shared (subreg),
3772 copy_rtx_if_shared (subreg)),
3773 insn);
3774 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3775 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3776 copy_rtx_if_shared (src)),
3777 UNSPEC_PTEST);
3778 break;
3780 case CONST_INT:
3781 convert_op (&src, insn);
3782 break;
3784 default:
3785 gcc_unreachable ();
3788 SET_SRC (def_set) = src;
3789 SET_DEST (def_set) = dst;
3791 /* Drop possible dead definitions. */
3792 PATTERN (insn) = def_set;
3794 INSN_CODE (insn) = -1;
3795 recog_memoized (insn);
3796 df_insn_rescan (insn);
3799 /* Fix uses of converted REG in debug insns. */
3801 void
3802 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3804 if (!flag_var_tracking)
3805 return;
3807 df_ref ref;
3808 for (ref = DF_REG_USE_CHAIN (REGNO (reg));
3809 ref;
3810 ref = DF_REF_NEXT_REG (ref))
3812 rtx_insn *insn = DF_REF_INSN (ref);
3813 if (DEBUG_INSN_P (insn))
3815 /* It may be a debug insn with a TImode variable in
3816 register. */
3817 rtx val = PATTERN (insn);
3818 if (GET_MODE (val) != TImode)
3819 continue;
3820 gcc_assert (GET_CODE (val) == VAR_LOCATION);
3821 rtx loc = PAT_VAR_LOCATION_LOC (val);
3822 /* It may have been converted to TImode already. */
3823 if (GET_MODE (loc) == TImode)
3824 continue;
3825 gcc_assert (REG_P (loc)
3826 && GET_MODE (loc) == V1TImode);
3827 /* Convert V1TImode register, which has been updated by a SET
3828 insn before, to SUBREG TImode. */
3829 PAT_VAR_LOCATION_LOC (val) = gen_rtx_SUBREG (TImode, loc, 0);
3830 df_insn_rescan (insn);
3835 /* Convert INSN from TImode to V1T1mode. */
3837 void
3838 timode_scalar_chain::convert_insn (rtx_insn *insn)
3840 rtx def_set = single_set (insn);
3841 rtx src = SET_SRC (def_set);
3842 rtx dst = SET_DEST (def_set);
3844 switch (GET_CODE (dst))
3846 case REG:
3848 rtx tmp = find_reg_equal_equiv_note (insn);
3849 if (tmp)
3850 PUT_MODE (XEXP (tmp, 0), V1TImode);
3851 PUT_MODE (dst, V1TImode);
3852 fix_debug_reg_uses (dst);
3854 break;
3855 case MEM:
3856 PUT_MODE (dst, V1TImode);
3857 break;
3859 default:
3860 gcc_unreachable ();
3863 switch (GET_CODE (src))
3865 case REG:
3866 PUT_MODE (src, V1TImode);
3867 /* Call fix_debug_reg_uses only if SRC is never defined. */
3868 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3869 fix_debug_reg_uses (src);
3870 break;
3872 case MEM:
3873 PUT_MODE (src, V1TImode);
3874 break;
3876 case CONST_WIDE_INT:
3877 if (NONDEBUG_INSN_P (insn))
3879 /* Since there are no instructions to store 128-bit constant,
3880 temporary register usage is required. */
3881 rtx tmp = gen_reg_rtx (V1TImode);
3882 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3883 src = validize_mem (force_const_mem (V1TImode, src));
3884 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3885 dst = tmp;
3887 break;
3889 case CONST_INT:
3890 switch (standard_sse_constant_p (src, TImode))
3892 case 1:
3893 src = CONST0_RTX (GET_MODE (dst));
3894 break;
3895 case 2:
3896 src = CONSTM1_RTX (GET_MODE (dst));
3897 break;
3898 default:
3899 gcc_unreachable ();
3901 if (NONDEBUG_INSN_P (insn))
3903 rtx tmp = gen_reg_rtx (V1TImode);
3904 /* Since there are no instructions to store standard SSE
3905 constant, temporary register usage is required. */
3906 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3907 dst = tmp;
3909 break;
3911 default:
3912 gcc_unreachable ();
3915 SET_SRC (def_set) = src;
3916 SET_DEST (def_set) = dst;
3918 /* Drop possible dead definitions. */
3919 PATTERN (insn) = def_set;
3921 INSN_CODE (insn) = -1;
3922 recog_memoized (insn);
3923 df_insn_rescan (insn);
3926 void
3927 dimode_scalar_chain::convert_registers ()
3929 bitmap_iterator bi;
3930 unsigned id;
3932 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
3933 convert_reg (id);
3935 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
3936 make_vector_copies (id);
3939 /* Convert whole chain creating required register
3940 conversions and copies. */
3943 scalar_chain::convert ()
3945 bitmap_iterator bi;
3946 unsigned id;
3947 int converted_insns = 0;
3949 if (!dbg_cnt (stv_conversion))
3950 return 0;
3952 if (dump_file)
3953 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
3955 convert_registers ();
3957 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
3959 convert_insn (DF_INSN_UID_GET (id)->insn);
3960 converted_insns++;
3963 return converted_insns;
3966 /* Main STV pass function. Find and convert scalar
3967 instructions into vector mode when profitable. */
3969 static unsigned int
3970 convert_scalars_to_vector ()
3972 basic_block bb;
3973 bitmap candidates;
3974 int converted_insns = 0;
3976 bitmap_obstack_initialize (NULL);
3977 candidates = BITMAP_ALLOC (NULL);
3979 calculate_dominance_info (CDI_DOMINATORS);
3980 df_set_flags (DF_DEFER_INSN_RESCAN);
3981 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
3982 df_md_add_problem ();
3983 df_analyze ();
3985 /* Find all instructions we want to convert into vector mode. */
3986 if (dump_file)
3987 fprintf (dump_file, "Searching for mode conversion candidates...\n");
3989 FOR_EACH_BB_FN (bb, cfun)
3991 rtx_insn *insn;
3992 FOR_BB_INSNS (bb, insn)
3993 if (scalar_to_vector_candidate_p (insn))
3995 if (dump_file)
3996 fprintf (dump_file, " insn %d is marked as a candidate\n",
3997 INSN_UID (insn));
3999 bitmap_set_bit (candidates, INSN_UID (insn));
4003 remove_non_convertible_regs (candidates);
4005 if (bitmap_empty_p (candidates))
4006 if (dump_file)
4007 fprintf (dump_file, "There are no candidates for optimization.\n");
4009 while (!bitmap_empty_p (candidates))
4011 unsigned uid = bitmap_first_set_bit (candidates);
4012 scalar_chain *chain;
4014 if (TARGET_64BIT)
4015 chain = new timode_scalar_chain;
4016 else
4017 chain = new dimode_scalar_chain;
4019 /* Find instructions chain we want to convert to vector mode.
4020 Check all uses and definitions to estimate all required
4021 conversions. */
4022 chain->build (candidates, uid);
4024 if (chain->compute_convert_gain () > 0)
4025 converted_insns += chain->convert ();
4026 else
4027 if (dump_file)
4028 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4029 chain->chain_id);
4031 delete chain;
4034 if (dump_file)
4035 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4037 BITMAP_FREE (candidates);
4038 bitmap_obstack_release (NULL);
4039 df_process_deferred_rescans ();
4041 /* Conversion means we may have 128bit register spills/fills
4042 which require aligned stack. */
4043 if (converted_insns)
4045 if (crtl->stack_alignment_needed < 128)
4046 crtl->stack_alignment_needed = 128;
4047 if (crtl->stack_alignment_estimated < 128)
4048 crtl->stack_alignment_estimated = 128;
4051 return 0;
4054 namespace {
4056 const pass_data pass_data_insert_vzeroupper =
4058 RTL_PASS, /* type */
4059 "vzeroupper", /* name */
4060 OPTGROUP_NONE, /* optinfo_flags */
4061 TV_MACH_DEP, /* tv_id */
4062 0, /* properties_required */
4063 0, /* properties_provided */
4064 0, /* properties_destroyed */
4065 0, /* todo_flags_start */
4066 TODO_df_finish, /* todo_flags_finish */
4069 class pass_insert_vzeroupper : public rtl_opt_pass
4071 public:
4072 pass_insert_vzeroupper(gcc::context *ctxt)
4073 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4076 /* opt_pass methods: */
4077 virtual bool gate (function *)
4079 return TARGET_AVX && !TARGET_AVX512F
4080 && TARGET_VZEROUPPER && flag_expensive_optimizations
4081 && !optimize_size;
4084 virtual unsigned int execute (function *)
4086 return rest_of_handle_insert_vzeroupper ();
4089 }; // class pass_insert_vzeroupper
4091 const pass_data pass_data_stv =
4093 RTL_PASS, /* type */
4094 "stv", /* name */
4095 OPTGROUP_NONE, /* optinfo_flags */
4096 TV_MACH_DEP, /* tv_id */
4097 0, /* properties_required */
4098 0, /* properties_provided */
4099 0, /* properties_destroyed */
4100 0, /* todo_flags_start */
4101 TODO_df_finish, /* todo_flags_finish */
4104 class pass_stv : public rtl_opt_pass
4106 public:
4107 pass_stv (gcc::context *ctxt)
4108 : rtl_opt_pass (pass_data_stv, ctxt)
4111 /* opt_pass methods: */
4112 virtual bool gate (function *)
4114 return TARGET_STV && TARGET_SSE2 && optimize > 1;
4117 virtual unsigned int execute (function *)
4119 return convert_scalars_to_vector ();
4122 }; // class pass_stv
4124 } // anon namespace
4126 rtl_opt_pass *
4127 make_pass_insert_vzeroupper (gcc::context *ctxt)
4129 return new pass_insert_vzeroupper (ctxt);
4132 rtl_opt_pass *
4133 make_pass_stv (gcc::context *ctxt)
4135 return new pass_stv (ctxt);
4138 /* Return true if a red-zone is in use. */
4140 bool
4141 ix86_using_red_zone (void)
4143 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4146 /* Return a string that documents the current -m options. The caller is
4147 responsible for freeing the string. */
4149 static char *
4150 ix86_target_string (HOST_WIDE_INT isa, int flags, int ix86_flags,
4151 const char *arch, const char *tune,
4152 enum fpmath_unit fpmath, bool add_nl_p)
4154 struct ix86_target_opts
4156 const char *option; /* option string */
4157 HOST_WIDE_INT mask; /* isa mask options */
4160 /* This table is ordered so that options like -msse4.2 that imply
4161 preceding options while match those first. */
4162 static struct ix86_target_opts isa_opts[] =
4164 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4165 { "-mfma", OPTION_MASK_ISA_FMA },
4166 { "-mxop", OPTION_MASK_ISA_XOP },
4167 { "-mlwp", OPTION_MASK_ISA_LWP },
4168 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4169 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4170 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4171 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4172 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4173 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4174 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4175 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4176 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4177 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4178 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4179 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4180 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4181 { "-msse3", OPTION_MASK_ISA_SSE3 },
4182 { "-msse2", OPTION_MASK_ISA_SSE2 },
4183 { "-msse", OPTION_MASK_ISA_SSE },
4184 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4185 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4186 { "-mmmx", OPTION_MASK_ISA_MMX },
4187 { "-mabm", OPTION_MASK_ISA_ABM },
4188 { "-mbmi", OPTION_MASK_ISA_BMI },
4189 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4190 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4191 { "-mhle", OPTION_MASK_ISA_HLE },
4192 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4193 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4194 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4195 { "-madx", OPTION_MASK_ISA_ADX },
4196 { "-mtbm", OPTION_MASK_ISA_TBM },
4197 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4198 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4199 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4200 { "-maes", OPTION_MASK_ISA_AES },
4201 { "-msha", OPTION_MASK_ISA_SHA },
4202 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4203 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4204 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4205 { "-mf16c", OPTION_MASK_ISA_F16C },
4206 { "-mrtm", OPTION_MASK_ISA_RTM },
4207 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4208 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4209 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4210 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4211 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4212 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4213 { "-mmpx", OPTION_MASK_ISA_MPX },
4214 { "-mclwb", OPTION_MASK_ISA_CLWB },
4215 { "-mpcommit", OPTION_MASK_ISA_PCOMMIT },
4216 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4217 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4218 { "-mpku", OPTION_MASK_ISA_PKU },
4221 /* Flag options. */
4222 static struct ix86_target_opts flag_opts[] =
4224 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4225 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4226 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4227 { "-m80387", MASK_80387 },
4228 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4229 { "-malign-double", MASK_ALIGN_DOUBLE },
4230 { "-mcld", MASK_CLD },
4231 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4232 { "-mieee-fp", MASK_IEEE_FP },
4233 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4234 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4235 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4236 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4237 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4238 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4239 { "-mno-red-zone", MASK_NO_RED_ZONE },
4240 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4241 { "-mrecip", MASK_RECIP },
4242 { "-mrtd", MASK_RTD },
4243 { "-msseregparm", MASK_SSEREGPARM },
4244 { "-mstack-arg-probe", MASK_STACK_PROBE },
4245 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4246 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4247 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4248 { "-mvzeroupper", MASK_VZEROUPPER },
4249 { "-mstv", MASK_STV},
4250 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
4251 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
4252 { "-mprefer-avx128", MASK_PREFER_AVX128},
4255 /* Additional flag options. */
4256 static struct ix86_target_opts ix86_flag_opts[] =
4258 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4261 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts)
4262 + ARRAY_SIZE (ix86_flag_opts) + 6][2];
4264 char isa_other[40];
4265 char target_other[40];
4266 char ix86_target_other[40];
4267 unsigned num = 0;
4268 unsigned i, j;
4269 char *ret;
4270 char *ptr;
4271 size_t len;
4272 size_t line_len;
4273 size_t sep_len;
4274 const char *abi;
4276 memset (opts, '\0', sizeof (opts));
4278 /* Add -march= option. */
4279 if (arch)
4281 opts[num][0] = "-march=";
4282 opts[num++][1] = arch;
4285 /* Add -mtune= option. */
4286 if (tune)
4288 opts[num][0] = "-mtune=";
4289 opts[num++][1] = tune;
4292 /* Add -m32/-m64/-mx32. */
4293 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4295 if ((isa & OPTION_MASK_ABI_64) != 0)
4296 abi = "-m64";
4297 else
4298 abi = "-mx32";
4299 isa &= ~ (OPTION_MASK_ISA_64BIT
4300 | OPTION_MASK_ABI_64
4301 | OPTION_MASK_ABI_X32);
4303 else
4304 abi = "-m32";
4305 opts[num++][0] = abi;
4307 /* Pick out the options in isa options. */
4308 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4310 if ((isa & isa_opts[i].mask) != 0)
4312 opts[num++][0] = isa_opts[i].option;
4313 isa &= ~ isa_opts[i].mask;
4317 if (isa && add_nl_p)
4319 opts[num++][0] = isa_other;
4320 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
4321 isa);
4324 /* Add flag options. */
4325 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4327 if ((flags & flag_opts[i].mask) != 0)
4329 opts[num++][0] = flag_opts[i].option;
4330 flags &= ~ flag_opts[i].mask;
4334 if (flags && add_nl_p)
4336 opts[num++][0] = target_other;
4337 sprintf (target_other, "(other flags: %#x)", flags);
4340 /* Add additional flag options. */
4341 for (i = 0; i < ARRAY_SIZE (ix86_flag_opts); i++)
4343 if ((ix86_flags & ix86_flag_opts[i].mask) != 0)
4345 opts[num++][0] = ix86_flag_opts[i].option;
4346 ix86_flags &= ~ ix86_flag_opts[i].mask;
4350 if (ix86_flags && add_nl_p)
4352 opts[num++][0] = ix86_target_other;
4353 sprintf (ix86_target_other, "(other flags: %#x)", ix86_flags);
4356 /* Add -fpmath= option. */
4357 if (fpmath)
4359 opts[num][0] = "-mfpmath=";
4360 switch ((int) fpmath)
4362 case FPMATH_387:
4363 opts[num++][1] = "387";
4364 break;
4366 case FPMATH_SSE:
4367 opts[num++][1] = "sse";
4368 break;
4370 case FPMATH_387 | FPMATH_SSE:
4371 opts[num++][1] = "sse+387";
4372 break;
4374 default:
4375 gcc_unreachable ();
4379 /* Any options? */
4380 if (num == 0)
4381 return NULL;
4383 gcc_assert (num < ARRAY_SIZE (opts));
4385 /* Size the string. */
4386 len = 0;
4387 sep_len = (add_nl_p) ? 3 : 1;
4388 for (i = 0; i < num; i++)
4390 len += sep_len;
4391 for (j = 0; j < 2; j++)
4392 if (opts[i][j])
4393 len += strlen (opts[i][j]);
4396 /* Build the string. */
4397 ret = ptr = (char *) xmalloc (len);
4398 line_len = 0;
4400 for (i = 0; i < num; i++)
4402 size_t len2[2];
4404 for (j = 0; j < 2; j++)
4405 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4407 if (i != 0)
4409 *ptr++ = ' ';
4410 line_len++;
4412 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4414 *ptr++ = '\\';
4415 *ptr++ = '\n';
4416 line_len = 0;
4420 for (j = 0; j < 2; j++)
4421 if (opts[i][j])
4423 memcpy (ptr, opts[i][j], len2[j]);
4424 ptr += len2[j];
4425 line_len += len2[j];
4429 *ptr = '\0';
4430 gcc_assert (ret + len >= ptr);
4432 return ret;
4435 /* Return true, if profiling code should be emitted before
4436 prologue. Otherwise it returns false.
4437 Note: For x86 with "hotfix" it is sorried. */
4438 static bool
4439 ix86_profile_before_prologue (void)
4441 return flag_fentry != 0;
4444 /* Function that is callable from the debugger to print the current
4445 options. */
4446 void ATTRIBUTE_UNUSED
4447 ix86_debug_options (void)
4449 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
4450 ix86_target_flags,
4451 ix86_arch_string, ix86_tune_string,
4452 ix86_fpmath, true);
4454 if (opts)
4456 fprintf (stderr, "%s\n\n", opts);
4457 free (opts);
4459 else
4460 fputs ("<no options>\n\n", stderr);
4462 return;
4465 /* Return true if T is one of the bytes we should avoid with
4466 -fmitigate-rop. */
4468 static bool
4469 ix86_rop_should_change_byte_p (int t)
4471 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4474 static const char *stringop_alg_names[] = {
4475 #define DEF_ENUM
4476 #define DEF_ALG(alg, name) #name,
4477 #include "stringop.def"
4478 #undef DEF_ENUM
4479 #undef DEF_ALG
4482 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4483 The string is of the following form (or comma separated list of it):
4485 strategy_alg:max_size:[align|noalign]
4487 where the full size range for the strategy is either [0, max_size] or
4488 [min_size, max_size], in which min_size is the max_size + 1 of the
4489 preceding range. The last size range must have max_size == -1.
4491 Examples:
4494 -mmemcpy-strategy=libcall:-1:noalign
4496 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4500 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4502 This is to tell the compiler to use the following strategy for memset
4503 1) when the expected size is between [1, 16], use rep_8byte strategy;
4504 2) when the size is between [17, 2048], use vector_loop;
4505 3) when the size is > 2048, use libcall. */
4507 struct stringop_size_range
4509 int max;
4510 stringop_alg alg;
4511 bool noalign;
4514 static void
4515 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4517 const struct stringop_algs *default_algs;
4518 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4519 char *curr_range_str, *next_range_str;
4520 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4521 int i = 0, n = 0;
4523 if (is_memset)
4524 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4525 else
4526 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4528 curr_range_str = strategy_str;
4532 int maxs;
4533 char alg_name[128];
4534 char align[16];
4535 next_range_str = strchr (curr_range_str, ',');
4536 if (next_range_str)
4537 *next_range_str++ = '\0';
4539 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4540 alg_name, &maxs, align))
4542 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4543 return;
4546 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4548 error ("size ranges of option %qs should be increasing", opt);
4549 return;
4552 for (i = 0; i < last_alg; i++)
4553 if (!strcmp (alg_name, stringop_alg_names[i]))
4554 break;
4556 if (i == last_alg)
4558 error ("wrong strategy name %qs specified for option %qs",
4559 alg_name, opt);
4561 auto_vec <const char *> candidates;
4562 for (i = 0; i < last_alg; i++)
4563 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4564 candidates.safe_push (stringop_alg_names[i]);
4566 char *s;
4567 const char *hint
4568 = candidates_list_and_hint (alg_name, s, candidates);
4569 if (hint)
4570 inform (input_location,
4571 "valid arguments to %qs are: %s; did you mean %qs?",
4572 opt, s, hint);
4573 else
4574 inform (input_location, "valid arguments to %qs are: %s",
4575 opt, s);
4576 XDELETEVEC (s);
4577 return;
4580 if ((stringop_alg) i == rep_prefix_8_byte
4581 && !TARGET_64BIT)
4583 /* rep; movq isn't available in 32-bit code. */
4584 error ("strategy name %qs specified for option %qs "
4585 "not supported for 32-bit code", alg_name, opt);
4586 return;
4589 input_ranges[n].max = maxs;
4590 input_ranges[n].alg = (stringop_alg) i;
4591 if (!strcmp (align, "align"))
4592 input_ranges[n].noalign = false;
4593 else if (!strcmp (align, "noalign"))
4594 input_ranges[n].noalign = true;
4595 else
4597 error ("unknown alignment %qs specified for option %qs", align, opt);
4598 return;
4600 n++;
4601 curr_range_str = next_range_str;
4603 while (curr_range_str);
4605 if (input_ranges[n - 1].max != -1)
4607 error ("the max value for the last size range should be -1"
4608 " for option %qs", opt);
4609 return;
4612 if (n > MAX_STRINGOP_ALGS)
4614 error ("too many size ranges specified in option %qs", opt);
4615 return;
4618 /* Now override the default algs array. */
4619 for (i = 0; i < n; i++)
4621 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4622 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4623 = input_ranges[i].alg;
4624 *const_cast<int *>(&default_algs->size[i].noalign)
4625 = input_ranges[i].noalign;
4630 /* parse -mtune-ctrl= option. When DUMP is true,
4631 print the features that are explicitly set. */
4633 static void
4634 parse_mtune_ctrl_str (bool dump)
4636 if (!ix86_tune_ctrl_string)
4637 return;
4639 char *next_feature_string = NULL;
4640 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4641 char *orig = curr_feature_string;
4642 int i;
4645 bool clear = false;
4647 next_feature_string = strchr (curr_feature_string, ',');
4648 if (next_feature_string)
4649 *next_feature_string++ = '\0';
4650 if (*curr_feature_string == '^')
4652 curr_feature_string++;
4653 clear = true;
4655 for (i = 0; i < X86_TUNE_LAST; i++)
4657 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4659 ix86_tune_features[i] = !clear;
4660 if (dump)
4661 fprintf (stderr, "Explicitly %s feature %s\n",
4662 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4663 break;
4666 if (i == X86_TUNE_LAST)
4667 error ("Unknown parameter to option -mtune-ctrl: %s",
4668 clear ? curr_feature_string - 1 : curr_feature_string);
4669 curr_feature_string = next_feature_string;
4671 while (curr_feature_string);
4672 free (orig);
4675 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4676 processor type. */
4678 static void
4679 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4681 unsigned int ix86_tune_mask = 1u << ix86_tune;
4682 int i;
4684 for (i = 0; i < X86_TUNE_LAST; ++i)
4686 if (ix86_tune_no_default)
4687 ix86_tune_features[i] = 0;
4688 else
4689 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4692 if (dump)
4694 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4695 for (i = 0; i < X86_TUNE_LAST; i++)
4696 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4697 ix86_tune_features[i] ? "on" : "off");
4700 parse_mtune_ctrl_str (dump);
4704 /* Default align_* from the processor table. */
4706 static void
4707 ix86_default_align (struct gcc_options *opts)
4709 if (opts->x_align_loops == 0)
4711 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4712 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4714 if (opts->x_align_jumps == 0)
4716 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4717 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4719 if (opts->x_align_functions == 0)
4721 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4725 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4727 static void
4728 ix86_override_options_after_change (void)
4730 ix86_default_align (&global_options);
4733 /* Override various settings based on options. If MAIN_ARGS_P, the
4734 options are from the command line, otherwise they are from
4735 attributes. Return true if there's an error related to march
4736 option. */
4738 static bool
4739 ix86_option_override_internal (bool main_args_p,
4740 struct gcc_options *opts,
4741 struct gcc_options *opts_set)
4743 int i;
4744 unsigned int ix86_arch_mask;
4745 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4747 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4748 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4749 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4750 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4751 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4752 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4753 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4754 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4755 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4756 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4757 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4758 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4759 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4760 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4761 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4762 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4763 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4764 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4765 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4766 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4767 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4768 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4769 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4770 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4771 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4772 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4773 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4774 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4775 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4776 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4777 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4778 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4779 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4780 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4781 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4782 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4783 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4784 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4785 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4786 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4787 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4788 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4789 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4790 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4791 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4792 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4793 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4794 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4795 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4796 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4797 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4798 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4799 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4800 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4801 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4802 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4803 #define PTA_PCOMMIT (HOST_WIDE_INT_1 << 56)
4804 #define PTA_MWAITX (HOST_WIDE_INT_1 << 57)
4805 #define PTA_CLZERO (HOST_WIDE_INT_1 << 58)
4806 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 59)
4807 #define PTA_PKU (HOST_WIDE_INT_1 << 60)
4809 #define PTA_CORE2 \
4810 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4811 | PTA_CX16 | PTA_FXSR)
4812 #define PTA_NEHALEM \
4813 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4814 #define PTA_WESTMERE \
4815 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4816 #define PTA_SANDYBRIDGE \
4817 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4818 #define PTA_IVYBRIDGE \
4819 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4820 #define PTA_HASWELL \
4821 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4822 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4823 #define PTA_BROADWELL \
4824 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4825 #define PTA_SKYLAKE \
4826 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4827 #define PTA_SKYLAKE_AVX512 \
4828 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4829 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4830 #define PTA_KNL \
4831 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4832 #define PTA_BONNELL \
4833 (PTA_CORE2 | PTA_MOVBE)
4834 #define PTA_SILVERMONT \
4835 (PTA_WESTMERE | PTA_MOVBE)
4837 /* if this reaches 64, need to widen struct pta flags below */
4839 static struct pta
4841 const char *const name; /* processor name or nickname. */
4842 const enum processor_type processor;
4843 const enum attr_cpu schedule;
4844 const unsigned HOST_WIDE_INT flags;
4846 const processor_alias_table[] =
4848 {"i386", PROCESSOR_I386, CPU_NONE, 0},
4849 {"i486", PROCESSOR_I486, CPU_NONE, 0},
4850 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4851 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4852 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
4853 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
4854 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
4855 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4856 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4857 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4858 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4859 PTA_MMX | PTA_SSE | PTA_FXSR},
4860 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4861 PTA_MMX | PTA_SSE | PTA_FXSR},
4862 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4863 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4864 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4865 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4866 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4867 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4868 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
4869 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4870 PTA_MMX | PTA_SSE | PTA_FXSR},
4871 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4872 PTA_MMX | PTA_SSE | PTA_FXSR},
4873 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4874 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4875 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
4876 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
4877 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
4878 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4879 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
4880 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4881 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
4882 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4883 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
4884 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
4885 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4886 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4887 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
4888 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4889 PTA_SANDYBRIDGE},
4890 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4891 PTA_SANDYBRIDGE},
4892 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4893 PTA_IVYBRIDGE},
4894 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4895 PTA_IVYBRIDGE},
4896 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4897 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4898 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
4899 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
4900 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
4901 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4902 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4903 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4904 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4905 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
4906 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
4907 {"geode", PROCESSOR_GEODE, CPU_GEODE,
4908 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4909 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
4910 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4911 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4912 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
4913 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4914 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
4915 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4916 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
4917 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4918 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
4919 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4920 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
4921 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4922 {"x86-64", PROCESSOR_K8, CPU_K8,
4923 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4924 {"eden-x2", PROCESSOR_K8, CPU_K8,
4925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4926 {"nano", PROCESSOR_K8, CPU_K8,
4927 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4928 | PTA_SSSE3 | PTA_FXSR},
4929 {"nano-1000", PROCESSOR_K8, CPU_K8,
4930 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4931 | PTA_SSSE3 | PTA_FXSR},
4932 {"nano-2000", PROCESSOR_K8, CPU_K8,
4933 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4934 | PTA_SSSE3 | PTA_FXSR},
4935 {"nano-3000", PROCESSOR_K8, CPU_K8,
4936 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4937 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4938 {"nano-x2", PROCESSOR_K8, CPU_K8,
4939 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4940 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4941 {"eden-x4", PROCESSOR_K8, CPU_K8,
4942 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4943 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4944 {"nano-x4", PROCESSOR_K8, CPU_K8,
4945 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4946 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4947 {"k8", PROCESSOR_K8, CPU_K8,
4948 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4949 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4950 {"k8-sse3", PROCESSOR_K8, CPU_K8,
4951 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4952 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4953 {"opteron", PROCESSOR_K8, CPU_K8,
4954 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4955 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4956 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
4957 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4958 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4959 {"athlon64", PROCESSOR_K8, CPU_K8,
4960 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4961 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4962 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
4963 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4964 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4965 {"athlon-fx", PROCESSOR_K8, CPU_K8,
4966 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4967 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4968 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4969 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4970 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4971 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4972 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4973 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4974 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
4975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4976 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4977 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4978 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4979 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
4980 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4981 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4982 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4983 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
4984 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4985 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
4986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4987 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4988 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4989 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
4990 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
4991 | PTA_XSAVEOPT | PTA_FSGSBASE},
4992 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
4993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4994 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4995 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
4996 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
4997 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
4998 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
4999 | PTA_MOVBE | PTA_MWAITX},
5000 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5001 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5002 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5003 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5004 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5005 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5006 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5007 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5008 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5009 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5010 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5011 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5012 | PTA_FXSR | PTA_XSAVE},
5013 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5014 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5015 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5016 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5017 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5018 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5020 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5021 PTA_64BIT
5022 | PTA_HLE /* flags are only used for -march switch. */ },
5025 /* -mrecip options. */
5026 static struct
5028 const char *string; /* option name */
5029 unsigned int mask; /* mask bits to set */
5031 const recip_options[] =
5033 { "all", RECIP_MASK_ALL },
5034 { "none", RECIP_MASK_NONE },
5035 { "div", RECIP_MASK_DIV },
5036 { "sqrt", RECIP_MASK_SQRT },
5037 { "vec-div", RECIP_MASK_VEC_DIV },
5038 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5041 int const pta_size = ARRAY_SIZE (processor_alias_table);
5043 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5044 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5045 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5046 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5047 #ifdef TARGET_BI_ARCH
5048 else
5050 #if TARGET_BI_ARCH == 1
5051 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5052 is on and OPTION_MASK_ABI_X32 is off. We turn off
5053 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5054 -mx32. */
5055 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5056 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5057 #else
5058 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5059 on and OPTION_MASK_ABI_64 is off. We turn off
5060 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5061 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5062 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5063 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5064 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5065 #endif
5066 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5067 && TARGET_IAMCU_P (opts->x_target_flags))
5068 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5069 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5071 #endif
5073 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5075 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5076 OPTION_MASK_ABI_64 for TARGET_X32. */
5077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5078 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5080 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5081 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5082 | OPTION_MASK_ABI_X32
5083 | OPTION_MASK_ABI_64);
5084 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5086 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5087 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5088 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5089 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5093 SUBTARGET_OVERRIDE_OPTIONS;
5094 #endif
5096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5097 SUBSUBTARGET_OVERRIDE_OPTIONS;
5098 #endif
5100 /* -fPIC is the default for x86_64. */
5101 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5102 opts->x_flag_pic = 2;
5104 /* Need to check -mtune=generic first. */
5105 if (opts->x_ix86_tune_string)
5107 /* As special support for cross compilers we read -mtune=native
5108 as -mtune=generic. With native compilers we won't see the
5109 -mtune=native, as it was changed by the driver. */
5110 if (!strcmp (opts->x_ix86_tune_string, "native"))
5112 opts->x_ix86_tune_string = "generic";
5114 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5115 warning (OPT_Wdeprecated,
5116 main_args_p
5117 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5118 "or %<-mtune=generic%> instead as appropriate"
5119 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5120 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5121 "instead as appropriate");
5123 else
5125 if (opts->x_ix86_arch_string)
5126 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5127 if (!opts->x_ix86_tune_string)
5129 opts->x_ix86_tune_string
5130 = processor_target_table[TARGET_CPU_DEFAULT].name;
5131 ix86_tune_defaulted = 1;
5134 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5135 or defaulted. We need to use a sensible tune option. */
5136 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5138 opts->x_ix86_tune_string = "generic";
5142 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5143 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5145 /* rep; movq isn't available in 32-bit code. */
5146 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5147 opts->x_ix86_stringop_alg = no_stringop;
5150 if (!opts->x_ix86_arch_string)
5151 opts->x_ix86_arch_string
5152 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5153 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5154 else
5155 ix86_arch_specified = 1;
5157 if (opts_set->x_ix86_pmode)
5159 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5160 && opts->x_ix86_pmode == PMODE_SI)
5161 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5162 && opts->x_ix86_pmode == PMODE_DI))
5163 error ("address mode %qs not supported in the %s bit mode",
5164 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5165 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5167 else
5168 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5169 ? PMODE_DI : PMODE_SI;
5171 if (!opts_set->x_ix86_abi)
5172 opts->x_ix86_abi = DEFAULT_ABI;
5174 /* For targets using ms ABI enable ms-extensions, if not
5175 explicit turned off. For non-ms ABI we turn off this
5176 option. */
5177 if (!opts_set->x_flag_ms_extensions)
5178 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5180 if (opts_set->x_ix86_cmodel)
5182 switch (opts->x_ix86_cmodel)
5184 case CM_SMALL:
5185 case CM_SMALL_PIC:
5186 if (opts->x_flag_pic)
5187 opts->x_ix86_cmodel = CM_SMALL_PIC;
5188 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5189 error ("code model %qs not supported in the %s bit mode",
5190 "small", "32");
5191 break;
5193 case CM_MEDIUM:
5194 case CM_MEDIUM_PIC:
5195 if (opts->x_flag_pic)
5196 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5197 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5198 error ("code model %qs not supported in the %s bit mode",
5199 "medium", "32");
5200 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5201 error ("code model %qs not supported in x32 mode",
5202 "medium");
5203 break;
5205 case CM_LARGE:
5206 case CM_LARGE_PIC:
5207 if (opts->x_flag_pic)
5208 opts->x_ix86_cmodel = CM_LARGE_PIC;
5209 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5210 error ("code model %qs not supported in the %s bit mode",
5211 "large", "32");
5212 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5213 error ("code model %qs not supported in x32 mode",
5214 "large");
5215 break;
5217 case CM_32:
5218 if (opts->x_flag_pic)
5219 error ("code model %s does not support PIC mode", "32");
5220 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5221 error ("code model %qs not supported in the %s bit mode",
5222 "32", "64");
5223 break;
5225 case CM_KERNEL:
5226 if (opts->x_flag_pic)
5228 error ("code model %s does not support PIC mode", "kernel");
5229 opts->x_ix86_cmodel = CM_32;
5231 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5232 error ("code model %qs not supported in the %s bit mode",
5233 "kernel", "32");
5234 break;
5236 default:
5237 gcc_unreachable ();
5240 else
5242 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5243 use of rip-relative addressing. This eliminates fixups that
5244 would otherwise be needed if this object is to be placed in a
5245 DLL, and is essentially just as efficient as direct addressing. */
5246 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5247 && (TARGET_RDOS || TARGET_PECOFF))
5248 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5249 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5250 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5251 else
5252 opts->x_ix86_cmodel = CM_32;
5254 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5256 error ("-masm=intel not supported in this configuration");
5257 opts->x_ix86_asm_dialect = ASM_ATT;
5259 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5260 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5261 sorry ("%i-bit mode not compiled in",
5262 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5264 for (i = 0; i < pta_size; i++)
5265 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5267 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5269 error (main_args_p
5270 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5271 : "%<generic%> CPU can be used only for "
5272 "%<target(\"tune=\")%> attribute");
5273 return false;
5275 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5277 error (main_args_p
5278 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5279 : "%<intel%> CPU can be used only for "
5280 "%<target(\"tune=\")%> attribute");
5281 return false;
5284 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5285 && !(processor_alias_table[i].flags & PTA_64BIT))
5287 error ("CPU you selected does not support x86-64 "
5288 "instruction set");
5289 return false;
5292 ix86_schedule = processor_alias_table[i].schedule;
5293 ix86_arch = processor_alias_table[i].processor;
5294 /* Default cpu tuning to the architecture. */
5295 ix86_tune = ix86_arch;
5297 if (processor_alias_table[i].flags & PTA_MMX
5298 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5299 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5300 if (processor_alias_table[i].flags & PTA_3DNOW
5301 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5302 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5303 if (processor_alias_table[i].flags & PTA_3DNOW_A
5304 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5305 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5306 if (processor_alias_table[i].flags & PTA_SSE
5307 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5308 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5309 if (processor_alias_table[i].flags & PTA_SSE2
5310 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5311 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5312 if (processor_alias_table[i].flags & PTA_SSE3
5313 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5314 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5315 if (processor_alias_table[i].flags & PTA_SSSE3
5316 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5317 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5318 if (processor_alias_table[i].flags & PTA_SSE4_1
5319 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5320 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5321 if (processor_alias_table[i].flags & PTA_SSE4_2
5322 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5323 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5324 if (processor_alias_table[i].flags & PTA_AVX
5325 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5326 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5327 if (processor_alias_table[i].flags & PTA_AVX2
5328 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5329 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5330 if (processor_alias_table[i].flags & PTA_FMA
5331 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5332 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5333 if (processor_alias_table[i].flags & PTA_SSE4A
5334 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5335 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5336 if (processor_alias_table[i].flags & PTA_FMA4
5337 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5338 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5339 if (processor_alias_table[i].flags & PTA_XOP
5340 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5341 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5342 if (processor_alias_table[i].flags & PTA_LWP
5343 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5344 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5345 if (processor_alias_table[i].flags & PTA_ABM
5346 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5347 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5348 if (processor_alias_table[i].flags & PTA_BMI
5349 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5350 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5351 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5352 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5353 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5354 if (processor_alias_table[i].flags & PTA_TBM
5355 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5356 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5357 if (processor_alias_table[i].flags & PTA_BMI2
5358 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5359 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5360 if (processor_alias_table[i].flags & PTA_CX16
5361 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5362 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5363 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5364 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5365 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5366 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5367 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5368 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5369 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5370 if (processor_alias_table[i].flags & PTA_MOVBE
5371 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5372 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5373 if (processor_alias_table[i].flags & PTA_AES
5374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5375 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5376 if (processor_alias_table[i].flags & PTA_SHA
5377 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5378 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5379 if (processor_alias_table[i].flags & PTA_PCLMUL
5380 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5381 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5382 if (processor_alias_table[i].flags & PTA_FSGSBASE
5383 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5384 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5385 if (processor_alias_table[i].flags & PTA_RDRND
5386 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5387 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5388 if (processor_alias_table[i].flags & PTA_F16C
5389 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5390 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5391 if (processor_alias_table[i].flags & PTA_RTM
5392 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5393 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5394 if (processor_alias_table[i].flags & PTA_HLE
5395 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5396 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5397 if (processor_alias_table[i].flags & PTA_PRFCHW
5398 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5399 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5400 if (processor_alias_table[i].flags & PTA_RDSEED
5401 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5402 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5403 if (processor_alias_table[i].flags & PTA_ADX
5404 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5405 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5406 if (processor_alias_table[i].flags & PTA_FXSR
5407 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5408 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5409 if (processor_alias_table[i].flags & PTA_XSAVE
5410 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5411 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5412 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5413 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5414 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5415 if (processor_alias_table[i].flags & PTA_AVX512F
5416 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5417 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5418 if (processor_alias_table[i].flags & PTA_AVX512ER
5419 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5420 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5421 if (processor_alias_table[i].flags & PTA_AVX512PF
5422 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5423 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5424 if (processor_alias_table[i].flags & PTA_AVX512CD
5425 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5426 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5427 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5428 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5429 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5430 if (processor_alias_table[i].flags & PTA_PCOMMIT
5431 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCOMMIT))
5432 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCOMMIT;
5433 if (processor_alias_table[i].flags & PTA_CLWB
5434 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5435 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5436 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5437 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5438 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5439 if (processor_alias_table[i].flags & PTA_CLZERO
5440 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5441 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5442 if (processor_alias_table[i].flags & PTA_XSAVEC
5443 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5444 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5445 if (processor_alias_table[i].flags & PTA_XSAVES
5446 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5447 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5448 if (processor_alias_table[i].flags & PTA_AVX512DQ
5449 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5450 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5451 if (processor_alias_table[i].flags & PTA_AVX512BW
5452 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5453 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5454 if (processor_alias_table[i].flags & PTA_AVX512VL
5455 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5456 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5457 if (processor_alias_table[i].flags & PTA_MPX
5458 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5459 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5460 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5461 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5462 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5463 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5466 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5467 x86_prefetch_sse = true;
5468 if (processor_alias_table[i].flags & PTA_MWAITX
5469 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5470 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5471 if (processor_alias_table[i].flags & PTA_PKU
5472 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5473 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5475 /* Don't enable x87 instructions if only
5476 general registers are allowed. */
5477 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5478 && !(opts_set->x_target_flags & MASK_80387))
5480 if (processor_alias_table[i].flags & PTA_NO_80387)
5481 opts->x_target_flags &= ~MASK_80387;
5482 else
5483 opts->x_target_flags |= MASK_80387;
5485 break;
5488 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5489 error ("Intel MPX does not support x32");
5491 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5492 error ("Intel MPX does not support x32");
5494 if (i == pta_size)
5496 error (main_args_p
5497 ? "bad value (%qs) for %<-march=%> switch"
5498 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5499 opts->x_ix86_arch_string);
5501 auto_vec <const char *> candidates;
5502 for (i = 0; i < pta_size; i++)
5503 if (strcmp (processor_alias_table[i].name, "generic")
5504 && strcmp (processor_alias_table[i].name, "intel")
5505 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5506 || (processor_alias_table[i].flags & PTA_64BIT)))
5507 candidates.safe_push (processor_alias_table[i].name);
5509 char *s;
5510 const char *hint
5511 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5512 if (hint)
5513 inform (input_location,
5514 main_args_p
5515 ? "valid arguments to %<-march=%> switch are: "
5516 "%s; did you mean %qs?"
5517 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5518 "%s; did you mean %qs?", s, hint);
5519 else
5520 inform (input_location,
5521 main_args_p
5522 ? "valid arguments to %<-march=%> switch are: %s"
5523 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5525 XDELETEVEC (s);
5528 ix86_arch_mask = 1u << ix86_arch;
5529 for (i = 0; i < X86_ARCH_LAST; ++i)
5530 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5532 for (i = 0; i < pta_size; i++)
5533 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5535 ix86_schedule = processor_alias_table[i].schedule;
5536 ix86_tune = processor_alias_table[i].processor;
5537 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5539 if (!(processor_alias_table[i].flags & PTA_64BIT))
5541 if (ix86_tune_defaulted)
5543 opts->x_ix86_tune_string = "x86-64";
5544 for (i = 0; i < pta_size; i++)
5545 if (! strcmp (opts->x_ix86_tune_string,
5546 processor_alias_table[i].name))
5547 break;
5548 ix86_schedule = processor_alias_table[i].schedule;
5549 ix86_tune = processor_alias_table[i].processor;
5551 else
5552 error ("CPU you selected does not support x86-64 "
5553 "instruction set");
5556 /* Intel CPUs have always interpreted SSE prefetch instructions as
5557 NOPs; so, we can enable SSE prefetch instructions even when
5558 -mtune (rather than -march) points us to a processor that has them.
5559 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5560 higher processors. */
5561 if (TARGET_CMOV
5562 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5563 x86_prefetch_sse = true;
5564 break;
5567 if (ix86_tune_specified && i == pta_size)
5569 error (main_args_p
5570 ? "bad value (%qs) for %<-mtune=%> switch"
5571 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5572 opts->x_ix86_tune_string);
5574 auto_vec <const char *> candidates;
5575 for (i = 0; i < pta_size; i++)
5576 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5577 || (processor_alias_table[i].flags & PTA_64BIT))
5578 candidates.safe_push (processor_alias_table[i].name);
5580 char *s;
5581 const char *hint
5582 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5583 if (hint)
5584 inform (input_location,
5585 main_args_p
5586 ? "valid arguments to %<-mtune=%> switch are: "
5587 "%s; did you mean %qs?"
5588 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5589 "%s; did you mean %qs?", s, hint);
5590 else
5591 inform (input_location,
5592 main_args_p
5593 ? "valid arguments to %<-mtune=%> switch are: %s"
5594 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5596 XDELETEVEC (s);
5599 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5601 #ifndef USE_IX86_FRAME_POINTER
5602 #define USE_IX86_FRAME_POINTER 0
5603 #endif
5605 #ifndef USE_X86_64_FRAME_POINTER
5606 #define USE_X86_64_FRAME_POINTER 0
5607 #endif
5609 /* Set the default values for switches whose default depends on TARGET_64BIT
5610 in case they weren't overwritten by command line options. */
5611 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5613 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5614 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5615 if (opts->x_flag_asynchronous_unwind_tables
5616 && !opts_set->x_flag_unwind_tables
5617 && TARGET_64BIT_MS_ABI)
5618 opts->x_flag_unwind_tables = 1;
5619 if (opts->x_flag_asynchronous_unwind_tables == 2)
5620 opts->x_flag_unwind_tables
5621 = opts->x_flag_asynchronous_unwind_tables = 1;
5622 if (opts->x_flag_pcc_struct_return == 2)
5623 opts->x_flag_pcc_struct_return = 0;
5625 else
5627 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5628 opts->x_flag_omit_frame_pointer
5629 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5630 if (opts->x_flag_asynchronous_unwind_tables == 2)
5631 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5632 if (opts->x_flag_pcc_struct_return == 2)
5634 /* Intel MCU psABI specifies that -freg-struct-return should
5635 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5636 we check -miamcu so that -freg-struct-return is always
5637 turned on if -miamcu is used. */
5638 if (TARGET_IAMCU_P (opts->x_target_flags))
5639 opts->x_flag_pcc_struct_return = 0;
5640 else
5641 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5645 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5646 /* TODO: ix86_cost should be chosen at instruction or function granuality
5647 so for cold code we use size_cost even in !optimize_size compilation. */
5648 if (opts->x_optimize_size)
5649 ix86_cost = &ix86_size_cost;
5650 else
5651 ix86_cost = ix86_tune_cost;
5653 /* Arrange to set up i386_stack_locals for all functions. */
5654 init_machine_status = ix86_init_machine_status;
5656 /* Validate -mregparm= value. */
5657 if (opts_set->x_ix86_regparm)
5659 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5660 warning (0, "-mregparm is ignored in 64-bit mode");
5661 else if (TARGET_IAMCU_P (opts->x_target_flags))
5662 warning (0, "-mregparm is ignored for Intel MCU psABI");
5663 if (opts->x_ix86_regparm > REGPARM_MAX)
5665 error ("-mregparm=%d is not between 0 and %d",
5666 opts->x_ix86_regparm, REGPARM_MAX);
5667 opts->x_ix86_regparm = 0;
5670 if (TARGET_IAMCU_P (opts->x_target_flags)
5671 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5672 opts->x_ix86_regparm = REGPARM_MAX;
5674 /* Default align_* from the processor table. */
5675 ix86_default_align (opts);
5677 /* Provide default for -mbranch-cost= value. */
5678 if (!opts_set->x_ix86_branch_cost)
5679 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5681 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5683 opts->x_target_flags
5684 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5686 /* Enable by default the SSE and MMX builtins. Do allow the user to
5687 explicitly disable any of these. In particular, disabling SSE and
5688 MMX for kernel code is extremely useful. */
5689 if (!ix86_arch_specified)
5690 opts->x_ix86_isa_flags
5691 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5692 | TARGET_SUBTARGET64_ISA_DEFAULT)
5693 & ~opts->x_ix86_isa_flags_explicit);
5695 if (TARGET_RTD_P (opts->x_target_flags))
5696 warning (0,
5697 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5698 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5700 else
5702 opts->x_target_flags
5703 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5705 if (!ix86_arch_specified)
5706 opts->x_ix86_isa_flags
5707 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5709 /* i386 ABI does not specify red zone. It still makes sense to use it
5710 when programmer takes care to stack from being destroyed. */
5711 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5712 opts->x_target_flags |= MASK_NO_RED_ZONE;
5715 /* Keep nonleaf frame pointers. */
5716 if (opts->x_flag_omit_frame_pointer)
5717 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5718 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5719 opts->x_flag_omit_frame_pointer = 1;
5721 /* If we're doing fast math, we don't care about comparison order
5722 wrt NaNs. This lets us use a shorter comparison sequence. */
5723 if (opts->x_flag_finite_math_only)
5724 opts->x_target_flags &= ~MASK_IEEE_FP;
5726 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5727 since the insns won't need emulation. */
5728 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5729 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5731 /* Likewise, if the target doesn't have a 387, or we've specified
5732 software floating point, don't use 387 inline intrinsics. */
5733 if (!TARGET_80387_P (opts->x_target_flags))
5734 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5736 /* Turn on MMX builtins for -msse. */
5737 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5738 opts->x_ix86_isa_flags
5739 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5741 /* Enable SSE prefetch. */
5742 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5743 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5744 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5745 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5746 x86_prefetch_sse = true;
5748 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5749 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5750 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5751 opts->x_ix86_isa_flags
5752 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5754 /* Enable lzcnt instruction for -mabm. */
5755 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5756 opts->x_ix86_isa_flags
5757 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5759 /* Validate -mpreferred-stack-boundary= value or default it to
5760 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5761 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5762 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5764 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5765 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5766 int max = (TARGET_SEH ? 4 : 12);
5768 if (opts->x_ix86_preferred_stack_boundary_arg < min
5769 || opts->x_ix86_preferred_stack_boundary_arg > max)
5771 if (min == max)
5772 error ("-mpreferred-stack-boundary is not supported "
5773 "for this target");
5774 else
5775 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5776 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5778 else
5779 ix86_preferred_stack_boundary
5780 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5783 /* Set the default value for -mstackrealign. */
5784 if (opts->x_ix86_force_align_arg_pointer == -1)
5785 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5787 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5789 /* Validate -mincoming-stack-boundary= value or default it to
5790 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5791 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5792 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5794 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5796 if (opts->x_ix86_incoming_stack_boundary_arg < min
5797 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5798 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5799 opts->x_ix86_incoming_stack_boundary_arg, min);
5800 else
5802 ix86_user_incoming_stack_boundary
5803 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5804 ix86_incoming_stack_boundary
5805 = ix86_user_incoming_stack_boundary;
5809 #ifndef NO_PROFILE_COUNTERS
5810 if (flag_nop_mcount)
5811 error ("-mnop-mcount is not compatible with this target");
5812 #endif
5813 if (flag_nop_mcount && flag_pic)
5814 error ("-mnop-mcount is not implemented for -fPIC");
5816 /* Accept -msseregparm only if at least SSE support is enabled. */
5817 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5818 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5819 error (main_args_p
5820 ? "%<-msseregparm%> used without SSE enabled"
5821 : "%<target(\"sseregparm\")%> used without SSE enabled");
5823 if (opts_set->x_ix86_fpmath)
5825 if (opts->x_ix86_fpmath & FPMATH_SSE)
5827 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5829 if (TARGET_80387_P (opts->x_target_flags))
5831 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5832 opts->x_ix86_fpmath = FPMATH_387;
5835 else if ((opts->x_ix86_fpmath & FPMATH_387)
5836 && !TARGET_80387_P (opts->x_target_flags))
5838 warning (0, "387 instruction set disabled, using SSE arithmetics");
5839 opts->x_ix86_fpmath = FPMATH_SSE;
5843 /* For all chips supporting SSE2, -mfpmath=sse performs better than
5844 fpmath=387. The second is however default at many targets since the
5845 extra 80bit precision of temporaries is considered to be part of ABI.
5846 Overwrite the default at least for -ffast-math.
5847 TODO: -mfpmath=both seems to produce same performing code with bit
5848 smaller binaries. It is however not clear if register allocation is
5849 ready for this setting.
5850 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
5851 codegen. We may switch to 387 with -ffast-math for size optimized
5852 functions. */
5853 else if (fast_math_flags_set_p (&global_options)
5854 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
5855 opts->x_ix86_fpmath = FPMATH_SSE;
5856 else
5857 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
5859 /* Use external vectorized library in vectorizing intrinsics. */
5860 if (opts_set->x_ix86_veclibabi_type)
5861 switch (opts->x_ix86_veclibabi_type)
5863 case ix86_veclibabi_type_svml:
5864 ix86_veclib_handler = ix86_veclibabi_svml;
5865 break;
5867 case ix86_veclibabi_type_acml:
5868 ix86_veclib_handler = ix86_veclibabi_acml;
5869 break;
5871 default:
5872 gcc_unreachable ();
5875 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
5876 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5877 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5879 /* If stack probes are required, the space used for large function
5880 arguments on the stack must also be probed, so enable
5881 -maccumulate-outgoing-args so this happens in the prologue. */
5882 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
5883 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5885 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5886 warning (0,
5887 main_args_p
5888 ? "stack probing requires %<-maccumulate-outgoing-args%> "
5889 "for correctness"
5890 : "stack probing requires "
5891 "%<target(\"accumulate-outgoing-args\")%> for correctness");
5892 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5895 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
5896 so enable -maccumulate-outgoing-args when %ebp is fixed. */
5897 if (fixed_regs[BP_REG]
5898 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5900 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5901 warning (0,
5902 main_args_p
5903 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
5904 : "fixed ebp register requires "
5905 "%<target(\"accumulate-outgoing-args\")%>");
5906 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5909 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
5911 char *p;
5912 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
5913 p = strchr (internal_label_prefix, 'X');
5914 internal_label_prefix_len = p - internal_label_prefix;
5915 *p = '\0';
5918 /* When scheduling description is not available, disable scheduler pass
5919 so it won't slow down the compilation and make x87 code slower. */
5920 if (!TARGET_SCHEDULE)
5921 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
5923 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
5924 ix86_tune_cost->simultaneous_prefetches,
5925 opts->x_param_values,
5926 opts_set->x_param_values);
5927 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
5928 ix86_tune_cost->prefetch_block,
5929 opts->x_param_values,
5930 opts_set->x_param_values);
5931 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
5932 ix86_tune_cost->l1_cache_size,
5933 opts->x_param_values,
5934 opts_set->x_param_values);
5935 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
5936 ix86_tune_cost->l2_cache_size,
5937 opts->x_param_values,
5938 opts_set->x_param_values);
5940 /* Restrict number of if-converted SET insns to 1. */
5941 if (TARGET_ONE_IF_CONV_INSN)
5942 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
5944 opts->x_param_values,
5945 opts_set->x_param_values);
5947 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
5948 if (opts->x_flag_prefetch_loop_arrays < 0
5949 && HAVE_prefetch
5950 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
5951 && !opts->x_optimize_size
5952 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
5953 opts->x_flag_prefetch_loop_arrays = 1;
5955 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
5956 can be opts->x_optimized to ap = __builtin_next_arg (0). */
5957 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
5958 targetm.expand_builtin_va_start = NULL;
5960 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5962 ix86_gen_leave = gen_leave_rex64;
5963 if (Pmode == DImode)
5965 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
5966 ix86_gen_tls_local_dynamic_base_64
5967 = gen_tls_local_dynamic_base_64_di;
5969 else
5971 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
5972 ix86_gen_tls_local_dynamic_base_64
5973 = gen_tls_local_dynamic_base_64_si;
5976 else
5977 ix86_gen_leave = gen_leave;
5979 if (Pmode == DImode)
5981 ix86_gen_add3 = gen_adddi3;
5982 ix86_gen_sub3 = gen_subdi3;
5983 ix86_gen_sub3_carry = gen_subdi3_carry;
5984 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
5985 ix86_gen_andsp = gen_anddi3;
5986 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
5987 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
5988 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
5989 ix86_gen_monitor = gen_sse3_monitor_di;
5990 ix86_gen_monitorx = gen_monitorx_di;
5991 ix86_gen_clzero = gen_clzero_di;
5993 else
5995 ix86_gen_add3 = gen_addsi3;
5996 ix86_gen_sub3 = gen_subsi3;
5997 ix86_gen_sub3_carry = gen_subsi3_carry;
5998 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
5999 ix86_gen_andsp = gen_andsi3;
6000 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6001 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6002 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6003 ix86_gen_monitor = gen_sse3_monitor_si;
6004 ix86_gen_monitorx = gen_monitorx_si;
6005 ix86_gen_clzero = gen_clzero_si;
6008 #ifdef USE_IX86_CLD
6009 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6010 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6011 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6012 #endif
6014 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6016 if (opts->x_flag_fentry > 0)
6017 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6018 "with -fpic");
6019 opts->x_flag_fentry = 0;
6021 else if (TARGET_SEH)
6023 if (opts->x_flag_fentry == 0)
6024 sorry ("-mno-fentry isn%'t compatible with SEH");
6025 opts->x_flag_fentry = 1;
6027 else if (opts->x_flag_fentry < 0)
6029 #if defined(PROFILE_BEFORE_PROLOGUE)
6030 opts->x_flag_fentry = 1;
6031 #else
6032 opts->x_flag_fentry = 0;
6033 #endif
6036 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6037 opts->x_target_flags |= MASK_VZEROUPPER;
6038 if (!(opts_set->x_target_flags & MASK_STV))
6039 opts->x_target_flags |= MASK_STV;
6040 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6041 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6042 stack realignment will be extra cost the pass doesn't take into
6043 account and the pass can't realign the stack. */
6044 if (ix86_preferred_stack_boundary < 128
6045 || ix86_incoming_stack_boundary < 128
6046 || opts->x_ix86_force_align_arg_pointer)
6047 opts->x_target_flags &= ~MASK_STV;
6048 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6049 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6050 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6051 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6052 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6053 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6054 /* Enable 128-bit AVX instruction generation
6055 for the auto-vectorizer. */
6056 if (TARGET_AVX128_OPTIMAL
6057 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6058 opts->x_target_flags |= MASK_PREFER_AVX128;
6060 if (opts->x_ix86_recip_name)
6062 char *p = ASTRDUP (opts->x_ix86_recip_name);
6063 char *q;
6064 unsigned int mask, i;
6065 bool invert;
6067 while ((q = strtok (p, ",")) != NULL)
6069 p = NULL;
6070 if (*q == '!')
6072 invert = true;
6073 q++;
6075 else
6076 invert = false;
6078 if (!strcmp (q, "default"))
6079 mask = RECIP_MASK_ALL;
6080 else
6082 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6083 if (!strcmp (q, recip_options[i].string))
6085 mask = recip_options[i].mask;
6086 break;
6089 if (i == ARRAY_SIZE (recip_options))
6091 error ("unknown option for -mrecip=%s", q);
6092 invert = false;
6093 mask = RECIP_MASK_NONE;
6097 opts->x_recip_mask_explicit |= mask;
6098 if (invert)
6099 opts->x_recip_mask &= ~mask;
6100 else
6101 opts->x_recip_mask |= mask;
6105 if (TARGET_RECIP_P (opts->x_target_flags))
6106 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6107 else if (opts_set->x_target_flags & MASK_RECIP)
6108 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6110 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6111 for 64-bit Bionic. Also default long double to 64-bit for Intel
6112 MCU psABI. */
6113 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6114 && !(opts_set->x_target_flags
6115 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6116 opts->x_target_flags |= (TARGET_64BIT
6117 ? MASK_LONG_DOUBLE_128
6118 : MASK_LONG_DOUBLE_64);
6120 /* Only one of them can be active. */
6121 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6122 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6124 /* Save the initial options in case the user does function specific
6125 options. */
6126 if (main_args_p)
6127 target_option_default_node = target_option_current_node
6128 = build_target_option_node (opts);
6130 /* Handle stack protector */
6131 if (!opts_set->x_ix86_stack_protector_guard)
6132 opts->x_ix86_stack_protector_guard
6133 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6135 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6136 if (opts->x_ix86_tune_memcpy_strategy)
6138 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6139 ix86_parse_stringop_strategy_string (str, false);
6140 free (str);
6143 if (opts->x_ix86_tune_memset_strategy)
6145 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6146 ix86_parse_stringop_strategy_string (str, true);
6147 free (str);
6150 return true;
6153 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6155 static void
6156 ix86_option_override (void)
6158 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
6159 struct register_pass_info insert_vzeroupper_info
6160 = { pass_insert_vzeroupper, "reload",
6161 1, PASS_POS_INSERT_AFTER
6163 opt_pass *pass_stv = make_pass_stv (g);
6164 struct register_pass_info stv_info_dimode
6165 = { pass_stv, "combine",
6166 1, PASS_POS_INSERT_AFTER
6168 /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
6169 CONSTM1_RTX generated by the STV pass can be CSEed. */
6170 struct register_pass_info stv_info_timode
6171 = { pass_stv, "cse2",
6172 1, PASS_POS_INSERT_BEFORE
6175 ix86_option_override_internal (true, &global_options, &global_options_set);
6178 /* This needs to be done at start up. It's convenient to do it here. */
6179 register_pass (&insert_vzeroupper_info);
6180 register_pass (TARGET_64BIT ? &stv_info_timode : &stv_info_dimode);
6183 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6184 static char *
6185 ix86_offload_options (void)
6187 if (TARGET_LP64)
6188 return xstrdup ("-foffload-abi=lp64");
6189 return xstrdup ("-foffload-abi=ilp32");
6192 /* Update register usage after having seen the compiler flags. */
6194 static void
6195 ix86_conditional_register_usage (void)
6197 int i, c_mask;
6199 /* If there are no caller-saved registers, preserve all registers.
6200 except fixed_regs and registers used for function return value
6201 since aggregate_value_p checks call_used_regs[regno] on return
6202 value. */
6203 if (cfun && cfun->machine->no_caller_saved_registers)
6204 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6205 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6206 call_used_regs[i] = 0;
6208 /* For 32-bit targets, squash the REX registers. */
6209 if (! TARGET_64BIT)
6211 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6212 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6213 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6214 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6215 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6216 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6219 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6220 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6222 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6224 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6226 /* Set/reset conditionally defined registers from
6227 CALL_USED_REGISTERS initializer. */
6228 if (call_used_regs[i] > 1)
6229 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6231 /* Calculate registers of CLOBBERED_REGS register set
6232 as call used registers from GENERAL_REGS register set. */
6233 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6234 && call_used_regs[i])
6235 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6238 /* If MMX is disabled, squash the registers. */
6239 if (! TARGET_MMX)
6240 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6241 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6242 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6244 /* If SSE is disabled, squash the registers. */
6245 if (! TARGET_SSE)
6246 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6247 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6248 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6250 /* If the FPU is disabled, squash the registers. */
6251 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6252 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6253 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6254 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6256 /* If AVX512F is disabled, squash the registers. */
6257 if (! TARGET_AVX512F)
6259 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6260 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6262 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6263 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6266 /* If MPX is disabled, squash the registers. */
6267 if (! TARGET_MPX)
6268 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6269 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6273 /* Save the current options */
6275 static void
6276 ix86_function_specific_save (struct cl_target_option *ptr,
6277 struct gcc_options *opts)
6279 ptr->arch = ix86_arch;
6280 ptr->schedule = ix86_schedule;
6281 ptr->prefetch_sse = x86_prefetch_sse;
6282 ptr->tune = ix86_tune;
6283 ptr->branch_cost = ix86_branch_cost;
6284 ptr->tune_defaulted = ix86_tune_defaulted;
6285 ptr->arch_specified = ix86_arch_specified;
6286 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6287 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6288 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6289 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6290 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6291 ptr->x_ix86_abi = opts->x_ix86_abi;
6292 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6293 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6294 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6295 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6296 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6297 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6298 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6299 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6300 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6301 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6302 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6303 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6304 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6305 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6306 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6307 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6308 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6309 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6310 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6311 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6313 /* The fields are char but the variables are not; make sure the
6314 values fit in the fields. */
6315 gcc_assert (ptr->arch == ix86_arch);
6316 gcc_assert (ptr->schedule == ix86_schedule);
6317 gcc_assert (ptr->tune == ix86_tune);
6318 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6321 /* Restore the current options */
6323 static void
6324 ix86_function_specific_restore (struct gcc_options *opts,
6325 struct cl_target_option *ptr)
6327 enum processor_type old_tune = ix86_tune;
6328 enum processor_type old_arch = ix86_arch;
6329 unsigned int ix86_arch_mask;
6330 int i;
6332 /* We don't change -fPIC. */
6333 opts->x_flag_pic = flag_pic;
6335 ix86_arch = (enum processor_type) ptr->arch;
6336 ix86_schedule = (enum attr_cpu) ptr->schedule;
6337 ix86_tune = (enum processor_type) ptr->tune;
6338 x86_prefetch_sse = ptr->prefetch_sse;
6339 opts->x_ix86_branch_cost = ptr->branch_cost;
6340 ix86_tune_defaulted = ptr->tune_defaulted;
6341 ix86_arch_specified = ptr->arch_specified;
6342 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6343 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6344 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6345 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6346 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6347 opts->x_ix86_abi = ptr->x_ix86_abi;
6348 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6349 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6350 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6351 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6352 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6353 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6354 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6355 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6356 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6357 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6358 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6359 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6360 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6361 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6362 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6363 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6364 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6365 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6366 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6367 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6368 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6369 /* TODO: ix86_cost should be chosen at instruction or function granuality
6370 so for cold code we use size_cost even in !optimize_size compilation. */
6371 if (opts->x_optimize_size)
6372 ix86_cost = &ix86_size_cost;
6373 else
6374 ix86_cost = ix86_tune_cost;
6376 /* Recreate the arch feature tests if the arch changed */
6377 if (old_arch != ix86_arch)
6379 ix86_arch_mask = 1u << ix86_arch;
6380 for (i = 0; i < X86_ARCH_LAST; ++i)
6381 ix86_arch_features[i]
6382 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6385 /* Recreate the tune optimization tests */
6386 if (old_tune != ix86_tune)
6387 set_ix86_tune_features (ix86_tune, false);
6390 /* Adjust target options after streaming them in. This is mainly about
6391 reconciling them with global options. */
6393 static void
6394 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6396 /* flag_pic is a global option, but ix86_cmodel is target saved option
6397 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6398 for PIC, or error out. */
6399 if (flag_pic)
6400 switch (ptr->x_ix86_cmodel)
6402 case CM_SMALL:
6403 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6404 break;
6406 case CM_MEDIUM:
6407 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6408 break;
6410 case CM_LARGE:
6411 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6412 break;
6414 case CM_KERNEL:
6415 error ("code model %s does not support PIC mode", "kernel");
6416 break;
6418 default:
6419 break;
6421 else
6422 switch (ptr->x_ix86_cmodel)
6424 case CM_SMALL_PIC:
6425 ptr->x_ix86_cmodel = CM_SMALL;
6426 break;
6428 case CM_MEDIUM_PIC:
6429 ptr->x_ix86_cmodel = CM_MEDIUM;
6430 break;
6432 case CM_LARGE_PIC:
6433 ptr->x_ix86_cmodel = CM_LARGE;
6434 break;
6436 default:
6437 break;
6441 /* Print the current options */
6443 static void
6444 ix86_function_specific_print (FILE *file, int indent,
6445 struct cl_target_option *ptr)
6447 char *target_string
6448 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
6449 ptr->x_ix86_target_flags, NULL, NULL,
6450 ptr->x_ix86_fpmath, false);
6452 gcc_assert (ptr->arch < PROCESSOR_max);
6453 fprintf (file, "%*sarch = %d (%s)\n",
6454 indent, "",
6455 ptr->arch, processor_target_table[ptr->arch].name);
6457 gcc_assert (ptr->tune < PROCESSOR_max);
6458 fprintf (file, "%*stune = %d (%s)\n",
6459 indent, "",
6460 ptr->tune, processor_target_table[ptr->tune].name);
6462 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6464 if (target_string)
6466 fprintf (file, "%*s%s\n", indent, "", target_string);
6467 free (target_string);
6472 /* Inner function to process the attribute((target(...))), take an argument and
6473 set the current options from the argument. If we have a list, recursively go
6474 over the list. */
6476 static bool
6477 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6478 struct gcc_options *opts,
6479 struct gcc_options *opts_set,
6480 struct gcc_options *enum_opts_set)
6482 char *next_optstr;
6483 bool ret = true;
6485 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6486 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6487 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6488 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6489 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6491 enum ix86_opt_type
6493 ix86_opt_unknown,
6494 ix86_opt_yes,
6495 ix86_opt_no,
6496 ix86_opt_str,
6497 ix86_opt_enum,
6498 ix86_opt_isa
6501 static const struct
6503 const char *string;
6504 size_t len;
6505 enum ix86_opt_type type;
6506 int opt;
6507 int mask;
6508 } attrs[] = {
6509 /* isa options */
6510 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6511 IX86_ATTR_ISA ("abm", OPT_mabm),
6512 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6513 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6514 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6515 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6516 IX86_ATTR_ISA ("aes", OPT_maes),
6517 IX86_ATTR_ISA ("sha", OPT_msha),
6518 IX86_ATTR_ISA ("avx", OPT_mavx),
6519 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6520 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6521 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6522 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6523 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6524 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6525 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6526 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6527 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6528 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6529 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6530 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6531 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6532 IX86_ATTR_ISA ("sse", OPT_msse),
6533 IX86_ATTR_ISA ("sse2", OPT_msse2),
6534 IX86_ATTR_ISA ("sse3", OPT_msse3),
6535 IX86_ATTR_ISA ("sse4", OPT_msse4),
6536 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6537 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6538 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6539 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6540 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6541 IX86_ATTR_ISA ("fma", OPT_mfma),
6542 IX86_ATTR_ISA ("xop", OPT_mxop),
6543 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6544 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6545 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6546 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6547 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6548 IX86_ATTR_ISA ("hle", OPT_mhle),
6549 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6550 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6551 IX86_ATTR_ISA ("adx", OPT_madx),
6552 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6553 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6554 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6555 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6556 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6557 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6558 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6559 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6560 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6561 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6562 IX86_ATTR_ISA ("pcommit", OPT_mpcommit),
6563 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6564 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6565 IX86_ATTR_ISA ("pku", OPT_mpku),
6567 /* enum options */
6568 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6570 /* string options */
6571 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6572 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6574 /* flag options */
6575 IX86_ATTR_YES ("cld",
6576 OPT_mcld,
6577 MASK_CLD),
6579 IX86_ATTR_NO ("fancy-math-387",
6580 OPT_mfancy_math_387,
6581 MASK_NO_FANCY_MATH_387),
6583 IX86_ATTR_YES ("ieee-fp",
6584 OPT_mieee_fp,
6585 MASK_IEEE_FP),
6587 IX86_ATTR_YES ("inline-all-stringops",
6588 OPT_minline_all_stringops,
6589 MASK_INLINE_ALL_STRINGOPS),
6591 IX86_ATTR_YES ("inline-stringops-dynamically",
6592 OPT_minline_stringops_dynamically,
6593 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6595 IX86_ATTR_NO ("align-stringops",
6596 OPT_mno_align_stringops,
6597 MASK_NO_ALIGN_STRINGOPS),
6599 IX86_ATTR_YES ("recip",
6600 OPT_mrecip,
6601 MASK_RECIP),
6605 /* If this is a list, recurse to get the options. */
6606 if (TREE_CODE (args) == TREE_LIST)
6608 bool ret = true;
6610 for (; args; args = TREE_CHAIN (args))
6611 if (TREE_VALUE (args)
6612 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6613 p_strings, opts, opts_set,
6614 enum_opts_set))
6615 ret = false;
6617 return ret;
6620 else if (TREE_CODE (args) != STRING_CST)
6622 error ("attribute %<target%> argument not a string");
6623 return false;
6626 /* Handle multiple arguments separated by commas. */
6627 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6629 while (next_optstr && *next_optstr != '\0')
6631 char *p = next_optstr;
6632 char *orig_p = p;
6633 char *comma = strchr (next_optstr, ',');
6634 const char *opt_string;
6635 size_t len, opt_len;
6636 int opt;
6637 bool opt_set_p;
6638 char ch;
6639 unsigned i;
6640 enum ix86_opt_type type = ix86_opt_unknown;
6641 int mask = 0;
6643 if (comma)
6645 *comma = '\0';
6646 len = comma - next_optstr;
6647 next_optstr = comma + 1;
6649 else
6651 len = strlen (p);
6652 next_optstr = NULL;
6655 /* Recognize no-xxx. */
6656 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6658 opt_set_p = false;
6659 p += 3;
6660 len -= 3;
6662 else
6663 opt_set_p = true;
6665 /* Find the option. */
6666 ch = *p;
6667 opt = N_OPTS;
6668 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6670 type = attrs[i].type;
6671 opt_len = attrs[i].len;
6672 if (ch == attrs[i].string[0]
6673 && ((type != ix86_opt_str && type != ix86_opt_enum)
6674 ? len == opt_len
6675 : len > opt_len)
6676 && memcmp (p, attrs[i].string, opt_len) == 0)
6678 opt = attrs[i].opt;
6679 mask = attrs[i].mask;
6680 opt_string = attrs[i].string;
6681 break;
6685 /* Process the option. */
6686 if (opt == N_OPTS)
6688 error ("attribute(target(\"%s\")) is unknown", orig_p);
6689 ret = false;
6692 else if (type == ix86_opt_isa)
6694 struct cl_decoded_option decoded;
6696 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6697 ix86_handle_option (opts, opts_set,
6698 &decoded, input_location);
6701 else if (type == ix86_opt_yes || type == ix86_opt_no)
6703 if (type == ix86_opt_no)
6704 opt_set_p = !opt_set_p;
6706 if (opt_set_p)
6707 opts->x_target_flags |= mask;
6708 else
6709 opts->x_target_flags &= ~mask;
6712 else if (type == ix86_opt_str)
6714 if (p_strings[opt])
6716 error ("option(\"%s\") was already specified", opt_string);
6717 ret = false;
6719 else
6720 p_strings[opt] = xstrdup (p + opt_len);
6723 else if (type == ix86_opt_enum)
6725 bool arg_ok;
6726 int value;
6728 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6729 if (arg_ok)
6730 set_option (opts, enum_opts_set, opt, value,
6731 p + opt_len, DK_UNSPECIFIED, input_location,
6732 global_dc);
6733 else
6735 error ("attribute(target(\"%s\")) is unknown", orig_p);
6736 ret = false;
6740 else
6741 gcc_unreachable ();
6744 return ret;
6747 /* Release allocated strings. */
6748 static void
6749 release_options_strings (char **option_strings)
6751 /* Free up memory allocated to hold the strings */
6752 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6753 free (option_strings[i]);
6756 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6758 tree
6759 ix86_valid_target_attribute_tree (tree args,
6760 struct gcc_options *opts,
6761 struct gcc_options *opts_set)
6763 const char *orig_arch_string = opts->x_ix86_arch_string;
6764 const char *orig_tune_string = opts->x_ix86_tune_string;
6765 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6766 int orig_tune_defaulted = ix86_tune_defaulted;
6767 int orig_arch_specified = ix86_arch_specified;
6768 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6769 tree t = NULL_TREE;
6770 struct cl_target_option *def
6771 = TREE_TARGET_OPTION (target_option_default_node);
6772 struct gcc_options enum_opts_set;
6774 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6776 /* Process each of the options on the chain. */
6777 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6778 opts_set, &enum_opts_set))
6779 return error_mark_node;
6781 /* If the changed options are different from the default, rerun
6782 ix86_option_override_internal, and then save the options away.
6783 The string options are attribute options, and will be undone
6784 when we copy the save structure. */
6785 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6786 || opts->x_target_flags != def->x_target_flags
6787 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6788 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6789 || enum_opts_set.x_ix86_fpmath)
6791 /* If we are using the default tune= or arch=, undo the string assigned,
6792 and use the default. */
6793 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6795 opts->x_ix86_arch_string
6796 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6798 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6799 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6800 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6801 | OPTION_MASK_ABI_64
6802 | OPTION_MASK_ABI_X32
6803 | OPTION_MASK_CODE16);
6806 else if (!orig_arch_specified)
6807 opts->x_ix86_arch_string = NULL;
6809 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6810 opts->x_ix86_tune_string
6811 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6812 else if (orig_tune_defaulted)
6813 opts->x_ix86_tune_string = NULL;
6815 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6816 if (enum_opts_set.x_ix86_fpmath)
6817 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6818 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6819 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6821 if (TARGET_80387_P (opts->x_target_flags))
6822 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6823 | FPMATH_387);
6824 else
6825 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6826 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6829 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6830 bool r = ix86_option_override_internal (false, opts, opts_set);
6831 if (!r)
6833 release_options_strings (option_strings);
6834 return error_mark_node;
6837 /* Add any builtin functions with the new isa if any. */
6838 ix86_add_new_builtins (opts->x_ix86_isa_flags);
6840 /* Save the current options unless we are validating options for
6841 #pragma. */
6842 t = build_target_option_node (opts);
6844 opts->x_ix86_arch_string = orig_arch_string;
6845 opts->x_ix86_tune_string = orig_tune_string;
6846 opts_set->x_ix86_fpmath = orig_fpmath_set;
6848 release_options_strings (option_strings);
6851 return t;
6854 /* Hook to validate attribute((target("string"))). */
6856 static bool
6857 ix86_valid_target_attribute_p (tree fndecl,
6858 tree ARG_UNUSED (name),
6859 tree args,
6860 int ARG_UNUSED (flags))
6862 struct gcc_options func_options;
6863 tree new_target, new_optimize;
6864 bool ret = true;
6866 /* attribute((target("default"))) does nothing, beyond
6867 affecting multi-versioning. */
6868 if (TREE_VALUE (args)
6869 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
6870 && TREE_CHAIN (args) == NULL_TREE
6871 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
6872 return true;
6874 tree old_optimize = build_optimization_node (&global_options);
6876 /* Get the optimization options of the current function. */
6877 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6879 if (!func_optimize)
6880 func_optimize = old_optimize;
6882 /* Init func_options. */
6883 memset (&func_options, 0, sizeof (func_options));
6884 init_options_struct (&func_options, NULL);
6885 lang_hooks.init_options_struct (&func_options);
6887 cl_optimization_restore (&func_options,
6888 TREE_OPTIMIZATION (func_optimize));
6890 /* Initialize func_options to the default before its target options can
6891 be set. */
6892 cl_target_option_restore (&func_options,
6893 TREE_TARGET_OPTION (target_option_default_node));
6895 new_target = ix86_valid_target_attribute_tree (args, &func_options,
6896 &global_options_set);
6898 new_optimize = build_optimization_node (&func_options);
6900 if (new_target == error_mark_node)
6901 ret = false;
6903 else if (fndecl && new_target)
6905 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
6907 if (old_optimize != new_optimize)
6908 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
6911 finalize_options_struct (&func_options);
6913 return ret;
6917 /* Hook to determine if one function can safely inline another. */
6919 static bool
6920 ix86_can_inline_p (tree caller, tree callee)
6922 bool ret = false;
6923 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
6924 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
6926 /* If callee has no option attributes, then it is ok to inline. */
6927 if (!callee_tree)
6928 ret = true;
6930 /* If caller has no option attributes, but callee does then it is not ok to
6931 inline. */
6932 else if (!caller_tree)
6933 ret = false;
6935 else
6937 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
6938 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
6940 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
6941 can inline a SSE2 function but a SSE2 function can't inline a SSE4
6942 function. */
6943 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
6944 != callee_opts->x_ix86_isa_flags)
6945 ret = false;
6947 /* See if we have the same non-isa options. */
6948 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
6949 ret = false;
6951 /* See if arch, tune, etc. are the same. */
6952 else if (caller_opts->arch != callee_opts->arch)
6953 ret = false;
6955 else if (caller_opts->tune != callee_opts->tune)
6956 ret = false;
6958 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
6959 ret = false;
6961 else if (caller_opts->branch_cost != callee_opts->branch_cost)
6962 ret = false;
6964 else
6965 ret = true;
6968 return ret;
6972 /* Remember the last target of ix86_set_current_function. */
6973 static GTY(()) tree ix86_previous_fndecl;
6975 /* Set targets globals to the default (or current #pragma GCC target
6976 if active). Invalidate ix86_previous_fndecl cache. */
6978 void
6979 ix86_reset_previous_fndecl (void)
6981 tree new_tree = target_option_current_node;
6982 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6983 if (TREE_TARGET_GLOBALS (new_tree))
6984 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6985 else if (new_tree == target_option_default_node)
6986 restore_target_globals (&default_target_globals);
6987 else
6988 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6989 ix86_previous_fndecl = NULL_TREE;
6992 /* Set the func_type field from the function FNDECL. */
6994 static void
6995 ix86_set_func_type (tree fndecl)
6997 if (cfun->machine->func_type == TYPE_UNKNOWN)
6999 if (lookup_attribute ("interrupt",
7000 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7002 int nargs = 0;
7003 for (tree arg = DECL_ARGUMENTS (fndecl);
7004 arg;
7005 arg = TREE_CHAIN (arg))
7006 nargs++;
7007 cfun->machine->no_caller_saved_registers = true;
7008 cfun->machine->func_type
7009 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7011 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7013 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7014 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7015 sorry ("Only DWARF debug format is supported for interrupt "
7016 "service routine.");
7018 else
7020 cfun->machine->func_type = TYPE_NORMAL;
7021 if (lookup_attribute ("no_caller_saved_registers",
7022 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7023 cfun->machine->no_caller_saved_registers = true;
7028 /* Establish appropriate back-end context for processing the function
7029 FNDECL. The argument might be NULL to indicate processing at top
7030 level, outside of any function scope. */
7031 static void
7032 ix86_set_current_function (tree fndecl)
7034 /* Only change the context if the function changes. This hook is called
7035 several times in the course of compiling a function, and we don't want to
7036 slow things down too much or call target_reinit when it isn't safe. */
7037 if (fndecl == ix86_previous_fndecl)
7039 /* There may be 2 function bodies for the same function FNDECL,
7040 one is extern inline and one isn't. Call ix86_set_func_type
7041 to set the func_type field. */
7042 if (fndecl != NULL_TREE)
7043 ix86_set_func_type (fndecl);
7044 return;
7047 tree old_tree;
7048 if (ix86_previous_fndecl == NULL_TREE)
7049 old_tree = target_option_current_node;
7050 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7051 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7052 else
7053 old_tree = target_option_default_node;
7055 if (fndecl == NULL_TREE)
7057 if (old_tree != target_option_current_node)
7058 ix86_reset_previous_fndecl ();
7059 return;
7062 ix86_set_func_type (fndecl);
7064 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7065 if (new_tree == NULL_TREE)
7066 new_tree = target_option_default_node;
7068 if (old_tree != new_tree)
7070 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7071 if (TREE_TARGET_GLOBALS (new_tree))
7072 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7073 else if (new_tree == target_option_default_node)
7074 restore_target_globals (&default_target_globals);
7075 else
7076 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7078 ix86_previous_fndecl = fndecl;
7080 static bool prev_no_caller_saved_registers;
7082 /* 64-bit MS and SYSV ABI have different set of call used registers.
7083 Avoid expensive re-initialization of init_regs each time we switch
7084 function context. */
7085 if (TARGET_64BIT
7086 && (call_used_regs[SI_REG]
7087 == (cfun->machine->call_abi == MS_ABI)))
7088 reinit_regs ();
7089 /* Need to re-initialize init_regs if caller-saved registers are
7090 changed. */
7091 else if (prev_no_caller_saved_registers
7092 != cfun->machine->no_caller_saved_registers)
7093 reinit_regs ();
7095 if (cfun->machine->func_type != TYPE_NORMAL
7096 || cfun->machine->no_caller_saved_registers)
7098 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7099 may change processor state. */
7100 const char *isa;
7101 if (TARGET_MPX)
7102 isa = "MPX";
7103 else if (TARGET_SSE)
7104 isa = "SSE";
7105 else if (TARGET_MMX)
7106 isa = "MMX/3Dnow";
7107 else if (TARGET_80387)
7108 isa = "80387";
7109 else
7110 isa = NULL;
7111 if (isa != NULL)
7113 if (cfun->machine->func_type != TYPE_NORMAL)
7114 sorry ("%s instructions aren't allowed in %s service routine",
7115 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7116 ? "exception" : "interrupt"));
7117 else
7118 sorry ("%s instructions aren't allowed in function with "
7119 "no_caller_saved_registers attribute", isa);
7120 /* Don't issue the same error twice. */
7121 cfun->machine->func_type = TYPE_NORMAL;
7122 cfun->machine->no_caller_saved_registers = false;
7126 prev_no_caller_saved_registers
7127 = cfun->machine->no_caller_saved_registers;
7131 /* Return true if this goes in large data/bss. */
7133 static bool
7134 ix86_in_large_data_p (tree exp)
7136 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7137 return false;
7139 if (exp == NULL_TREE)
7140 return false;
7142 /* Functions are never large data. */
7143 if (TREE_CODE (exp) == FUNCTION_DECL)
7144 return false;
7146 /* Automatic variables are never large data. */
7147 if (TREE_CODE (exp) == VAR_DECL && !is_global_var (exp))
7148 return false;
7150 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
7152 const char *section = DECL_SECTION_NAME (exp);
7153 if (strcmp (section, ".ldata") == 0
7154 || strcmp (section, ".lbss") == 0)
7155 return true;
7156 return false;
7158 else
7160 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7162 /* If this is an incomplete type with size 0, then we can't put it
7163 in data because it might be too big when completed. Also,
7164 int_size_in_bytes returns -1 if size can vary or is larger than
7165 an integer in which case also it is safer to assume that it goes in
7166 large data. */
7167 if (size <= 0 || size > ix86_section_threshold)
7168 return true;
7171 return false;
7174 /* i386-specific section flag to mark large sections. */
7175 #define SECTION_LARGE SECTION_MACH_DEP
7177 /* Switch to the appropriate section for output of DECL.
7178 DECL is either a `VAR_DECL' node or a constant of some sort.
7179 RELOC indicates whether forming the initial value of DECL requires
7180 link-time relocations. */
7182 ATTRIBUTE_UNUSED static section *
7183 x86_64_elf_select_section (tree decl, int reloc,
7184 unsigned HOST_WIDE_INT align)
7186 if (ix86_in_large_data_p (decl))
7188 const char *sname = NULL;
7189 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7190 switch (categorize_decl_for_section (decl, reloc))
7192 case SECCAT_DATA:
7193 sname = ".ldata";
7194 break;
7195 case SECCAT_DATA_REL:
7196 sname = ".ldata.rel";
7197 break;
7198 case SECCAT_DATA_REL_LOCAL:
7199 sname = ".ldata.rel.local";
7200 break;
7201 case SECCAT_DATA_REL_RO:
7202 sname = ".ldata.rel.ro";
7203 break;
7204 case SECCAT_DATA_REL_RO_LOCAL:
7205 sname = ".ldata.rel.ro.local";
7206 break;
7207 case SECCAT_BSS:
7208 sname = ".lbss";
7209 flags |= SECTION_BSS;
7210 break;
7211 case SECCAT_RODATA:
7212 case SECCAT_RODATA_MERGE_STR:
7213 case SECCAT_RODATA_MERGE_STR_INIT:
7214 case SECCAT_RODATA_MERGE_CONST:
7215 sname = ".lrodata";
7216 flags &= ~SECTION_WRITE;
7217 break;
7218 case SECCAT_SRODATA:
7219 case SECCAT_SDATA:
7220 case SECCAT_SBSS:
7221 gcc_unreachable ();
7222 case SECCAT_TEXT:
7223 case SECCAT_TDATA:
7224 case SECCAT_TBSS:
7225 /* We don't split these for medium model. Place them into
7226 default sections and hope for best. */
7227 break;
7229 if (sname)
7231 /* We might get called with string constants, but get_named_section
7232 doesn't like them as they are not DECLs. Also, we need to set
7233 flags in that case. */
7234 if (!DECL_P (decl))
7235 return get_section (sname, flags, NULL);
7236 return get_named_section (decl, sname, reloc);
7239 return default_elf_select_section (decl, reloc, align);
7242 /* Select a set of attributes for section NAME based on the properties
7243 of DECL and whether or not RELOC indicates that DECL's initializer
7244 might contain runtime relocations. */
7246 static unsigned int ATTRIBUTE_UNUSED
7247 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7249 unsigned int flags = default_section_type_flags (decl, name, reloc);
7251 if (ix86_in_large_data_p (decl))
7252 flags |= SECTION_LARGE;
7254 if (decl == NULL_TREE
7255 && (strcmp (name, ".ldata.rel.ro") == 0
7256 || strcmp (name, ".ldata.rel.ro.local") == 0))
7257 flags |= SECTION_RELRO;
7259 if (strcmp (name, ".lbss") == 0
7260 || strncmp (name, ".lbss.", 5) == 0
7261 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7262 flags |= SECTION_BSS;
7264 return flags;
7267 /* Build up a unique section name, expressed as a
7268 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7269 RELOC indicates whether the initial value of EXP requires
7270 link-time relocations. */
7272 static void ATTRIBUTE_UNUSED
7273 x86_64_elf_unique_section (tree decl, int reloc)
7275 if (ix86_in_large_data_p (decl))
7277 const char *prefix = NULL;
7278 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7279 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7281 switch (categorize_decl_for_section (decl, reloc))
7283 case SECCAT_DATA:
7284 case SECCAT_DATA_REL:
7285 case SECCAT_DATA_REL_LOCAL:
7286 case SECCAT_DATA_REL_RO:
7287 case SECCAT_DATA_REL_RO_LOCAL:
7288 prefix = one_only ? ".ld" : ".ldata";
7289 break;
7290 case SECCAT_BSS:
7291 prefix = one_only ? ".lb" : ".lbss";
7292 break;
7293 case SECCAT_RODATA:
7294 case SECCAT_RODATA_MERGE_STR:
7295 case SECCAT_RODATA_MERGE_STR_INIT:
7296 case SECCAT_RODATA_MERGE_CONST:
7297 prefix = one_only ? ".lr" : ".lrodata";
7298 break;
7299 case SECCAT_SRODATA:
7300 case SECCAT_SDATA:
7301 case SECCAT_SBSS:
7302 gcc_unreachable ();
7303 case SECCAT_TEXT:
7304 case SECCAT_TDATA:
7305 case SECCAT_TBSS:
7306 /* We don't split these for medium model. Place them into
7307 default sections and hope for best. */
7308 break;
7310 if (prefix)
7312 const char *name, *linkonce;
7313 char *string;
7315 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7316 name = targetm.strip_name_encoding (name);
7318 /* If we're using one_only, then there needs to be a .gnu.linkonce
7319 prefix to the section name. */
7320 linkonce = one_only ? ".gnu.linkonce" : "";
7322 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7324 set_decl_section_name (decl, string);
7325 return;
7328 default_unique_section (decl, reloc);
7331 #ifdef COMMON_ASM_OP
7333 #ifndef LARGECOMM_SECTION_ASM_OP
7334 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7335 #endif
7337 /* This says how to output assembler code to declare an
7338 uninitialized external linkage data object.
7340 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7341 large objects. */
7342 void
7343 x86_elf_aligned_decl_common (FILE *file, tree decl,
7344 const char *name, unsigned HOST_WIDE_INT size,
7345 int align)
7347 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7348 && size > (unsigned int)ix86_section_threshold)
7350 switch_to_section (get_named_section (decl, ".lbss", 0));
7351 fputs (LARGECOMM_SECTION_ASM_OP, file);
7353 else
7354 fputs (COMMON_ASM_OP, file);
7355 assemble_name (file, name);
7356 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7357 size, align / BITS_PER_UNIT);
7359 #endif
7361 /* Utility function for targets to use in implementing
7362 ASM_OUTPUT_ALIGNED_BSS. */
7364 void
7365 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7366 unsigned HOST_WIDE_INT size, int align)
7368 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7369 && size > (unsigned int)ix86_section_threshold)
7370 switch_to_section (get_named_section (decl, ".lbss", 0));
7371 else
7372 switch_to_section (bss_section);
7373 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7374 #ifdef ASM_DECLARE_OBJECT_NAME
7375 last_assemble_variable_decl = decl;
7376 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7377 #else
7378 /* Standard thing is just output label for the object. */
7379 ASM_OUTPUT_LABEL (file, name);
7380 #endif /* ASM_DECLARE_OBJECT_NAME */
7381 ASM_OUTPUT_SKIP (file, size ? size : 1);
7384 /* Decide whether we must probe the stack before any space allocation
7385 on this target. It's essentially TARGET_STACK_PROBE except when
7386 -fstack-check causes the stack to be already probed differently. */
7388 bool
7389 ix86_target_stack_probe (void)
7391 /* Do not probe the stack twice if static stack checking is enabled. */
7392 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7393 return false;
7395 return TARGET_STACK_PROBE;
7398 /* Decide whether we can make a sibling call to a function. DECL is the
7399 declaration of the function being targeted by the call and EXP is the
7400 CALL_EXPR representing the call. */
7402 static bool
7403 ix86_function_ok_for_sibcall (tree decl, tree exp)
7405 tree type, decl_or_type;
7406 rtx a, b;
7407 bool bind_global = decl && !targetm.binds_local_p (decl);
7409 /* Sibling call isn't OK if there are no caller-saved registers
7410 since all registers must be preserved before return. */
7411 if (cfun->machine->no_caller_saved_registers)
7412 return false;
7414 /* If we are generating position-independent code, we cannot sibcall
7415 optimize direct calls to global functions, as the PLT requires
7416 %ebx be live. (Darwin does not have a PLT.) */
7417 if (!TARGET_MACHO
7418 && !TARGET_64BIT
7419 && flag_pic
7420 && flag_plt
7421 && bind_global)
7422 return false;
7424 /* If we need to align the outgoing stack, then sibcalling would
7425 unalign the stack, which may break the called function. */
7426 if (ix86_minimum_incoming_stack_boundary (true)
7427 < PREFERRED_STACK_BOUNDARY)
7428 return false;
7430 if (decl)
7432 decl_or_type = decl;
7433 type = TREE_TYPE (decl);
7435 else
7437 /* We're looking at the CALL_EXPR, we need the type of the function. */
7438 type = CALL_EXPR_FN (exp); /* pointer expression */
7439 type = TREE_TYPE (type); /* pointer type */
7440 type = TREE_TYPE (type); /* function type */
7441 decl_or_type = type;
7444 /* Check that the return value locations are the same. Like
7445 if we are returning floats on the 80387 register stack, we cannot
7446 make a sibcall from a function that doesn't return a float to a
7447 function that does or, conversely, from a function that does return
7448 a float to a function that doesn't; the necessary stack adjustment
7449 would not be executed. This is also the place we notice
7450 differences in the return value ABI. Note that it is ok for one
7451 of the functions to have void return type as long as the return
7452 value of the other is passed in a register. */
7453 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7454 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7455 cfun->decl, false);
7456 if (STACK_REG_P (a) || STACK_REG_P (b))
7458 if (!rtx_equal_p (a, b))
7459 return false;
7461 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7463 else if (!rtx_equal_p (a, b))
7464 return false;
7466 if (TARGET_64BIT)
7468 /* The SYSV ABI has more call-clobbered registers;
7469 disallow sibcalls from MS to SYSV. */
7470 if (cfun->machine->call_abi == MS_ABI
7471 && ix86_function_type_abi (type) == SYSV_ABI)
7472 return false;
7474 else
7476 /* If this call is indirect, we'll need to be able to use a
7477 call-clobbered register for the address of the target function.
7478 Make sure that all such registers are not used for passing
7479 parameters. Note that DLLIMPORT functions and call to global
7480 function via GOT slot are indirect. */
7481 if (!decl
7482 || (bind_global && flag_pic && !flag_plt)
7483 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7485 /* Check if regparm >= 3 since arg_reg_available is set to
7486 false if regparm == 0. If regparm is 1 or 2, there is
7487 always a call-clobbered register available.
7489 ??? The symbol indirect call doesn't need a call-clobbered
7490 register. But we don't know if this is a symbol indirect
7491 call or not here. */
7492 if (ix86_function_regparm (type, NULL) >= 3
7493 && !cfun->machine->arg_reg_available)
7494 return false;
7498 /* Otherwise okay. That also includes certain types of indirect calls. */
7499 return true;
7502 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7503 and "sseregparm" calling convention attributes;
7504 arguments as in struct attribute_spec.handler. */
7506 static tree
7507 ix86_handle_cconv_attribute (tree *node, tree name,
7508 tree args,
7509 int,
7510 bool *no_add_attrs)
7512 if (TREE_CODE (*node) != FUNCTION_TYPE
7513 && TREE_CODE (*node) != METHOD_TYPE
7514 && TREE_CODE (*node) != FIELD_DECL
7515 && TREE_CODE (*node) != TYPE_DECL)
7517 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7518 name);
7519 *no_add_attrs = true;
7520 return NULL_TREE;
7523 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7524 if (is_attribute_p ("regparm", name))
7526 tree cst;
7528 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7530 error ("fastcall and regparm attributes are not compatible");
7533 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7535 error ("regparam and thiscall attributes are not compatible");
7538 cst = TREE_VALUE (args);
7539 if (TREE_CODE (cst) != INTEGER_CST)
7541 warning (OPT_Wattributes,
7542 "%qE attribute requires an integer constant argument",
7543 name);
7544 *no_add_attrs = true;
7546 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7548 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7549 name, REGPARM_MAX);
7550 *no_add_attrs = true;
7553 return NULL_TREE;
7556 if (TARGET_64BIT)
7558 /* Do not warn when emulating the MS ABI. */
7559 if ((TREE_CODE (*node) != FUNCTION_TYPE
7560 && TREE_CODE (*node) != METHOD_TYPE)
7561 || ix86_function_type_abi (*node) != MS_ABI)
7562 warning (OPT_Wattributes, "%qE attribute ignored",
7563 name);
7564 *no_add_attrs = true;
7565 return NULL_TREE;
7568 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7569 if (is_attribute_p ("fastcall", name))
7571 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7573 error ("fastcall and cdecl attributes are not compatible");
7575 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7577 error ("fastcall and stdcall attributes are not compatible");
7579 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7581 error ("fastcall and regparm attributes are not compatible");
7583 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7585 error ("fastcall and thiscall attributes are not compatible");
7589 /* Can combine stdcall with fastcall (redundant), regparm and
7590 sseregparm. */
7591 else if (is_attribute_p ("stdcall", name))
7593 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7595 error ("stdcall and cdecl attributes are not compatible");
7597 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7599 error ("stdcall and fastcall attributes are not compatible");
7601 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7603 error ("stdcall and thiscall attributes are not compatible");
7607 /* Can combine cdecl with regparm and sseregparm. */
7608 else if (is_attribute_p ("cdecl", name))
7610 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7612 error ("stdcall and cdecl attributes are not compatible");
7614 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7616 error ("fastcall and cdecl attributes are not compatible");
7618 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7620 error ("cdecl and thiscall attributes are not compatible");
7623 else if (is_attribute_p ("thiscall", name))
7625 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7626 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7627 name);
7628 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7630 error ("stdcall and thiscall attributes are not compatible");
7632 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7634 error ("fastcall and thiscall attributes are not compatible");
7636 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7638 error ("cdecl and thiscall attributes are not compatible");
7642 /* Can combine sseregparm with all attributes. */
7644 return NULL_TREE;
7647 /* The transactional memory builtins are implicitly regparm or fastcall
7648 depending on the ABI. Override the generic do-nothing attribute that
7649 these builtins were declared with, and replace it with one of the two
7650 attributes that we expect elsewhere. */
7652 static tree
7653 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7654 int flags, bool *no_add_attrs)
7656 tree alt;
7658 /* In no case do we want to add the placeholder attribute. */
7659 *no_add_attrs = true;
7661 /* The 64-bit ABI is unchanged for transactional memory. */
7662 if (TARGET_64BIT)
7663 return NULL_TREE;
7665 /* ??? Is there a better way to validate 32-bit windows? We have
7666 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7667 if (CHECK_STACK_LIMIT > 0)
7668 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7669 else
7671 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7672 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7674 decl_attributes (node, alt, flags);
7676 return NULL_TREE;
7679 /* This function determines from TYPE the calling-convention. */
7681 unsigned int
7682 ix86_get_callcvt (const_tree type)
7684 unsigned int ret = 0;
7685 bool is_stdarg;
7686 tree attrs;
7688 if (TARGET_64BIT)
7689 return IX86_CALLCVT_CDECL;
7691 attrs = TYPE_ATTRIBUTES (type);
7692 if (attrs != NULL_TREE)
7694 if (lookup_attribute ("cdecl", attrs))
7695 ret |= IX86_CALLCVT_CDECL;
7696 else if (lookup_attribute ("stdcall", attrs))
7697 ret |= IX86_CALLCVT_STDCALL;
7698 else if (lookup_attribute ("fastcall", attrs))
7699 ret |= IX86_CALLCVT_FASTCALL;
7700 else if (lookup_attribute ("thiscall", attrs))
7701 ret |= IX86_CALLCVT_THISCALL;
7703 /* Regparam isn't allowed for thiscall and fastcall. */
7704 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7706 if (lookup_attribute ("regparm", attrs))
7707 ret |= IX86_CALLCVT_REGPARM;
7708 if (lookup_attribute ("sseregparm", attrs))
7709 ret |= IX86_CALLCVT_SSEREGPARM;
7712 if (IX86_BASE_CALLCVT(ret) != 0)
7713 return ret;
7716 is_stdarg = stdarg_p (type);
7717 if (TARGET_RTD && !is_stdarg)
7718 return IX86_CALLCVT_STDCALL | ret;
7720 if (ret != 0
7721 || is_stdarg
7722 || TREE_CODE (type) != METHOD_TYPE
7723 || ix86_function_type_abi (type) != MS_ABI)
7724 return IX86_CALLCVT_CDECL | ret;
7726 return IX86_CALLCVT_THISCALL;
7729 /* Return 0 if the attributes for two types are incompatible, 1 if they
7730 are compatible, and 2 if they are nearly compatible (which causes a
7731 warning to be generated). */
7733 static int
7734 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7736 unsigned int ccvt1, ccvt2;
7738 if (TREE_CODE (type1) != FUNCTION_TYPE
7739 && TREE_CODE (type1) != METHOD_TYPE)
7740 return 1;
7742 ccvt1 = ix86_get_callcvt (type1);
7743 ccvt2 = ix86_get_callcvt (type2);
7744 if (ccvt1 != ccvt2)
7745 return 0;
7746 if (ix86_function_regparm (type1, NULL)
7747 != ix86_function_regparm (type2, NULL))
7748 return 0;
7750 return 1;
7753 /* Return the regparm value for a function with the indicated TYPE and DECL.
7754 DECL may be NULL when calling function indirectly
7755 or considering a libcall. */
7757 static int
7758 ix86_function_regparm (const_tree type, const_tree decl)
7760 tree attr;
7761 int regparm;
7762 unsigned int ccvt;
7764 if (TARGET_64BIT)
7765 return (ix86_function_type_abi (type) == SYSV_ABI
7766 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7767 ccvt = ix86_get_callcvt (type);
7768 regparm = ix86_regparm;
7770 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7772 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7773 if (attr)
7775 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7776 return regparm;
7779 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7780 return 2;
7781 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7782 return 1;
7784 /* Use register calling convention for local functions when possible. */
7785 if (decl
7786 && TREE_CODE (decl) == FUNCTION_DECL)
7788 cgraph_node *target = cgraph_node::get (decl);
7789 if (target)
7790 target = target->function_symbol ();
7792 /* Caller and callee must agree on the calling convention, so
7793 checking here just optimize means that with
7794 __attribute__((optimize (...))) caller could use regparm convention
7795 and callee not, or vice versa. Instead look at whether the callee
7796 is optimized or not. */
7797 if (target && opt_for_fn (target->decl, optimize)
7798 && !(profile_flag && !flag_fentry))
7800 cgraph_local_info *i = &target->local;
7801 if (i && i->local && i->can_change_signature)
7803 int local_regparm, globals = 0, regno;
7805 /* Make sure no regparm register is taken by a
7806 fixed register variable. */
7807 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7808 local_regparm++)
7809 if (fixed_regs[local_regparm])
7810 break;
7812 /* We don't want to use regparm(3) for nested functions as
7813 these use a static chain pointer in the third argument. */
7814 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7815 local_regparm = 2;
7817 /* Save a register for the split stack. */
7818 if (local_regparm == 3 && flag_split_stack)
7819 local_regparm = 2;
7821 /* Each fixed register usage increases register pressure,
7822 so less registers should be used for argument passing.
7823 This functionality can be overriden by an explicit
7824 regparm value. */
7825 for (regno = AX_REG; regno <= DI_REG; regno++)
7826 if (fixed_regs[regno])
7827 globals++;
7829 local_regparm
7830 = globals < local_regparm ? local_regparm - globals : 0;
7832 if (local_regparm > regparm)
7833 regparm = local_regparm;
7838 return regparm;
7841 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7842 DFmode (2) arguments in SSE registers for a function with the
7843 indicated TYPE and DECL. DECL may be NULL when calling function
7844 indirectly or considering a libcall. Return -1 if any FP parameter
7845 should be rejected by error. This is used in siutation we imply SSE
7846 calling convetion but the function is called from another function with
7847 SSE disabled. Otherwise return 0. */
7849 static int
7850 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
7852 gcc_assert (!TARGET_64BIT);
7854 /* Use SSE registers to pass SFmode and DFmode arguments if requested
7855 by the sseregparm attribute. */
7856 if (TARGET_SSEREGPARM
7857 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
7859 if (!TARGET_SSE)
7861 if (warn)
7863 if (decl)
7864 error ("calling %qD with attribute sseregparm without "
7865 "SSE/SSE2 enabled", decl);
7866 else
7867 error ("calling %qT with attribute sseregparm without "
7868 "SSE/SSE2 enabled", type);
7870 return 0;
7873 return 2;
7876 if (!decl)
7877 return 0;
7879 cgraph_node *target = cgraph_node::get (decl);
7880 if (target)
7881 target = target->function_symbol ();
7883 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
7884 (and DFmode for SSE2) arguments in SSE registers. */
7885 if (target
7886 /* TARGET_SSE_MATH */
7887 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
7888 && opt_for_fn (target->decl, optimize)
7889 && !(profile_flag && !flag_fentry))
7891 cgraph_local_info *i = &target->local;
7892 if (i && i->local && i->can_change_signature)
7894 /* Refuse to produce wrong code when local function with SSE enabled
7895 is called from SSE disabled function.
7896 FIXME: We need a way to detect these cases cross-ltrans partition
7897 and avoid using SSE calling conventions on local functions called
7898 from function with SSE disabled. For now at least delay the
7899 warning until we know we are going to produce wrong code.
7900 See PR66047 */
7901 if (!TARGET_SSE && warn)
7902 return -1;
7903 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
7904 ->x_ix86_isa_flags) ? 2 : 1;
7908 return 0;
7911 /* Return true if EAX is live at the start of the function. Used by
7912 ix86_expand_prologue to determine if we need special help before
7913 calling allocate_stack_worker. */
7915 static bool
7916 ix86_eax_live_at_start_p (void)
7918 /* Cheat. Don't bother working forward from ix86_function_regparm
7919 to the function type to whether an actual argument is located in
7920 eax. Instead just look at cfg info, which is still close enough
7921 to correct at this point. This gives false positives for broken
7922 functions that might use uninitialized data that happens to be
7923 allocated in eax, but who cares? */
7924 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
7927 static bool
7928 ix86_keep_aggregate_return_pointer (tree fntype)
7930 tree attr;
7932 if (!TARGET_64BIT)
7934 attr = lookup_attribute ("callee_pop_aggregate_return",
7935 TYPE_ATTRIBUTES (fntype));
7936 if (attr)
7937 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
7939 /* For 32-bit MS-ABI the default is to keep aggregate
7940 return pointer. */
7941 if (ix86_function_type_abi (fntype) == MS_ABI)
7942 return true;
7944 return KEEP_AGGREGATE_RETURN_POINTER != 0;
7947 /* Value is the number of bytes of arguments automatically
7948 popped when returning from a subroutine call.
7949 FUNDECL is the declaration node of the function (as a tree),
7950 FUNTYPE is the data type of the function (as a tree),
7951 or for a library call it is an identifier node for the subroutine name.
7952 SIZE is the number of bytes of arguments passed on the stack.
7954 On the 80386, the RTD insn may be used to pop them if the number
7955 of args is fixed, but if the number is variable then the caller
7956 must pop them all. RTD can't be used for library calls now
7957 because the library is compiled with the Unix compiler.
7958 Use of RTD is a selectable option, since it is incompatible with
7959 standard Unix calling sequences. If the option is not selected,
7960 the caller must always pop the args.
7962 The attribute stdcall is equivalent to RTD on a per module basis. */
7964 static int
7965 ix86_return_pops_args (tree fundecl, tree funtype, int size)
7967 unsigned int ccvt;
7969 /* None of the 64-bit ABIs pop arguments. */
7970 if (TARGET_64BIT)
7971 return 0;
7973 ccvt = ix86_get_callcvt (funtype);
7975 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
7976 | IX86_CALLCVT_THISCALL)) != 0
7977 && ! stdarg_p (funtype))
7978 return size;
7980 /* Lose any fake structure return argument if it is passed on the stack. */
7981 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
7982 && !ix86_keep_aggregate_return_pointer (funtype))
7984 int nregs = ix86_function_regparm (funtype, fundecl);
7985 if (nregs == 0)
7986 return GET_MODE_SIZE (Pmode);
7989 return 0;
7992 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
7994 static bool
7995 ix86_legitimate_combined_insn (rtx_insn *insn)
7997 /* Check operand constraints in case hard registers were propagated
7998 into insn pattern. This check prevents combine pass from
7999 generating insn patterns with invalid hard register operands.
8000 These invalid insns can eventually confuse reload to error out
8001 with a spill failure. See also PRs 46829 and 46843. */
8002 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
8004 int i;
8006 extract_insn (insn);
8007 preprocess_constraints (insn);
8009 int n_operands = recog_data.n_operands;
8010 int n_alternatives = recog_data.n_alternatives;
8011 for (i = 0; i < n_operands; i++)
8013 rtx op = recog_data.operand[i];
8014 machine_mode mode = GET_MODE (op);
8015 const operand_alternative *op_alt;
8016 int offset = 0;
8017 bool win;
8018 int j;
8020 /* A unary operator may be accepted by the predicate, but it
8021 is irrelevant for matching constraints. */
8022 if (UNARY_P (op))
8023 op = XEXP (op, 0);
8025 if (SUBREG_P (op))
8027 if (REG_P (SUBREG_REG (op))
8028 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8029 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8030 GET_MODE (SUBREG_REG (op)),
8031 SUBREG_BYTE (op),
8032 GET_MODE (op));
8033 op = SUBREG_REG (op);
8036 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8037 continue;
8039 op_alt = recog_op_alt;
8041 /* Operand has no constraints, anything is OK. */
8042 win = !n_alternatives;
8044 alternative_mask preferred = get_preferred_alternatives (insn);
8045 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8047 if (!TEST_BIT (preferred, j))
8048 continue;
8049 if (op_alt[i].anything_ok
8050 || (op_alt[i].matches != -1
8051 && operands_match_p
8052 (recog_data.operand[i],
8053 recog_data.operand[op_alt[i].matches]))
8054 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8056 win = true;
8057 break;
8061 if (!win)
8062 return false;
8066 return true;
8069 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8071 static unsigned HOST_WIDE_INT
8072 ix86_asan_shadow_offset (void)
8074 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8075 : HOST_WIDE_INT_C (0x7fff8000))
8076 : (HOST_WIDE_INT_1 << 29);
8079 /* Argument support functions. */
8081 /* Return true when register may be used to pass function parameters. */
8082 bool
8083 ix86_function_arg_regno_p (int regno)
8085 int i;
8086 enum calling_abi call_abi;
8087 const int *parm_regs;
8089 if (TARGET_MPX && BND_REGNO_P (regno))
8090 return true;
8092 if (!TARGET_64BIT)
8094 if (TARGET_MACHO)
8095 return (regno < REGPARM_MAX
8096 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8097 else
8098 return (regno < REGPARM_MAX
8099 || (TARGET_MMX && MMX_REGNO_P (regno)
8100 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8101 || (TARGET_SSE && SSE_REGNO_P (regno)
8102 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8105 if (TARGET_SSE && SSE_REGNO_P (regno)
8106 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8107 return true;
8109 /* TODO: The function should depend on current function ABI but
8110 builtins.c would need updating then. Therefore we use the
8111 default ABI. */
8112 call_abi = ix86_cfun_abi ();
8114 /* RAX is used as hidden argument to va_arg functions. */
8115 if (call_abi == SYSV_ABI && regno == AX_REG)
8116 return true;
8118 if (call_abi == MS_ABI)
8119 parm_regs = x86_64_ms_abi_int_parameter_registers;
8120 else
8121 parm_regs = x86_64_int_parameter_registers;
8123 for (i = 0; i < (call_abi == MS_ABI
8124 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8125 if (regno == parm_regs[i])
8126 return true;
8127 return false;
8130 /* Return if we do not know how to pass TYPE solely in registers. */
8132 static bool
8133 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8135 if (must_pass_in_stack_var_size_or_pad (mode, type))
8136 return true;
8138 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8139 The layout_type routine is crafty and tries to trick us into passing
8140 currently unsupported vector types on the stack by using TImode. */
8141 return (!TARGET_64BIT && mode == TImode
8142 && type && TREE_CODE (type) != VECTOR_TYPE);
8145 /* It returns the size, in bytes, of the area reserved for arguments passed
8146 in registers for the function represented by fndecl dependent to the used
8147 abi format. */
8149 ix86_reg_parm_stack_space (const_tree fndecl)
8151 enum calling_abi call_abi = SYSV_ABI;
8152 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8153 call_abi = ix86_function_abi (fndecl);
8154 else
8155 call_abi = ix86_function_type_abi (fndecl);
8156 if (TARGET_64BIT && call_abi == MS_ABI)
8157 return 32;
8158 return 0;
8161 /* We add this as a workaround in order to use libc_has_function
8162 hook in i386.md. */
8163 bool
8164 ix86_libc_has_function (enum function_class fn_class)
8166 return targetm.libc_has_function (fn_class);
8169 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8170 specifying the call abi used. */
8171 enum calling_abi
8172 ix86_function_type_abi (const_tree fntype)
8174 enum calling_abi abi = ix86_abi;
8176 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8177 return abi;
8179 if (abi == SYSV_ABI
8180 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8182 if (TARGET_X32)
8183 error ("X32 does not support ms_abi attribute");
8185 abi = MS_ABI;
8187 else if (abi == MS_ABI
8188 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8189 abi = SYSV_ABI;
8191 return abi;
8194 static enum calling_abi
8195 ix86_function_abi (const_tree fndecl)
8197 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8200 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8201 specifying the call abi used. */
8202 enum calling_abi
8203 ix86_cfun_abi (void)
8205 return cfun ? cfun->machine->call_abi : ix86_abi;
8208 static bool
8209 ix86_function_ms_hook_prologue (const_tree fn)
8211 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8213 if (decl_function_context (fn) != NULL_TREE)
8214 error_at (DECL_SOURCE_LOCATION (fn),
8215 "ms_hook_prologue is not compatible with nested function");
8216 else
8217 return true;
8219 return false;
8222 /* Write the extra assembler code needed to declare a function properly. */
8224 void
8225 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8226 tree decl)
8228 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8230 if (is_ms_hook)
8232 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8233 unsigned int filler_cc = 0xcccccccc;
8235 for (i = 0; i < filler_count; i += 4)
8236 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8239 #ifdef SUBTARGET_ASM_UNWIND_INIT
8240 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8241 #endif
8243 ASM_OUTPUT_LABEL (asm_out_file, fname);
8245 /* Output magic byte marker, if hot-patch attribute is set. */
8246 if (is_ms_hook)
8248 if (TARGET_64BIT)
8250 /* leaq [%rsp + 0], %rsp */
8251 asm_fprintf (asm_out_file, ASM_BYTE
8252 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8254 else
8256 /* movl.s %edi, %edi
8257 push %ebp
8258 movl.s %esp, %ebp */
8259 asm_fprintf (asm_out_file, ASM_BYTE
8260 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8265 /* regclass.c */
8266 extern void init_regs (void);
8268 /* Implementation of call abi switching target hook. Specific to FNDECL
8269 the specific call register sets are set. See also
8270 ix86_conditional_register_usage for more details. */
8271 void
8272 ix86_call_abi_override (const_tree fndecl)
8274 cfun->machine->call_abi = ix86_function_abi (fndecl);
8277 /* Return 1 if pseudo register should be created and used to hold
8278 GOT address for PIC code. */
8279 bool
8280 ix86_use_pseudo_pic_reg (void)
8282 if ((TARGET_64BIT
8283 && (ix86_cmodel == CM_SMALL_PIC
8284 || TARGET_PECOFF))
8285 || !flag_pic)
8286 return false;
8287 return true;
8290 /* Initialize large model PIC register. */
8292 static void
8293 ix86_init_large_pic_reg (unsigned int tmp_regno)
8295 rtx_code_label *label;
8296 rtx tmp_reg;
8298 gcc_assert (Pmode == DImode);
8299 label = gen_label_rtx ();
8300 emit_label (label);
8301 LABEL_PRESERVE_P (label) = 1;
8302 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8303 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8304 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8305 label));
8306 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8307 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8308 pic_offset_table_rtx, tmp_reg));
8311 /* Create and initialize PIC register if required. */
8312 static void
8313 ix86_init_pic_reg (void)
8315 edge entry_edge;
8316 rtx_insn *seq;
8318 if (!ix86_use_pseudo_pic_reg ())
8319 return;
8321 start_sequence ();
8323 if (TARGET_64BIT)
8325 if (ix86_cmodel == CM_LARGE_PIC)
8326 ix86_init_large_pic_reg (R11_REG);
8327 else
8328 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8330 else
8332 /* If there is future mcount call in the function it is more profitable
8333 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8334 rtx reg = crtl->profile
8335 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8336 : pic_offset_table_rtx;
8337 rtx_insn *insn = emit_insn (gen_set_got (reg));
8338 RTX_FRAME_RELATED_P (insn) = 1;
8339 if (crtl->profile)
8340 emit_move_insn (pic_offset_table_rtx, reg);
8341 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8344 seq = get_insns ();
8345 end_sequence ();
8347 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8348 insert_insn_on_edge (seq, entry_edge);
8349 commit_one_edge_insertion (entry_edge);
8352 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8353 for a call to a function whose data type is FNTYPE.
8354 For a library call, FNTYPE is 0. */
8356 void
8357 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8358 tree fntype, /* tree ptr for function decl */
8359 rtx libname, /* SYMBOL_REF of library name or 0 */
8360 tree fndecl,
8361 int caller)
8363 struct cgraph_local_info *i = NULL;
8364 struct cgraph_node *target = NULL;
8366 memset (cum, 0, sizeof (*cum));
8368 if (fndecl)
8370 target = cgraph_node::get (fndecl);
8371 if (target)
8373 target = target->function_symbol ();
8374 i = cgraph_node::local_info (target->decl);
8375 cum->call_abi = ix86_function_abi (target->decl);
8377 else
8378 cum->call_abi = ix86_function_abi (fndecl);
8380 else
8381 cum->call_abi = ix86_function_type_abi (fntype);
8383 cum->caller = caller;
8385 /* Set up the number of registers to use for passing arguments. */
8386 cum->nregs = ix86_regparm;
8387 if (TARGET_64BIT)
8389 cum->nregs = (cum->call_abi == SYSV_ABI
8390 ? X86_64_REGPARM_MAX
8391 : X86_64_MS_REGPARM_MAX);
8393 if (TARGET_SSE)
8395 cum->sse_nregs = SSE_REGPARM_MAX;
8396 if (TARGET_64BIT)
8398 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8399 ? X86_64_SSE_REGPARM_MAX
8400 : X86_64_MS_SSE_REGPARM_MAX);
8403 if (TARGET_MMX)
8404 cum->mmx_nregs = MMX_REGPARM_MAX;
8405 cum->warn_avx512f = true;
8406 cum->warn_avx = true;
8407 cum->warn_sse = true;
8408 cum->warn_mmx = true;
8410 /* Because type might mismatch in between caller and callee, we need to
8411 use actual type of function for local calls.
8412 FIXME: cgraph_analyze can be told to actually record if function uses
8413 va_start so for local functions maybe_vaarg can be made aggressive
8414 helping K&R code.
8415 FIXME: once typesytem is fixed, we won't need this code anymore. */
8416 if (i && i->local && i->can_change_signature)
8417 fntype = TREE_TYPE (target->decl);
8418 cum->stdarg = stdarg_p (fntype);
8419 cum->maybe_vaarg = (fntype
8420 ? (!prototype_p (fntype) || stdarg_p (fntype))
8421 : !libname);
8423 cum->bnd_regno = FIRST_BND_REG;
8424 cum->bnds_in_bt = 0;
8425 cum->force_bnd_pass = 0;
8426 cum->decl = fndecl;
8428 if (!TARGET_64BIT)
8430 /* If there are variable arguments, then we won't pass anything
8431 in registers in 32-bit mode. */
8432 if (stdarg_p (fntype))
8434 cum->nregs = 0;
8435 /* Since in 32-bit, variable arguments are always passed on
8436 stack, there is scratch register available for indirect
8437 sibcall. */
8438 cfun->machine->arg_reg_available = true;
8439 cum->sse_nregs = 0;
8440 cum->mmx_nregs = 0;
8441 cum->warn_avx512f = false;
8442 cum->warn_avx = false;
8443 cum->warn_sse = false;
8444 cum->warn_mmx = false;
8445 return;
8448 /* Use ecx and edx registers if function has fastcall attribute,
8449 else look for regparm information. */
8450 if (fntype)
8452 unsigned int ccvt = ix86_get_callcvt (fntype);
8453 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8455 cum->nregs = 1;
8456 cum->fastcall = 1; /* Same first register as in fastcall. */
8458 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8460 cum->nregs = 2;
8461 cum->fastcall = 1;
8463 else
8464 cum->nregs = ix86_function_regparm (fntype, fndecl);
8467 /* Set up the number of SSE registers used for passing SFmode
8468 and DFmode arguments. Warn for mismatching ABI. */
8469 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8472 cfun->machine->arg_reg_available = (cum->nregs > 0);
8475 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8476 But in the case of vector types, it is some vector mode.
8478 When we have only some of our vector isa extensions enabled, then there
8479 are some modes for which vector_mode_supported_p is false. For these
8480 modes, the generic vector support in gcc will choose some non-vector mode
8481 in order to implement the type. By computing the natural mode, we'll
8482 select the proper ABI location for the operand and not depend on whatever
8483 the middle-end decides to do with these vector types.
8485 The midde-end can't deal with the vector types > 16 bytes. In this
8486 case, we return the original mode and warn ABI change if CUM isn't
8487 NULL.
8489 If INT_RETURN is true, warn ABI change if the vector mode isn't
8490 available for function return value. */
8492 static machine_mode
8493 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8494 bool in_return)
8496 machine_mode mode = TYPE_MODE (type);
8498 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8500 HOST_WIDE_INT size = int_size_in_bytes (type);
8501 if ((size == 8 || size == 16 || size == 32 || size == 64)
8502 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8503 && TYPE_VECTOR_SUBPARTS (type) > 1)
8505 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8507 /* There are no XFmode vector modes. */
8508 if (innermode == XFmode)
8509 return mode;
8511 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8512 mode = MIN_MODE_VECTOR_FLOAT;
8513 else
8514 mode = MIN_MODE_VECTOR_INT;
8516 /* Get the mode which has this inner mode and number of units. */
8517 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8518 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8519 && GET_MODE_INNER (mode) == innermode)
8521 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8523 static bool warnedavx512f;
8524 static bool warnedavx512f_ret;
8526 if (cum && cum->warn_avx512f && !warnedavx512f)
8528 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8529 "without AVX512F enabled changes the ABI"))
8530 warnedavx512f = true;
8532 else if (in_return && !warnedavx512f_ret)
8534 if (warning (OPT_Wpsabi, "AVX512F vector return "
8535 "without AVX512F enabled changes the ABI"))
8536 warnedavx512f_ret = true;
8539 return TYPE_MODE (type);
8541 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8543 static bool warnedavx;
8544 static bool warnedavx_ret;
8546 if (cum && cum->warn_avx && !warnedavx)
8548 if (warning (OPT_Wpsabi, "AVX vector argument "
8549 "without AVX enabled changes the ABI"))
8550 warnedavx = true;
8552 else if (in_return && !warnedavx_ret)
8554 if (warning (OPT_Wpsabi, "AVX vector return "
8555 "without AVX enabled changes the ABI"))
8556 warnedavx_ret = true;
8559 return TYPE_MODE (type);
8561 else if (((size == 8 && TARGET_64BIT) || size == 16)
8562 && !TARGET_SSE
8563 && !TARGET_IAMCU)
8565 static bool warnedsse;
8566 static bool warnedsse_ret;
8568 if (cum && cum->warn_sse && !warnedsse)
8570 if (warning (OPT_Wpsabi, "SSE vector argument "
8571 "without SSE enabled changes the ABI"))
8572 warnedsse = true;
8574 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8576 if (warning (OPT_Wpsabi, "SSE vector return "
8577 "without SSE enabled changes the ABI"))
8578 warnedsse_ret = true;
8581 else if ((size == 8 && !TARGET_64BIT)
8582 && (!cfun
8583 || cfun->machine->func_type == TYPE_NORMAL)
8584 && !TARGET_MMX
8585 && !TARGET_IAMCU)
8587 static bool warnedmmx;
8588 static bool warnedmmx_ret;
8590 if (cum && cum->warn_mmx && !warnedmmx)
8592 if (warning (OPT_Wpsabi, "MMX vector argument "
8593 "without MMX enabled changes the ABI"))
8594 warnedmmx = true;
8596 else if (in_return && !warnedmmx_ret)
8598 if (warning (OPT_Wpsabi, "MMX vector return "
8599 "without MMX enabled changes the ABI"))
8600 warnedmmx_ret = true;
8603 return mode;
8606 gcc_unreachable ();
8610 return mode;
8613 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8614 this may not agree with the mode that the type system has chosen for the
8615 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8616 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8618 static rtx
8619 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8620 unsigned int regno)
8622 rtx tmp;
8624 if (orig_mode != BLKmode)
8625 tmp = gen_rtx_REG (orig_mode, regno);
8626 else
8628 tmp = gen_rtx_REG (mode, regno);
8629 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8630 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8633 return tmp;
8636 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8637 of this code is to classify each 8bytes of incoming argument by the register
8638 class and assign registers accordingly. */
8640 /* Return the union class of CLASS1 and CLASS2.
8641 See the x86-64 PS ABI for details. */
8643 static enum x86_64_reg_class
8644 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8646 /* Rule #1: If both classes are equal, this is the resulting class. */
8647 if (class1 == class2)
8648 return class1;
8650 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8651 the other class. */
8652 if (class1 == X86_64_NO_CLASS)
8653 return class2;
8654 if (class2 == X86_64_NO_CLASS)
8655 return class1;
8657 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8658 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8659 return X86_64_MEMORY_CLASS;
8661 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8662 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8663 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8664 return X86_64_INTEGERSI_CLASS;
8665 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8666 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8667 return X86_64_INTEGER_CLASS;
8669 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8670 MEMORY is used. */
8671 if (class1 == X86_64_X87_CLASS
8672 || class1 == X86_64_X87UP_CLASS
8673 || class1 == X86_64_COMPLEX_X87_CLASS
8674 || class2 == X86_64_X87_CLASS
8675 || class2 == X86_64_X87UP_CLASS
8676 || class2 == X86_64_COMPLEX_X87_CLASS)
8677 return X86_64_MEMORY_CLASS;
8679 /* Rule #6: Otherwise class SSE is used. */
8680 return X86_64_SSE_CLASS;
8683 /* Classify the argument of type TYPE and mode MODE.
8684 CLASSES will be filled by the register class used to pass each word
8685 of the operand. The number of words is returned. In case the parameter
8686 should be passed in memory, 0 is returned. As a special case for zero
8687 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8689 BIT_OFFSET is used internally for handling records and specifies offset
8690 of the offset in bits modulo 512 to avoid overflow cases.
8692 See the x86-64 PS ABI for details.
8695 static int
8696 classify_argument (machine_mode mode, const_tree type,
8697 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8699 HOST_WIDE_INT bytes =
8700 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8701 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8703 /* Variable sized entities are always passed/returned in memory. */
8704 if (bytes < 0)
8705 return 0;
8707 if (mode != VOIDmode
8708 && targetm.calls.must_pass_in_stack (mode, type))
8709 return 0;
8711 if (type && AGGREGATE_TYPE_P (type))
8713 int i;
8714 tree field;
8715 enum x86_64_reg_class subclasses[MAX_CLASSES];
8717 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8718 if (bytes > 64)
8719 return 0;
8721 for (i = 0; i < words; i++)
8722 classes[i] = X86_64_NO_CLASS;
8724 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8725 signalize memory class, so handle it as special case. */
8726 if (!words)
8728 classes[0] = X86_64_NO_CLASS;
8729 return 1;
8732 /* Classify each field of record and merge classes. */
8733 switch (TREE_CODE (type))
8735 case RECORD_TYPE:
8736 /* And now merge the fields of structure. */
8737 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8739 if (TREE_CODE (field) == FIELD_DECL)
8741 int num;
8743 if (TREE_TYPE (field) == error_mark_node)
8744 continue;
8746 /* Bitfields are always classified as integer. Handle them
8747 early, since later code would consider them to be
8748 misaligned integers. */
8749 if (DECL_BIT_FIELD (field))
8751 for (i = (int_bit_position (field)
8752 + (bit_offset % 64)) / 8 / 8;
8753 i < ((int_bit_position (field) + (bit_offset % 64))
8754 + tree_to_shwi (DECL_SIZE (field))
8755 + 63) / 8 / 8; i++)
8756 classes[i] =
8757 merge_classes (X86_64_INTEGER_CLASS,
8758 classes[i]);
8760 else
8762 int pos;
8764 type = TREE_TYPE (field);
8766 /* Flexible array member is ignored. */
8767 if (TYPE_MODE (type) == BLKmode
8768 && TREE_CODE (type) == ARRAY_TYPE
8769 && TYPE_SIZE (type) == NULL_TREE
8770 && TYPE_DOMAIN (type) != NULL_TREE
8771 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8772 == NULL_TREE))
8774 static bool warned;
8776 if (!warned && warn_psabi)
8778 warned = true;
8779 inform (input_location,
8780 "the ABI of passing struct with"
8781 " a flexible array member has"
8782 " changed in GCC 4.4");
8784 continue;
8786 num = classify_argument (TYPE_MODE (type), type,
8787 subclasses,
8788 (int_bit_position (field)
8789 + bit_offset) % 512);
8790 if (!num)
8791 return 0;
8792 pos = (int_bit_position (field)
8793 + (bit_offset % 64)) / 8 / 8;
8794 for (i = 0; i < num && (i + pos) < words; i++)
8795 classes[i + pos] =
8796 merge_classes (subclasses[i], classes[i + pos]);
8800 break;
8802 case ARRAY_TYPE:
8803 /* Arrays are handled as small records. */
8805 int num;
8806 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8807 TREE_TYPE (type), subclasses, bit_offset);
8808 if (!num)
8809 return 0;
8811 /* The partial classes are now full classes. */
8812 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8813 subclasses[0] = X86_64_SSE_CLASS;
8814 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8815 && !((bit_offset % 64) == 0 && bytes == 4))
8816 subclasses[0] = X86_64_INTEGER_CLASS;
8818 for (i = 0; i < words; i++)
8819 classes[i] = subclasses[i % num];
8821 break;
8823 case UNION_TYPE:
8824 case QUAL_UNION_TYPE:
8825 /* Unions are similar to RECORD_TYPE but offset is always 0.
8827 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8829 if (TREE_CODE (field) == FIELD_DECL)
8831 int num;
8833 if (TREE_TYPE (field) == error_mark_node)
8834 continue;
8836 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8837 TREE_TYPE (field), subclasses,
8838 bit_offset);
8839 if (!num)
8840 return 0;
8841 for (i = 0; i < num && i < words; i++)
8842 classes[i] = merge_classes (subclasses[i], classes[i]);
8845 break;
8847 default:
8848 gcc_unreachable ();
8851 if (words > 2)
8853 /* When size > 16 bytes, if the first one isn't
8854 X86_64_SSE_CLASS or any other ones aren't
8855 X86_64_SSEUP_CLASS, everything should be passed in
8856 memory. */
8857 if (classes[0] != X86_64_SSE_CLASS)
8858 return 0;
8860 for (i = 1; i < words; i++)
8861 if (classes[i] != X86_64_SSEUP_CLASS)
8862 return 0;
8865 /* Final merger cleanup. */
8866 for (i = 0; i < words; i++)
8868 /* If one class is MEMORY, everything should be passed in
8869 memory. */
8870 if (classes[i] == X86_64_MEMORY_CLASS)
8871 return 0;
8873 /* The X86_64_SSEUP_CLASS should be always preceded by
8874 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
8875 if (classes[i] == X86_64_SSEUP_CLASS
8876 && classes[i - 1] != X86_64_SSE_CLASS
8877 && classes[i - 1] != X86_64_SSEUP_CLASS)
8879 /* The first one should never be X86_64_SSEUP_CLASS. */
8880 gcc_assert (i != 0);
8881 classes[i] = X86_64_SSE_CLASS;
8884 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
8885 everything should be passed in memory. */
8886 if (classes[i] == X86_64_X87UP_CLASS
8887 && (classes[i - 1] != X86_64_X87_CLASS))
8889 static bool warned;
8891 /* The first one should never be X86_64_X87UP_CLASS. */
8892 gcc_assert (i != 0);
8893 if (!warned && warn_psabi)
8895 warned = true;
8896 inform (input_location,
8897 "the ABI of passing union with long double"
8898 " has changed in GCC 4.4");
8900 return 0;
8903 return words;
8906 /* Compute alignment needed. We align all types to natural boundaries with
8907 exception of XFmode that is aligned to 64bits. */
8908 if (mode != VOIDmode && mode != BLKmode)
8910 int mode_alignment = GET_MODE_BITSIZE (mode);
8912 if (mode == XFmode)
8913 mode_alignment = 128;
8914 else if (mode == XCmode)
8915 mode_alignment = 256;
8916 if (COMPLEX_MODE_P (mode))
8917 mode_alignment /= 2;
8918 /* Misaligned fields are always returned in memory. */
8919 if (bit_offset % mode_alignment)
8920 return 0;
8923 /* for V1xx modes, just use the base mode */
8924 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
8925 && GET_MODE_UNIT_SIZE (mode) == bytes)
8926 mode = GET_MODE_INNER (mode);
8928 /* Classification of atomic types. */
8929 switch (mode)
8931 case SDmode:
8932 case DDmode:
8933 classes[0] = X86_64_SSE_CLASS;
8934 return 1;
8935 case TDmode:
8936 classes[0] = X86_64_SSE_CLASS;
8937 classes[1] = X86_64_SSEUP_CLASS;
8938 return 2;
8939 case DImode:
8940 case SImode:
8941 case HImode:
8942 case QImode:
8943 case CSImode:
8944 case CHImode:
8945 case CQImode:
8947 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
8949 /* Analyze last 128 bits only. */
8950 size = (size - 1) & 0x7f;
8952 if (size < 32)
8954 classes[0] = X86_64_INTEGERSI_CLASS;
8955 return 1;
8957 else if (size < 64)
8959 classes[0] = X86_64_INTEGER_CLASS;
8960 return 1;
8962 else if (size < 64+32)
8964 classes[0] = X86_64_INTEGER_CLASS;
8965 classes[1] = X86_64_INTEGERSI_CLASS;
8966 return 2;
8968 else if (size < 64+64)
8970 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8971 return 2;
8973 else
8974 gcc_unreachable ();
8976 case CDImode:
8977 case TImode:
8978 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8979 return 2;
8980 case COImode:
8981 case OImode:
8982 /* OImode shouldn't be used directly. */
8983 gcc_unreachable ();
8984 case CTImode:
8985 return 0;
8986 case SFmode:
8987 if (!(bit_offset % 64))
8988 classes[0] = X86_64_SSESF_CLASS;
8989 else
8990 classes[0] = X86_64_SSE_CLASS;
8991 return 1;
8992 case DFmode:
8993 classes[0] = X86_64_SSEDF_CLASS;
8994 return 1;
8995 case XFmode:
8996 classes[0] = X86_64_X87_CLASS;
8997 classes[1] = X86_64_X87UP_CLASS;
8998 return 2;
8999 case TFmode:
9000 classes[0] = X86_64_SSE_CLASS;
9001 classes[1] = X86_64_SSEUP_CLASS;
9002 return 2;
9003 case SCmode:
9004 classes[0] = X86_64_SSE_CLASS;
9005 if (!(bit_offset % 64))
9006 return 1;
9007 else
9009 static bool warned;
9011 if (!warned && warn_psabi)
9013 warned = true;
9014 inform (input_location,
9015 "the ABI of passing structure with complex float"
9016 " member has changed in GCC 4.4");
9018 classes[1] = X86_64_SSESF_CLASS;
9019 return 2;
9021 case DCmode:
9022 classes[0] = X86_64_SSEDF_CLASS;
9023 classes[1] = X86_64_SSEDF_CLASS;
9024 return 2;
9025 case XCmode:
9026 classes[0] = X86_64_COMPLEX_X87_CLASS;
9027 return 1;
9028 case TCmode:
9029 /* This modes is larger than 16 bytes. */
9030 return 0;
9031 case V8SFmode:
9032 case V8SImode:
9033 case V32QImode:
9034 case V16HImode:
9035 case V4DFmode:
9036 case V4DImode:
9037 classes[0] = X86_64_SSE_CLASS;
9038 classes[1] = X86_64_SSEUP_CLASS;
9039 classes[2] = X86_64_SSEUP_CLASS;
9040 classes[3] = X86_64_SSEUP_CLASS;
9041 return 4;
9042 case V8DFmode:
9043 case V16SFmode:
9044 case V8DImode:
9045 case V16SImode:
9046 case V32HImode:
9047 case V64QImode:
9048 classes[0] = X86_64_SSE_CLASS;
9049 classes[1] = X86_64_SSEUP_CLASS;
9050 classes[2] = X86_64_SSEUP_CLASS;
9051 classes[3] = X86_64_SSEUP_CLASS;
9052 classes[4] = X86_64_SSEUP_CLASS;
9053 classes[5] = X86_64_SSEUP_CLASS;
9054 classes[6] = X86_64_SSEUP_CLASS;
9055 classes[7] = X86_64_SSEUP_CLASS;
9056 return 8;
9057 case V4SFmode:
9058 case V4SImode:
9059 case V16QImode:
9060 case V8HImode:
9061 case V2DFmode:
9062 case V2DImode:
9063 classes[0] = X86_64_SSE_CLASS;
9064 classes[1] = X86_64_SSEUP_CLASS;
9065 return 2;
9066 case V1TImode:
9067 case V1DImode:
9068 case V2SFmode:
9069 case V2SImode:
9070 case V4HImode:
9071 case V8QImode:
9072 classes[0] = X86_64_SSE_CLASS;
9073 return 1;
9074 case BLKmode:
9075 case VOIDmode:
9076 return 0;
9077 default:
9078 gcc_assert (VECTOR_MODE_P (mode));
9080 if (bytes > 16)
9081 return 0;
9083 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9085 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9086 classes[0] = X86_64_INTEGERSI_CLASS;
9087 else
9088 classes[0] = X86_64_INTEGER_CLASS;
9089 classes[1] = X86_64_INTEGER_CLASS;
9090 return 1 + (bytes > 8);
9094 /* Examine the argument and return set number of register required in each
9095 class. Return true iff parameter should be passed in memory. */
9097 static bool
9098 examine_argument (machine_mode mode, const_tree type, int in_return,
9099 int *int_nregs, int *sse_nregs)
9101 enum x86_64_reg_class regclass[MAX_CLASSES];
9102 int n = classify_argument (mode, type, regclass, 0);
9104 *int_nregs = 0;
9105 *sse_nregs = 0;
9107 if (!n)
9108 return true;
9109 for (n--; n >= 0; n--)
9110 switch (regclass[n])
9112 case X86_64_INTEGER_CLASS:
9113 case X86_64_INTEGERSI_CLASS:
9114 (*int_nregs)++;
9115 break;
9116 case X86_64_SSE_CLASS:
9117 case X86_64_SSESF_CLASS:
9118 case X86_64_SSEDF_CLASS:
9119 (*sse_nregs)++;
9120 break;
9121 case X86_64_NO_CLASS:
9122 case X86_64_SSEUP_CLASS:
9123 break;
9124 case X86_64_X87_CLASS:
9125 case X86_64_X87UP_CLASS:
9126 case X86_64_COMPLEX_X87_CLASS:
9127 if (!in_return)
9128 return true;
9129 break;
9130 case X86_64_MEMORY_CLASS:
9131 gcc_unreachable ();
9134 return false;
9137 /* Construct container for the argument used by GCC interface. See
9138 FUNCTION_ARG for the detailed description. */
9140 static rtx
9141 construct_container (machine_mode mode, machine_mode orig_mode,
9142 const_tree type, int in_return, int nintregs, int nsseregs,
9143 const int *intreg, int sse_regno)
9145 /* The following variables hold the static issued_error state. */
9146 static bool issued_sse_arg_error;
9147 static bool issued_sse_ret_error;
9148 static bool issued_x87_ret_error;
9150 machine_mode tmpmode;
9151 int bytes =
9152 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9153 enum x86_64_reg_class regclass[MAX_CLASSES];
9154 int n;
9155 int i;
9156 int nexps = 0;
9157 int needed_sseregs, needed_intregs;
9158 rtx exp[MAX_CLASSES];
9159 rtx ret;
9161 n = classify_argument (mode, type, regclass, 0);
9162 if (!n)
9163 return NULL;
9164 if (examine_argument (mode, type, in_return, &needed_intregs,
9165 &needed_sseregs))
9166 return NULL;
9167 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9168 return NULL;
9170 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9171 some less clueful developer tries to use floating-point anyway. */
9172 if (needed_sseregs && !TARGET_SSE)
9174 if (in_return)
9176 if (!issued_sse_ret_error)
9178 error ("SSE register return with SSE disabled");
9179 issued_sse_ret_error = true;
9182 else if (!issued_sse_arg_error)
9184 error ("SSE register argument with SSE disabled");
9185 issued_sse_arg_error = true;
9187 return NULL;
9190 /* Likewise, error if the ABI requires us to return values in the
9191 x87 registers and the user specified -mno-80387. */
9192 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9193 for (i = 0; i < n; i++)
9194 if (regclass[i] == X86_64_X87_CLASS
9195 || regclass[i] == X86_64_X87UP_CLASS
9196 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9198 if (!issued_x87_ret_error)
9200 error ("x87 register return with x87 disabled");
9201 issued_x87_ret_error = true;
9203 return NULL;
9206 /* First construct simple cases. Avoid SCmode, since we want to use
9207 single register to pass this type. */
9208 if (n == 1 && mode != SCmode)
9209 switch (regclass[0])
9211 case X86_64_INTEGER_CLASS:
9212 case X86_64_INTEGERSI_CLASS:
9213 return gen_rtx_REG (mode, intreg[0]);
9214 case X86_64_SSE_CLASS:
9215 case X86_64_SSESF_CLASS:
9216 case X86_64_SSEDF_CLASS:
9217 if (mode != BLKmode)
9218 return gen_reg_or_parallel (mode, orig_mode,
9219 SSE_REGNO (sse_regno));
9220 break;
9221 case X86_64_X87_CLASS:
9222 case X86_64_COMPLEX_X87_CLASS:
9223 return gen_rtx_REG (mode, FIRST_STACK_REG);
9224 case X86_64_NO_CLASS:
9225 /* Zero sized array, struct or class. */
9226 return NULL;
9227 default:
9228 gcc_unreachable ();
9230 if (n == 2
9231 && regclass[0] == X86_64_SSE_CLASS
9232 && regclass[1] == X86_64_SSEUP_CLASS
9233 && mode != BLKmode)
9234 return gen_reg_or_parallel (mode, orig_mode,
9235 SSE_REGNO (sse_regno));
9236 if (n == 4
9237 && regclass[0] == X86_64_SSE_CLASS
9238 && regclass[1] == X86_64_SSEUP_CLASS
9239 && regclass[2] == X86_64_SSEUP_CLASS
9240 && regclass[3] == X86_64_SSEUP_CLASS
9241 && mode != BLKmode)
9242 return gen_reg_or_parallel (mode, orig_mode,
9243 SSE_REGNO (sse_regno));
9244 if (n == 8
9245 && regclass[0] == X86_64_SSE_CLASS
9246 && regclass[1] == X86_64_SSEUP_CLASS
9247 && regclass[2] == X86_64_SSEUP_CLASS
9248 && regclass[3] == X86_64_SSEUP_CLASS
9249 && regclass[4] == X86_64_SSEUP_CLASS
9250 && regclass[5] == X86_64_SSEUP_CLASS
9251 && regclass[6] == X86_64_SSEUP_CLASS
9252 && regclass[7] == X86_64_SSEUP_CLASS
9253 && mode != BLKmode)
9254 return gen_reg_or_parallel (mode, orig_mode,
9255 SSE_REGNO (sse_regno));
9256 if (n == 2
9257 && regclass[0] == X86_64_X87_CLASS
9258 && regclass[1] == X86_64_X87UP_CLASS)
9259 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9261 if (n == 2
9262 && regclass[0] == X86_64_INTEGER_CLASS
9263 && regclass[1] == X86_64_INTEGER_CLASS
9264 && (mode == CDImode || mode == TImode)
9265 && intreg[0] + 1 == intreg[1])
9266 return gen_rtx_REG (mode, intreg[0]);
9268 /* Otherwise figure out the entries of the PARALLEL. */
9269 for (i = 0; i < n; i++)
9271 int pos;
9273 switch (regclass[i])
9275 case X86_64_NO_CLASS:
9276 break;
9277 case X86_64_INTEGER_CLASS:
9278 case X86_64_INTEGERSI_CLASS:
9279 /* Merge TImodes on aligned occasions here too. */
9280 if (i * 8 + 8 > bytes)
9281 tmpmode
9282 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9283 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9284 tmpmode = SImode;
9285 else
9286 tmpmode = DImode;
9287 /* We've requested 24 bytes we
9288 don't have mode for. Use DImode. */
9289 if (tmpmode == BLKmode)
9290 tmpmode = DImode;
9291 exp [nexps++]
9292 = gen_rtx_EXPR_LIST (VOIDmode,
9293 gen_rtx_REG (tmpmode, *intreg),
9294 GEN_INT (i*8));
9295 intreg++;
9296 break;
9297 case X86_64_SSESF_CLASS:
9298 exp [nexps++]
9299 = gen_rtx_EXPR_LIST (VOIDmode,
9300 gen_rtx_REG (SFmode,
9301 SSE_REGNO (sse_regno)),
9302 GEN_INT (i*8));
9303 sse_regno++;
9304 break;
9305 case X86_64_SSEDF_CLASS:
9306 exp [nexps++]
9307 = gen_rtx_EXPR_LIST (VOIDmode,
9308 gen_rtx_REG (DFmode,
9309 SSE_REGNO (sse_regno)),
9310 GEN_INT (i*8));
9311 sse_regno++;
9312 break;
9313 case X86_64_SSE_CLASS:
9314 pos = i;
9315 switch (n)
9317 case 1:
9318 tmpmode = DImode;
9319 break;
9320 case 2:
9321 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9323 tmpmode = TImode;
9324 i++;
9326 else
9327 tmpmode = DImode;
9328 break;
9329 case 4:
9330 gcc_assert (i == 0
9331 && regclass[1] == X86_64_SSEUP_CLASS
9332 && regclass[2] == X86_64_SSEUP_CLASS
9333 && regclass[3] == X86_64_SSEUP_CLASS);
9334 tmpmode = OImode;
9335 i += 3;
9336 break;
9337 case 8:
9338 gcc_assert (i == 0
9339 && regclass[1] == X86_64_SSEUP_CLASS
9340 && regclass[2] == X86_64_SSEUP_CLASS
9341 && regclass[3] == X86_64_SSEUP_CLASS
9342 && regclass[4] == X86_64_SSEUP_CLASS
9343 && regclass[5] == X86_64_SSEUP_CLASS
9344 && regclass[6] == X86_64_SSEUP_CLASS
9345 && regclass[7] == X86_64_SSEUP_CLASS);
9346 tmpmode = XImode;
9347 i += 7;
9348 break;
9349 default:
9350 gcc_unreachable ();
9352 exp [nexps++]
9353 = gen_rtx_EXPR_LIST (VOIDmode,
9354 gen_rtx_REG (tmpmode,
9355 SSE_REGNO (sse_regno)),
9356 GEN_INT (pos*8));
9357 sse_regno++;
9358 break;
9359 default:
9360 gcc_unreachable ();
9364 /* Empty aligned struct, union or class. */
9365 if (nexps == 0)
9366 return NULL;
9368 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9369 for (i = 0; i < nexps; i++)
9370 XVECEXP (ret, 0, i) = exp [i];
9371 return ret;
9374 /* Update the data in CUM to advance over an argument of mode MODE
9375 and data type TYPE. (TYPE is null for libcalls where that information
9376 may not be available.)
9378 Return a number of integer regsiters advanced over. */
9380 static int
9381 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9382 const_tree type, HOST_WIDE_INT bytes,
9383 HOST_WIDE_INT words)
9385 int res = 0;
9386 bool error_p = NULL;
9388 if (TARGET_IAMCU)
9390 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9391 bytes in registers. */
9392 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9393 goto pass_in_reg;
9394 return res;
9397 switch (mode)
9399 default:
9400 break;
9402 case BLKmode:
9403 if (bytes < 0)
9404 break;
9405 /* FALLTHRU */
9407 case DImode:
9408 case SImode:
9409 case HImode:
9410 case QImode:
9411 pass_in_reg:
9412 cum->words += words;
9413 cum->nregs -= words;
9414 cum->regno += words;
9415 if (cum->nregs >= 0)
9416 res = words;
9417 if (cum->nregs <= 0)
9419 cum->nregs = 0;
9420 cfun->machine->arg_reg_available = false;
9421 cum->regno = 0;
9423 break;
9425 case OImode:
9426 /* OImode shouldn't be used directly. */
9427 gcc_unreachable ();
9429 case DFmode:
9430 if (cum->float_in_sse == -1)
9431 error_p = 1;
9432 if (cum->float_in_sse < 2)
9433 break;
9434 /* FALLTHRU */
9435 case SFmode:
9436 if (cum->float_in_sse == -1)
9437 error_p = 1;
9438 if (cum->float_in_sse < 1)
9439 break;
9440 /* FALLTHRU */
9442 case V8SFmode:
9443 case V8SImode:
9444 case V64QImode:
9445 case V32HImode:
9446 case V16SImode:
9447 case V8DImode:
9448 case V16SFmode:
9449 case V8DFmode:
9450 case V32QImode:
9451 case V16HImode:
9452 case V4DFmode:
9453 case V4DImode:
9454 case TImode:
9455 case V16QImode:
9456 case V8HImode:
9457 case V4SImode:
9458 case V2DImode:
9459 case V4SFmode:
9460 case V2DFmode:
9461 if (!type || !AGGREGATE_TYPE_P (type))
9463 cum->sse_words += words;
9464 cum->sse_nregs -= 1;
9465 cum->sse_regno += 1;
9466 if (cum->sse_nregs <= 0)
9468 cum->sse_nregs = 0;
9469 cum->sse_regno = 0;
9472 break;
9474 case V8QImode:
9475 case V4HImode:
9476 case V2SImode:
9477 case V2SFmode:
9478 case V1TImode:
9479 case V1DImode:
9480 if (!type || !AGGREGATE_TYPE_P (type))
9482 cum->mmx_words += words;
9483 cum->mmx_nregs -= 1;
9484 cum->mmx_regno += 1;
9485 if (cum->mmx_nregs <= 0)
9487 cum->mmx_nregs = 0;
9488 cum->mmx_regno = 0;
9491 break;
9493 if (error_p)
9495 cum->float_in_sse = 0;
9496 error ("calling %qD with SSE calling convention without "
9497 "SSE/SSE2 enabled", cum->decl);
9498 sorry ("this is a GCC bug that can be worked around by adding "
9499 "attribute used to function called");
9502 return res;
9505 static int
9506 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9507 const_tree type, HOST_WIDE_INT words, bool named)
9509 int int_nregs, sse_nregs;
9511 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9512 if (!named && (VALID_AVX512F_REG_MODE (mode)
9513 || VALID_AVX256_REG_MODE (mode)))
9514 return 0;
9516 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9517 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9519 cum->nregs -= int_nregs;
9520 cum->sse_nregs -= sse_nregs;
9521 cum->regno += int_nregs;
9522 cum->sse_regno += sse_nregs;
9523 return int_nregs;
9525 else
9527 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9528 cum->words = ROUND_UP (cum->words, align);
9529 cum->words += words;
9530 return 0;
9534 static int
9535 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9536 HOST_WIDE_INT words)
9538 /* Otherwise, this should be passed indirect. */
9539 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9541 cum->words += words;
9542 if (cum->nregs > 0)
9544 cum->nregs -= 1;
9545 cum->regno += 1;
9546 return 1;
9548 return 0;
9551 /* Update the data in CUM to advance over an argument of mode MODE and
9552 data type TYPE. (TYPE is null for libcalls where that information
9553 may not be available.) */
9555 static void
9556 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9557 const_tree type, bool named)
9559 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9560 HOST_WIDE_INT bytes, words;
9561 int nregs;
9563 /* The argument of interrupt handler is a special case and is
9564 handled in ix86_function_arg. */
9565 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9566 return;
9568 if (mode == BLKmode)
9569 bytes = int_size_in_bytes (type);
9570 else
9571 bytes = GET_MODE_SIZE (mode);
9572 words = CEIL (bytes, UNITS_PER_WORD);
9574 if (type)
9575 mode = type_natural_mode (type, NULL, false);
9577 if ((type && POINTER_BOUNDS_TYPE_P (type))
9578 || POINTER_BOUNDS_MODE_P (mode))
9580 /* If we pass bounds in BT then just update remained bounds count. */
9581 if (cum->bnds_in_bt)
9583 cum->bnds_in_bt--;
9584 return;
9587 /* Update remained number of bounds to force. */
9588 if (cum->force_bnd_pass)
9589 cum->force_bnd_pass--;
9591 cum->bnd_regno++;
9593 return;
9596 /* The first arg not going to Bounds Tables resets this counter. */
9597 cum->bnds_in_bt = 0;
9598 /* For unnamed args we always pass bounds to avoid bounds mess when
9599 passed and received types do not match. If bounds do not follow
9600 unnamed arg, still pretend required number of bounds were passed. */
9601 if (cum->force_bnd_pass)
9603 cum->bnd_regno += cum->force_bnd_pass;
9604 cum->force_bnd_pass = 0;
9607 if (TARGET_64BIT)
9609 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9611 if (call_abi == MS_ABI)
9612 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9613 else
9614 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9616 else
9617 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9619 /* For stdarg we expect bounds to be passed for each value passed
9620 in register. */
9621 if (cum->stdarg)
9622 cum->force_bnd_pass = nregs;
9623 /* For pointers passed in memory we expect bounds passed in Bounds
9624 Table. */
9625 if (!nregs)
9626 cum->bnds_in_bt = chkp_type_bounds_count (type);
9629 /* Define where to put the arguments to a function.
9630 Value is zero to push the argument on the stack,
9631 or a hard register in which to store the argument.
9633 MODE is the argument's machine mode.
9634 TYPE is the data type of the argument (as a tree).
9635 This is null for libcalls where that information may
9636 not be available.
9637 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9638 the preceding args and about the function being called.
9639 NAMED is nonzero if this argument is a named parameter
9640 (otherwise it is an extra parameter matching an ellipsis). */
9642 static rtx
9643 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9644 machine_mode orig_mode, const_tree type,
9645 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9647 bool error_p = false;
9648 /* Avoid the AL settings for the Unix64 ABI. */
9649 if (mode == VOIDmode)
9650 return constm1_rtx;
9652 if (TARGET_IAMCU)
9654 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9655 bytes in registers. */
9656 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9657 goto pass_in_reg;
9658 return NULL_RTX;
9661 switch (mode)
9663 default:
9664 break;
9666 case BLKmode:
9667 if (bytes < 0)
9668 break;
9669 /* FALLTHRU */
9670 case DImode:
9671 case SImode:
9672 case HImode:
9673 case QImode:
9674 pass_in_reg:
9675 if (words <= cum->nregs)
9677 int regno = cum->regno;
9679 /* Fastcall allocates the first two DWORD (SImode) or
9680 smaller arguments to ECX and EDX if it isn't an
9681 aggregate type . */
9682 if (cum->fastcall)
9684 if (mode == BLKmode
9685 || mode == DImode
9686 || (type && AGGREGATE_TYPE_P (type)))
9687 break;
9689 /* ECX not EAX is the first allocated register. */
9690 if (regno == AX_REG)
9691 regno = CX_REG;
9693 return gen_rtx_REG (mode, regno);
9695 break;
9697 case DFmode:
9698 if (cum->float_in_sse == -1)
9699 error_p = 1;
9700 if (cum->float_in_sse < 2)
9701 break;
9702 /* FALLTHRU */
9703 case SFmode:
9704 if (cum->float_in_sse == -1)
9705 error_p = 1;
9706 if (cum->float_in_sse < 1)
9707 break;
9708 /* FALLTHRU */
9709 case TImode:
9710 /* In 32bit, we pass TImode in xmm registers. */
9711 case V16QImode:
9712 case V8HImode:
9713 case V4SImode:
9714 case V2DImode:
9715 case V4SFmode:
9716 case V2DFmode:
9717 if (!type || !AGGREGATE_TYPE_P (type))
9719 if (cum->sse_nregs)
9720 return gen_reg_or_parallel (mode, orig_mode,
9721 cum->sse_regno + FIRST_SSE_REG);
9723 break;
9725 case OImode:
9726 case XImode:
9727 /* OImode and XImode shouldn't be used directly. */
9728 gcc_unreachable ();
9730 case V64QImode:
9731 case V32HImode:
9732 case V16SImode:
9733 case V8DImode:
9734 case V16SFmode:
9735 case V8DFmode:
9736 case V8SFmode:
9737 case V8SImode:
9738 case V32QImode:
9739 case V16HImode:
9740 case V4DFmode:
9741 case V4DImode:
9742 if (!type || !AGGREGATE_TYPE_P (type))
9744 if (cum->sse_nregs)
9745 return gen_reg_or_parallel (mode, orig_mode,
9746 cum->sse_regno + FIRST_SSE_REG);
9748 break;
9750 case V8QImode:
9751 case V4HImode:
9752 case V2SImode:
9753 case V2SFmode:
9754 case V1TImode:
9755 case V1DImode:
9756 if (!type || !AGGREGATE_TYPE_P (type))
9758 if (cum->mmx_nregs)
9759 return gen_reg_or_parallel (mode, orig_mode,
9760 cum->mmx_regno + FIRST_MMX_REG);
9762 break;
9764 if (error_p)
9766 cum->float_in_sse = 0;
9767 error ("calling %qD with SSE calling convention without "
9768 "SSE/SSE2 enabled", cum->decl);
9769 sorry ("this is a GCC bug that can be worked around by adding "
9770 "attribute used to function called");
9773 return NULL_RTX;
9776 static rtx
9777 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9778 machine_mode orig_mode, const_tree type, bool named)
9780 /* Handle a hidden AL argument containing number of registers
9781 for varargs x86-64 functions. */
9782 if (mode == VOIDmode)
9783 return GEN_INT (cum->maybe_vaarg
9784 ? (cum->sse_nregs < 0
9785 ? X86_64_SSE_REGPARM_MAX
9786 : cum->sse_regno)
9787 : -1);
9789 switch (mode)
9791 default:
9792 break;
9794 case V8SFmode:
9795 case V8SImode:
9796 case V32QImode:
9797 case V16HImode:
9798 case V4DFmode:
9799 case V4DImode:
9800 case V16SFmode:
9801 case V16SImode:
9802 case V64QImode:
9803 case V32HImode:
9804 case V8DFmode:
9805 case V8DImode:
9806 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9807 if (!named)
9808 return NULL;
9809 break;
9812 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9813 cum->sse_nregs,
9814 &x86_64_int_parameter_registers [cum->regno],
9815 cum->sse_regno);
9818 static rtx
9819 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9820 machine_mode orig_mode, bool named,
9821 HOST_WIDE_INT bytes)
9823 unsigned int regno;
9825 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9826 We use value of -2 to specify that current function call is MSABI. */
9827 if (mode == VOIDmode)
9828 return GEN_INT (-2);
9830 /* If we've run out of registers, it goes on the stack. */
9831 if (cum->nregs == 0)
9832 return NULL_RTX;
9834 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9836 /* Only floating point modes are passed in anything but integer regs. */
9837 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9839 if (named)
9840 regno = cum->regno + FIRST_SSE_REG;
9841 else
9843 rtx t1, t2;
9845 /* Unnamed floating parameters are passed in both the
9846 SSE and integer registers. */
9847 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
9848 t2 = gen_rtx_REG (mode, regno);
9849 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
9850 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
9851 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
9854 /* Handle aggregated types passed in register. */
9855 if (orig_mode == BLKmode)
9857 if (bytes > 0 && bytes <= 8)
9858 mode = (bytes > 4 ? DImode : SImode);
9859 if (mode == BLKmode)
9860 mode = DImode;
9863 return gen_reg_or_parallel (mode, orig_mode, regno);
9866 /* Return where to put the arguments to a function.
9867 Return zero to push the argument on the stack, or a hard register in which to store the argument.
9869 MODE is the argument's machine mode. TYPE is the data type of the
9870 argument. It is null for libcalls where that information may not be
9871 available. CUM gives information about the preceding args and about
9872 the function being called. NAMED is nonzero if this argument is a
9873 named parameter (otherwise it is an extra parameter matching an
9874 ellipsis). */
9876 static rtx
9877 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
9878 const_tree type, bool named)
9880 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9881 machine_mode mode = omode;
9882 HOST_WIDE_INT bytes, words;
9883 rtx arg;
9885 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9887 gcc_assert (type != NULL_TREE);
9888 if (POINTER_TYPE_P (type))
9890 /* This is the pointer argument. */
9891 gcc_assert (TYPE_MODE (type) == Pmode);
9892 if (cfun->machine->func_type == TYPE_INTERRUPT)
9893 /* -WORD(AP) in the current frame in interrupt handler. */
9894 arg = plus_constant (Pmode, arg_pointer_rtx,
9895 -UNITS_PER_WORD);
9896 else
9897 /* (AP) in the current frame in exception handler. */
9898 arg = arg_pointer_rtx;
9900 else
9902 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
9903 && TREE_CODE (type) == INTEGER_TYPE
9904 && TYPE_MODE (type) == word_mode);
9905 /* The integer argument is the error code at -WORD(AP) in
9906 the current frame in exception handler. */
9907 arg = gen_rtx_MEM (word_mode,
9908 plus_constant (Pmode,
9909 arg_pointer_rtx,
9910 -UNITS_PER_WORD));
9912 return arg;
9915 /* All pointer bounds arguments are handled separately here. */
9916 if ((type && POINTER_BOUNDS_TYPE_P (type))
9917 || POINTER_BOUNDS_MODE_P (mode))
9919 /* Return NULL if bounds are forced to go in Bounds Table. */
9920 if (cum->bnds_in_bt)
9921 arg = NULL;
9922 /* Return the next available bound reg if any. */
9923 else if (cum->bnd_regno <= LAST_BND_REG)
9924 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
9925 /* Return the next special slot number otherwise. */
9926 else
9927 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
9929 return arg;
9932 if (mode == BLKmode)
9933 bytes = int_size_in_bytes (type);
9934 else
9935 bytes = GET_MODE_SIZE (mode);
9936 words = CEIL (bytes, UNITS_PER_WORD);
9938 /* To simplify the code below, represent vector types with a vector mode
9939 even if MMX/SSE are not active. */
9940 if (type && TREE_CODE (type) == VECTOR_TYPE)
9941 mode = type_natural_mode (type, cum, false);
9943 if (TARGET_64BIT)
9945 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9947 if (call_abi == MS_ABI)
9948 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
9949 else
9950 arg = function_arg_64 (cum, mode, omode, type, named);
9952 else
9953 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
9955 return arg;
9958 /* A C expression that indicates when an argument must be passed by
9959 reference. If nonzero for an argument, a copy of that argument is
9960 made in memory and a pointer to the argument is passed instead of
9961 the argument itself. The pointer is passed in whatever way is
9962 appropriate for passing a pointer to that type. */
9964 static bool
9965 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
9966 const_tree type, bool)
9968 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9970 /* Bounds are never passed by reference. */
9971 if ((type && POINTER_BOUNDS_TYPE_P (type))
9972 || POINTER_BOUNDS_MODE_P (mode))
9973 return false;
9975 if (TARGET_64BIT)
9977 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9979 /* See Windows x64 Software Convention. */
9980 if (call_abi == MS_ABI)
9982 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
9984 if (type)
9986 /* Arrays are passed by reference. */
9987 if (TREE_CODE (type) == ARRAY_TYPE)
9988 return true;
9990 if (RECORD_OR_UNION_TYPE_P (type))
9992 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
9993 are passed by reference. */
9994 msize = int_size_in_bytes (type);
9998 /* __m128 is passed by reference. */
9999 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10001 else if (type && int_size_in_bytes (type) == -1)
10002 return true;
10005 return false;
10008 /* Return true when TYPE should be 128bit aligned for 32bit argument
10009 passing ABI. XXX: This function is obsolete and is only used for
10010 checking psABI compatibility with previous versions of GCC. */
10012 static bool
10013 ix86_compat_aligned_value_p (const_tree type)
10015 machine_mode mode = TYPE_MODE (type);
10016 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10017 || mode == TDmode
10018 || mode == TFmode
10019 || mode == TCmode)
10020 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10021 return true;
10022 if (TYPE_ALIGN (type) < 128)
10023 return false;
10025 if (AGGREGATE_TYPE_P (type))
10027 /* Walk the aggregates recursively. */
10028 switch (TREE_CODE (type))
10030 case RECORD_TYPE:
10031 case UNION_TYPE:
10032 case QUAL_UNION_TYPE:
10034 tree field;
10036 /* Walk all the structure fields. */
10037 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10039 if (TREE_CODE (field) == FIELD_DECL
10040 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10041 return true;
10043 break;
10046 case ARRAY_TYPE:
10047 /* Just for use if some languages passes arrays by value. */
10048 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10049 return true;
10050 break;
10052 default:
10053 gcc_unreachable ();
10056 return false;
10059 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10060 XXX: This function is obsolete and is only used for checking psABI
10061 compatibility with previous versions of GCC. */
10063 static unsigned int
10064 ix86_compat_function_arg_boundary (machine_mode mode,
10065 const_tree type, unsigned int align)
10067 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10068 natural boundaries. */
10069 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10071 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10072 make an exception for SSE modes since these require 128bit
10073 alignment.
10075 The handling here differs from field_alignment. ICC aligns MMX
10076 arguments to 4 byte boundaries, while structure fields are aligned
10077 to 8 byte boundaries. */
10078 if (!type)
10080 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10081 align = PARM_BOUNDARY;
10083 else
10085 if (!ix86_compat_aligned_value_p (type))
10086 align = PARM_BOUNDARY;
10089 if (align > BIGGEST_ALIGNMENT)
10090 align = BIGGEST_ALIGNMENT;
10091 return align;
10094 /* Return true when TYPE should be 128bit aligned for 32bit argument
10095 passing ABI. */
10097 static bool
10098 ix86_contains_aligned_value_p (const_tree type)
10100 machine_mode mode = TYPE_MODE (type);
10102 if (mode == XFmode || mode == XCmode)
10103 return false;
10105 if (TYPE_ALIGN (type) < 128)
10106 return false;
10108 if (AGGREGATE_TYPE_P (type))
10110 /* Walk the aggregates recursively. */
10111 switch (TREE_CODE (type))
10113 case RECORD_TYPE:
10114 case UNION_TYPE:
10115 case QUAL_UNION_TYPE:
10117 tree field;
10119 /* Walk all the structure fields. */
10120 for (field = TYPE_FIELDS (type);
10121 field;
10122 field = DECL_CHAIN (field))
10124 if (TREE_CODE (field) == FIELD_DECL
10125 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10126 return true;
10128 break;
10131 case ARRAY_TYPE:
10132 /* Just for use if some languages passes arrays by value. */
10133 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10134 return true;
10135 break;
10137 default:
10138 gcc_unreachable ();
10141 else
10142 return TYPE_ALIGN (type) >= 128;
10144 return false;
10147 /* Gives the alignment boundary, in bits, of an argument with the
10148 specified mode and type. */
10150 static unsigned int
10151 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10153 unsigned int align;
10154 if (type)
10156 /* Since the main variant type is used for call, we convert it to
10157 the main variant type. */
10158 type = TYPE_MAIN_VARIANT (type);
10159 align = TYPE_ALIGN (type);
10161 else
10162 align = GET_MODE_ALIGNMENT (mode);
10163 if (align < PARM_BOUNDARY)
10164 align = PARM_BOUNDARY;
10165 else
10167 static bool warned;
10168 unsigned int saved_align = align;
10170 if (!TARGET_64BIT)
10172 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10173 if (!type)
10175 if (mode == XFmode || mode == XCmode)
10176 align = PARM_BOUNDARY;
10178 else if (!ix86_contains_aligned_value_p (type))
10179 align = PARM_BOUNDARY;
10181 if (align < 128)
10182 align = PARM_BOUNDARY;
10185 if (warn_psabi
10186 && !warned
10187 && align != ix86_compat_function_arg_boundary (mode, type,
10188 saved_align))
10190 warned = true;
10191 inform (input_location,
10192 "The ABI for passing parameters with %d-byte"
10193 " alignment has changed in GCC 4.6",
10194 align / BITS_PER_UNIT);
10198 return align;
10201 /* Return true if N is a possible register number of function value. */
10203 static bool
10204 ix86_function_value_regno_p (const unsigned int regno)
10206 switch (regno)
10208 case AX_REG:
10209 return true;
10210 case DX_REG:
10211 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10212 case DI_REG:
10213 case SI_REG:
10214 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10216 case BND0_REG:
10217 case BND1_REG:
10218 return chkp_function_instrumented_p (current_function_decl);
10220 /* Complex values are returned in %st(0)/%st(1) pair. */
10221 case ST0_REG:
10222 case ST1_REG:
10223 /* TODO: The function should depend on current function ABI but
10224 builtins.c would need updating then. Therefore we use the
10225 default ABI. */
10226 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10227 return false;
10228 return TARGET_FLOAT_RETURNS_IN_80387;
10230 /* Complex values are returned in %xmm0/%xmm1 pair. */
10231 case XMM0_REG:
10232 case XMM1_REG:
10233 return TARGET_SSE;
10235 case MM0_REG:
10236 if (TARGET_MACHO || TARGET_64BIT)
10237 return false;
10238 return TARGET_MMX;
10241 return false;
10244 /* Define how to find the value returned by a function.
10245 VALTYPE is the data type of the value (as a tree).
10246 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10247 otherwise, FUNC is 0. */
10249 static rtx
10250 function_value_32 (machine_mode orig_mode, machine_mode mode,
10251 const_tree fntype, const_tree fn)
10253 unsigned int regno;
10255 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10256 we normally prevent this case when mmx is not available. However
10257 some ABIs may require the result to be returned like DImode. */
10258 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10259 regno = FIRST_MMX_REG;
10261 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10262 we prevent this case when sse is not available. However some ABIs
10263 may require the result to be returned like integer TImode. */
10264 else if (mode == TImode
10265 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10266 regno = FIRST_SSE_REG;
10268 /* 32-byte vector modes in %ymm0. */
10269 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10270 regno = FIRST_SSE_REG;
10272 /* 64-byte vector modes in %zmm0. */
10273 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10274 regno = FIRST_SSE_REG;
10276 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10277 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10278 regno = FIRST_FLOAT_REG;
10279 else
10280 /* Most things go in %eax. */
10281 regno = AX_REG;
10283 /* Override FP return register with %xmm0 for local functions when
10284 SSE math is enabled or for functions with sseregparm attribute. */
10285 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10287 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10288 if (sse_level == -1)
10290 error ("calling %qD with SSE caling convention without "
10291 "SSE/SSE2 enabled", fn);
10292 sorry ("this is a GCC bug that can be worked around by adding "
10293 "attribute used to function called");
10295 else if ((sse_level >= 1 && mode == SFmode)
10296 || (sse_level == 2 && mode == DFmode))
10297 regno = FIRST_SSE_REG;
10300 /* OImode shouldn't be used directly. */
10301 gcc_assert (mode != OImode);
10303 return gen_rtx_REG (orig_mode, regno);
10306 static rtx
10307 function_value_64 (machine_mode orig_mode, machine_mode mode,
10308 const_tree valtype)
10310 rtx ret;
10312 /* Handle libcalls, which don't provide a type node. */
10313 if (valtype == NULL)
10315 unsigned int regno;
10317 switch (mode)
10319 case SFmode:
10320 case SCmode:
10321 case DFmode:
10322 case DCmode:
10323 case TFmode:
10324 case SDmode:
10325 case DDmode:
10326 case TDmode:
10327 regno = FIRST_SSE_REG;
10328 break;
10329 case XFmode:
10330 case XCmode:
10331 regno = FIRST_FLOAT_REG;
10332 break;
10333 case TCmode:
10334 return NULL;
10335 default:
10336 regno = AX_REG;
10339 return gen_rtx_REG (mode, regno);
10341 else if (POINTER_TYPE_P (valtype))
10343 /* Pointers are always returned in word_mode. */
10344 mode = word_mode;
10347 ret = construct_container (mode, orig_mode, valtype, 1,
10348 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10349 x86_64_int_return_registers, 0);
10351 /* For zero sized structures, construct_container returns NULL, but we
10352 need to keep rest of compiler happy by returning meaningful value. */
10353 if (!ret)
10354 ret = gen_rtx_REG (orig_mode, AX_REG);
10356 return ret;
10359 static rtx
10360 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10361 const_tree valtype)
10363 unsigned int regno = AX_REG;
10365 if (TARGET_SSE)
10367 switch (GET_MODE_SIZE (mode))
10369 case 16:
10370 if (valtype != NULL_TREE
10371 && !VECTOR_INTEGER_TYPE_P (valtype)
10372 && !VECTOR_INTEGER_TYPE_P (valtype)
10373 && !INTEGRAL_TYPE_P (valtype)
10374 && !VECTOR_FLOAT_TYPE_P (valtype))
10375 break;
10376 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10377 && !COMPLEX_MODE_P (mode))
10378 regno = FIRST_SSE_REG;
10379 break;
10380 case 8:
10381 case 4:
10382 if (mode == SFmode || mode == DFmode)
10383 regno = FIRST_SSE_REG;
10384 break;
10385 default:
10386 break;
10389 return gen_rtx_REG (orig_mode, regno);
10392 static rtx
10393 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10394 machine_mode orig_mode, machine_mode mode)
10396 const_tree fn, fntype;
10398 fn = NULL_TREE;
10399 if (fntype_or_decl && DECL_P (fntype_or_decl))
10400 fn = fntype_or_decl;
10401 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10403 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10404 || POINTER_BOUNDS_MODE_P (mode))
10405 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10406 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10407 return function_value_ms_64 (orig_mode, mode, valtype);
10408 else if (TARGET_64BIT)
10409 return function_value_64 (orig_mode, mode, valtype);
10410 else
10411 return function_value_32 (orig_mode, mode, fntype, fn);
10414 static rtx
10415 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10417 machine_mode mode, orig_mode;
10419 orig_mode = TYPE_MODE (valtype);
10420 mode = type_natural_mode (valtype, NULL, true);
10421 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10424 /* Return an RTX representing a place where a function returns
10425 or recieves pointer bounds or NULL if no bounds are returned.
10427 VALTYPE is a data type of a value returned by the function.
10429 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10430 or FUNCTION_TYPE of the function.
10432 If OUTGOING is false, return a place in which the caller will
10433 see the return value. Otherwise, return a place where a
10434 function returns a value. */
10436 static rtx
10437 ix86_function_value_bounds (const_tree valtype,
10438 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10439 bool outgoing ATTRIBUTE_UNUSED)
10441 rtx res = NULL_RTX;
10443 if (BOUNDED_TYPE_P (valtype))
10444 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10445 else if (chkp_type_has_pointer (valtype))
10447 bitmap slots;
10448 rtx bounds[2];
10449 bitmap_iterator bi;
10450 unsigned i, bnd_no = 0;
10452 bitmap_obstack_initialize (NULL);
10453 slots = BITMAP_ALLOC (NULL);
10454 chkp_find_bound_slots (valtype, slots);
10456 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10458 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10459 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10460 gcc_assert (bnd_no < 2);
10461 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10464 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10466 BITMAP_FREE (slots);
10467 bitmap_obstack_release (NULL);
10469 else
10470 res = NULL_RTX;
10472 return res;
10475 /* Pointer function arguments and return values are promoted to
10476 word_mode for normal functions. */
10478 static machine_mode
10479 ix86_promote_function_mode (const_tree type, machine_mode mode,
10480 int *punsignedp, const_tree fntype,
10481 int for_return)
10483 if (cfun->machine->func_type == TYPE_NORMAL
10484 && type != NULL_TREE
10485 && POINTER_TYPE_P (type))
10487 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10488 return word_mode;
10490 return default_promote_function_mode (type, mode, punsignedp, fntype,
10491 for_return);
10494 /* Return true if a structure, union or array with MODE containing FIELD
10495 should be accessed using BLKmode. */
10497 static bool
10498 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10500 /* Union with XFmode must be in BLKmode. */
10501 return (mode == XFmode
10502 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10503 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10507 ix86_libcall_value (machine_mode mode)
10509 return ix86_function_value_1 (NULL, NULL, mode, mode);
10512 /* Return true iff type is returned in memory. */
10514 static bool
10515 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10517 #ifdef SUBTARGET_RETURN_IN_MEMORY
10518 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10519 #else
10520 const machine_mode mode = type_natural_mode (type, NULL, true);
10521 HOST_WIDE_INT size;
10523 if (POINTER_BOUNDS_TYPE_P (type))
10524 return false;
10526 if (TARGET_64BIT)
10528 if (ix86_function_type_abi (fntype) == MS_ABI)
10530 size = int_size_in_bytes (type);
10532 /* __m128 is returned in xmm0. */
10533 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10534 || INTEGRAL_TYPE_P (type)
10535 || VECTOR_FLOAT_TYPE_P (type))
10536 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10537 && !COMPLEX_MODE_P (mode)
10538 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10539 return false;
10541 /* Otherwise, the size must be exactly in [1248]. */
10542 return size != 1 && size != 2 && size != 4 && size != 8;
10544 else
10546 int needed_intregs, needed_sseregs;
10548 return examine_argument (mode, type, 1,
10549 &needed_intregs, &needed_sseregs);
10552 else
10554 size = int_size_in_bytes (type);
10556 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10557 bytes in registers. */
10558 if (TARGET_IAMCU)
10559 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10561 if (mode == BLKmode)
10562 return true;
10564 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10565 return false;
10567 if (VECTOR_MODE_P (mode) || mode == TImode)
10569 /* User-created vectors small enough to fit in EAX. */
10570 if (size < 8)
10571 return false;
10573 /* Unless ABI prescibes otherwise,
10574 MMX/3dNow values are returned in MM0 if available. */
10576 if (size == 8)
10577 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10579 /* SSE values are returned in XMM0 if available. */
10580 if (size == 16)
10581 return !TARGET_SSE;
10583 /* AVX values are returned in YMM0 if available. */
10584 if (size == 32)
10585 return !TARGET_AVX;
10587 /* AVX512F values are returned in ZMM0 if available. */
10588 if (size == 64)
10589 return !TARGET_AVX512F;
10592 if (mode == XFmode)
10593 return false;
10595 if (size > 12)
10596 return true;
10598 /* OImode shouldn't be used directly. */
10599 gcc_assert (mode != OImode);
10601 return false;
10603 #endif
10607 /* Create the va_list data type. */
10609 static tree
10610 ix86_build_builtin_va_list_64 (void)
10612 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10614 record = lang_hooks.types.make_type (RECORD_TYPE);
10615 type_decl = build_decl (BUILTINS_LOCATION,
10616 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10618 f_gpr = build_decl (BUILTINS_LOCATION,
10619 FIELD_DECL, get_identifier ("gp_offset"),
10620 unsigned_type_node);
10621 f_fpr = build_decl (BUILTINS_LOCATION,
10622 FIELD_DECL, get_identifier ("fp_offset"),
10623 unsigned_type_node);
10624 f_ovf = build_decl (BUILTINS_LOCATION,
10625 FIELD_DECL, get_identifier ("overflow_arg_area"),
10626 ptr_type_node);
10627 f_sav = build_decl (BUILTINS_LOCATION,
10628 FIELD_DECL, get_identifier ("reg_save_area"),
10629 ptr_type_node);
10631 va_list_gpr_counter_field = f_gpr;
10632 va_list_fpr_counter_field = f_fpr;
10634 DECL_FIELD_CONTEXT (f_gpr) = record;
10635 DECL_FIELD_CONTEXT (f_fpr) = record;
10636 DECL_FIELD_CONTEXT (f_ovf) = record;
10637 DECL_FIELD_CONTEXT (f_sav) = record;
10639 TYPE_STUB_DECL (record) = type_decl;
10640 TYPE_NAME (record) = type_decl;
10641 TYPE_FIELDS (record) = f_gpr;
10642 DECL_CHAIN (f_gpr) = f_fpr;
10643 DECL_CHAIN (f_fpr) = f_ovf;
10644 DECL_CHAIN (f_ovf) = f_sav;
10646 layout_type (record);
10648 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10649 NULL_TREE, TYPE_ATTRIBUTES (record));
10651 /* The correct type is an array type of one element. */
10652 return build_array_type (record, build_index_type (size_zero_node));
10655 /* Setup the builtin va_list data type and for 64-bit the additional
10656 calling convention specific va_list data types. */
10658 static tree
10659 ix86_build_builtin_va_list (void)
10661 if (TARGET_64BIT)
10663 /* Initialize ABI specific va_list builtin types.
10665 In lto1, we can encounter two va_list types:
10666 - one as a result of the type-merge across TUs, and
10667 - the one constructed here.
10668 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10669 a type identity check in canonical_va_list_type based on
10670 TYPE_MAIN_VARIANT (which we used to have) will not work.
10671 Instead, we tag each va_list_type_node with its unique attribute, and
10672 look for the attribute in the type identity check in
10673 canonical_va_list_type.
10675 Tagging sysv_va_list_type_node directly with the attribute is
10676 problematic since it's a array of one record, which will degrade into a
10677 pointer to record when used as parameter (see build_va_arg comments for
10678 an example), dropping the attribute in the process. So we tag the
10679 record instead. */
10681 /* For SYSV_ABI we use an array of one record. */
10682 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10684 /* For MS_ABI we use plain pointer to argument area. */
10685 tree char_ptr_type = build_pointer_type (char_type_node);
10686 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10687 TYPE_ATTRIBUTES (char_ptr_type));
10688 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10690 return ((ix86_abi == MS_ABI)
10691 ? ms_va_list_type_node
10692 : sysv_va_list_type_node);
10694 else
10696 /* For i386 we use plain pointer to argument area. */
10697 return build_pointer_type (char_type_node);
10701 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10703 static void
10704 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10706 rtx save_area, mem;
10707 alias_set_type set;
10708 int i, max;
10710 /* GPR size of varargs save area. */
10711 if (cfun->va_list_gpr_size)
10712 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10713 else
10714 ix86_varargs_gpr_size = 0;
10716 /* FPR size of varargs save area. We don't need it if we don't pass
10717 anything in SSE registers. */
10718 if (TARGET_SSE && cfun->va_list_fpr_size)
10719 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10720 else
10721 ix86_varargs_fpr_size = 0;
10723 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10724 return;
10726 save_area = frame_pointer_rtx;
10727 set = get_varargs_alias_set ();
10729 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10730 if (max > X86_64_REGPARM_MAX)
10731 max = X86_64_REGPARM_MAX;
10733 for (i = cum->regno; i < max; i++)
10735 mem = gen_rtx_MEM (word_mode,
10736 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10737 MEM_NOTRAP_P (mem) = 1;
10738 set_mem_alias_set (mem, set);
10739 emit_move_insn (mem,
10740 gen_rtx_REG (word_mode,
10741 x86_64_int_parameter_registers[i]));
10744 if (ix86_varargs_fpr_size)
10746 machine_mode smode;
10747 rtx_code_label *label;
10748 rtx test;
10750 /* Now emit code to save SSE registers. The AX parameter contains number
10751 of SSE parameter registers used to call this function, though all we
10752 actually check here is the zero/non-zero status. */
10754 label = gen_label_rtx ();
10755 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10756 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10757 label));
10759 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10760 we used movdqa (i.e. TImode) instead? Perhaps even better would
10761 be if we could determine the real mode of the data, via a hook
10762 into pass_stdarg. Ignore all that for now. */
10763 smode = V4SFmode;
10764 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10765 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10767 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10768 if (max > X86_64_SSE_REGPARM_MAX)
10769 max = X86_64_SSE_REGPARM_MAX;
10771 for (i = cum->sse_regno; i < max; ++i)
10773 mem = plus_constant (Pmode, save_area,
10774 i * 16 + ix86_varargs_gpr_size);
10775 mem = gen_rtx_MEM (smode, mem);
10776 MEM_NOTRAP_P (mem) = 1;
10777 set_mem_alias_set (mem, set);
10778 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10780 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10783 emit_label (label);
10787 static void
10788 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10790 alias_set_type set = get_varargs_alias_set ();
10791 int i;
10793 /* Reset to zero, as there might be a sysv vaarg used
10794 before. */
10795 ix86_varargs_gpr_size = 0;
10796 ix86_varargs_fpr_size = 0;
10798 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10800 rtx reg, mem;
10802 mem = gen_rtx_MEM (Pmode,
10803 plus_constant (Pmode, virtual_incoming_args_rtx,
10804 i * UNITS_PER_WORD));
10805 MEM_NOTRAP_P (mem) = 1;
10806 set_mem_alias_set (mem, set);
10808 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10809 emit_move_insn (mem, reg);
10813 static void
10814 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10815 tree type, int *, int no_rtl)
10817 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10818 CUMULATIVE_ARGS next_cum;
10819 tree fntype;
10821 /* This argument doesn't appear to be used anymore. Which is good,
10822 because the old code here didn't suppress rtl generation. */
10823 gcc_assert (!no_rtl);
10825 if (!TARGET_64BIT)
10826 return;
10828 fntype = TREE_TYPE (current_function_decl);
10830 /* For varargs, we do not want to skip the dummy va_dcl argument.
10831 For stdargs, we do want to skip the last named argument. */
10832 next_cum = *cum;
10833 if (stdarg_p (fntype))
10834 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10835 true);
10837 if (cum->call_abi == MS_ABI)
10838 setup_incoming_varargs_ms_64 (&next_cum);
10839 else
10840 setup_incoming_varargs_64 (&next_cum);
10843 static void
10844 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10845 enum machine_mode mode,
10846 tree type,
10847 int *pretend_size ATTRIBUTE_UNUSED,
10848 int no_rtl)
10850 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10851 CUMULATIVE_ARGS next_cum;
10852 tree fntype;
10853 rtx save_area;
10854 int bnd_reg, i, max;
10856 gcc_assert (!no_rtl);
10858 /* Do nothing if we use plain pointer to argument area. */
10859 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
10860 return;
10862 fntype = TREE_TYPE (current_function_decl);
10864 /* For varargs, we do not want to skip the dummy va_dcl argument.
10865 For stdargs, we do want to skip the last named argument. */
10866 next_cum = *cum;
10867 if (stdarg_p (fntype))
10868 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10869 true);
10870 save_area = frame_pointer_rtx;
10872 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10873 if (max > X86_64_REGPARM_MAX)
10874 max = X86_64_REGPARM_MAX;
10876 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
10877 if (chkp_function_instrumented_p (current_function_decl))
10878 for (i = cum->regno; i < max; i++)
10880 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
10881 rtx ptr = gen_rtx_REG (Pmode,
10882 x86_64_int_parameter_registers[i]);
10883 rtx bounds;
10885 if (bnd_reg <= LAST_BND_REG)
10886 bounds = gen_rtx_REG (BNDmode, bnd_reg);
10887 else
10889 rtx ldx_addr =
10890 plus_constant (Pmode, arg_pointer_rtx,
10891 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
10892 bounds = gen_reg_rtx (BNDmode);
10893 emit_insn (BNDmode == BND64mode
10894 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
10895 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
10898 emit_insn (BNDmode == BND64mode
10899 ? gen_bnd64_stx (addr, ptr, bounds)
10900 : gen_bnd32_stx (addr, ptr, bounds));
10902 bnd_reg++;
10907 /* Checks if TYPE is of kind va_list char *. */
10909 static bool
10910 is_va_list_char_pointer (tree type)
10912 tree canonic;
10914 /* For 32-bit it is always true. */
10915 if (!TARGET_64BIT)
10916 return true;
10917 canonic = ix86_canonical_va_list_type (type);
10918 return (canonic == ms_va_list_type_node
10919 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
10922 /* Implement va_start. */
10924 static void
10925 ix86_va_start (tree valist, rtx nextarg)
10927 HOST_WIDE_INT words, n_gpr, n_fpr;
10928 tree f_gpr, f_fpr, f_ovf, f_sav;
10929 tree gpr, fpr, ovf, sav, t;
10930 tree type;
10931 rtx ovf_rtx;
10933 if (flag_split_stack
10934 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10936 unsigned int scratch_regno;
10938 /* When we are splitting the stack, we can't refer to the stack
10939 arguments using internal_arg_pointer, because they may be on
10940 the old stack. The split stack prologue will arrange to
10941 leave a pointer to the old stack arguments in a scratch
10942 register, which we here copy to a pseudo-register. The split
10943 stack prologue can't set the pseudo-register directly because
10944 it (the prologue) runs before any registers have been saved. */
10946 scratch_regno = split_stack_prologue_scratch_regno ();
10947 if (scratch_regno != INVALID_REGNUM)
10949 rtx reg;
10950 rtx_insn *seq;
10952 reg = gen_reg_rtx (Pmode);
10953 cfun->machine->split_stack_varargs_pointer = reg;
10955 start_sequence ();
10956 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
10957 seq = get_insns ();
10958 end_sequence ();
10960 push_topmost_sequence ();
10961 emit_insn_after (seq, entry_of_function ());
10962 pop_topmost_sequence ();
10966 /* Only 64bit target needs something special. */
10967 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10969 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10970 std_expand_builtin_va_start (valist, nextarg);
10971 else
10973 rtx va_r, next;
10975 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
10976 next = expand_binop (ptr_mode, add_optab,
10977 cfun->machine->split_stack_varargs_pointer,
10978 crtl->args.arg_offset_rtx,
10979 NULL_RTX, 0, OPTAB_LIB_WIDEN);
10980 convert_move (va_r, next, 0);
10982 /* Store zero bounds for va_list. */
10983 if (chkp_function_instrumented_p (current_function_decl))
10984 chkp_expand_bounds_reset_for_mem (valist,
10985 make_tree (TREE_TYPE (valist),
10986 next));
10989 return;
10992 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10993 f_fpr = DECL_CHAIN (f_gpr);
10994 f_ovf = DECL_CHAIN (f_fpr);
10995 f_sav = DECL_CHAIN (f_ovf);
10997 valist = build_simple_mem_ref (valist);
10998 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
10999 /* The following should be folded into the MEM_REF offset. */
11000 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11001 f_gpr, NULL_TREE);
11002 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11003 f_fpr, NULL_TREE);
11004 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11005 f_ovf, NULL_TREE);
11006 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11007 f_sav, NULL_TREE);
11009 /* Count number of gp and fp argument registers used. */
11010 words = crtl->args.info.words;
11011 n_gpr = crtl->args.info.regno;
11012 n_fpr = crtl->args.info.sse_regno;
11014 if (cfun->va_list_gpr_size)
11016 type = TREE_TYPE (gpr);
11017 t = build2 (MODIFY_EXPR, type,
11018 gpr, build_int_cst (type, n_gpr * 8));
11019 TREE_SIDE_EFFECTS (t) = 1;
11020 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11023 if (TARGET_SSE && cfun->va_list_fpr_size)
11025 type = TREE_TYPE (fpr);
11026 t = build2 (MODIFY_EXPR, type, fpr,
11027 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11028 TREE_SIDE_EFFECTS (t) = 1;
11029 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11032 /* Find the overflow area. */
11033 type = TREE_TYPE (ovf);
11034 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11035 ovf_rtx = crtl->args.internal_arg_pointer;
11036 else
11037 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11038 t = make_tree (type, ovf_rtx);
11039 if (words != 0)
11040 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11042 /* Store zero bounds for overflow area pointer. */
11043 if (chkp_function_instrumented_p (current_function_decl))
11044 chkp_expand_bounds_reset_for_mem (ovf, t);
11046 t = build2 (MODIFY_EXPR, type, ovf, t);
11047 TREE_SIDE_EFFECTS (t) = 1;
11048 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11050 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11052 /* Find the register save area.
11053 Prologue of the function save it right above stack frame. */
11054 type = TREE_TYPE (sav);
11055 t = make_tree (type, frame_pointer_rtx);
11056 if (!ix86_varargs_gpr_size)
11057 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11059 /* Store zero bounds for save area pointer. */
11060 if (chkp_function_instrumented_p (current_function_decl))
11061 chkp_expand_bounds_reset_for_mem (sav, t);
11063 t = build2 (MODIFY_EXPR, type, sav, t);
11064 TREE_SIDE_EFFECTS (t) = 1;
11065 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11069 /* Implement va_arg. */
11071 static tree
11072 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11073 gimple_seq *post_p)
11075 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11076 tree f_gpr, f_fpr, f_ovf, f_sav;
11077 tree gpr, fpr, ovf, sav, t;
11078 int size, rsize;
11079 tree lab_false, lab_over = NULL_TREE;
11080 tree addr, t2;
11081 rtx container;
11082 int indirect_p = 0;
11083 tree ptrtype;
11084 machine_mode nat_mode;
11085 unsigned int arg_boundary;
11087 /* Only 64bit target needs something special. */
11088 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11089 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11091 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11092 f_fpr = DECL_CHAIN (f_gpr);
11093 f_ovf = DECL_CHAIN (f_fpr);
11094 f_sav = DECL_CHAIN (f_ovf);
11096 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11097 valist, f_gpr, NULL_TREE);
11099 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11100 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11101 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11103 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11104 if (indirect_p)
11105 type = build_pointer_type (type);
11106 size = int_size_in_bytes (type);
11107 rsize = CEIL (size, UNITS_PER_WORD);
11109 nat_mode = type_natural_mode (type, NULL, false);
11110 switch (nat_mode)
11112 case V8SFmode:
11113 case V8SImode:
11114 case V32QImode:
11115 case V16HImode:
11116 case V4DFmode:
11117 case V4DImode:
11118 case V16SFmode:
11119 case V16SImode:
11120 case V64QImode:
11121 case V32HImode:
11122 case V8DFmode:
11123 case V8DImode:
11124 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11125 if (!TARGET_64BIT_MS_ABI)
11127 container = NULL;
11128 break;
11130 /* FALLTHRU */
11132 default:
11133 container = construct_container (nat_mode, TYPE_MODE (type),
11134 type, 0, X86_64_REGPARM_MAX,
11135 X86_64_SSE_REGPARM_MAX, intreg,
11137 break;
11140 /* Pull the value out of the saved registers. */
11142 addr = create_tmp_var (ptr_type_node, "addr");
11144 if (container)
11146 int needed_intregs, needed_sseregs;
11147 bool need_temp;
11148 tree int_addr, sse_addr;
11150 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11151 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11153 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11155 need_temp = (!REG_P (container)
11156 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11157 || TYPE_ALIGN (type) > 128));
11159 /* In case we are passing structure, verify that it is consecutive block
11160 on the register save area. If not we need to do moves. */
11161 if (!need_temp && !REG_P (container))
11163 /* Verify that all registers are strictly consecutive */
11164 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11166 int i;
11168 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11170 rtx slot = XVECEXP (container, 0, i);
11171 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11172 || INTVAL (XEXP (slot, 1)) != i * 16)
11173 need_temp = true;
11176 else
11178 int i;
11180 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11182 rtx slot = XVECEXP (container, 0, i);
11183 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11184 || INTVAL (XEXP (slot, 1)) != i * 8)
11185 need_temp = true;
11189 if (!need_temp)
11191 int_addr = addr;
11192 sse_addr = addr;
11194 else
11196 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11197 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11200 /* First ensure that we fit completely in registers. */
11201 if (needed_intregs)
11203 t = build_int_cst (TREE_TYPE (gpr),
11204 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11205 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11206 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11207 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11208 gimplify_and_add (t, pre_p);
11210 if (needed_sseregs)
11212 t = build_int_cst (TREE_TYPE (fpr),
11213 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11214 + X86_64_REGPARM_MAX * 8);
11215 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11216 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11217 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11218 gimplify_and_add (t, pre_p);
11221 /* Compute index to start of area used for integer regs. */
11222 if (needed_intregs)
11224 /* int_addr = gpr + sav; */
11225 t = fold_build_pointer_plus (sav, gpr);
11226 gimplify_assign (int_addr, t, pre_p);
11228 if (needed_sseregs)
11230 /* sse_addr = fpr + sav; */
11231 t = fold_build_pointer_plus (sav, fpr);
11232 gimplify_assign (sse_addr, t, pre_p);
11234 if (need_temp)
11236 int i, prev_size = 0;
11237 tree temp = create_tmp_var (type, "va_arg_tmp");
11239 /* addr = &temp; */
11240 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11241 gimplify_assign (addr, t, pre_p);
11243 for (i = 0; i < XVECLEN (container, 0); i++)
11245 rtx slot = XVECEXP (container, 0, i);
11246 rtx reg = XEXP (slot, 0);
11247 machine_mode mode = GET_MODE (reg);
11248 tree piece_type;
11249 tree addr_type;
11250 tree daddr_type;
11251 tree src_addr, src;
11252 int src_offset;
11253 tree dest_addr, dest;
11254 int cur_size = GET_MODE_SIZE (mode);
11256 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11257 prev_size = INTVAL (XEXP (slot, 1));
11258 if (prev_size + cur_size > size)
11260 cur_size = size - prev_size;
11261 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11262 if (mode == BLKmode)
11263 mode = QImode;
11265 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11266 if (mode == GET_MODE (reg))
11267 addr_type = build_pointer_type (piece_type);
11268 else
11269 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11270 true);
11271 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11272 true);
11274 if (SSE_REGNO_P (REGNO (reg)))
11276 src_addr = sse_addr;
11277 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11279 else
11281 src_addr = int_addr;
11282 src_offset = REGNO (reg) * 8;
11284 src_addr = fold_convert (addr_type, src_addr);
11285 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11287 dest_addr = fold_convert (daddr_type, addr);
11288 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11289 if (cur_size == GET_MODE_SIZE (mode))
11291 src = build_va_arg_indirect_ref (src_addr);
11292 dest = build_va_arg_indirect_ref (dest_addr);
11294 gimplify_assign (dest, src, pre_p);
11296 else
11298 tree copy
11299 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11300 3, dest_addr, src_addr,
11301 size_int (cur_size));
11302 gimplify_and_add (copy, pre_p);
11304 prev_size += cur_size;
11308 if (needed_intregs)
11310 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11311 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11312 gimplify_assign (gpr, t, pre_p);
11315 if (needed_sseregs)
11317 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11318 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11319 gimplify_assign (unshare_expr (fpr), t, pre_p);
11322 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11324 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11327 /* ... otherwise out of the overflow area. */
11329 /* When we align parameter on stack for caller, if the parameter
11330 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11331 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11332 here with caller. */
11333 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11334 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11335 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11337 /* Care for on-stack alignment if needed. */
11338 if (arg_boundary <= 64 || size == 0)
11339 t = ovf;
11340 else
11342 HOST_WIDE_INT align = arg_boundary / 8;
11343 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11344 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11345 build_int_cst (TREE_TYPE (t), -align));
11348 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11349 gimplify_assign (addr, t, pre_p);
11351 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11352 gimplify_assign (unshare_expr (ovf), t, pre_p);
11354 if (container)
11355 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11357 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11358 addr = fold_convert (ptrtype, addr);
11360 if (indirect_p)
11361 addr = build_va_arg_indirect_ref (addr);
11362 return build_va_arg_indirect_ref (addr);
11365 /* Return true if OPNUM's MEM should be matched
11366 in movabs* patterns. */
11368 bool
11369 ix86_check_movabs (rtx insn, int opnum)
11371 rtx set, mem;
11373 set = PATTERN (insn);
11374 if (GET_CODE (set) == PARALLEL)
11375 set = XVECEXP (set, 0, 0);
11376 gcc_assert (GET_CODE (set) == SET);
11377 mem = XEXP (set, opnum);
11378 while (SUBREG_P (mem))
11379 mem = SUBREG_REG (mem);
11380 gcc_assert (MEM_P (mem));
11381 return volatile_ok || !MEM_VOLATILE_P (mem);
11384 /* Return false if INSN contains a MEM with a non-default address space. */
11385 bool
11386 ix86_check_no_addr_space (rtx insn)
11388 subrtx_var_iterator::array_type array;
11389 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11391 rtx x = *iter;
11392 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11393 return false;
11395 return true;
11398 /* Initialize the table of extra 80387 mathematical constants. */
11400 static void
11401 init_ext_80387_constants (void)
11403 static const char * cst[5] =
11405 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11406 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11407 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11408 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11409 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11411 int i;
11413 for (i = 0; i < 5; i++)
11415 real_from_string (&ext_80387_constants_table[i], cst[i]);
11416 /* Ensure each constant is rounded to XFmode precision. */
11417 real_convert (&ext_80387_constants_table[i],
11418 XFmode, &ext_80387_constants_table[i]);
11421 ext_80387_constants_init = 1;
11424 /* Return non-zero if the constant is something that
11425 can be loaded with a special instruction. */
11428 standard_80387_constant_p (rtx x)
11430 machine_mode mode = GET_MODE (x);
11432 const REAL_VALUE_TYPE *r;
11434 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11435 return -1;
11437 if (x == CONST0_RTX (mode))
11438 return 1;
11439 if (x == CONST1_RTX (mode))
11440 return 2;
11442 r = CONST_DOUBLE_REAL_VALUE (x);
11444 /* For XFmode constants, try to find a special 80387 instruction when
11445 optimizing for size or on those CPUs that benefit from them. */
11446 if (mode == XFmode
11447 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11449 int i;
11451 if (! ext_80387_constants_init)
11452 init_ext_80387_constants ();
11454 for (i = 0; i < 5; i++)
11455 if (real_identical (r, &ext_80387_constants_table[i]))
11456 return i + 3;
11459 /* Load of the constant -0.0 or -1.0 will be split as
11460 fldz;fchs or fld1;fchs sequence. */
11461 if (real_isnegzero (r))
11462 return 8;
11463 if (real_identical (r, &dconstm1))
11464 return 9;
11466 return 0;
11469 /* Return the opcode of the special instruction to be used to load
11470 the constant X. */
11472 const char *
11473 standard_80387_constant_opcode (rtx x)
11475 switch (standard_80387_constant_p (x))
11477 case 1:
11478 return "fldz";
11479 case 2:
11480 return "fld1";
11481 case 3:
11482 return "fldlg2";
11483 case 4:
11484 return "fldln2";
11485 case 5:
11486 return "fldl2e";
11487 case 6:
11488 return "fldl2t";
11489 case 7:
11490 return "fldpi";
11491 case 8:
11492 case 9:
11493 return "#";
11494 default:
11495 gcc_unreachable ();
11499 /* Return the CONST_DOUBLE representing the 80387 constant that is
11500 loaded by the specified special instruction. The argument IDX
11501 matches the return value from standard_80387_constant_p. */
11504 standard_80387_constant_rtx (int idx)
11506 int i;
11508 if (! ext_80387_constants_init)
11509 init_ext_80387_constants ();
11511 switch (idx)
11513 case 3:
11514 case 4:
11515 case 5:
11516 case 6:
11517 case 7:
11518 i = idx - 3;
11519 break;
11521 default:
11522 gcc_unreachable ();
11525 return const_double_from_real_value (ext_80387_constants_table[i],
11526 XFmode);
11529 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11530 in supported SSE/AVX vector mode. */
11533 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11535 machine_mode mode;
11537 if (!TARGET_SSE)
11538 return 0;
11540 mode = GET_MODE (x);
11542 if (x == const0_rtx || const0_operand (x, mode))
11543 return 1;
11545 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11547 /* VOIDmode integer constant, get mode from the predicate. */
11548 if (mode == VOIDmode)
11549 mode = pred_mode;
11551 switch (GET_MODE_SIZE (mode))
11553 case 64:
11554 if (TARGET_AVX512F)
11555 return 2;
11556 break;
11557 case 32:
11558 if (TARGET_AVX2)
11559 return 2;
11560 break;
11561 case 16:
11562 if (TARGET_SSE2)
11563 return 2;
11564 break;
11565 case 0:
11566 /* VOIDmode */
11567 gcc_unreachable ();
11568 default:
11569 break;
11573 return 0;
11576 /* Return the opcode of the special instruction to be used to load
11577 the constant X. */
11579 const char *
11580 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11582 machine_mode mode;
11584 gcc_assert (TARGET_SSE);
11586 mode = GET_MODE (x);
11588 if (x == const0_rtx || const0_operand (x, mode))
11590 switch (get_attr_mode (insn))
11592 case MODE_XI:
11593 return "vpxord\t%g0, %g0, %g0";
11594 case MODE_OI:
11595 return (TARGET_AVX512VL
11596 ? "vpxord\t%x0, %x0, %x0"
11597 : "vpxor\t%x0, %x0, %x0");
11598 case MODE_TI:
11599 return (TARGET_AVX512VL
11600 ? "vpxord\t%t0, %t0, %t0"
11601 : "%vpxor\t%0, %d0");
11603 case MODE_V8DF:
11604 return (TARGET_AVX512DQ
11605 ? "vxorpd\t%g0, %g0, %g0"
11606 : "vpxorq\t%g0, %g0, %g0");
11607 case MODE_V4DF:
11608 return "vxorpd\t%x0, %x0, %x0";
11609 case MODE_V2DF:
11610 return "%vxorpd\t%0, %d0";
11612 case MODE_V16SF:
11613 return (TARGET_AVX512DQ
11614 ? "vxorps\t%g0, %g0, %g0"
11615 : "vpxord\t%g0, %g0, %g0");
11616 case MODE_V8SF:
11617 return "vxorps\t%x0, %x0, %x0";
11618 case MODE_V4SF:
11619 return "%vxorps\t%0, %d0";
11621 default:
11622 gcc_unreachable ();
11625 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11627 enum attr_mode insn_mode = get_attr_mode (insn);
11629 switch (insn_mode)
11631 case MODE_XI:
11632 case MODE_V8DF:
11633 case MODE_V16SF:
11634 gcc_assert (TARGET_AVX512F);
11635 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11637 case MODE_OI:
11638 case MODE_V4DF:
11639 case MODE_V8SF:
11640 gcc_assert (TARGET_AVX2);
11641 /* FALLTHRU */
11642 case MODE_TI:
11643 case MODE_V2DF:
11644 case MODE_V4SF:
11645 gcc_assert (TARGET_SSE2);
11646 return (TARGET_AVX
11647 ? "vpcmpeqd\t%0, %0, %0"
11648 : "pcmpeqd\t%0, %0");
11650 default:
11651 gcc_unreachable ();
11655 gcc_unreachable ();
11658 /* Returns true if INSN can be transformed from a memory load
11659 to a supported FP constant load. */
11661 bool
11662 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11664 rtx src = find_constant_src (insn);
11666 gcc_assert (REG_P (dst));
11668 if (src == NULL
11669 || (SSE_REGNO_P (REGNO (dst))
11670 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11671 || (STACK_REGNO_P (REGNO (dst))
11672 && standard_80387_constant_p (src) < 1))
11673 return false;
11675 return true;
11678 /* Returns true if OP contains a symbol reference */
11680 bool
11681 symbolic_reference_mentioned_p (rtx op)
11683 const char *fmt;
11684 int i;
11686 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11687 return true;
11689 fmt = GET_RTX_FORMAT (GET_CODE (op));
11690 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11692 if (fmt[i] == 'E')
11694 int j;
11696 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11697 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11698 return true;
11701 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11702 return true;
11705 return false;
11708 /* Return true if it is appropriate to emit `ret' instructions in the
11709 body of a function. Do this only if the epilogue is simple, needing a
11710 couple of insns. Prior to reloading, we can't tell how many registers
11711 must be saved, so return false then. Return false if there is no frame
11712 marker to de-allocate. */
11714 bool
11715 ix86_can_use_return_insn_p (void)
11717 struct ix86_frame frame;
11719 /* Don't use `ret' instruction in interrupt handler. */
11720 if (! reload_completed
11721 || frame_pointer_needed
11722 || cfun->machine->func_type != TYPE_NORMAL)
11723 return 0;
11725 /* Don't allow more than 32k pop, since that's all we can do
11726 with one instruction. */
11727 if (crtl->args.pops_args && crtl->args.size >= 32768)
11728 return 0;
11730 ix86_compute_frame_layout (&frame);
11731 return (frame.stack_pointer_offset == UNITS_PER_WORD
11732 && (frame.nregs + frame.nsseregs) == 0);
11735 /* Value should be nonzero if functions must have frame pointers.
11736 Zero means the frame pointer need not be set up (and parms may
11737 be accessed via the stack pointer) in functions that seem suitable. */
11739 static bool
11740 ix86_frame_pointer_required (void)
11742 /* If we accessed previous frames, then the generated code expects
11743 to be able to access the saved ebp value in our frame. */
11744 if (cfun->machine->accesses_prev_frame)
11745 return true;
11747 /* Several x86 os'es need a frame pointer for other reasons,
11748 usually pertaining to setjmp. */
11749 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11750 return true;
11752 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11753 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11754 return true;
11756 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11757 allocation is 4GB. */
11758 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11759 return true;
11761 /* SSE saves require frame-pointer when stack is misaligned. */
11762 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11763 return true;
11765 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11766 turns off the frame pointer by default. Turn it back on now if
11767 we've not got a leaf function. */
11768 if (TARGET_OMIT_LEAF_FRAME_POINTER
11769 && (!crtl->is_leaf
11770 || ix86_current_function_calls_tls_descriptor))
11771 return true;
11773 if (crtl->profile && !flag_fentry)
11774 return true;
11776 return false;
11779 /* Record that the current function accesses previous call frames. */
11781 void
11782 ix86_setup_frame_addresses (void)
11784 cfun->machine->accesses_prev_frame = 1;
11787 #ifndef USE_HIDDEN_LINKONCE
11788 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11789 # define USE_HIDDEN_LINKONCE 1
11790 # else
11791 # define USE_HIDDEN_LINKONCE 0
11792 # endif
11793 #endif
11795 static int pic_labels_used;
11797 /* Fills in the label name that should be used for a pc thunk for
11798 the given register. */
11800 static void
11801 get_pc_thunk_name (char name[32], unsigned int regno)
11803 gcc_assert (!TARGET_64BIT);
11805 if (USE_HIDDEN_LINKONCE)
11806 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11807 else
11808 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11812 /* This function generates code for -fpic that loads %ebx with
11813 the return address of the caller and then returns. */
11815 static void
11816 ix86_code_end (void)
11818 rtx xops[2];
11819 int regno;
11821 for (regno = AX_REG; regno <= SP_REG; regno++)
11823 char name[32];
11824 tree decl;
11826 if (!(pic_labels_used & (1 << regno)))
11827 continue;
11829 get_pc_thunk_name (name, regno);
11831 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11832 get_identifier (name),
11833 build_function_type_list (void_type_node, NULL_TREE));
11834 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11835 NULL_TREE, void_type_node);
11836 TREE_PUBLIC (decl) = 1;
11837 TREE_STATIC (decl) = 1;
11838 DECL_IGNORED_P (decl) = 1;
11840 #if TARGET_MACHO
11841 if (TARGET_MACHO)
11843 switch_to_section (darwin_sections[text_coal_section]);
11844 fputs ("\t.weak_definition\t", asm_out_file);
11845 assemble_name (asm_out_file, name);
11846 fputs ("\n\t.private_extern\t", asm_out_file);
11847 assemble_name (asm_out_file, name);
11848 putc ('\n', asm_out_file);
11849 ASM_OUTPUT_LABEL (asm_out_file, name);
11850 DECL_WEAK (decl) = 1;
11852 else
11853 #endif
11854 if (USE_HIDDEN_LINKONCE)
11856 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11858 targetm.asm_out.unique_section (decl, 0);
11859 switch_to_section (get_named_section (decl, NULL, 0));
11861 targetm.asm_out.globalize_label (asm_out_file, name);
11862 fputs ("\t.hidden\t", asm_out_file);
11863 assemble_name (asm_out_file, name);
11864 putc ('\n', asm_out_file);
11865 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11867 else
11869 switch_to_section (text_section);
11870 ASM_OUTPUT_LABEL (asm_out_file, name);
11873 DECL_INITIAL (decl) = make_node (BLOCK);
11874 current_function_decl = decl;
11875 allocate_struct_function (decl, false);
11876 init_function_start (decl);
11877 first_function_block_is_cold = false;
11878 /* Make sure unwind info is emitted for the thunk if needed. */
11879 final_start_function (emit_barrier (), asm_out_file, 1);
11881 /* Pad stack IP move with 4 instructions (two NOPs count
11882 as one instruction). */
11883 if (TARGET_PAD_SHORT_FUNCTION)
11885 int i = 8;
11887 while (i--)
11888 fputs ("\tnop\n", asm_out_file);
11891 xops[0] = gen_rtx_REG (Pmode, regno);
11892 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11893 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11894 output_asm_insn ("%!ret", NULL);
11895 final_end_function ();
11896 init_insn_lengths ();
11897 free_after_compilation (cfun);
11898 set_cfun (NULL);
11899 current_function_decl = NULL;
11902 if (flag_split_stack)
11903 file_end_indicate_split_stack ();
11906 /* Emit code for the SET_GOT patterns. */
11908 const char *
11909 output_set_got (rtx dest, rtx label)
11911 rtx xops[3];
11913 xops[0] = dest;
11915 if (TARGET_VXWORKS_RTP && flag_pic)
11917 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11918 xops[2] = gen_rtx_MEM (Pmode,
11919 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11920 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11922 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11923 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11924 an unadorned address. */
11925 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11926 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11927 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11928 return "";
11931 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11933 if (flag_pic)
11935 char name[32];
11936 get_pc_thunk_name (name, REGNO (dest));
11937 pic_labels_used |= 1 << REGNO (dest);
11939 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11940 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11941 output_asm_insn ("%!call\t%X2", xops);
11943 #if TARGET_MACHO
11944 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11945 This is what will be referenced by the Mach-O PIC subsystem. */
11946 if (machopic_should_output_picbase_label () || !label)
11947 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11949 /* When we are restoring the pic base at the site of a nonlocal label,
11950 and we decided to emit the pic base above, we will still output a
11951 local label used for calculating the correction offset (even though
11952 the offset will be 0 in that case). */
11953 if (label)
11954 targetm.asm_out.internal_label (asm_out_file, "L",
11955 CODE_LABEL_NUMBER (label));
11956 #endif
11958 else
11960 if (TARGET_MACHO)
11961 /* We don't need a pic base, we're not producing pic. */
11962 gcc_unreachable ();
11964 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11965 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11966 targetm.asm_out.internal_label (asm_out_file, "L",
11967 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11970 if (!TARGET_MACHO)
11971 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11973 return "";
11976 /* Generate an "push" pattern for input ARG. */
11978 static rtx
11979 gen_push (rtx arg)
11981 struct machine_function *m = cfun->machine;
11983 if (m->fs.cfa_reg == stack_pointer_rtx)
11984 m->fs.cfa_offset += UNITS_PER_WORD;
11985 m->fs.sp_offset += UNITS_PER_WORD;
11987 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11988 arg = gen_rtx_REG (word_mode, REGNO (arg));
11990 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11991 gen_rtx_PRE_DEC (Pmode,
11992 stack_pointer_rtx)),
11993 arg);
11996 /* Generate an "pop" pattern for input ARG. */
11998 static rtx
11999 gen_pop (rtx arg)
12001 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12002 arg = gen_rtx_REG (word_mode, REGNO (arg));
12004 return gen_rtx_SET (arg,
12005 gen_rtx_MEM (word_mode,
12006 gen_rtx_POST_INC (Pmode,
12007 stack_pointer_rtx)));
12010 /* Return >= 0 if there is an unused call-clobbered register available
12011 for the entire function. */
12013 static unsigned int
12014 ix86_select_alt_pic_regnum (void)
12016 if (ix86_use_pseudo_pic_reg ())
12017 return INVALID_REGNUM;
12019 if (crtl->is_leaf
12020 && !crtl->profile
12021 && !ix86_current_function_calls_tls_descriptor)
12023 int i, drap;
12024 /* Can't use the same register for both PIC and DRAP. */
12025 if (crtl->drap_reg)
12026 drap = REGNO (crtl->drap_reg);
12027 else
12028 drap = -1;
12029 for (i = 2; i >= 0; --i)
12030 if (i != drap && !df_regs_ever_live_p (i))
12031 return i;
12034 return INVALID_REGNUM;
12037 /* Return true if REGNO is used by the epilogue. */
12039 bool
12040 ix86_epilogue_uses (int regno)
12042 /* If there are no caller-saved registers, we preserve all registers,
12043 except for MMX and x87 registers which aren't supported when saving
12044 and restoring registers. Don't explicitly save SP register since
12045 it is always preserved. */
12046 return (epilogue_completed
12047 && cfun->machine->no_caller_saved_registers
12048 && !fixed_regs[regno]
12049 && !STACK_REGNO_P (regno)
12050 && !MMX_REGNO_P (regno));
12053 /* Return nonzero if register REGNO can be used as a scratch register
12054 in peephole2. */
12056 static bool
12057 ix86_hard_regno_scratch_ok (unsigned int regno)
12059 /* If there are no caller-saved registers, we can't use any register
12060 as a scratch register after epilogue and use REGNO as scratch
12061 register only if it has been used before to avoid saving and
12062 restoring it. */
12063 return (!cfun->machine->no_caller_saved_registers
12064 || (!epilogue_completed
12065 && df_regs_ever_live_p (regno)));
12068 /* Return TRUE if we need to save REGNO. */
12070 static bool
12071 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12073 /* If there are no caller-saved registers, we preserve all registers,
12074 except for MMX and x87 registers which aren't supported when saving
12075 and restoring registers. Don't explicitly save SP register since
12076 it is always preserved. */
12077 if (cfun->machine->no_caller_saved_registers)
12079 /* Don't preserve registers used for function return value. */
12080 rtx reg = crtl->return_rtx;
12081 if (reg)
12083 unsigned int i = REGNO (reg);
12084 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12085 while (nregs-- > 0)
12086 if ((i + nregs) == regno)
12087 return false;
12089 reg = crtl->return_bnd;
12090 if (reg)
12092 i = REGNO (reg);
12093 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12094 while (nregs-- > 0)
12095 if ((i + nregs) == regno)
12096 return false;
12100 return (df_regs_ever_live_p (regno)
12101 && !fixed_regs[regno]
12102 && !STACK_REGNO_P (regno)
12103 && !MMX_REGNO_P (regno)
12104 && (regno != HARD_FRAME_POINTER_REGNUM
12105 || !frame_pointer_needed));
12108 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12109 && pic_offset_table_rtx)
12111 if (ix86_use_pseudo_pic_reg ())
12113 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12114 _mcount in prologue. */
12115 if (!TARGET_64BIT && flag_pic && crtl->profile)
12116 return true;
12118 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12119 || crtl->profile
12120 || crtl->calls_eh_return
12121 || crtl->uses_const_pool
12122 || cfun->has_nonlocal_label)
12123 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12126 if (crtl->calls_eh_return && maybe_eh_return)
12128 unsigned i;
12129 for (i = 0; ; i++)
12131 unsigned test = EH_RETURN_DATA_REGNO (i);
12132 if (test == INVALID_REGNUM)
12133 break;
12134 if (test == regno)
12135 return true;
12139 if (crtl->drap_reg
12140 && regno == REGNO (crtl->drap_reg)
12141 && !cfun->machine->no_drap_save_restore)
12142 return true;
12144 return (df_regs_ever_live_p (regno)
12145 && !call_used_regs[regno]
12146 && !fixed_regs[regno]
12147 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12150 /* Return number of saved general prupose registers. */
12152 static int
12153 ix86_nsaved_regs (void)
12155 int nregs = 0;
12156 int regno;
12158 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12159 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12160 nregs ++;
12161 return nregs;
12164 /* Return number of saved SSE registers. */
12166 static int
12167 ix86_nsaved_sseregs (void)
12169 int nregs = 0;
12170 int regno;
12172 if (!TARGET_64BIT_MS_ABI)
12173 return 0;
12174 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12175 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12176 nregs ++;
12177 return nregs;
12180 /* Given FROM and TO register numbers, say whether this elimination is
12181 allowed. If stack alignment is needed, we can only replace argument
12182 pointer with hard frame pointer, or replace frame pointer with stack
12183 pointer. Otherwise, frame pointer elimination is automatically
12184 handled and all other eliminations are valid. */
12186 static bool
12187 ix86_can_eliminate (const int from, const int to)
12189 if (stack_realign_fp)
12190 return ((from == ARG_POINTER_REGNUM
12191 && to == HARD_FRAME_POINTER_REGNUM)
12192 || (from == FRAME_POINTER_REGNUM
12193 && to == STACK_POINTER_REGNUM));
12194 else
12195 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12198 /* Return the offset between two registers, one to be eliminated, and the other
12199 its replacement, at the start of a routine. */
12201 HOST_WIDE_INT
12202 ix86_initial_elimination_offset (int from, int to)
12204 struct ix86_frame frame;
12205 ix86_compute_frame_layout (&frame);
12207 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12208 return frame.hard_frame_pointer_offset;
12209 else if (from == FRAME_POINTER_REGNUM
12210 && to == HARD_FRAME_POINTER_REGNUM)
12211 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12212 else
12214 gcc_assert (to == STACK_POINTER_REGNUM);
12216 if (from == ARG_POINTER_REGNUM)
12217 return frame.stack_pointer_offset;
12219 gcc_assert (from == FRAME_POINTER_REGNUM);
12220 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12224 /* In a dynamically-aligned function, we can't know the offset from
12225 stack pointer to frame pointer, so we must ensure that setjmp
12226 eliminates fp against the hard fp (%ebp) rather than trying to
12227 index from %esp up to the top of the frame across a gap that is
12228 of unknown (at compile-time) size. */
12229 static rtx
12230 ix86_builtin_setjmp_frame_value (void)
12232 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12235 /* When using -fsplit-stack, the allocation routines set a field in
12236 the TCB to the bottom of the stack plus this much space, measured
12237 in bytes. */
12239 #define SPLIT_STACK_AVAILABLE 256
12241 /* Fill structure ix86_frame about frame of currently computed function. */
12243 static void
12244 ix86_compute_frame_layout (struct ix86_frame *frame)
12246 unsigned HOST_WIDE_INT stack_alignment_needed;
12247 HOST_WIDE_INT offset;
12248 unsigned HOST_WIDE_INT preferred_alignment;
12249 HOST_WIDE_INT size = get_frame_size ();
12250 HOST_WIDE_INT to_allocate;
12252 frame->nregs = ix86_nsaved_regs ();
12253 frame->nsseregs = ix86_nsaved_sseregs ();
12255 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12256 except for function prologues, leaf functions and when the defult
12257 incoming stack boundary is overriden at command line or via
12258 force_align_arg_pointer attribute. */
12259 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12260 && (!crtl->is_leaf || cfun->calls_alloca != 0
12261 || ix86_current_function_calls_tls_descriptor
12262 || ix86_incoming_stack_boundary < 128))
12264 crtl->preferred_stack_boundary = 128;
12265 crtl->stack_alignment_needed = 128;
12268 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12269 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12271 gcc_assert (!size || stack_alignment_needed);
12272 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12273 gcc_assert (preferred_alignment <= stack_alignment_needed);
12275 /* For SEH we have to limit the amount of code movement into the prologue.
12276 At present we do this via a BLOCKAGE, at which point there's very little
12277 scheduling that can be done, which means that there's very little point
12278 in doing anything except PUSHs. */
12279 if (TARGET_SEH)
12280 cfun->machine->use_fast_prologue_epilogue = false;
12282 /* During reload iteration the amount of registers saved can change.
12283 Recompute the value as needed. Do not recompute when amount of registers
12284 didn't change as reload does multiple calls to the function and does not
12285 expect the decision to change within single iteration. */
12286 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12287 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12289 int count = frame->nregs;
12290 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12292 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12294 /* The fast prologue uses move instead of push to save registers. This
12295 is significantly longer, but also executes faster as modern hardware
12296 can execute the moves in parallel, but can't do that for push/pop.
12298 Be careful about choosing what prologue to emit: When function takes
12299 many instructions to execute we may use slow version as well as in
12300 case function is known to be outside hot spot (this is known with
12301 feedback only). Weight the size of function by number of registers
12302 to save as it is cheap to use one or two push instructions but very
12303 slow to use many of them. */
12304 if (count)
12305 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12306 if (node->frequency < NODE_FREQUENCY_NORMAL
12307 || (flag_branch_probabilities
12308 && node->frequency < NODE_FREQUENCY_HOT))
12309 cfun->machine->use_fast_prologue_epilogue = false;
12310 else
12311 cfun->machine->use_fast_prologue_epilogue
12312 = !expensive_function_p (count);
12315 frame->save_regs_using_mov
12316 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12317 /* If static stack checking is enabled and done with probes,
12318 the registers need to be saved before allocating the frame. */
12319 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12321 /* Skip return address. */
12322 offset = UNITS_PER_WORD;
12324 /* Skip pushed static chain. */
12325 if (ix86_static_chain_on_stack)
12326 offset += UNITS_PER_WORD;
12328 /* Skip saved base pointer. */
12329 if (frame_pointer_needed)
12330 offset += UNITS_PER_WORD;
12331 frame->hfp_save_offset = offset;
12333 /* The traditional frame pointer location is at the top of the frame. */
12334 frame->hard_frame_pointer_offset = offset;
12336 /* Register save area */
12337 offset += frame->nregs * UNITS_PER_WORD;
12338 frame->reg_save_offset = offset;
12340 /* On SEH target, registers are pushed just before the frame pointer
12341 location. */
12342 if (TARGET_SEH)
12343 frame->hard_frame_pointer_offset = offset;
12345 /* Align and set SSE register save area. */
12346 if (frame->nsseregs)
12348 /* The only ABI that has saved SSE registers (Win64) also has a
12349 16-byte aligned default stack, and thus we don't need to be
12350 within the re-aligned local stack frame to save them. In case
12351 incoming stack boundary is aligned to less than 16 bytes,
12352 unaligned move of SSE register will be emitted, so there is
12353 no point to round up the SSE register save area outside the
12354 re-aligned local stack frame to 16 bytes. */
12355 if (ix86_incoming_stack_boundary >= 128)
12356 offset = ROUND_UP (offset, 16);
12357 offset += frame->nsseregs * 16;
12359 frame->sse_reg_save_offset = offset;
12361 /* The re-aligned stack starts here. Values before this point are not
12362 directly comparable with values below this point. In order to make
12363 sure that no value happens to be the same before and after, force
12364 the alignment computation below to add a non-zero value. */
12365 if (stack_realign_fp)
12366 offset = ROUND_UP (offset, stack_alignment_needed);
12368 /* Va-arg area */
12369 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12370 offset += frame->va_arg_size;
12372 /* Align start of frame for local function. */
12373 if (stack_realign_fp
12374 || offset != frame->sse_reg_save_offset
12375 || size != 0
12376 || !crtl->is_leaf
12377 || cfun->calls_alloca
12378 || ix86_current_function_calls_tls_descriptor)
12379 offset = ROUND_UP (offset, stack_alignment_needed);
12381 /* Frame pointer points here. */
12382 frame->frame_pointer_offset = offset;
12384 offset += size;
12386 /* Add outgoing arguments area. Can be skipped if we eliminated
12387 all the function calls as dead code.
12388 Skipping is however impossible when function calls alloca. Alloca
12389 expander assumes that last crtl->outgoing_args_size
12390 of stack frame are unused. */
12391 if (ACCUMULATE_OUTGOING_ARGS
12392 && (!crtl->is_leaf || cfun->calls_alloca
12393 || ix86_current_function_calls_tls_descriptor))
12395 offset += crtl->outgoing_args_size;
12396 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12398 else
12399 frame->outgoing_arguments_size = 0;
12401 /* Align stack boundary. Only needed if we're calling another function
12402 or using alloca. */
12403 if (!crtl->is_leaf || cfun->calls_alloca
12404 || ix86_current_function_calls_tls_descriptor)
12405 offset = ROUND_UP (offset, preferred_alignment);
12407 /* We've reached end of stack frame. */
12408 frame->stack_pointer_offset = offset;
12410 /* Size prologue needs to allocate. */
12411 to_allocate = offset - frame->sse_reg_save_offset;
12413 if ((!to_allocate && frame->nregs <= 1)
12414 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12415 frame->save_regs_using_mov = false;
12417 if (ix86_using_red_zone ()
12418 && crtl->sp_is_unchanging
12419 && crtl->is_leaf
12420 && !ix86_pc_thunk_call_expanded
12421 && !ix86_current_function_calls_tls_descriptor)
12423 frame->red_zone_size = to_allocate;
12424 if (frame->save_regs_using_mov)
12425 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12426 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12427 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12429 else
12430 frame->red_zone_size = 0;
12431 frame->stack_pointer_offset -= frame->red_zone_size;
12433 /* The SEH frame pointer location is near the bottom of the frame.
12434 This is enforced by the fact that the difference between the
12435 stack pointer and the frame pointer is limited to 240 bytes in
12436 the unwind data structure. */
12437 if (TARGET_SEH)
12439 HOST_WIDE_INT diff;
12441 /* If we can leave the frame pointer where it is, do so. Also, returns
12442 the establisher frame for __builtin_frame_address (0). */
12443 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12444 if (diff <= SEH_MAX_FRAME_SIZE
12445 && (diff > 240 || (diff & 15) != 0)
12446 && !crtl->accesses_prior_frames)
12448 /* Ideally we'd determine what portion of the local stack frame
12449 (within the constraint of the lowest 240) is most heavily used.
12450 But without that complication, simply bias the frame pointer
12451 by 128 bytes so as to maximize the amount of the local stack
12452 frame that is addressable with 8-bit offsets. */
12453 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12458 /* This is semi-inlined memory_address_length, but simplified
12459 since we know that we're always dealing with reg+offset, and
12460 to avoid having to create and discard all that rtl. */
12462 static inline int
12463 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12465 int len = 4;
12467 if (offset == 0)
12469 /* EBP and R13 cannot be encoded without an offset. */
12470 len = (regno == BP_REG || regno == R13_REG);
12472 else if (IN_RANGE (offset, -128, 127))
12473 len = 1;
12475 /* ESP and R12 must be encoded with a SIB byte. */
12476 if (regno == SP_REG || regno == R12_REG)
12477 len++;
12479 return len;
12482 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12483 The valid base registers are taken from CFUN->MACHINE->FS. */
12485 static rtx
12486 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12488 const struct machine_function *m = cfun->machine;
12489 rtx base_reg = NULL;
12490 HOST_WIDE_INT base_offset = 0;
12492 if (m->use_fast_prologue_epilogue)
12494 /* Choose the base register most likely to allow the most scheduling
12495 opportunities. Generally FP is valid throughout the function,
12496 while DRAP must be reloaded within the epilogue. But choose either
12497 over the SP due to increased encoding size. */
12499 if (m->fs.fp_valid)
12501 base_reg = hard_frame_pointer_rtx;
12502 base_offset = m->fs.fp_offset - cfa_offset;
12504 else if (m->fs.drap_valid)
12506 base_reg = crtl->drap_reg;
12507 base_offset = 0 - cfa_offset;
12509 else if (m->fs.sp_valid)
12511 base_reg = stack_pointer_rtx;
12512 base_offset = m->fs.sp_offset - cfa_offset;
12515 else
12517 HOST_WIDE_INT toffset;
12518 int len = 16, tlen;
12520 /* Choose the base register with the smallest address encoding.
12521 With a tie, choose FP > DRAP > SP. */
12522 if (m->fs.sp_valid)
12524 base_reg = stack_pointer_rtx;
12525 base_offset = m->fs.sp_offset - cfa_offset;
12526 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12528 if (m->fs.drap_valid)
12530 toffset = 0 - cfa_offset;
12531 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12532 if (tlen <= len)
12534 base_reg = crtl->drap_reg;
12535 base_offset = toffset;
12536 len = tlen;
12539 if (m->fs.fp_valid)
12541 toffset = m->fs.fp_offset - cfa_offset;
12542 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12543 if (tlen <= len)
12545 base_reg = hard_frame_pointer_rtx;
12546 base_offset = toffset;
12547 len = tlen;
12551 gcc_assert (base_reg != NULL);
12553 return plus_constant (Pmode, base_reg, base_offset);
12556 /* Emit code to save registers in the prologue. */
12558 static void
12559 ix86_emit_save_regs (void)
12561 unsigned int regno;
12562 rtx_insn *insn;
12564 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12565 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12567 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12568 RTX_FRAME_RELATED_P (insn) = 1;
12572 /* Emit a single register save at CFA - CFA_OFFSET. */
12574 static void
12575 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12576 HOST_WIDE_INT cfa_offset)
12578 struct machine_function *m = cfun->machine;
12579 rtx reg = gen_rtx_REG (mode, regno);
12580 rtx mem, addr, base, insn;
12581 unsigned int align;
12583 addr = choose_baseaddr (cfa_offset);
12584 mem = gen_frame_mem (mode, addr);
12586 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12587 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12588 set_mem_align (mem, align);
12590 insn = emit_insn (gen_rtx_SET (mem, reg));
12591 RTX_FRAME_RELATED_P (insn) = 1;
12593 base = addr;
12594 if (GET_CODE (base) == PLUS)
12595 base = XEXP (base, 0);
12596 gcc_checking_assert (REG_P (base));
12598 /* When saving registers into a re-aligned local stack frame, avoid
12599 any tricky guessing by dwarf2out. */
12600 if (m->fs.realigned)
12602 gcc_checking_assert (stack_realign_drap);
12604 if (regno == REGNO (crtl->drap_reg))
12606 /* A bit of a hack. We force the DRAP register to be saved in
12607 the re-aligned stack frame, which provides us with a copy
12608 of the CFA that will last past the prologue. Install it. */
12609 gcc_checking_assert (cfun->machine->fs.fp_valid);
12610 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12611 cfun->machine->fs.fp_offset - cfa_offset);
12612 mem = gen_rtx_MEM (mode, addr);
12613 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12615 else
12617 /* The frame pointer is a stable reference within the
12618 aligned frame. Use it. */
12619 gcc_checking_assert (cfun->machine->fs.fp_valid);
12620 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12621 cfun->machine->fs.fp_offset - cfa_offset);
12622 mem = gen_rtx_MEM (mode, addr);
12623 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12627 /* The memory may not be relative to the current CFA register,
12628 which means that we may need to generate a new pattern for
12629 use by the unwind info. */
12630 else if (base != m->fs.cfa_reg)
12632 addr = plus_constant (Pmode, m->fs.cfa_reg,
12633 m->fs.cfa_offset - cfa_offset);
12634 mem = gen_rtx_MEM (mode, addr);
12635 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12639 /* Emit code to save registers using MOV insns.
12640 First register is stored at CFA - CFA_OFFSET. */
12641 static void
12642 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12644 unsigned int regno;
12646 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12647 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12649 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12650 cfa_offset -= UNITS_PER_WORD;
12654 /* Emit code to save SSE registers using MOV insns.
12655 First register is stored at CFA - CFA_OFFSET. */
12656 static void
12657 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12659 unsigned int regno;
12661 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12662 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12664 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12665 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12669 static GTY(()) rtx queued_cfa_restores;
12671 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12672 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12673 Don't add the note if the previously saved value will be left untouched
12674 within stack red-zone till return, as unwinders can find the same value
12675 in the register and on the stack. */
12677 static void
12678 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12680 if (!crtl->shrink_wrapped
12681 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12682 return;
12684 if (insn)
12686 add_reg_note (insn, REG_CFA_RESTORE, reg);
12687 RTX_FRAME_RELATED_P (insn) = 1;
12689 else
12690 queued_cfa_restores
12691 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12694 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12696 static void
12697 ix86_add_queued_cfa_restore_notes (rtx insn)
12699 rtx last;
12700 if (!queued_cfa_restores)
12701 return;
12702 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12704 XEXP (last, 1) = REG_NOTES (insn);
12705 REG_NOTES (insn) = queued_cfa_restores;
12706 queued_cfa_restores = NULL_RTX;
12707 RTX_FRAME_RELATED_P (insn) = 1;
12710 /* Expand prologue or epilogue stack adjustment.
12711 The pattern exist to put a dependency on all ebp-based memory accesses.
12712 STYLE should be negative if instructions should be marked as frame related,
12713 zero if %r11 register is live and cannot be freely used and positive
12714 otherwise. */
12716 static void
12717 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12718 int style, bool set_cfa)
12720 struct machine_function *m = cfun->machine;
12721 rtx insn;
12722 bool add_frame_related_expr = false;
12724 if (Pmode == SImode)
12725 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12726 else if (x86_64_immediate_operand (offset, DImode))
12727 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12728 else
12730 rtx tmp;
12731 /* r11 is used by indirect sibcall return as well, set before the
12732 epilogue and used after the epilogue. */
12733 if (style)
12734 tmp = gen_rtx_REG (DImode, R11_REG);
12735 else
12737 gcc_assert (src != hard_frame_pointer_rtx
12738 && dest != hard_frame_pointer_rtx);
12739 tmp = hard_frame_pointer_rtx;
12741 insn = emit_insn (gen_rtx_SET (tmp, offset));
12742 if (style < 0)
12743 add_frame_related_expr = true;
12745 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12748 insn = emit_insn (insn);
12749 if (style >= 0)
12750 ix86_add_queued_cfa_restore_notes (insn);
12752 if (set_cfa)
12754 rtx r;
12756 gcc_assert (m->fs.cfa_reg == src);
12757 m->fs.cfa_offset += INTVAL (offset);
12758 m->fs.cfa_reg = dest;
12760 r = gen_rtx_PLUS (Pmode, src, offset);
12761 r = gen_rtx_SET (dest, r);
12762 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12763 RTX_FRAME_RELATED_P (insn) = 1;
12765 else if (style < 0)
12767 RTX_FRAME_RELATED_P (insn) = 1;
12768 if (add_frame_related_expr)
12770 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12771 r = gen_rtx_SET (dest, r);
12772 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12776 if (dest == stack_pointer_rtx)
12778 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12779 bool valid = m->fs.sp_valid;
12781 if (src == hard_frame_pointer_rtx)
12783 valid = m->fs.fp_valid;
12784 ooffset = m->fs.fp_offset;
12786 else if (src == crtl->drap_reg)
12788 valid = m->fs.drap_valid;
12789 ooffset = 0;
12791 else
12793 /* Else there are two possibilities: SP itself, which we set
12794 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12795 taken care of this by hand along the eh_return path. */
12796 gcc_checking_assert (src == stack_pointer_rtx
12797 || offset == const0_rtx);
12800 m->fs.sp_offset = ooffset - INTVAL (offset);
12801 m->fs.sp_valid = valid;
12805 /* Find an available register to be used as dynamic realign argument
12806 pointer regsiter. Such a register will be written in prologue and
12807 used in begin of body, so it must not be
12808 1. parameter passing register.
12809 2. GOT pointer.
12810 We reuse static-chain register if it is available. Otherwise, we
12811 use DI for i386 and R13 for x86-64. We chose R13 since it has
12812 shorter encoding.
12814 Return: the regno of chosen register. */
12816 static unsigned int
12817 find_drap_reg (void)
12819 tree decl = cfun->decl;
12821 /* Always use callee-saved register if there are no caller-saved
12822 registers. */
12823 if (TARGET_64BIT)
12825 /* Use R13 for nested function or function need static chain.
12826 Since function with tail call may use any caller-saved
12827 registers in epilogue, DRAP must not use caller-saved
12828 register in such case. */
12829 if (DECL_STATIC_CHAIN (decl)
12830 || cfun->machine->no_caller_saved_registers
12831 || crtl->tail_call_emit)
12832 return R13_REG;
12834 return R10_REG;
12836 else
12838 /* Use DI for nested function or function need static chain.
12839 Since function with tail call may use any caller-saved
12840 registers in epilogue, DRAP must not use caller-saved
12841 register in such case. */
12842 if (DECL_STATIC_CHAIN (decl)
12843 || cfun->machine->no_caller_saved_registers
12844 || crtl->tail_call_emit)
12845 return DI_REG;
12847 /* Reuse static chain register if it isn't used for parameter
12848 passing. */
12849 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12851 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12852 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12853 return CX_REG;
12855 return DI_REG;
12859 /* Handle a "force_align_arg_pointer" attribute. */
12861 static tree
12862 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12863 tree, int, bool *no_add_attrs)
12865 if (TREE_CODE (*node) != FUNCTION_TYPE
12866 && TREE_CODE (*node) != METHOD_TYPE
12867 && TREE_CODE (*node) != FIELD_DECL
12868 && TREE_CODE (*node) != TYPE_DECL)
12870 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12871 name);
12872 *no_add_attrs = true;
12875 return NULL_TREE;
12878 /* Return minimum incoming stack alignment. */
12880 static unsigned int
12881 ix86_minimum_incoming_stack_boundary (bool sibcall)
12883 unsigned int incoming_stack_boundary;
12885 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
12887 if (cfun->machine->func_type != TYPE_NORMAL)
12888 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12889 /* Prefer the one specified at command line. */
12890 else if (ix86_user_incoming_stack_boundary)
12891 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12892 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12893 if -mstackrealign is used, it isn't used for sibcall check and
12894 estimated stack alignment is 128bit. */
12895 else if (!sibcall
12896 && ix86_force_align_arg_pointer
12897 && crtl->stack_alignment_estimated == 128)
12898 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12899 else
12900 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12902 /* Incoming stack alignment can be changed on individual functions
12903 via force_align_arg_pointer attribute. We use the smallest
12904 incoming stack boundary. */
12905 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12906 && lookup_attribute (ix86_force_align_arg_pointer_string,
12907 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12908 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12910 /* The incoming stack frame has to be aligned at least at
12911 parm_stack_boundary. */
12912 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12913 incoming_stack_boundary = crtl->parm_stack_boundary;
12915 /* Stack at entrance of main is aligned by runtime. We use the
12916 smallest incoming stack boundary. */
12917 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12918 && DECL_NAME (current_function_decl)
12919 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12920 && DECL_FILE_SCOPE_P (current_function_decl))
12921 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12923 return incoming_stack_boundary;
12926 /* Update incoming stack boundary and estimated stack alignment. */
12928 static void
12929 ix86_update_stack_boundary (void)
12931 ix86_incoming_stack_boundary
12932 = ix86_minimum_incoming_stack_boundary (false);
12934 /* x86_64 vararg needs 16byte stack alignment for register save
12935 area. */
12936 if (TARGET_64BIT
12937 && cfun->stdarg
12938 && crtl->stack_alignment_estimated < 128)
12939 crtl->stack_alignment_estimated = 128;
12941 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12942 if (ix86_tls_descriptor_calls_expanded_in_cfun
12943 && crtl->preferred_stack_boundary < 128)
12944 crtl->preferred_stack_boundary = 128;
12947 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12948 needed or an rtx for DRAP otherwise. */
12950 static rtx
12951 ix86_get_drap_rtx (void)
12953 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
12954 crtl->need_drap = true;
12956 if (stack_realign_drap)
12958 /* Assign DRAP to vDRAP and returns vDRAP */
12959 unsigned int regno = find_drap_reg ();
12960 rtx drap_vreg;
12961 rtx arg_ptr;
12962 rtx_insn *seq, *insn;
12964 arg_ptr = gen_rtx_REG (Pmode, regno);
12965 crtl->drap_reg = arg_ptr;
12967 start_sequence ();
12968 drap_vreg = copy_to_reg (arg_ptr);
12969 seq = get_insns ();
12970 end_sequence ();
12972 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12973 if (!optimize)
12975 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12976 RTX_FRAME_RELATED_P (insn) = 1;
12978 return drap_vreg;
12980 else
12981 return NULL;
12984 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12986 static rtx
12987 ix86_internal_arg_pointer (void)
12989 return virtual_incoming_args_rtx;
12992 struct scratch_reg {
12993 rtx reg;
12994 bool saved;
12997 /* Return a short-lived scratch register for use on function entry.
12998 In 32-bit mode, it is valid only after the registers are saved
12999 in the prologue. This register must be released by means of
13000 release_scratch_register_on_entry once it is dead. */
13002 static void
13003 get_scratch_register_on_entry (struct scratch_reg *sr)
13005 int regno;
13007 sr->saved = false;
13009 if (TARGET_64BIT)
13011 /* We always use R11 in 64-bit mode. */
13012 regno = R11_REG;
13014 else
13016 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13017 bool fastcall_p
13018 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13019 bool thiscall_p
13020 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13021 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13022 int regparm = ix86_function_regparm (fntype, decl);
13023 int drap_regno
13024 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13026 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13027 for the static chain register. */
13028 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13029 && drap_regno != AX_REG)
13030 regno = AX_REG;
13031 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13032 for the static chain register. */
13033 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13034 regno = AX_REG;
13035 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13036 regno = DX_REG;
13037 /* ecx is the static chain register. */
13038 else if (regparm < 3 && !fastcall_p && !thiscall_p
13039 && !static_chain_p
13040 && drap_regno != CX_REG)
13041 regno = CX_REG;
13042 else if (ix86_save_reg (BX_REG, true))
13043 regno = BX_REG;
13044 /* esi is the static chain register. */
13045 else if (!(regparm == 3 && static_chain_p)
13046 && ix86_save_reg (SI_REG, true))
13047 regno = SI_REG;
13048 else if (ix86_save_reg (DI_REG, true))
13049 regno = DI_REG;
13050 else
13052 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13053 sr->saved = true;
13057 sr->reg = gen_rtx_REG (Pmode, regno);
13058 if (sr->saved)
13060 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13061 RTX_FRAME_RELATED_P (insn) = 1;
13065 /* Release a scratch register obtained from the preceding function. */
13067 static void
13068 release_scratch_register_on_entry (struct scratch_reg *sr)
13070 if (sr->saved)
13072 struct machine_function *m = cfun->machine;
13073 rtx x, insn = emit_insn (gen_pop (sr->reg));
13075 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13076 RTX_FRAME_RELATED_P (insn) = 1;
13077 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13078 x = gen_rtx_SET (stack_pointer_rtx, x);
13079 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13080 m->fs.sp_offset -= UNITS_PER_WORD;
13084 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13086 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13088 static void
13089 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13091 /* We skip the probe for the first interval + a small dope of 4 words and
13092 probe that many bytes past the specified size to maintain a protection
13093 area at the botton of the stack. */
13094 const int dope = 4 * UNITS_PER_WORD;
13095 rtx size_rtx = GEN_INT (size), last;
13097 /* See if we have a constant small number of probes to generate. If so,
13098 that's the easy case. The run-time loop is made up of 9 insns in the
13099 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13100 for n # of intervals. */
13101 if (size <= 4 * PROBE_INTERVAL)
13103 HOST_WIDE_INT i, adjust;
13104 bool first_probe = true;
13106 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13107 values of N from 1 until it exceeds SIZE. If only one probe is
13108 needed, this will not generate any code. Then adjust and probe
13109 to PROBE_INTERVAL + SIZE. */
13110 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13112 if (first_probe)
13114 adjust = 2 * PROBE_INTERVAL + dope;
13115 first_probe = false;
13117 else
13118 adjust = PROBE_INTERVAL;
13120 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13121 plus_constant (Pmode, stack_pointer_rtx,
13122 -adjust)));
13123 emit_stack_probe (stack_pointer_rtx);
13126 if (first_probe)
13127 adjust = size + PROBE_INTERVAL + dope;
13128 else
13129 adjust = size + PROBE_INTERVAL - i;
13131 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13132 plus_constant (Pmode, stack_pointer_rtx,
13133 -adjust)));
13134 emit_stack_probe (stack_pointer_rtx);
13136 /* Adjust back to account for the additional first interval. */
13137 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13138 plus_constant (Pmode, stack_pointer_rtx,
13139 PROBE_INTERVAL + dope)));
13142 /* Otherwise, do the same as above, but in a loop. Note that we must be
13143 extra careful with variables wrapping around because we might be at
13144 the very top (or the very bottom) of the address space and we have
13145 to be able to handle this case properly; in particular, we use an
13146 equality test for the loop condition. */
13147 else
13149 HOST_WIDE_INT rounded_size;
13150 struct scratch_reg sr;
13152 get_scratch_register_on_entry (&sr);
13155 /* Step 1: round SIZE to the previous multiple of the interval. */
13157 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13160 /* Step 2: compute initial and final value of the loop counter. */
13162 /* SP = SP_0 + PROBE_INTERVAL. */
13163 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13164 plus_constant (Pmode, stack_pointer_rtx,
13165 - (PROBE_INTERVAL + dope))));
13167 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13168 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13169 emit_insn (gen_rtx_SET (sr.reg,
13170 plus_constant (Pmode, stack_pointer_rtx,
13171 -rounded_size)));
13172 else
13174 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13175 emit_insn (gen_rtx_SET (sr.reg,
13176 gen_rtx_PLUS (Pmode, sr.reg,
13177 stack_pointer_rtx)));
13181 /* Step 3: the loop
13185 SP = SP + PROBE_INTERVAL
13186 probe at SP
13188 while (SP != LAST_ADDR)
13190 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13191 values of N from 1 until it is equal to ROUNDED_SIZE. */
13193 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13196 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13197 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13199 if (size != rounded_size)
13201 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13202 plus_constant (Pmode, stack_pointer_rtx,
13203 rounded_size - size)));
13204 emit_stack_probe (stack_pointer_rtx);
13207 /* Adjust back to account for the additional first interval. */
13208 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13209 plus_constant (Pmode, stack_pointer_rtx,
13210 PROBE_INTERVAL + dope)));
13212 release_scratch_register_on_entry (&sr);
13215 /* Even if the stack pointer isn't the CFA register, we need to correctly
13216 describe the adjustments made to it, in particular differentiate the
13217 frame-related ones from the frame-unrelated ones. */
13218 if (size > 0)
13220 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13221 XVECEXP (expr, 0, 0)
13222 = gen_rtx_SET (stack_pointer_rtx,
13223 plus_constant (Pmode, stack_pointer_rtx, -size));
13224 XVECEXP (expr, 0, 1)
13225 = gen_rtx_SET (stack_pointer_rtx,
13226 plus_constant (Pmode, stack_pointer_rtx,
13227 PROBE_INTERVAL + dope + size));
13228 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13229 RTX_FRAME_RELATED_P (last) = 1;
13231 cfun->machine->fs.sp_offset += size;
13234 /* Make sure nothing is scheduled before we are done. */
13235 emit_insn (gen_blockage ());
13238 /* Adjust the stack pointer up to REG while probing it. */
13240 const char *
13241 output_adjust_stack_and_probe (rtx reg)
13243 static int labelno = 0;
13244 char loop_lab[32];
13245 rtx xops[2];
13247 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13249 /* Loop. */
13250 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13252 /* SP = SP + PROBE_INTERVAL. */
13253 xops[0] = stack_pointer_rtx;
13254 xops[1] = GEN_INT (PROBE_INTERVAL);
13255 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13257 /* Probe at SP. */
13258 xops[1] = const0_rtx;
13259 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13261 /* Test if SP == LAST_ADDR. */
13262 xops[0] = stack_pointer_rtx;
13263 xops[1] = reg;
13264 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13266 /* Branch. */
13267 fputs ("\tjne\t", asm_out_file);
13268 assemble_name_raw (asm_out_file, loop_lab);
13269 fputc ('\n', asm_out_file);
13271 return "";
13274 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13275 inclusive. These are offsets from the current stack pointer. */
13277 static void
13278 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13280 /* See if we have a constant small number of probes to generate. If so,
13281 that's the easy case. The run-time loop is made up of 6 insns in the
13282 generic case while the compile-time loop is made up of n insns for n #
13283 of intervals. */
13284 if (size <= 6 * PROBE_INTERVAL)
13286 HOST_WIDE_INT i;
13288 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13289 it exceeds SIZE. If only one probe is needed, this will not
13290 generate any code. Then probe at FIRST + SIZE. */
13291 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13292 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13293 -(first + i)));
13295 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13296 -(first + size)));
13299 /* Otherwise, do the same as above, but in a loop. Note that we must be
13300 extra careful with variables wrapping around because we might be at
13301 the very top (or the very bottom) of the address space and we have
13302 to be able to handle this case properly; in particular, we use an
13303 equality test for the loop condition. */
13304 else
13306 HOST_WIDE_INT rounded_size, last;
13307 struct scratch_reg sr;
13309 get_scratch_register_on_entry (&sr);
13312 /* Step 1: round SIZE to the previous multiple of the interval. */
13314 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13317 /* Step 2: compute initial and final value of the loop counter. */
13319 /* TEST_OFFSET = FIRST. */
13320 emit_move_insn (sr.reg, GEN_INT (-first));
13322 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13323 last = first + rounded_size;
13326 /* Step 3: the loop
13330 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13331 probe at TEST_ADDR
13333 while (TEST_ADDR != LAST_ADDR)
13335 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13336 until it is equal to ROUNDED_SIZE. */
13338 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13341 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13342 that SIZE is equal to ROUNDED_SIZE. */
13344 if (size != rounded_size)
13345 emit_stack_probe (plus_constant (Pmode,
13346 gen_rtx_PLUS (Pmode,
13347 stack_pointer_rtx,
13348 sr.reg),
13349 rounded_size - size));
13351 release_scratch_register_on_entry (&sr);
13354 /* Make sure nothing is scheduled before we are done. */
13355 emit_insn (gen_blockage ());
13358 /* Probe a range of stack addresses from REG to END, inclusive. These are
13359 offsets from the current stack pointer. */
13361 const char *
13362 output_probe_stack_range (rtx reg, rtx end)
13364 static int labelno = 0;
13365 char loop_lab[32];
13366 rtx xops[3];
13368 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13370 /* Loop. */
13371 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13373 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13374 xops[0] = reg;
13375 xops[1] = GEN_INT (PROBE_INTERVAL);
13376 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13378 /* Probe at TEST_ADDR. */
13379 xops[0] = stack_pointer_rtx;
13380 xops[1] = reg;
13381 xops[2] = const0_rtx;
13382 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13384 /* Test if TEST_ADDR == LAST_ADDR. */
13385 xops[0] = reg;
13386 xops[1] = end;
13387 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13389 /* Branch. */
13390 fputs ("\tjne\t", asm_out_file);
13391 assemble_name_raw (asm_out_file, loop_lab);
13392 fputc ('\n', asm_out_file);
13394 return "";
13397 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13398 to be generated in correct form. */
13399 static void
13400 ix86_finalize_stack_realign_flags (void)
13402 /* Check if stack realign is really needed after reload, and
13403 stores result in cfun */
13404 unsigned int incoming_stack_boundary
13405 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13406 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13407 unsigned int stack_realign
13408 = (incoming_stack_boundary
13409 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13410 ? crtl->max_used_stack_slot_alignment
13411 : crtl->stack_alignment_needed));
13413 if (crtl->stack_realign_finalized)
13415 /* After stack_realign_needed is finalized, we can't no longer
13416 change it. */
13417 gcc_assert (crtl->stack_realign_needed == stack_realign);
13418 return;
13421 /* If the only reason for frame_pointer_needed is that we conservatively
13422 assumed stack realignment might be needed, but in the end nothing that
13423 needed the stack alignment had been spilled, clear frame_pointer_needed
13424 and say we don't need stack realignment. */
13425 if (stack_realign
13426 && frame_pointer_needed
13427 && crtl->is_leaf
13428 && flag_omit_frame_pointer
13429 && crtl->sp_is_unchanging
13430 && !ix86_current_function_calls_tls_descriptor
13431 && !crtl->accesses_prior_frames
13432 && !cfun->calls_alloca
13433 && !crtl->calls_eh_return
13434 /* See ira_setup_eliminable_regset for the rationale. */
13435 && !(STACK_CHECK_MOVING_SP
13436 && flag_stack_check
13437 && flag_exceptions
13438 && cfun->can_throw_non_call_exceptions)
13439 && !ix86_frame_pointer_required ()
13440 && get_frame_size () == 0
13441 && ix86_nsaved_sseregs () == 0
13442 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13444 HARD_REG_SET set_up_by_prologue, prologue_used;
13445 basic_block bb;
13447 CLEAR_HARD_REG_SET (prologue_used);
13448 CLEAR_HARD_REG_SET (set_up_by_prologue);
13449 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13450 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13451 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13452 HARD_FRAME_POINTER_REGNUM);
13453 FOR_EACH_BB_FN (bb, cfun)
13455 rtx_insn *insn;
13456 FOR_BB_INSNS (bb, insn)
13457 if (NONDEBUG_INSN_P (insn)
13458 && requires_stack_frame_p (insn, prologue_used,
13459 set_up_by_prologue))
13461 crtl->stack_realign_needed = stack_realign;
13462 crtl->stack_realign_finalized = true;
13463 return;
13467 /* If drap has been set, but it actually isn't live at the start
13468 of the function, there is no reason to set it up. */
13469 if (crtl->drap_reg)
13471 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13472 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13474 crtl->drap_reg = NULL_RTX;
13475 crtl->need_drap = false;
13478 else
13479 cfun->machine->no_drap_save_restore = true;
13481 frame_pointer_needed = false;
13482 stack_realign = false;
13483 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13484 crtl->stack_alignment_needed = incoming_stack_boundary;
13485 crtl->stack_alignment_estimated = incoming_stack_boundary;
13486 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13487 crtl->preferred_stack_boundary = incoming_stack_boundary;
13488 df_finish_pass (true);
13489 df_scan_alloc (NULL);
13490 df_scan_blocks ();
13491 df_compute_regs_ever_live (true);
13492 df_analyze ();
13495 crtl->stack_realign_needed = stack_realign;
13496 crtl->stack_realign_finalized = true;
13499 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13501 static void
13502 ix86_elim_entry_set_got (rtx reg)
13504 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13505 rtx_insn *c_insn = BB_HEAD (bb);
13506 if (!NONDEBUG_INSN_P (c_insn))
13507 c_insn = next_nonnote_nondebug_insn (c_insn);
13508 if (c_insn && NONJUMP_INSN_P (c_insn))
13510 rtx pat = PATTERN (c_insn);
13511 if (GET_CODE (pat) == PARALLEL)
13513 rtx vec = XVECEXP (pat, 0, 0);
13514 if (GET_CODE (vec) == SET
13515 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13516 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13517 delete_insn (c_insn);
13522 /* Expand the prologue into a bunch of separate insns. */
13524 void
13525 ix86_expand_prologue (void)
13527 struct machine_function *m = cfun->machine;
13528 rtx insn, t;
13529 struct ix86_frame frame;
13530 HOST_WIDE_INT allocate;
13531 bool int_registers_saved;
13532 bool sse_registers_saved;
13533 rtx static_chain = NULL_RTX;
13535 ix86_finalize_stack_realign_flags ();
13537 /* DRAP should not coexist with stack_realign_fp */
13538 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13540 memset (&m->fs, 0, sizeof (m->fs));
13542 /* Initialize CFA state for before the prologue. */
13543 m->fs.cfa_reg = stack_pointer_rtx;
13544 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13546 /* Track SP offset to the CFA. We continue tracking this after we've
13547 swapped the CFA register away from SP. In the case of re-alignment
13548 this is fudged; we're interested to offsets within the local frame. */
13549 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13550 m->fs.sp_valid = true;
13552 ix86_compute_frame_layout (&frame);
13554 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13556 /* We should have already generated an error for any use of
13557 ms_hook on a nested function. */
13558 gcc_checking_assert (!ix86_static_chain_on_stack);
13560 /* Check if profiling is active and we shall use profiling before
13561 prologue variant. If so sorry. */
13562 if (crtl->profile && flag_fentry != 0)
13563 sorry ("ms_hook_prologue attribute isn%'t compatible "
13564 "with -mfentry for 32-bit");
13566 /* In ix86_asm_output_function_label we emitted:
13567 8b ff movl.s %edi,%edi
13568 55 push %ebp
13569 8b ec movl.s %esp,%ebp
13571 This matches the hookable function prologue in Win32 API
13572 functions in Microsoft Windows XP Service Pack 2 and newer.
13573 Wine uses this to enable Windows apps to hook the Win32 API
13574 functions provided by Wine.
13576 What that means is that we've already set up the frame pointer. */
13578 if (frame_pointer_needed
13579 && !(crtl->drap_reg && crtl->stack_realign_needed))
13581 rtx push, mov;
13583 /* We've decided to use the frame pointer already set up.
13584 Describe this to the unwinder by pretending that both
13585 push and mov insns happen right here.
13587 Putting the unwind info here at the end of the ms_hook
13588 is done so that we can make absolutely certain we get
13589 the required byte sequence at the start of the function,
13590 rather than relying on an assembler that can produce
13591 the exact encoding required.
13593 However it does mean (in the unpatched case) that we have
13594 a 1 insn window where the asynchronous unwind info is
13595 incorrect. However, if we placed the unwind info at
13596 its correct location we would have incorrect unwind info
13597 in the patched case. Which is probably all moot since
13598 I don't expect Wine generates dwarf2 unwind info for the
13599 system libraries that use this feature. */
13601 insn = emit_insn (gen_blockage ());
13603 push = gen_push (hard_frame_pointer_rtx);
13604 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13605 stack_pointer_rtx);
13606 RTX_FRAME_RELATED_P (push) = 1;
13607 RTX_FRAME_RELATED_P (mov) = 1;
13609 RTX_FRAME_RELATED_P (insn) = 1;
13610 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13611 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13613 /* Note that gen_push incremented m->fs.cfa_offset, even
13614 though we didn't emit the push insn here. */
13615 m->fs.cfa_reg = hard_frame_pointer_rtx;
13616 m->fs.fp_offset = m->fs.cfa_offset;
13617 m->fs.fp_valid = true;
13619 else
13621 /* The frame pointer is not needed so pop %ebp again.
13622 This leaves us with a pristine state. */
13623 emit_insn (gen_pop (hard_frame_pointer_rtx));
13627 /* The first insn of a function that accepts its static chain on the
13628 stack is to push the register that would be filled in by a direct
13629 call. This insn will be skipped by the trampoline. */
13630 else if (ix86_static_chain_on_stack)
13632 static_chain = ix86_static_chain (cfun->decl, false);
13633 insn = emit_insn (gen_push (static_chain));
13634 emit_insn (gen_blockage ());
13636 /* We don't want to interpret this push insn as a register save,
13637 only as a stack adjustment. The real copy of the register as
13638 a save will be done later, if needed. */
13639 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13640 t = gen_rtx_SET (stack_pointer_rtx, t);
13641 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13642 RTX_FRAME_RELATED_P (insn) = 1;
13645 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13646 of DRAP is needed and stack realignment is really needed after reload */
13647 if (stack_realign_drap)
13649 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13651 /* Can't use DRAP in interrupt function. */
13652 if (cfun->machine->func_type != TYPE_NORMAL)
13653 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13654 "in interrupt service routine. This may be worked "
13655 "around by avoiding functions with aggregate return.");
13657 /* Only need to push parameter pointer reg if it is caller saved. */
13658 if (!call_used_regs[REGNO (crtl->drap_reg)])
13660 /* Push arg pointer reg */
13661 insn = emit_insn (gen_push (crtl->drap_reg));
13662 RTX_FRAME_RELATED_P (insn) = 1;
13665 /* Grab the argument pointer. */
13666 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13667 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13668 RTX_FRAME_RELATED_P (insn) = 1;
13669 m->fs.cfa_reg = crtl->drap_reg;
13670 m->fs.cfa_offset = 0;
13672 /* Align the stack. */
13673 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13674 stack_pointer_rtx,
13675 GEN_INT (-align_bytes)));
13676 RTX_FRAME_RELATED_P (insn) = 1;
13678 /* Replicate the return address on the stack so that return
13679 address can be reached via (argp - 1) slot. This is needed
13680 to implement macro RETURN_ADDR_RTX and intrinsic function
13681 expand_builtin_return_addr etc. */
13682 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13683 t = gen_frame_mem (word_mode, t);
13684 insn = emit_insn (gen_push (t));
13685 RTX_FRAME_RELATED_P (insn) = 1;
13687 /* For the purposes of frame and register save area addressing,
13688 we've started over with a new frame. */
13689 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13690 m->fs.realigned = true;
13692 if (static_chain)
13694 /* Replicate static chain on the stack so that static chain
13695 can be reached via (argp - 2) slot. This is needed for
13696 nested function with stack realignment. */
13697 insn = emit_insn (gen_push (static_chain));
13698 RTX_FRAME_RELATED_P (insn) = 1;
13702 int_registers_saved = (frame.nregs == 0);
13703 sse_registers_saved = (frame.nsseregs == 0);
13705 if (frame_pointer_needed && !m->fs.fp_valid)
13707 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13708 slower on all targets. Also sdb doesn't like it. */
13709 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13710 RTX_FRAME_RELATED_P (insn) = 1;
13712 /* Push registers now, before setting the frame pointer
13713 on SEH target. */
13714 if (!int_registers_saved
13715 && TARGET_SEH
13716 && !frame.save_regs_using_mov)
13718 ix86_emit_save_regs ();
13719 int_registers_saved = true;
13720 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13723 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13725 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13726 RTX_FRAME_RELATED_P (insn) = 1;
13728 if (m->fs.cfa_reg == stack_pointer_rtx)
13729 m->fs.cfa_reg = hard_frame_pointer_rtx;
13730 m->fs.fp_offset = m->fs.sp_offset;
13731 m->fs.fp_valid = true;
13735 if (!int_registers_saved)
13737 /* If saving registers via PUSH, do so now. */
13738 if (!frame.save_regs_using_mov)
13740 ix86_emit_save_regs ();
13741 int_registers_saved = true;
13742 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13745 /* When using red zone we may start register saving before allocating
13746 the stack frame saving one cycle of the prologue. However, avoid
13747 doing this if we have to probe the stack; at least on x86_64 the
13748 stack probe can turn into a call that clobbers a red zone location. */
13749 else if (ix86_using_red_zone ()
13750 && (! TARGET_STACK_PROBE
13751 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13753 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13754 int_registers_saved = true;
13758 if (stack_realign_fp)
13760 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13761 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13763 /* The computation of the size of the re-aligned stack frame means
13764 that we must allocate the size of the register save area before
13765 performing the actual alignment. Otherwise we cannot guarantee
13766 that there's enough storage above the realignment point. */
13767 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13768 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13769 GEN_INT (m->fs.sp_offset
13770 - frame.sse_reg_save_offset),
13771 -1, false);
13773 /* Align the stack. */
13774 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13775 stack_pointer_rtx,
13776 GEN_INT (-align_bytes)));
13778 /* For the purposes of register save area addressing, the stack
13779 pointer is no longer valid. As for the value of sp_offset,
13780 see ix86_compute_frame_layout, which we need to match in order
13781 to pass verification of stack_pointer_offset at the end. */
13782 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13783 m->fs.sp_valid = false;
13786 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13788 if (flag_stack_usage_info)
13790 /* We start to count from ARG_POINTER. */
13791 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13793 /* If it was realigned, take into account the fake frame. */
13794 if (stack_realign_drap)
13796 if (ix86_static_chain_on_stack)
13797 stack_size += UNITS_PER_WORD;
13799 if (!call_used_regs[REGNO (crtl->drap_reg)])
13800 stack_size += UNITS_PER_WORD;
13802 /* This over-estimates by 1 minimal-stack-alignment-unit but
13803 mitigates that by counting in the new return address slot. */
13804 current_function_dynamic_stack_size
13805 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13808 current_function_static_stack_size = stack_size;
13811 /* On SEH target with very large frame size, allocate an area to save
13812 SSE registers (as the very large allocation won't be described). */
13813 if (TARGET_SEH
13814 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13815 && !sse_registers_saved)
13817 HOST_WIDE_INT sse_size =
13818 frame.sse_reg_save_offset - frame.reg_save_offset;
13820 gcc_assert (int_registers_saved);
13822 /* No need to do stack checking as the area will be immediately
13823 written. */
13824 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13825 GEN_INT (-sse_size), -1,
13826 m->fs.cfa_reg == stack_pointer_rtx);
13827 allocate -= sse_size;
13828 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13829 sse_registers_saved = true;
13832 /* The stack has already been decremented by the instruction calling us
13833 so probe if the size is non-negative to preserve the protection area. */
13834 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
13836 /* We expect the registers to be saved when probes are used. */
13837 gcc_assert (int_registers_saved);
13839 if (STACK_CHECK_MOVING_SP)
13841 if (!(crtl->is_leaf && !cfun->calls_alloca
13842 && allocate <= PROBE_INTERVAL))
13844 ix86_adjust_stack_and_probe (allocate);
13845 allocate = 0;
13848 else
13850 HOST_WIDE_INT size = allocate;
13852 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13853 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
13855 if (TARGET_STACK_PROBE)
13857 if (crtl->is_leaf && !cfun->calls_alloca)
13859 if (size > PROBE_INTERVAL)
13860 ix86_emit_probe_stack_range (0, size);
13862 else
13863 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
13865 else
13867 if (crtl->is_leaf && !cfun->calls_alloca)
13869 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
13870 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
13871 size - STACK_CHECK_PROTECT);
13873 else
13874 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
13879 if (allocate == 0)
13881 else if (!ix86_target_stack_probe ()
13882 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13884 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13885 GEN_INT (-allocate), -1,
13886 m->fs.cfa_reg == stack_pointer_rtx);
13888 else
13890 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13891 rtx r10 = NULL;
13892 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13893 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13894 bool eax_live = ix86_eax_live_at_start_p ();
13895 bool r10_live = false;
13897 if (TARGET_64BIT)
13898 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13900 if (eax_live)
13902 insn = emit_insn (gen_push (eax));
13903 allocate -= UNITS_PER_WORD;
13904 /* Note that SEH directives need to continue tracking the stack
13905 pointer even after the frame pointer has been set up. */
13906 if (sp_is_cfa_reg || TARGET_SEH)
13908 if (sp_is_cfa_reg)
13909 m->fs.cfa_offset += UNITS_PER_WORD;
13910 RTX_FRAME_RELATED_P (insn) = 1;
13911 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13912 gen_rtx_SET (stack_pointer_rtx,
13913 plus_constant (Pmode, stack_pointer_rtx,
13914 -UNITS_PER_WORD)));
13918 if (r10_live)
13920 r10 = gen_rtx_REG (Pmode, R10_REG);
13921 insn = emit_insn (gen_push (r10));
13922 allocate -= UNITS_PER_WORD;
13923 if (sp_is_cfa_reg || TARGET_SEH)
13925 if (sp_is_cfa_reg)
13926 m->fs.cfa_offset += UNITS_PER_WORD;
13927 RTX_FRAME_RELATED_P (insn) = 1;
13928 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13929 gen_rtx_SET (stack_pointer_rtx,
13930 plus_constant (Pmode, stack_pointer_rtx,
13931 -UNITS_PER_WORD)));
13935 emit_move_insn (eax, GEN_INT (allocate));
13936 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13938 /* Use the fact that AX still contains ALLOCATE. */
13939 adjust_stack_insn = (Pmode == DImode
13940 ? gen_pro_epilogue_adjust_stack_di_sub
13941 : gen_pro_epilogue_adjust_stack_si_sub);
13943 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13944 stack_pointer_rtx, eax));
13946 if (sp_is_cfa_reg || TARGET_SEH)
13948 if (sp_is_cfa_reg)
13949 m->fs.cfa_offset += allocate;
13950 RTX_FRAME_RELATED_P (insn) = 1;
13951 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13952 gen_rtx_SET (stack_pointer_rtx,
13953 plus_constant (Pmode, stack_pointer_rtx,
13954 -allocate)));
13956 m->fs.sp_offset += allocate;
13958 /* Use stack_pointer_rtx for relative addressing so that code
13959 works for realigned stack, too. */
13960 if (r10_live && eax_live)
13962 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13963 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13964 gen_frame_mem (word_mode, t));
13965 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13966 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13967 gen_frame_mem (word_mode, t));
13969 else if (eax_live || r10_live)
13971 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13972 emit_move_insn (gen_rtx_REG (word_mode,
13973 (eax_live ? AX_REG : R10_REG)),
13974 gen_frame_mem (word_mode, t));
13977 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13979 /* If we havn't already set up the frame pointer, do so now. */
13980 if (frame_pointer_needed && !m->fs.fp_valid)
13982 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13983 GEN_INT (frame.stack_pointer_offset
13984 - frame.hard_frame_pointer_offset));
13985 insn = emit_insn (insn);
13986 RTX_FRAME_RELATED_P (insn) = 1;
13987 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13989 if (m->fs.cfa_reg == stack_pointer_rtx)
13990 m->fs.cfa_reg = hard_frame_pointer_rtx;
13991 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13992 m->fs.fp_valid = true;
13995 if (!int_registers_saved)
13996 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13997 if (!sse_registers_saved)
13998 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14000 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14001 in PROLOGUE. */
14002 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14004 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14005 insn = emit_insn (gen_set_got (pic));
14006 RTX_FRAME_RELATED_P (insn) = 1;
14007 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14008 emit_insn (gen_prologue_use (pic));
14009 /* Deleting already emmitted SET_GOT if exist and allocated to
14010 REAL_PIC_OFFSET_TABLE_REGNUM. */
14011 ix86_elim_entry_set_got (pic);
14014 if (crtl->drap_reg && !crtl->stack_realign_needed)
14016 /* vDRAP is setup but after reload it turns out stack realign
14017 isn't necessary, here we will emit prologue to setup DRAP
14018 without stack realign adjustment */
14019 t = choose_baseaddr (0);
14020 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14023 /* Prevent instructions from being scheduled into register save push
14024 sequence when access to the redzone area is done through frame pointer.
14025 The offset between the frame pointer and the stack pointer is calculated
14026 relative to the value of the stack pointer at the end of the function
14027 prologue, and moving instructions that access redzone area via frame
14028 pointer inside push sequence violates this assumption. */
14029 if (frame_pointer_needed && frame.red_zone_size)
14030 emit_insn (gen_memory_blockage ());
14032 /* SEH requires that the prologue end within 256 bytes of the start of
14033 the function. Prevent instruction schedules that would extend that.
14034 Further, prevent alloca modifications to the stack pointer from being
14035 combined with prologue modifications. */
14036 if (TARGET_SEH)
14037 emit_insn (gen_prologue_use (stack_pointer_rtx));
14040 /* Emit code to restore REG using a POP insn. */
14042 static void
14043 ix86_emit_restore_reg_using_pop (rtx reg)
14045 struct machine_function *m = cfun->machine;
14046 rtx_insn *insn = emit_insn (gen_pop (reg));
14048 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14049 m->fs.sp_offset -= UNITS_PER_WORD;
14051 if (m->fs.cfa_reg == crtl->drap_reg
14052 && REGNO (reg) == REGNO (crtl->drap_reg))
14054 /* Previously we'd represented the CFA as an expression
14055 like *(%ebp - 8). We've just popped that value from
14056 the stack, which means we need to reset the CFA to
14057 the drap register. This will remain until we restore
14058 the stack pointer. */
14059 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14060 RTX_FRAME_RELATED_P (insn) = 1;
14062 /* This means that the DRAP register is valid for addressing too. */
14063 m->fs.drap_valid = true;
14064 return;
14067 if (m->fs.cfa_reg == stack_pointer_rtx)
14069 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14070 x = gen_rtx_SET (stack_pointer_rtx, x);
14071 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14072 RTX_FRAME_RELATED_P (insn) = 1;
14074 m->fs.cfa_offset -= UNITS_PER_WORD;
14077 /* When the frame pointer is the CFA, and we pop it, we are
14078 swapping back to the stack pointer as the CFA. This happens
14079 for stack frames that don't allocate other data, so we assume
14080 the stack pointer is now pointing at the return address, i.e.
14081 the function entry state, which makes the offset be 1 word. */
14082 if (reg == hard_frame_pointer_rtx)
14084 m->fs.fp_valid = false;
14085 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14087 m->fs.cfa_reg = stack_pointer_rtx;
14088 m->fs.cfa_offset -= UNITS_PER_WORD;
14090 add_reg_note (insn, REG_CFA_DEF_CFA,
14091 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14092 GEN_INT (m->fs.cfa_offset)));
14093 RTX_FRAME_RELATED_P (insn) = 1;
14098 /* Emit code to restore saved registers using POP insns. */
14100 static void
14101 ix86_emit_restore_regs_using_pop (void)
14103 unsigned int regno;
14105 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14106 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14107 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14110 /* Emit code and notes for the LEAVE instruction. */
14112 static void
14113 ix86_emit_leave (void)
14115 struct machine_function *m = cfun->machine;
14116 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14118 ix86_add_queued_cfa_restore_notes (insn);
14120 gcc_assert (m->fs.fp_valid);
14121 m->fs.sp_valid = true;
14122 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14123 m->fs.fp_valid = false;
14125 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14127 m->fs.cfa_reg = stack_pointer_rtx;
14128 m->fs.cfa_offset = m->fs.sp_offset;
14130 add_reg_note (insn, REG_CFA_DEF_CFA,
14131 plus_constant (Pmode, stack_pointer_rtx,
14132 m->fs.sp_offset));
14133 RTX_FRAME_RELATED_P (insn) = 1;
14135 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14136 m->fs.fp_offset);
14139 /* Emit code to restore saved registers using MOV insns.
14140 First register is restored from CFA - CFA_OFFSET. */
14141 static void
14142 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14143 bool maybe_eh_return)
14145 struct machine_function *m = cfun->machine;
14146 unsigned int regno;
14148 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14149 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14151 rtx reg = gen_rtx_REG (word_mode, regno);
14152 rtx mem;
14153 rtx_insn *insn;
14155 mem = choose_baseaddr (cfa_offset);
14156 mem = gen_frame_mem (word_mode, mem);
14157 insn = emit_move_insn (reg, mem);
14159 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14161 /* Previously we'd represented the CFA as an expression
14162 like *(%ebp - 8). We've just popped that value from
14163 the stack, which means we need to reset the CFA to
14164 the drap register. This will remain until we restore
14165 the stack pointer. */
14166 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14167 RTX_FRAME_RELATED_P (insn) = 1;
14169 /* This means that the DRAP register is valid for addressing. */
14170 m->fs.drap_valid = true;
14172 else
14173 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14175 cfa_offset -= UNITS_PER_WORD;
14179 /* Emit code to restore saved registers using MOV insns.
14180 First register is restored from CFA - CFA_OFFSET. */
14181 static void
14182 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14183 bool maybe_eh_return)
14185 unsigned int regno;
14187 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14188 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14190 rtx reg = gen_rtx_REG (V4SFmode, regno);
14191 rtx mem;
14192 unsigned int align;
14194 mem = choose_baseaddr (cfa_offset);
14195 mem = gen_rtx_MEM (V4SFmode, mem);
14197 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14198 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14199 set_mem_align (mem, align);
14200 emit_insn (gen_rtx_SET (reg, mem));
14202 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14204 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14208 /* Restore function stack, frame, and registers. */
14210 void
14211 ix86_expand_epilogue (int style)
14213 struct machine_function *m = cfun->machine;
14214 struct machine_frame_state frame_state_save = m->fs;
14215 struct ix86_frame frame;
14216 bool restore_regs_via_mov;
14217 bool using_drap;
14219 ix86_finalize_stack_realign_flags ();
14220 ix86_compute_frame_layout (&frame);
14222 m->fs.sp_valid = (!frame_pointer_needed
14223 || (crtl->sp_is_unchanging
14224 && !stack_realign_fp));
14225 gcc_assert (!m->fs.sp_valid
14226 || m->fs.sp_offset == frame.stack_pointer_offset);
14228 /* The FP must be valid if the frame pointer is present. */
14229 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14230 gcc_assert (!m->fs.fp_valid
14231 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14233 /* We must have *some* valid pointer to the stack frame. */
14234 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14236 /* The DRAP is never valid at this point. */
14237 gcc_assert (!m->fs.drap_valid);
14239 /* See the comment about red zone and frame
14240 pointer usage in ix86_expand_prologue. */
14241 if (frame_pointer_needed && frame.red_zone_size)
14242 emit_insn (gen_memory_blockage ());
14244 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14245 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14247 /* Determine the CFA offset of the end of the red-zone. */
14248 m->fs.red_zone_offset = 0;
14249 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14251 /* The red-zone begins below the return address. */
14252 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14254 /* When the register save area is in the aligned portion of
14255 the stack, determine the maximum runtime displacement that
14256 matches up with the aligned frame. */
14257 if (stack_realign_drap)
14258 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14259 + UNITS_PER_WORD);
14262 /* Special care must be taken for the normal return case of a function
14263 using eh_return: the eax and edx registers are marked as saved, but
14264 not restored along this path. Adjust the save location to match. */
14265 if (crtl->calls_eh_return && style != 2)
14266 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14268 /* EH_RETURN requires the use of moves to function properly. */
14269 if (crtl->calls_eh_return)
14270 restore_regs_via_mov = true;
14271 /* SEH requires the use of pops to identify the epilogue. */
14272 else if (TARGET_SEH)
14273 restore_regs_via_mov = false;
14274 /* If we're only restoring one register and sp is not valid then
14275 using a move instruction to restore the register since it's
14276 less work than reloading sp and popping the register. */
14277 else if (!m->fs.sp_valid && frame.nregs <= 1)
14278 restore_regs_via_mov = true;
14279 else if (TARGET_EPILOGUE_USING_MOVE
14280 && cfun->machine->use_fast_prologue_epilogue
14281 && (frame.nregs > 1
14282 || m->fs.sp_offset != frame.reg_save_offset))
14283 restore_regs_via_mov = true;
14284 else if (frame_pointer_needed
14285 && !frame.nregs
14286 && m->fs.sp_offset != frame.reg_save_offset)
14287 restore_regs_via_mov = true;
14288 else if (frame_pointer_needed
14289 && TARGET_USE_LEAVE
14290 && cfun->machine->use_fast_prologue_epilogue
14291 && frame.nregs == 1)
14292 restore_regs_via_mov = true;
14293 else
14294 restore_regs_via_mov = false;
14296 if (restore_regs_via_mov || frame.nsseregs)
14298 /* Ensure that the entire register save area is addressable via
14299 the stack pointer, if we will restore via sp. */
14300 if (TARGET_64BIT
14301 && m->fs.sp_offset > 0x7fffffff
14302 && !(m->fs.fp_valid || m->fs.drap_valid)
14303 && (frame.nsseregs + frame.nregs) != 0)
14305 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14306 GEN_INT (m->fs.sp_offset
14307 - frame.sse_reg_save_offset),
14308 style,
14309 m->fs.cfa_reg == stack_pointer_rtx);
14313 /* If there are any SSE registers to restore, then we have to do it
14314 via moves, since there's obviously no pop for SSE regs. */
14315 if (frame.nsseregs)
14316 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14317 style == 2);
14319 if (restore_regs_via_mov)
14321 rtx t;
14323 if (frame.nregs)
14324 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14326 /* eh_return epilogues need %ecx added to the stack pointer. */
14327 if (style == 2)
14329 rtx sa = EH_RETURN_STACKADJ_RTX;
14330 rtx_insn *insn;
14332 /* %ecx can't be used for both DRAP register and eh_return. */
14333 if (crtl->drap_reg)
14334 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14336 /* regparm nested functions don't work with eh_return. */
14337 gcc_assert (!ix86_static_chain_on_stack);
14339 if (frame_pointer_needed)
14341 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14342 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14343 emit_insn (gen_rtx_SET (sa, t));
14345 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14346 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14348 /* Note that we use SA as a temporary CFA, as the return
14349 address is at the proper place relative to it. We
14350 pretend this happens at the FP restore insn because
14351 prior to this insn the FP would be stored at the wrong
14352 offset relative to SA, and after this insn we have no
14353 other reasonable register to use for the CFA. We don't
14354 bother resetting the CFA to the SP for the duration of
14355 the return insn. */
14356 add_reg_note (insn, REG_CFA_DEF_CFA,
14357 plus_constant (Pmode, sa, UNITS_PER_WORD));
14358 ix86_add_queued_cfa_restore_notes (insn);
14359 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14360 RTX_FRAME_RELATED_P (insn) = 1;
14362 m->fs.cfa_reg = sa;
14363 m->fs.cfa_offset = UNITS_PER_WORD;
14364 m->fs.fp_valid = false;
14366 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14367 const0_rtx, style, false);
14369 else
14371 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14372 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14373 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14374 ix86_add_queued_cfa_restore_notes (insn);
14376 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14377 if (m->fs.cfa_offset != UNITS_PER_WORD)
14379 m->fs.cfa_offset = UNITS_PER_WORD;
14380 add_reg_note (insn, REG_CFA_DEF_CFA,
14381 plus_constant (Pmode, stack_pointer_rtx,
14382 UNITS_PER_WORD));
14383 RTX_FRAME_RELATED_P (insn) = 1;
14386 m->fs.sp_offset = UNITS_PER_WORD;
14387 m->fs.sp_valid = true;
14390 else
14392 /* SEH requires that the function end with (1) a stack adjustment
14393 if necessary, (2) a sequence of pops, and (3) a return or
14394 jump instruction. Prevent insns from the function body from
14395 being scheduled into this sequence. */
14396 if (TARGET_SEH)
14398 /* Prevent a catch region from being adjacent to the standard
14399 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14400 several other flags that would be interesting to test are
14401 not yet set up. */
14402 if (flag_non_call_exceptions)
14403 emit_insn (gen_nops (const1_rtx));
14404 else
14405 emit_insn (gen_blockage ());
14408 /* First step is to deallocate the stack frame so that we can
14409 pop the registers. Also do it on SEH target for very large
14410 frame as the emitted instructions aren't allowed by the ABI in
14411 epilogues. */
14412 if (!m->fs.sp_valid
14413 || (TARGET_SEH
14414 && (m->fs.sp_offset - frame.reg_save_offset
14415 >= SEH_MAX_FRAME_SIZE)))
14417 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14418 GEN_INT (m->fs.fp_offset
14419 - frame.reg_save_offset),
14420 style, false);
14422 else if (m->fs.sp_offset != frame.reg_save_offset)
14424 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14425 GEN_INT (m->fs.sp_offset
14426 - frame.reg_save_offset),
14427 style,
14428 m->fs.cfa_reg == stack_pointer_rtx);
14431 ix86_emit_restore_regs_using_pop ();
14434 /* If we used a stack pointer and haven't already got rid of it,
14435 then do so now. */
14436 if (m->fs.fp_valid)
14438 /* If the stack pointer is valid and pointing at the frame
14439 pointer store address, then we only need a pop. */
14440 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14441 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14442 /* Leave results in shorter dependency chains on CPUs that are
14443 able to grok it fast. */
14444 else if (TARGET_USE_LEAVE
14445 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14446 || !cfun->machine->use_fast_prologue_epilogue)
14447 ix86_emit_leave ();
14448 else
14450 pro_epilogue_adjust_stack (stack_pointer_rtx,
14451 hard_frame_pointer_rtx,
14452 const0_rtx, style, !using_drap);
14453 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14457 if (using_drap)
14459 int param_ptr_offset = UNITS_PER_WORD;
14460 rtx_insn *insn;
14462 gcc_assert (stack_realign_drap);
14464 if (ix86_static_chain_on_stack)
14465 param_ptr_offset += UNITS_PER_WORD;
14466 if (!call_used_regs[REGNO (crtl->drap_reg)])
14467 param_ptr_offset += UNITS_PER_WORD;
14469 insn = emit_insn (gen_rtx_SET
14470 (stack_pointer_rtx,
14471 gen_rtx_PLUS (Pmode,
14472 crtl->drap_reg,
14473 GEN_INT (-param_ptr_offset))));
14474 m->fs.cfa_reg = stack_pointer_rtx;
14475 m->fs.cfa_offset = param_ptr_offset;
14476 m->fs.sp_offset = param_ptr_offset;
14477 m->fs.realigned = false;
14479 add_reg_note (insn, REG_CFA_DEF_CFA,
14480 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14481 GEN_INT (param_ptr_offset)));
14482 RTX_FRAME_RELATED_P (insn) = 1;
14484 if (!call_used_regs[REGNO (crtl->drap_reg)])
14485 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14488 /* At this point the stack pointer must be valid, and we must have
14489 restored all of the registers. We may not have deallocated the
14490 entire stack frame. We've delayed this until now because it may
14491 be possible to merge the local stack deallocation with the
14492 deallocation forced by ix86_static_chain_on_stack. */
14493 gcc_assert (m->fs.sp_valid);
14494 gcc_assert (!m->fs.fp_valid);
14495 gcc_assert (!m->fs.realigned);
14496 if (m->fs.sp_offset != UNITS_PER_WORD)
14498 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14499 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14500 style, true);
14502 else
14503 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14505 /* Sibcall epilogues don't want a return instruction. */
14506 if (style == 0)
14508 m->fs = frame_state_save;
14509 return;
14512 if (cfun->machine->func_type != TYPE_NORMAL)
14514 /* Return with the "IRET" instruction from interrupt handler.
14515 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14516 instruction in exception handler. */
14517 if (cfun->machine->func_type == TYPE_EXCEPTION)
14519 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14520 UNITS_PER_WORD);
14521 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14523 emit_jump_insn (gen_interrupt_return ());
14525 else if (crtl->args.pops_args && crtl->args.size)
14527 rtx popc = GEN_INT (crtl->args.pops_args);
14529 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14530 address, do explicit add, and jump indirectly to the caller. */
14532 if (crtl->args.pops_args >= 65536)
14534 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14535 rtx_insn *insn;
14537 /* There is no "pascal" calling convention in any 64bit ABI. */
14538 gcc_assert (!TARGET_64BIT);
14540 insn = emit_insn (gen_pop (ecx));
14541 m->fs.cfa_offset -= UNITS_PER_WORD;
14542 m->fs.sp_offset -= UNITS_PER_WORD;
14544 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14545 x = gen_rtx_SET (stack_pointer_rtx, x);
14546 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14547 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14548 RTX_FRAME_RELATED_P (insn) = 1;
14550 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14551 popc, -1, true);
14552 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14554 else
14555 emit_jump_insn (gen_simple_return_pop_internal (popc));
14557 else
14558 emit_jump_insn (gen_simple_return_internal ());
14560 /* Restore the state back to the state from the prologue,
14561 so that it's correct for the next epilogue. */
14562 m->fs = frame_state_save;
14565 /* Reset from the function's potential modifications. */
14567 static void
14568 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14570 if (pic_offset_table_rtx
14571 && !ix86_use_pseudo_pic_reg ())
14572 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14573 #if TARGET_MACHO
14574 /* Mach-O doesn't support labels at the end of objects, so if
14575 it looks like we might want one, insert a NOP. */
14577 rtx_insn *insn = get_last_insn ();
14578 rtx_insn *deleted_debug_label = NULL;
14579 while (insn
14580 && NOTE_P (insn)
14581 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14583 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14584 notes only, instead set their CODE_LABEL_NUMBER to -1,
14585 otherwise there would be code generation differences
14586 in between -g and -g0. */
14587 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14588 deleted_debug_label = insn;
14589 insn = PREV_INSN (insn);
14591 if (insn
14592 && (LABEL_P (insn)
14593 || (NOTE_P (insn)
14594 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
14595 fputs ("\tnop\n", file);
14596 else if (deleted_debug_label)
14597 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14598 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14599 CODE_LABEL_NUMBER (insn) = -1;
14601 #endif
14605 /* Return a scratch register to use in the split stack prologue. The
14606 split stack prologue is used for -fsplit-stack. It is the first
14607 instructions in the function, even before the regular prologue.
14608 The scratch register can be any caller-saved register which is not
14609 used for parameters or for the static chain. */
14611 static unsigned int
14612 split_stack_prologue_scratch_regno (void)
14614 if (TARGET_64BIT)
14615 return R11_REG;
14616 else
14618 bool is_fastcall, is_thiscall;
14619 int regparm;
14621 is_fastcall = (lookup_attribute ("fastcall",
14622 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14623 != NULL);
14624 is_thiscall = (lookup_attribute ("thiscall",
14625 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14626 != NULL);
14627 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14629 if (is_fastcall)
14631 if (DECL_STATIC_CHAIN (cfun->decl))
14633 sorry ("-fsplit-stack does not support fastcall with "
14634 "nested function");
14635 return INVALID_REGNUM;
14637 return AX_REG;
14639 else if (is_thiscall)
14641 if (!DECL_STATIC_CHAIN (cfun->decl))
14642 return DX_REG;
14643 return AX_REG;
14645 else if (regparm < 3)
14647 if (!DECL_STATIC_CHAIN (cfun->decl))
14648 return CX_REG;
14649 else
14651 if (regparm >= 2)
14653 sorry ("-fsplit-stack does not support 2 register "
14654 "parameters for a nested function");
14655 return INVALID_REGNUM;
14657 return DX_REG;
14660 else
14662 /* FIXME: We could make this work by pushing a register
14663 around the addition and comparison. */
14664 sorry ("-fsplit-stack does not support 3 register parameters");
14665 return INVALID_REGNUM;
14670 /* A SYMBOL_REF for the function which allocates new stackspace for
14671 -fsplit-stack. */
14673 static GTY(()) rtx split_stack_fn;
14675 /* A SYMBOL_REF for the more stack function when using the large
14676 model. */
14678 static GTY(()) rtx split_stack_fn_large;
14680 /* Handle -fsplit-stack. These are the first instructions in the
14681 function, even before the regular prologue. */
14683 void
14684 ix86_expand_split_stack_prologue (void)
14686 struct ix86_frame frame;
14687 HOST_WIDE_INT allocate;
14688 unsigned HOST_WIDE_INT args_size;
14689 rtx_code_label *label;
14690 rtx limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
14691 rtx scratch_reg = NULL_RTX;
14692 rtx_code_label *varargs_label = NULL;
14693 rtx fn;
14695 gcc_assert (flag_split_stack && reload_completed);
14697 ix86_finalize_stack_realign_flags ();
14698 ix86_compute_frame_layout (&frame);
14699 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14701 /* This is the label we will branch to if we have enough stack
14702 space. We expect the basic block reordering pass to reverse this
14703 branch if optimizing, so that we branch in the unlikely case. */
14704 label = gen_label_rtx ();
14706 /* We need to compare the stack pointer minus the frame size with
14707 the stack boundary in the TCB. The stack boundary always gives
14708 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14709 can compare directly. Otherwise we need to do an addition. */
14711 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14712 UNSPEC_STACK_CHECK);
14713 limit = gen_rtx_CONST (Pmode, limit);
14714 limit = gen_rtx_MEM (Pmode, limit);
14715 if (allocate < SPLIT_STACK_AVAILABLE)
14716 current = stack_pointer_rtx;
14717 else
14719 unsigned int scratch_regno;
14720 rtx offset;
14722 /* We need a scratch register to hold the stack pointer minus
14723 the required frame size. Since this is the very start of the
14724 function, the scratch register can be any caller-saved
14725 register which is not used for parameters. */
14726 offset = GEN_INT (- allocate);
14727 scratch_regno = split_stack_prologue_scratch_regno ();
14728 if (scratch_regno == INVALID_REGNUM)
14729 return;
14730 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14731 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14733 /* We don't use ix86_gen_add3 in this case because it will
14734 want to split to lea, but when not optimizing the insn
14735 will not be split after this point. */
14736 emit_insn (gen_rtx_SET (scratch_reg,
14737 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14738 offset)));
14740 else
14742 emit_move_insn (scratch_reg, offset);
14743 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14744 stack_pointer_rtx));
14746 current = scratch_reg;
14749 ix86_expand_branch (GEU, current, limit, label);
14750 jump_insn = get_last_insn ();
14751 JUMP_LABEL (jump_insn) = label;
14753 /* Mark the jump as very likely to be taken. */
14754 add_int_reg_note (jump_insn, REG_BR_PROB,
14755 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14757 if (split_stack_fn == NULL_RTX)
14759 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14760 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14762 fn = split_stack_fn;
14764 /* Get more stack space. We pass in the desired stack space and the
14765 size of the arguments to copy to the new stack. In 32-bit mode
14766 we push the parameters; __morestack will return on a new stack
14767 anyhow. In 64-bit mode we pass the parameters in r10 and
14768 r11. */
14769 allocate_rtx = GEN_INT (allocate);
14770 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14771 call_fusage = NULL_RTX;
14772 if (TARGET_64BIT)
14774 rtx reg10, reg11;
14776 reg10 = gen_rtx_REG (Pmode, R10_REG);
14777 reg11 = gen_rtx_REG (Pmode, R11_REG);
14779 /* If this function uses a static chain, it will be in %r10.
14780 Preserve it across the call to __morestack. */
14781 if (DECL_STATIC_CHAIN (cfun->decl))
14783 rtx rax;
14785 rax = gen_rtx_REG (word_mode, AX_REG);
14786 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14787 use_reg (&call_fusage, rax);
14790 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14791 && !TARGET_PECOFF)
14793 HOST_WIDE_INT argval;
14795 gcc_assert (Pmode == DImode);
14796 /* When using the large model we need to load the address
14797 into a register, and we've run out of registers. So we
14798 switch to a different calling convention, and we call a
14799 different function: __morestack_large. We pass the
14800 argument size in the upper 32 bits of r10 and pass the
14801 frame size in the lower 32 bits. */
14802 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14803 gcc_assert ((args_size & 0xffffffff) == args_size);
14805 if (split_stack_fn_large == NULL_RTX)
14807 split_stack_fn_large =
14808 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14809 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14811 if (ix86_cmodel == CM_LARGE_PIC)
14813 rtx_code_label *label;
14814 rtx x;
14816 label = gen_label_rtx ();
14817 emit_label (label);
14818 LABEL_PRESERVE_P (label) = 1;
14819 emit_insn (gen_set_rip_rex64 (reg10, label));
14820 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14821 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14822 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14823 UNSPEC_GOT);
14824 x = gen_rtx_CONST (Pmode, x);
14825 emit_move_insn (reg11, x);
14826 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14827 x = gen_const_mem (Pmode, x);
14828 emit_move_insn (reg11, x);
14830 else
14831 emit_move_insn (reg11, split_stack_fn_large);
14833 fn = reg11;
14835 argval = ((args_size << 16) << 16) + allocate;
14836 emit_move_insn (reg10, GEN_INT (argval));
14838 else
14840 emit_move_insn (reg10, allocate_rtx);
14841 emit_move_insn (reg11, GEN_INT (args_size));
14842 use_reg (&call_fusage, reg11);
14845 use_reg (&call_fusage, reg10);
14847 else
14849 emit_insn (gen_push (GEN_INT (args_size)));
14850 emit_insn (gen_push (allocate_rtx));
14852 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14853 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14854 NULL_RTX, false);
14855 add_function_usage_to (call_insn, call_fusage);
14857 /* In order to make call/return prediction work right, we now need
14858 to execute a return instruction. See
14859 libgcc/config/i386/morestack.S for the details on how this works.
14861 For flow purposes gcc must not see this as a return
14862 instruction--we need control flow to continue at the subsequent
14863 label. Therefore, we use an unspec. */
14864 gcc_assert (crtl->args.pops_args < 65536);
14865 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14867 /* If we are in 64-bit mode and this function uses a static chain,
14868 we saved %r10 in %rax before calling _morestack. */
14869 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14870 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14871 gen_rtx_REG (word_mode, AX_REG));
14873 /* If this function calls va_start, we need to store a pointer to
14874 the arguments on the old stack, because they may not have been
14875 all copied to the new stack. At this point the old stack can be
14876 found at the frame pointer value used by __morestack, because
14877 __morestack has set that up before calling back to us. Here we
14878 store that pointer in a scratch register, and in
14879 ix86_expand_prologue we store the scratch register in a stack
14880 slot. */
14881 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14883 unsigned int scratch_regno;
14884 rtx frame_reg;
14885 int words;
14887 scratch_regno = split_stack_prologue_scratch_regno ();
14888 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14889 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14891 /* 64-bit:
14892 fp -> old fp value
14893 return address within this function
14894 return address of caller of this function
14895 stack arguments
14896 So we add three words to get to the stack arguments.
14898 32-bit:
14899 fp -> old fp value
14900 return address within this function
14901 first argument to __morestack
14902 second argument to __morestack
14903 return address of caller of this function
14904 stack arguments
14905 So we add five words to get to the stack arguments.
14907 words = TARGET_64BIT ? 3 : 5;
14908 emit_insn (gen_rtx_SET (scratch_reg,
14909 gen_rtx_PLUS (Pmode, frame_reg,
14910 GEN_INT (words * UNITS_PER_WORD))));
14912 varargs_label = gen_label_rtx ();
14913 emit_jump_insn (gen_jump (varargs_label));
14914 JUMP_LABEL (get_last_insn ()) = varargs_label;
14916 emit_barrier ();
14919 emit_label (label);
14920 LABEL_NUSES (label) = 1;
14922 /* If this function calls va_start, we now have to set the scratch
14923 register for the case where we do not call __morestack. In this
14924 case we need to set it based on the stack pointer. */
14925 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14927 emit_insn (gen_rtx_SET (scratch_reg,
14928 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14929 GEN_INT (UNITS_PER_WORD))));
14931 emit_label (varargs_label);
14932 LABEL_NUSES (varargs_label) = 1;
14936 /* We may have to tell the dataflow pass that the split stack prologue
14937 is initializing a scratch register. */
14939 static void
14940 ix86_live_on_entry (bitmap regs)
14942 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14944 gcc_assert (flag_split_stack);
14945 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14949 /* Extract the parts of an RTL expression that is a valid memory address
14950 for an instruction. Return 0 if the structure of the address is
14951 grossly off. Return -1 if the address contains ASHIFT, so it is not
14952 strictly valid, but still used for computing length of lea instruction. */
14955 ix86_decompose_address (rtx addr, struct ix86_address *out)
14957 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14958 rtx base_reg, index_reg;
14959 HOST_WIDE_INT scale = 1;
14960 rtx scale_rtx = NULL_RTX;
14961 rtx tmp;
14962 int retval = 1;
14963 addr_space_t seg = ADDR_SPACE_GENERIC;
14965 /* Allow zero-extended SImode addresses,
14966 they will be emitted with addr32 prefix. */
14967 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14969 if (GET_CODE (addr) == ZERO_EXTEND
14970 && GET_MODE (XEXP (addr, 0)) == SImode)
14972 addr = XEXP (addr, 0);
14973 if (CONST_INT_P (addr))
14974 return 0;
14976 else if (GET_CODE (addr) == AND
14977 && const_32bit_mask (XEXP (addr, 1), DImode))
14979 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14980 if (addr == NULL_RTX)
14981 return 0;
14983 if (CONST_INT_P (addr))
14984 return 0;
14988 /* Allow SImode subregs of DImode addresses,
14989 they will be emitted with addr32 prefix. */
14990 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14992 if (SUBREG_P (addr)
14993 && GET_MODE (SUBREG_REG (addr)) == DImode)
14995 addr = SUBREG_REG (addr);
14996 if (CONST_INT_P (addr))
14997 return 0;
15001 if (REG_P (addr))
15002 base = addr;
15003 else if (SUBREG_P (addr))
15005 if (REG_P (SUBREG_REG (addr)))
15006 base = addr;
15007 else
15008 return 0;
15010 else if (GET_CODE (addr) == PLUS)
15012 rtx addends[4], op;
15013 int n = 0, i;
15015 op = addr;
15018 if (n >= 4)
15019 return 0;
15020 addends[n++] = XEXP (op, 1);
15021 op = XEXP (op, 0);
15023 while (GET_CODE (op) == PLUS);
15024 if (n >= 4)
15025 return 0;
15026 addends[n] = op;
15028 for (i = n; i >= 0; --i)
15030 op = addends[i];
15031 switch (GET_CODE (op))
15033 case MULT:
15034 if (index)
15035 return 0;
15036 index = XEXP (op, 0);
15037 scale_rtx = XEXP (op, 1);
15038 break;
15040 case ASHIFT:
15041 if (index)
15042 return 0;
15043 index = XEXP (op, 0);
15044 tmp = XEXP (op, 1);
15045 if (!CONST_INT_P (tmp))
15046 return 0;
15047 scale = INTVAL (tmp);
15048 if ((unsigned HOST_WIDE_INT) scale > 3)
15049 return 0;
15050 scale = 1 << scale;
15051 break;
15053 case ZERO_EXTEND:
15054 op = XEXP (op, 0);
15055 if (GET_CODE (op) != UNSPEC)
15056 return 0;
15057 /* FALLTHRU */
15059 case UNSPEC:
15060 if (XINT (op, 1) == UNSPEC_TP
15061 && TARGET_TLS_DIRECT_SEG_REFS
15062 && seg == ADDR_SPACE_GENERIC)
15063 seg = DEFAULT_TLS_SEG_REG;
15064 else
15065 return 0;
15066 break;
15068 case SUBREG:
15069 if (!REG_P (SUBREG_REG (op)))
15070 return 0;
15071 /* FALLTHRU */
15073 case REG:
15074 if (!base)
15075 base = op;
15076 else if (!index)
15077 index = op;
15078 else
15079 return 0;
15080 break;
15082 case CONST:
15083 case CONST_INT:
15084 case SYMBOL_REF:
15085 case LABEL_REF:
15086 if (disp)
15087 return 0;
15088 disp = op;
15089 break;
15091 default:
15092 return 0;
15096 else if (GET_CODE (addr) == MULT)
15098 index = XEXP (addr, 0); /* index*scale */
15099 scale_rtx = XEXP (addr, 1);
15101 else if (GET_CODE (addr) == ASHIFT)
15103 /* We're called for lea too, which implements ashift on occasion. */
15104 index = XEXP (addr, 0);
15105 tmp = XEXP (addr, 1);
15106 if (!CONST_INT_P (tmp))
15107 return 0;
15108 scale = INTVAL (tmp);
15109 if ((unsigned HOST_WIDE_INT) scale > 3)
15110 return 0;
15111 scale = 1 << scale;
15112 retval = -1;
15114 else
15115 disp = addr; /* displacement */
15117 if (index)
15119 if (REG_P (index))
15121 else if (SUBREG_P (index)
15122 && REG_P (SUBREG_REG (index)))
15124 else
15125 return 0;
15128 /* Extract the integral value of scale. */
15129 if (scale_rtx)
15131 if (!CONST_INT_P (scale_rtx))
15132 return 0;
15133 scale = INTVAL (scale_rtx);
15136 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15137 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15139 /* Avoid useless 0 displacement. */
15140 if (disp == const0_rtx && (base || index))
15141 disp = NULL_RTX;
15143 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15144 if (base_reg && index_reg && scale == 1
15145 && (index_reg == arg_pointer_rtx
15146 || index_reg == frame_pointer_rtx
15147 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15149 std::swap (base, index);
15150 std::swap (base_reg, index_reg);
15153 /* Special case: %ebp cannot be encoded as a base without a displacement.
15154 Similarly %r13. */
15155 if (!disp
15156 && base_reg
15157 && (base_reg == hard_frame_pointer_rtx
15158 || base_reg == frame_pointer_rtx
15159 || base_reg == arg_pointer_rtx
15160 || (REG_P (base_reg)
15161 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15162 || REGNO (base_reg) == R13_REG))))
15163 disp = const0_rtx;
15165 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15166 Avoid this by transforming to [%esi+0].
15167 Reload calls address legitimization without cfun defined, so we need
15168 to test cfun for being non-NULL. */
15169 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15170 && base_reg && !index_reg && !disp
15171 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15172 disp = const0_rtx;
15174 /* Special case: encode reg+reg instead of reg*2. */
15175 if (!base && index && scale == 2)
15176 base = index, base_reg = index_reg, scale = 1;
15178 /* Special case: scaling cannot be encoded without base or displacement. */
15179 if (!base && !disp && index && scale != 1)
15180 disp = const0_rtx;
15182 out->base = base;
15183 out->index = index;
15184 out->disp = disp;
15185 out->scale = scale;
15186 out->seg = seg;
15188 return retval;
15191 /* Return cost of the memory address x.
15192 For i386, it is better to use a complex address than let gcc copy
15193 the address into a reg and make a new pseudo. But not if the address
15194 requires to two regs - that would mean more pseudos with longer
15195 lifetimes. */
15196 static int
15197 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15199 struct ix86_address parts;
15200 int cost = 1;
15201 int ok = ix86_decompose_address (x, &parts);
15203 gcc_assert (ok);
15205 if (parts.base && SUBREG_P (parts.base))
15206 parts.base = SUBREG_REG (parts.base);
15207 if (parts.index && SUBREG_P (parts.index))
15208 parts.index = SUBREG_REG (parts.index);
15210 /* Attempt to minimize number of registers in the address by increasing
15211 address cost for each used register. We don't increase address cost
15212 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15213 is not invariant itself it most likely means that base or index is not
15214 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15215 which is not profitable for x86. */
15216 if (parts.base
15217 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15218 && (current_pass->type == GIMPLE_PASS
15219 || !pic_offset_table_rtx
15220 || !REG_P (parts.base)
15221 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15222 cost++;
15224 if (parts.index
15225 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15226 && (current_pass->type == GIMPLE_PASS
15227 || !pic_offset_table_rtx
15228 || !REG_P (parts.index)
15229 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15230 cost++;
15232 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15233 since it's predecode logic can't detect the length of instructions
15234 and it degenerates to vector decoded. Increase cost of such
15235 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15236 to split such addresses or even refuse such addresses at all.
15238 Following addressing modes are affected:
15239 [base+scale*index]
15240 [scale*index+disp]
15241 [base+index]
15243 The first and last case may be avoidable by explicitly coding the zero in
15244 memory address, but I don't have AMD-K6 machine handy to check this
15245 theory. */
15247 if (TARGET_K6
15248 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15249 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15250 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15251 cost += 10;
15253 return cost;
15256 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15257 this is used for to form addresses to local data when -fPIC is in
15258 use. */
15260 static bool
15261 darwin_local_data_pic (rtx disp)
15263 return (GET_CODE (disp) == UNSPEC
15264 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15267 /* True if operand X should be loaded from GOT. */
15269 bool
15270 ix86_force_load_from_GOT_p (rtx x)
15272 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15273 && !TARGET_PECOFF && !TARGET_MACHO
15274 && !flag_plt && !flag_pic
15275 && ix86_cmodel != CM_LARGE
15276 && GET_CODE (x) == SYMBOL_REF
15277 && SYMBOL_REF_FUNCTION_P (x)
15278 && !SYMBOL_REF_LOCAL_P (x));
15281 /* Determine if a given RTX is a valid constant. We already know this
15282 satisfies CONSTANT_P. */
15284 static bool
15285 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15287 /* Pointer bounds constants are not valid. */
15288 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15289 return false;
15291 switch (GET_CODE (x))
15293 case CONST:
15294 x = XEXP (x, 0);
15296 if (GET_CODE (x) == PLUS)
15298 if (!CONST_INT_P (XEXP (x, 1)))
15299 return false;
15300 x = XEXP (x, 0);
15303 if (TARGET_MACHO && darwin_local_data_pic (x))
15304 return true;
15306 /* Only some unspecs are valid as "constants". */
15307 if (GET_CODE (x) == UNSPEC)
15308 switch (XINT (x, 1))
15310 case UNSPEC_GOT:
15311 case UNSPEC_GOTOFF:
15312 case UNSPEC_PLTOFF:
15313 return TARGET_64BIT;
15314 case UNSPEC_TPOFF:
15315 case UNSPEC_NTPOFF:
15316 x = XVECEXP (x, 0, 0);
15317 return (GET_CODE (x) == SYMBOL_REF
15318 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15319 case UNSPEC_DTPOFF:
15320 x = XVECEXP (x, 0, 0);
15321 return (GET_CODE (x) == SYMBOL_REF
15322 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15323 default:
15324 return false;
15327 /* We must have drilled down to a symbol. */
15328 if (GET_CODE (x) == LABEL_REF)
15329 return true;
15330 if (GET_CODE (x) != SYMBOL_REF)
15331 return false;
15332 /* FALLTHRU */
15334 case SYMBOL_REF:
15335 /* TLS symbols are never valid. */
15336 if (SYMBOL_REF_TLS_MODEL (x))
15337 return false;
15339 /* DLLIMPORT symbols are never valid. */
15340 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15341 && SYMBOL_REF_DLLIMPORT_P (x))
15342 return false;
15344 #if TARGET_MACHO
15345 /* mdynamic-no-pic */
15346 if (MACHO_DYNAMIC_NO_PIC_P)
15347 return machopic_symbol_defined_p (x);
15348 #endif
15350 /* External function address should be loaded
15351 via the GOT slot to avoid PLT. */
15352 if (ix86_force_load_from_GOT_p (x))
15353 return false;
15355 break;
15357 CASE_CONST_SCALAR_INT:
15358 switch (mode)
15360 case TImode:
15361 if (TARGET_64BIT)
15362 return true;
15363 /* FALLTHRU */
15364 case OImode:
15365 case XImode:
15366 if (!standard_sse_constant_p (x, mode))
15367 return false;
15368 default:
15369 break;
15371 break;
15373 case CONST_VECTOR:
15374 if (!standard_sse_constant_p (x, mode))
15375 return false;
15377 default:
15378 break;
15381 /* Otherwise we handle everything else in the move patterns. */
15382 return true;
15385 /* Determine if it's legal to put X into the constant pool. This
15386 is not possible for the address of thread-local symbols, which
15387 is checked above. */
15389 static bool
15390 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15392 /* We can put any immediate constant in memory. */
15393 switch (GET_CODE (x))
15395 CASE_CONST_ANY:
15396 return false;
15398 default:
15399 break;
15402 return !ix86_legitimate_constant_p (mode, x);
15405 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15406 otherwise zero. */
15408 static bool
15409 is_imported_p (rtx x)
15411 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15412 || GET_CODE (x) != SYMBOL_REF)
15413 return false;
15415 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15419 /* Nonzero if the constant value X is a legitimate general operand
15420 when generating PIC code. It is given that flag_pic is on and
15421 that X satisfies CONSTANT_P. */
15423 bool
15424 legitimate_pic_operand_p (rtx x)
15426 rtx inner;
15428 switch (GET_CODE (x))
15430 case CONST:
15431 inner = XEXP (x, 0);
15432 if (GET_CODE (inner) == PLUS
15433 && CONST_INT_P (XEXP (inner, 1)))
15434 inner = XEXP (inner, 0);
15436 /* Only some unspecs are valid as "constants". */
15437 if (GET_CODE (inner) == UNSPEC)
15438 switch (XINT (inner, 1))
15440 case UNSPEC_GOT:
15441 case UNSPEC_GOTOFF:
15442 case UNSPEC_PLTOFF:
15443 return TARGET_64BIT;
15444 case UNSPEC_TPOFF:
15445 x = XVECEXP (inner, 0, 0);
15446 return (GET_CODE (x) == SYMBOL_REF
15447 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15448 case UNSPEC_MACHOPIC_OFFSET:
15449 return legitimate_pic_address_disp_p (x);
15450 default:
15451 return false;
15453 /* FALLTHRU */
15455 case SYMBOL_REF:
15456 case LABEL_REF:
15457 return legitimate_pic_address_disp_p (x);
15459 default:
15460 return true;
15464 /* Determine if a given CONST RTX is a valid memory displacement
15465 in PIC mode. */
15467 bool
15468 legitimate_pic_address_disp_p (rtx disp)
15470 bool saw_plus;
15472 /* In 64bit mode we can allow direct addresses of symbols and labels
15473 when they are not dynamic symbols. */
15474 if (TARGET_64BIT)
15476 rtx op0 = disp, op1;
15478 switch (GET_CODE (disp))
15480 case LABEL_REF:
15481 return true;
15483 case CONST:
15484 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15485 break;
15486 op0 = XEXP (XEXP (disp, 0), 0);
15487 op1 = XEXP (XEXP (disp, 0), 1);
15488 if (!CONST_INT_P (op1)
15489 || INTVAL (op1) >= 16*1024*1024
15490 || INTVAL (op1) < -16*1024*1024)
15491 break;
15492 if (GET_CODE (op0) == LABEL_REF)
15493 return true;
15494 if (GET_CODE (op0) == CONST
15495 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15496 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15497 return true;
15498 if (GET_CODE (op0) == UNSPEC
15499 && XINT (op0, 1) == UNSPEC_PCREL)
15500 return true;
15501 if (GET_CODE (op0) != SYMBOL_REF)
15502 break;
15503 /* FALLTHRU */
15505 case SYMBOL_REF:
15506 /* TLS references should always be enclosed in UNSPEC.
15507 The dllimported symbol needs always to be resolved. */
15508 if (SYMBOL_REF_TLS_MODEL (op0)
15509 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15510 return false;
15512 if (TARGET_PECOFF)
15514 if (is_imported_p (op0))
15515 return true;
15517 if (SYMBOL_REF_FAR_ADDR_P (op0)
15518 || !SYMBOL_REF_LOCAL_P (op0))
15519 break;
15521 /* Function-symbols need to be resolved only for
15522 large-model.
15523 For the small-model we don't need to resolve anything
15524 here. */
15525 if ((ix86_cmodel != CM_LARGE_PIC
15526 && SYMBOL_REF_FUNCTION_P (op0))
15527 || ix86_cmodel == CM_SMALL_PIC)
15528 return true;
15529 /* Non-external symbols don't need to be resolved for
15530 large, and medium-model. */
15531 if ((ix86_cmodel == CM_LARGE_PIC
15532 || ix86_cmodel == CM_MEDIUM_PIC)
15533 && !SYMBOL_REF_EXTERNAL_P (op0))
15534 return true;
15536 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15537 && (SYMBOL_REF_LOCAL_P (op0)
15538 || (HAVE_LD_PIE_COPYRELOC
15539 && flag_pie
15540 && !SYMBOL_REF_WEAK (op0)
15541 && !SYMBOL_REF_FUNCTION_P (op0)))
15542 && ix86_cmodel != CM_LARGE_PIC)
15543 return true;
15544 break;
15546 default:
15547 break;
15550 if (GET_CODE (disp) != CONST)
15551 return false;
15552 disp = XEXP (disp, 0);
15554 if (TARGET_64BIT)
15556 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15557 of GOT tables. We should not need these anyway. */
15558 if (GET_CODE (disp) != UNSPEC
15559 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15560 && XINT (disp, 1) != UNSPEC_GOTOFF
15561 && XINT (disp, 1) != UNSPEC_PCREL
15562 && XINT (disp, 1) != UNSPEC_PLTOFF))
15563 return false;
15565 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15566 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15567 return false;
15568 return true;
15571 saw_plus = false;
15572 if (GET_CODE (disp) == PLUS)
15574 if (!CONST_INT_P (XEXP (disp, 1)))
15575 return false;
15576 disp = XEXP (disp, 0);
15577 saw_plus = true;
15580 if (TARGET_MACHO && darwin_local_data_pic (disp))
15581 return true;
15583 if (GET_CODE (disp) != UNSPEC)
15584 return false;
15586 switch (XINT (disp, 1))
15588 case UNSPEC_GOT:
15589 if (saw_plus)
15590 return false;
15591 /* We need to check for both symbols and labels because VxWorks loads
15592 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15593 details. */
15594 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15595 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15596 case UNSPEC_GOTOFF:
15597 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15598 While ABI specify also 32bit relocation but we don't produce it in
15599 small PIC model at all. */
15600 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15601 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15602 && !TARGET_64BIT)
15603 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15604 return false;
15605 case UNSPEC_GOTTPOFF:
15606 case UNSPEC_GOTNTPOFF:
15607 case UNSPEC_INDNTPOFF:
15608 if (saw_plus)
15609 return false;
15610 disp = XVECEXP (disp, 0, 0);
15611 return (GET_CODE (disp) == SYMBOL_REF
15612 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15613 case UNSPEC_NTPOFF:
15614 disp = XVECEXP (disp, 0, 0);
15615 return (GET_CODE (disp) == SYMBOL_REF
15616 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15617 case UNSPEC_DTPOFF:
15618 disp = XVECEXP (disp, 0, 0);
15619 return (GET_CODE (disp) == SYMBOL_REF
15620 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15623 return false;
15626 /* Determine if op is suitable RTX for an address register.
15627 Return naked register if a register or a register subreg is
15628 found, otherwise return NULL_RTX. */
15630 static rtx
15631 ix86_validate_address_register (rtx op)
15633 machine_mode mode = GET_MODE (op);
15635 /* Only SImode or DImode registers can form the address. */
15636 if (mode != SImode && mode != DImode)
15637 return NULL_RTX;
15639 if (REG_P (op))
15640 return op;
15641 else if (SUBREG_P (op))
15643 rtx reg = SUBREG_REG (op);
15645 if (!REG_P (reg))
15646 return NULL_RTX;
15648 mode = GET_MODE (reg);
15650 /* Don't allow SUBREGs that span more than a word. It can
15651 lead to spill failures when the register is one word out
15652 of a two word structure. */
15653 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15654 return NULL_RTX;
15656 /* Allow only SUBREGs of non-eliminable hard registers. */
15657 if (register_no_elim_operand (reg, mode))
15658 return reg;
15661 /* Op is not a register. */
15662 return NULL_RTX;
15665 /* Recognizes RTL expressions that are valid memory addresses for an
15666 instruction. The MODE argument is the machine mode for the MEM
15667 expression that wants to use this address.
15669 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15670 convert common non-canonical forms to canonical form so that they will
15671 be recognized. */
15673 static bool
15674 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15676 struct ix86_address parts;
15677 rtx base, index, disp;
15678 HOST_WIDE_INT scale;
15679 addr_space_t seg;
15681 if (ix86_decompose_address (addr, &parts) <= 0)
15682 /* Decomposition failed. */
15683 return false;
15685 base = parts.base;
15686 index = parts.index;
15687 disp = parts.disp;
15688 scale = parts.scale;
15689 seg = parts.seg;
15691 /* Validate base register. */
15692 if (base)
15694 rtx reg = ix86_validate_address_register (base);
15696 if (reg == NULL_RTX)
15697 return false;
15699 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15700 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15701 /* Base is not valid. */
15702 return false;
15705 /* Validate index register. */
15706 if (index)
15708 rtx reg = ix86_validate_address_register (index);
15710 if (reg == NULL_RTX)
15711 return false;
15713 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15714 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15715 /* Index is not valid. */
15716 return false;
15719 /* Index and base should have the same mode. */
15720 if (base && index
15721 && GET_MODE (base) != GET_MODE (index))
15722 return false;
15724 /* Address override works only on the (%reg) part of %fs:(%reg). */
15725 if (seg != ADDR_SPACE_GENERIC
15726 && ((base && GET_MODE (base) != word_mode)
15727 || (index && GET_MODE (index) != word_mode)))
15728 return false;
15730 /* Validate scale factor. */
15731 if (scale != 1)
15733 if (!index)
15734 /* Scale without index. */
15735 return false;
15737 if (scale != 2 && scale != 4 && scale != 8)
15738 /* Scale is not a valid multiplier. */
15739 return false;
15742 /* Validate displacement. */
15743 if (disp)
15745 if (GET_CODE (disp) == CONST
15746 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15747 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15748 switch (XINT (XEXP (disp, 0), 1))
15750 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15751 when used. While ABI specify also 32bit relocations, we
15752 don't produce them at all and use IP relative instead.
15753 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15754 should be loaded via GOT. */
15755 case UNSPEC_GOT:
15756 if (!TARGET_64BIT
15757 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15758 goto is_legitimate_pic;
15759 /* FALLTHRU */
15760 case UNSPEC_GOTOFF:
15761 gcc_assert (flag_pic);
15762 if (!TARGET_64BIT)
15763 goto is_legitimate_pic;
15765 /* 64bit address unspec. */
15766 return false;
15768 case UNSPEC_GOTPCREL:
15769 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15770 goto is_legitimate_pic;
15771 /* FALLTHRU */
15772 case UNSPEC_PCREL:
15773 gcc_assert (flag_pic);
15774 goto is_legitimate_pic;
15776 case UNSPEC_GOTTPOFF:
15777 case UNSPEC_GOTNTPOFF:
15778 case UNSPEC_INDNTPOFF:
15779 case UNSPEC_NTPOFF:
15780 case UNSPEC_DTPOFF:
15781 break;
15783 case UNSPEC_STACK_CHECK:
15784 gcc_assert (flag_split_stack);
15785 break;
15787 default:
15788 /* Invalid address unspec. */
15789 return false;
15792 else if (SYMBOLIC_CONST (disp)
15793 && (flag_pic
15794 || (TARGET_MACHO
15795 #if TARGET_MACHO
15796 && MACHOPIC_INDIRECT
15797 && !machopic_operand_p (disp)
15798 #endif
15802 is_legitimate_pic:
15803 if (TARGET_64BIT && (index || base))
15805 /* foo@dtpoff(%rX) is ok. */
15806 if (GET_CODE (disp) != CONST
15807 || GET_CODE (XEXP (disp, 0)) != PLUS
15808 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15809 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15810 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15811 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15812 /* Non-constant pic memory reference. */
15813 return false;
15815 else if ((!TARGET_MACHO || flag_pic)
15816 && ! legitimate_pic_address_disp_p (disp))
15817 /* Displacement is an invalid pic construct. */
15818 return false;
15819 #if TARGET_MACHO
15820 else if (MACHO_DYNAMIC_NO_PIC_P
15821 && !ix86_legitimate_constant_p (Pmode, disp))
15822 /* displacment must be referenced via non_lazy_pointer */
15823 return false;
15824 #endif
15826 /* This code used to verify that a symbolic pic displacement
15827 includes the pic_offset_table_rtx register.
15829 While this is good idea, unfortunately these constructs may
15830 be created by "adds using lea" optimization for incorrect
15831 code like:
15833 int a;
15834 int foo(int i)
15836 return *(&a+i);
15839 This code is nonsensical, but results in addressing
15840 GOT table with pic_offset_table_rtx base. We can't
15841 just refuse it easily, since it gets matched by
15842 "addsi3" pattern, that later gets split to lea in the
15843 case output register differs from input. While this
15844 can be handled by separate addsi pattern for this case
15845 that never results in lea, this seems to be easier and
15846 correct fix for crash to disable this test. */
15848 else if (GET_CODE (disp) != LABEL_REF
15849 && !CONST_INT_P (disp)
15850 && (GET_CODE (disp) != CONST
15851 || !ix86_legitimate_constant_p (Pmode, disp))
15852 && (GET_CODE (disp) != SYMBOL_REF
15853 || !ix86_legitimate_constant_p (Pmode, disp)))
15854 /* Displacement is not constant. */
15855 return false;
15856 else if (TARGET_64BIT
15857 && !x86_64_immediate_operand (disp, VOIDmode))
15858 /* Displacement is out of range. */
15859 return false;
15860 /* In x32 mode, constant addresses are sign extended to 64bit, so
15861 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15862 else if (TARGET_X32 && !(index || base)
15863 && CONST_INT_P (disp)
15864 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15865 return false;
15868 /* Everything looks valid. */
15869 return true;
15872 /* Determine if a given RTX is a valid constant address. */
15874 bool
15875 constant_address_p (rtx x)
15877 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15880 /* Return a unique alias set for the GOT. */
15882 static alias_set_type
15883 ix86_GOT_alias_set (void)
15885 static alias_set_type set = -1;
15886 if (set == -1)
15887 set = new_alias_set ();
15888 return set;
15891 /* Return a legitimate reference for ORIG (an address) using the
15892 register REG. If REG is 0, a new pseudo is generated.
15894 There are two types of references that must be handled:
15896 1. Global data references must load the address from the GOT, via
15897 the PIC reg. An insn is emitted to do this load, and the reg is
15898 returned.
15900 2. Static data references, constant pool addresses, and code labels
15901 compute the address as an offset from the GOT, whose base is in
15902 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15903 differentiate them from global data objects. The returned
15904 address is the PIC reg + an unspec constant.
15906 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15907 reg also appears in the address. */
15909 static rtx
15910 legitimize_pic_address (rtx orig, rtx reg)
15912 rtx addr = orig;
15913 rtx new_rtx = orig;
15915 #if TARGET_MACHO
15916 if (TARGET_MACHO && !TARGET_64BIT)
15918 if (reg == 0)
15919 reg = gen_reg_rtx (Pmode);
15920 /* Use the generic Mach-O PIC machinery. */
15921 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15923 #endif
15925 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15927 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15928 if (tmp)
15929 return tmp;
15932 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15933 new_rtx = addr;
15934 else if ((!TARGET_64BIT
15935 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15936 && !TARGET_PECOFF
15937 && gotoff_operand (addr, Pmode))
15939 /* This symbol may be referenced via a displacement
15940 from the PIC base address (@GOTOFF). */
15941 if (GET_CODE (addr) == CONST)
15942 addr = XEXP (addr, 0);
15944 if (GET_CODE (addr) == PLUS)
15946 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15947 UNSPEC_GOTOFF);
15948 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15950 else
15951 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15953 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15955 if (TARGET_64BIT)
15956 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15958 if (reg != 0)
15960 gcc_assert (REG_P (reg));
15961 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15962 new_rtx, reg, 1, OPTAB_DIRECT);
15964 else
15965 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15967 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15968 /* We can't use @GOTOFF for text labels
15969 on VxWorks, see gotoff_operand. */
15970 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15972 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15973 if (tmp)
15974 return tmp;
15976 /* For x64 PE-COFF there is no GOT table,
15977 so we use address directly. */
15978 if (TARGET_64BIT && TARGET_PECOFF)
15980 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15981 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15983 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15985 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15986 UNSPEC_GOTPCREL);
15987 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15988 new_rtx = gen_const_mem (Pmode, new_rtx);
15989 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15991 else
15993 /* This symbol must be referenced via a load
15994 from the Global Offset Table (@GOT). */
15995 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15996 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15997 if (TARGET_64BIT)
15998 new_rtx = force_reg (Pmode, new_rtx);
15999 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16000 new_rtx = gen_const_mem (Pmode, new_rtx);
16001 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16004 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16006 else
16008 if (CONST_INT_P (addr)
16009 && !x86_64_immediate_operand (addr, VOIDmode))
16010 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16011 else if (GET_CODE (addr) == CONST)
16013 addr = XEXP (addr, 0);
16015 /* We must match stuff we generate before. Assume the only
16016 unspecs that can get here are ours. Not that we could do
16017 anything with them anyway.... */
16018 if (GET_CODE (addr) == UNSPEC
16019 || (GET_CODE (addr) == PLUS
16020 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16021 return orig;
16022 gcc_assert (GET_CODE (addr) == PLUS);
16025 if (GET_CODE (addr) == PLUS)
16027 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16029 /* Check first to see if this is a constant
16030 offset from a @GOTOFF symbol reference. */
16031 if (!TARGET_PECOFF
16032 && gotoff_operand (op0, Pmode)
16033 && CONST_INT_P (op1))
16035 if (!TARGET_64BIT)
16037 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16038 UNSPEC_GOTOFF);
16039 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16040 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16042 if (reg != 0)
16044 gcc_assert (REG_P (reg));
16045 new_rtx = expand_simple_binop (Pmode, PLUS,
16046 pic_offset_table_rtx,
16047 new_rtx, reg, 1,
16048 OPTAB_DIRECT);
16050 else
16051 new_rtx
16052 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16054 else
16056 if (INTVAL (op1) < -16*1024*1024
16057 || INTVAL (op1) >= 16*1024*1024)
16059 if (!x86_64_immediate_operand (op1, Pmode))
16060 op1 = force_reg (Pmode, op1);
16062 new_rtx
16063 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16067 else
16069 rtx base = legitimize_pic_address (op0, reg);
16070 machine_mode mode = GET_MODE (base);
16071 new_rtx
16072 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16074 if (CONST_INT_P (new_rtx))
16076 if (INTVAL (new_rtx) < -16*1024*1024
16077 || INTVAL (new_rtx) >= 16*1024*1024)
16079 if (!x86_64_immediate_operand (new_rtx, mode))
16080 new_rtx = force_reg (mode, new_rtx);
16082 new_rtx
16083 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16085 else
16086 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16088 else
16090 /* For %rip addressing, we have to use
16091 just disp32, not base nor index. */
16092 if (TARGET_64BIT
16093 && (GET_CODE (base) == SYMBOL_REF
16094 || GET_CODE (base) == LABEL_REF))
16095 base = force_reg (mode, base);
16096 if (GET_CODE (new_rtx) == PLUS
16097 && CONSTANT_P (XEXP (new_rtx, 1)))
16099 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16100 new_rtx = XEXP (new_rtx, 1);
16102 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16107 return new_rtx;
16110 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16112 static rtx
16113 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16115 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16117 if (GET_MODE (tp) != tp_mode)
16119 gcc_assert (GET_MODE (tp) == SImode);
16120 gcc_assert (tp_mode == DImode);
16122 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16125 if (to_reg)
16126 tp = copy_to_mode_reg (tp_mode, tp);
16128 return tp;
16131 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16133 static GTY(()) rtx ix86_tls_symbol;
16135 static rtx
16136 ix86_tls_get_addr (void)
16138 if (!ix86_tls_symbol)
16140 const char *sym
16141 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16142 ? "___tls_get_addr" : "__tls_get_addr");
16144 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16147 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16149 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16150 UNSPEC_PLTOFF);
16151 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16152 gen_rtx_CONST (Pmode, unspec));
16155 return ix86_tls_symbol;
16158 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16160 static GTY(()) rtx ix86_tls_module_base_symbol;
16163 ix86_tls_module_base (void)
16165 if (!ix86_tls_module_base_symbol)
16167 ix86_tls_module_base_symbol
16168 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16170 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16171 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16174 return ix86_tls_module_base_symbol;
16177 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16178 false if we expect this to be used for a memory address and true if
16179 we expect to load the address into a register. */
16181 static rtx
16182 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16184 rtx dest, base, off;
16185 rtx pic = NULL_RTX, tp = NULL_RTX;
16186 machine_mode tp_mode = Pmode;
16187 int type;
16189 /* Fall back to global dynamic model if tool chain cannot support local
16190 dynamic. */
16191 if (TARGET_SUN_TLS && !TARGET_64BIT
16192 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16193 && model == TLS_MODEL_LOCAL_DYNAMIC)
16194 model = TLS_MODEL_GLOBAL_DYNAMIC;
16196 switch (model)
16198 case TLS_MODEL_GLOBAL_DYNAMIC:
16199 dest = gen_reg_rtx (Pmode);
16201 if (!TARGET_64BIT)
16203 if (flag_pic && !TARGET_PECOFF)
16204 pic = pic_offset_table_rtx;
16205 else
16207 pic = gen_reg_rtx (Pmode);
16208 emit_insn (gen_set_got (pic));
16212 if (TARGET_GNU2_TLS)
16214 if (TARGET_64BIT)
16215 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16216 else
16217 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16219 tp = get_thread_pointer (Pmode, true);
16220 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16222 if (GET_MODE (x) != Pmode)
16223 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16225 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16227 else
16229 rtx caddr = ix86_tls_get_addr ();
16231 if (TARGET_64BIT)
16233 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16234 rtx_insn *insns;
16236 start_sequence ();
16237 emit_call_insn
16238 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16239 insns = get_insns ();
16240 end_sequence ();
16242 if (GET_MODE (x) != Pmode)
16243 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16245 RTL_CONST_CALL_P (insns) = 1;
16246 emit_libcall_block (insns, dest, rax, x);
16248 else
16249 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16251 break;
16253 case TLS_MODEL_LOCAL_DYNAMIC:
16254 base = gen_reg_rtx (Pmode);
16256 if (!TARGET_64BIT)
16258 if (flag_pic)
16259 pic = pic_offset_table_rtx;
16260 else
16262 pic = gen_reg_rtx (Pmode);
16263 emit_insn (gen_set_got (pic));
16267 if (TARGET_GNU2_TLS)
16269 rtx tmp = ix86_tls_module_base ();
16271 if (TARGET_64BIT)
16272 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16273 else
16274 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16276 tp = get_thread_pointer (Pmode, true);
16277 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16278 gen_rtx_MINUS (Pmode, tmp, tp));
16280 else
16282 rtx caddr = ix86_tls_get_addr ();
16284 if (TARGET_64BIT)
16286 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16287 rtx_insn *insns;
16288 rtx eqv;
16290 start_sequence ();
16291 emit_call_insn
16292 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16293 insns = get_insns ();
16294 end_sequence ();
16296 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16297 share the LD_BASE result with other LD model accesses. */
16298 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16299 UNSPEC_TLS_LD_BASE);
16301 RTL_CONST_CALL_P (insns) = 1;
16302 emit_libcall_block (insns, base, rax, eqv);
16304 else
16305 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16308 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16309 off = gen_rtx_CONST (Pmode, off);
16311 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16313 if (TARGET_GNU2_TLS)
16315 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16317 if (GET_MODE (x) != Pmode)
16318 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16320 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16322 break;
16324 case TLS_MODEL_INITIAL_EXEC:
16325 if (TARGET_64BIT)
16327 if (TARGET_SUN_TLS && !TARGET_X32)
16329 /* The Sun linker took the AMD64 TLS spec literally
16330 and can only handle %rax as destination of the
16331 initial executable code sequence. */
16333 dest = gen_reg_rtx (DImode);
16334 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16335 return dest;
16338 /* Generate DImode references to avoid %fs:(%reg32)
16339 problems and linker IE->LE relaxation bug. */
16340 tp_mode = DImode;
16341 pic = NULL;
16342 type = UNSPEC_GOTNTPOFF;
16344 else if (flag_pic)
16346 pic = pic_offset_table_rtx;
16347 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16349 else if (!TARGET_ANY_GNU_TLS)
16351 pic = gen_reg_rtx (Pmode);
16352 emit_insn (gen_set_got (pic));
16353 type = UNSPEC_GOTTPOFF;
16355 else
16357 pic = NULL;
16358 type = UNSPEC_INDNTPOFF;
16361 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16362 off = gen_rtx_CONST (tp_mode, off);
16363 if (pic)
16364 off = gen_rtx_PLUS (tp_mode, pic, off);
16365 off = gen_const_mem (tp_mode, off);
16366 set_mem_alias_set (off, ix86_GOT_alias_set ());
16368 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16370 base = get_thread_pointer (tp_mode,
16371 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16372 off = force_reg (tp_mode, off);
16373 return gen_rtx_PLUS (tp_mode, base, off);
16375 else
16377 base = get_thread_pointer (Pmode, true);
16378 dest = gen_reg_rtx (Pmode);
16379 emit_insn (ix86_gen_sub3 (dest, base, off));
16381 break;
16383 case TLS_MODEL_LOCAL_EXEC:
16384 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16385 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16386 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16387 off = gen_rtx_CONST (Pmode, off);
16389 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16391 base = get_thread_pointer (Pmode,
16392 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16393 return gen_rtx_PLUS (Pmode, base, off);
16395 else
16397 base = get_thread_pointer (Pmode, true);
16398 dest = gen_reg_rtx (Pmode);
16399 emit_insn (ix86_gen_sub3 (dest, base, off));
16401 break;
16403 default:
16404 gcc_unreachable ();
16407 return dest;
16410 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16411 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16412 unique refptr-DECL symbol corresponding to symbol DECL. */
16414 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16416 static inline hashval_t hash (tree_map *m) { return m->hash; }
16417 static inline bool
16418 equal (tree_map *a, tree_map *b)
16420 return a->base.from == b->base.from;
16423 static int
16424 keep_cache_entry (tree_map *&m)
16426 return ggc_marked_p (m->base.from);
16430 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16432 static tree
16433 get_dllimport_decl (tree decl, bool beimport)
16435 struct tree_map *h, in;
16436 const char *name;
16437 const char *prefix;
16438 size_t namelen, prefixlen;
16439 char *imp_name;
16440 tree to;
16441 rtx rtl;
16443 if (!dllimport_map)
16444 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16446 in.hash = htab_hash_pointer (decl);
16447 in.base.from = decl;
16448 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16449 h = *loc;
16450 if (h)
16451 return h->to;
16453 *loc = h = ggc_alloc<tree_map> ();
16454 h->hash = in.hash;
16455 h->base.from = decl;
16456 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16457 VAR_DECL, NULL, ptr_type_node);
16458 DECL_ARTIFICIAL (to) = 1;
16459 DECL_IGNORED_P (to) = 1;
16460 DECL_EXTERNAL (to) = 1;
16461 TREE_READONLY (to) = 1;
16463 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16464 name = targetm.strip_name_encoding (name);
16465 if (beimport)
16466 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16467 ? "*__imp_" : "*__imp__";
16468 else
16469 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16470 namelen = strlen (name);
16471 prefixlen = strlen (prefix);
16472 imp_name = (char *) alloca (namelen + prefixlen + 1);
16473 memcpy (imp_name, prefix, prefixlen);
16474 memcpy (imp_name + prefixlen, name, namelen + 1);
16476 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16477 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16478 SET_SYMBOL_REF_DECL (rtl, to);
16479 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16480 if (!beimport)
16482 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16483 #ifdef SUB_TARGET_RECORD_STUB
16484 SUB_TARGET_RECORD_STUB (name);
16485 #endif
16488 rtl = gen_const_mem (Pmode, rtl);
16489 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16491 SET_DECL_RTL (to, rtl);
16492 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16494 return to;
16497 /* Expand SYMBOL into its corresponding far-addresse symbol.
16498 WANT_REG is true if we require the result be a register. */
16500 static rtx
16501 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16503 tree imp_decl;
16504 rtx x;
16506 gcc_assert (SYMBOL_REF_DECL (symbol));
16507 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16509 x = DECL_RTL (imp_decl);
16510 if (want_reg)
16511 x = force_reg (Pmode, x);
16512 return x;
16515 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16516 true if we require the result be a register. */
16518 static rtx
16519 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16521 tree imp_decl;
16522 rtx x;
16524 gcc_assert (SYMBOL_REF_DECL (symbol));
16525 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16527 x = DECL_RTL (imp_decl);
16528 if (want_reg)
16529 x = force_reg (Pmode, x);
16530 return x;
16533 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16534 is true if we require the result be a register. */
16536 static rtx
16537 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16539 if (!TARGET_PECOFF)
16540 return NULL_RTX;
16542 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16544 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16545 return legitimize_dllimport_symbol (addr, inreg);
16546 if (GET_CODE (addr) == CONST
16547 && GET_CODE (XEXP (addr, 0)) == PLUS
16548 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16549 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16551 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16552 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16556 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16557 return NULL_RTX;
16558 if (GET_CODE (addr) == SYMBOL_REF
16559 && !is_imported_p (addr)
16560 && SYMBOL_REF_EXTERNAL_P (addr)
16561 && SYMBOL_REF_DECL (addr))
16562 return legitimize_pe_coff_extern_decl (addr, inreg);
16564 if (GET_CODE (addr) == CONST
16565 && GET_CODE (XEXP (addr, 0)) == PLUS
16566 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16567 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16568 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16569 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16571 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16572 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16574 return NULL_RTX;
16577 /* Try machine-dependent ways of modifying an illegitimate address
16578 to be legitimate. If we find one, return the new, valid address.
16579 This macro is used in only one place: `memory_address' in explow.c.
16581 OLDX is the address as it was before break_out_memory_refs was called.
16582 In some cases it is useful to look at this to decide what needs to be done.
16584 It is always safe for this macro to do nothing. It exists to recognize
16585 opportunities to optimize the output.
16587 For the 80386, we handle X+REG by loading X into a register R and
16588 using R+REG. R will go in a general reg and indexing will be used.
16589 However, if REG is a broken-out memory address or multiplication,
16590 nothing needs to be done because REG can certainly go in a general reg.
16592 When -fpic is used, special handling is needed for symbolic references.
16593 See comments by legitimize_pic_address in i386.c for details. */
16595 static rtx
16596 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16598 bool changed = false;
16599 unsigned log;
16601 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16602 if (log)
16603 return legitimize_tls_address (x, (enum tls_model) log, false);
16604 if (GET_CODE (x) == CONST
16605 && GET_CODE (XEXP (x, 0)) == PLUS
16606 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16607 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16609 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16610 (enum tls_model) log, false);
16611 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16614 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16616 rtx tmp = legitimize_pe_coff_symbol (x, true);
16617 if (tmp)
16618 return tmp;
16621 if (flag_pic && SYMBOLIC_CONST (x))
16622 return legitimize_pic_address (x, 0);
16624 #if TARGET_MACHO
16625 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16626 return machopic_indirect_data_reference (x, 0);
16627 #endif
16629 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16630 if (GET_CODE (x) == ASHIFT
16631 && CONST_INT_P (XEXP (x, 1))
16632 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16634 changed = true;
16635 log = INTVAL (XEXP (x, 1));
16636 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16637 GEN_INT (1 << log));
16640 if (GET_CODE (x) == PLUS)
16642 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16644 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16645 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16646 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16648 changed = true;
16649 log = INTVAL (XEXP (XEXP (x, 0), 1));
16650 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16651 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16652 GEN_INT (1 << log));
16655 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16656 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16657 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16659 changed = true;
16660 log = INTVAL (XEXP (XEXP (x, 1), 1));
16661 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16662 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16663 GEN_INT (1 << log));
16666 /* Put multiply first if it isn't already. */
16667 if (GET_CODE (XEXP (x, 1)) == MULT)
16669 std::swap (XEXP (x, 0), XEXP (x, 1));
16670 changed = true;
16673 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16674 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16675 created by virtual register instantiation, register elimination, and
16676 similar optimizations. */
16677 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16679 changed = true;
16680 x = gen_rtx_PLUS (Pmode,
16681 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16682 XEXP (XEXP (x, 1), 0)),
16683 XEXP (XEXP (x, 1), 1));
16686 /* Canonicalize
16687 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16688 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16689 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16690 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16691 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16692 && CONSTANT_P (XEXP (x, 1)))
16694 rtx constant;
16695 rtx other = NULL_RTX;
16697 if (CONST_INT_P (XEXP (x, 1)))
16699 constant = XEXP (x, 1);
16700 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16702 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16704 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16705 other = XEXP (x, 1);
16707 else
16708 constant = 0;
16710 if (constant)
16712 changed = true;
16713 x = gen_rtx_PLUS (Pmode,
16714 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16715 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16716 plus_constant (Pmode, other,
16717 INTVAL (constant)));
16721 if (changed && ix86_legitimate_address_p (mode, x, false))
16722 return x;
16724 if (GET_CODE (XEXP (x, 0)) == MULT)
16726 changed = true;
16727 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16730 if (GET_CODE (XEXP (x, 1)) == MULT)
16732 changed = true;
16733 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16736 if (changed
16737 && REG_P (XEXP (x, 1))
16738 && REG_P (XEXP (x, 0)))
16739 return x;
16741 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16743 changed = true;
16744 x = legitimize_pic_address (x, 0);
16747 if (changed && ix86_legitimate_address_p (mode, x, false))
16748 return x;
16750 if (REG_P (XEXP (x, 0)))
16752 rtx temp = gen_reg_rtx (Pmode);
16753 rtx val = force_operand (XEXP (x, 1), temp);
16754 if (val != temp)
16756 val = convert_to_mode (Pmode, val, 1);
16757 emit_move_insn (temp, val);
16760 XEXP (x, 1) = temp;
16761 return x;
16764 else if (REG_P (XEXP (x, 1)))
16766 rtx temp = gen_reg_rtx (Pmode);
16767 rtx val = force_operand (XEXP (x, 0), temp);
16768 if (val != temp)
16770 val = convert_to_mode (Pmode, val, 1);
16771 emit_move_insn (temp, val);
16774 XEXP (x, 0) = temp;
16775 return x;
16779 return x;
16782 /* Print an integer constant expression in assembler syntax. Addition
16783 and subtraction are the only arithmetic that may appear in these
16784 expressions. FILE is the stdio stream to write to, X is the rtx, and
16785 CODE is the operand print code from the output string. */
16787 static void
16788 output_pic_addr_const (FILE *file, rtx x, int code)
16790 char buf[256];
16792 switch (GET_CODE (x))
16794 case PC:
16795 gcc_assert (flag_pic);
16796 putc ('.', file);
16797 break;
16799 case SYMBOL_REF:
16800 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16801 output_addr_const (file, x);
16802 else
16804 const char *name = XSTR (x, 0);
16806 /* Mark the decl as referenced so that cgraph will
16807 output the function. */
16808 if (SYMBOL_REF_DECL (x))
16809 mark_decl_referenced (SYMBOL_REF_DECL (x));
16811 #if TARGET_MACHO
16812 if (MACHOPIC_INDIRECT
16813 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16814 name = machopic_indirection_name (x, /*stub_p=*/true);
16815 #endif
16816 assemble_name (file, name);
16818 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16819 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16820 fputs ("@PLT", file);
16821 break;
16823 case LABEL_REF:
16824 x = XEXP (x, 0);
16825 /* FALLTHRU */
16826 case CODE_LABEL:
16827 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16828 assemble_name (asm_out_file, buf);
16829 break;
16831 case CONST_INT:
16832 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16833 break;
16835 case CONST:
16836 /* This used to output parentheses around the expression,
16837 but that does not work on the 386 (either ATT or BSD assembler). */
16838 output_pic_addr_const (file, XEXP (x, 0), code);
16839 break;
16841 case CONST_DOUBLE:
16842 /* We can't handle floating point constants;
16843 TARGET_PRINT_OPERAND must handle them. */
16844 output_operand_lossage ("floating constant misused");
16845 break;
16847 case PLUS:
16848 /* Some assemblers need integer constants to appear first. */
16849 if (CONST_INT_P (XEXP (x, 0)))
16851 output_pic_addr_const (file, XEXP (x, 0), code);
16852 putc ('+', file);
16853 output_pic_addr_const (file, XEXP (x, 1), code);
16855 else
16857 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16858 output_pic_addr_const (file, XEXP (x, 1), code);
16859 putc ('+', file);
16860 output_pic_addr_const (file, XEXP (x, 0), code);
16862 break;
16864 case MINUS:
16865 if (!TARGET_MACHO)
16866 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16867 output_pic_addr_const (file, XEXP (x, 0), code);
16868 putc ('-', file);
16869 output_pic_addr_const (file, XEXP (x, 1), code);
16870 if (!TARGET_MACHO)
16871 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16872 break;
16874 case UNSPEC:
16875 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
16877 bool f = i386_asm_output_addr_const_extra (file, x);
16878 gcc_assert (f);
16879 break;
16882 gcc_assert (XVECLEN (x, 0) == 1);
16883 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16884 switch (XINT (x, 1))
16886 case UNSPEC_GOT:
16887 fputs ("@GOT", file);
16888 break;
16889 case UNSPEC_GOTOFF:
16890 fputs ("@GOTOFF", file);
16891 break;
16892 case UNSPEC_PLTOFF:
16893 fputs ("@PLTOFF", file);
16894 break;
16895 case UNSPEC_PCREL:
16896 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16897 "(%rip)" : "[rip]", file);
16898 break;
16899 case UNSPEC_GOTPCREL:
16900 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16901 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16902 break;
16903 case UNSPEC_GOTTPOFF:
16904 /* FIXME: This might be @TPOFF in Sun ld too. */
16905 fputs ("@gottpoff", file);
16906 break;
16907 case UNSPEC_TPOFF:
16908 fputs ("@tpoff", file);
16909 break;
16910 case UNSPEC_NTPOFF:
16911 if (TARGET_64BIT)
16912 fputs ("@tpoff", file);
16913 else
16914 fputs ("@ntpoff", file);
16915 break;
16916 case UNSPEC_DTPOFF:
16917 fputs ("@dtpoff", file);
16918 break;
16919 case UNSPEC_GOTNTPOFF:
16920 if (TARGET_64BIT)
16921 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16922 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16923 else
16924 fputs ("@gotntpoff", file);
16925 break;
16926 case UNSPEC_INDNTPOFF:
16927 fputs ("@indntpoff", file);
16928 break;
16929 #if TARGET_MACHO
16930 case UNSPEC_MACHOPIC_OFFSET:
16931 putc ('-', file);
16932 machopic_output_function_base_name (file);
16933 break;
16934 #endif
16935 default:
16936 output_operand_lossage ("invalid UNSPEC as operand");
16937 break;
16939 break;
16941 default:
16942 output_operand_lossage ("invalid expression as operand");
16946 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16947 We need to emit DTP-relative relocations. */
16949 static void ATTRIBUTE_UNUSED
16950 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16952 fputs (ASM_LONG, file);
16953 output_addr_const (file, x);
16954 fputs ("@dtpoff", file);
16955 switch (size)
16957 case 4:
16958 break;
16959 case 8:
16960 fputs (", 0", file);
16961 break;
16962 default:
16963 gcc_unreachable ();
16967 /* Return true if X is a representation of the PIC register. This copes
16968 with calls from ix86_find_base_term, where the register might have
16969 been replaced by a cselib value. */
16971 static bool
16972 ix86_pic_register_p (rtx x)
16974 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16975 return (pic_offset_table_rtx
16976 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16977 else if (!REG_P (x))
16978 return false;
16979 else if (pic_offset_table_rtx)
16981 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16982 return true;
16983 if (HARD_REGISTER_P (x)
16984 && !HARD_REGISTER_P (pic_offset_table_rtx)
16985 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16986 return true;
16987 return false;
16989 else
16990 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16993 /* Helper function for ix86_delegitimize_address.
16994 Attempt to delegitimize TLS local-exec accesses. */
16996 static rtx
16997 ix86_delegitimize_tls_address (rtx orig_x)
16999 rtx x = orig_x, unspec;
17000 struct ix86_address addr;
17002 if (!TARGET_TLS_DIRECT_SEG_REFS)
17003 return orig_x;
17004 if (MEM_P (x))
17005 x = XEXP (x, 0);
17006 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17007 return orig_x;
17008 if (ix86_decompose_address (x, &addr) == 0
17009 || addr.seg != DEFAULT_TLS_SEG_REG
17010 || addr.disp == NULL_RTX
17011 || GET_CODE (addr.disp) != CONST)
17012 return orig_x;
17013 unspec = XEXP (addr.disp, 0);
17014 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17015 unspec = XEXP (unspec, 0);
17016 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17017 return orig_x;
17018 x = XVECEXP (unspec, 0, 0);
17019 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17020 if (unspec != XEXP (addr.disp, 0))
17021 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17022 if (addr.index)
17024 rtx idx = addr.index;
17025 if (addr.scale != 1)
17026 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17027 x = gen_rtx_PLUS (Pmode, idx, x);
17029 if (addr.base)
17030 x = gen_rtx_PLUS (Pmode, addr.base, x);
17031 if (MEM_P (orig_x))
17032 x = replace_equiv_address_nv (orig_x, x);
17033 return x;
17036 /* In the name of slightly smaller debug output, and to cater to
17037 general assembler lossage, recognize PIC+GOTOFF and turn it back
17038 into a direct symbol reference.
17040 On Darwin, this is necessary to avoid a crash, because Darwin
17041 has a different PIC label for each routine but the DWARF debugging
17042 information is not associated with any particular routine, so it's
17043 necessary to remove references to the PIC label from RTL stored by
17044 the DWARF output code. */
17046 static rtx
17047 ix86_delegitimize_address (rtx x)
17049 rtx orig_x = delegitimize_mem_from_attrs (x);
17050 /* addend is NULL or some rtx if x is something+GOTOFF where
17051 something doesn't include the PIC register. */
17052 rtx addend = NULL_RTX;
17053 /* reg_addend is NULL or a multiple of some register. */
17054 rtx reg_addend = NULL_RTX;
17055 /* const_addend is NULL or a const_int. */
17056 rtx const_addend = NULL_RTX;
17057 /* This is the result, or NULL. */
17058 rtx result = NULL_RTX;
17060 x = orig_x;
17062 if (MEM_P (x))
17063 x = XEXP (x, 0);
17065 if (TARGET_64BIT)
17067 if (GET_CODE (x) == CONST
17068 && GET_CODE (XEXP (x, 0)) == PLUS
17069 && GET_MODE (XEXP (x, 0)) == Pmode
17070 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17071 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17072 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17074 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17075 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17076 if (MEM_P (orig_x))
17077 x = replace_equiv_address_nv (orig_x, x);
17078 return x;
17081 if (GET_CODE (x) == CONST
17082 && GET_CODE (XEXP (x, 0)) == UNSPEC
17083 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17084 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17085 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17087 x = XVECEXP (XEXP (x, 0), 0, 0);
17088 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17090 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17091 if (x == NULL_RTX)
17092 return orig_x;
17094 return x;
17097 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17098 return ix86_delegitimize_tls_address (orig_x);
17100 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17101 and -mcmodel=medium -fpic. */
17104 if (GET_CODE (x) != PLUS
17105 || GET_CODE (XEXP (x, 1)) != CONST)
17106 return ix86_delegitimize_tls_address (orig_x);
17108 if (ix86_pic_register_p (XEXP (x, 0)))
17109 /* %ebx + GOT/GOTOFF */
17111 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17113 /* %ebx + %reg * scale + GOT/GOTOFF */
17114 reg_addend = XEXP (x, 0);
17115 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17116 reg_addend = XEXP (reg_addend, 1);
17117 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17118 reg_addend = XEXP (reg_addend, 0);
17119 else
17121 reg_addend = NULL_RTX;
17122 addend = XEXP (x, 0);
17125 else
17126 addend = XEXP (x, 0);
17128 x = XEXP (XEXP (x, 1), 0);
17129 if (GET_CODE (x) == PLUS
17130 && CONST_INT_P (XEXP (x, 1)))
17132 const_addend = XEXP (x, 1);
17133 x = XEXP (x, 0);
17136 if (GET_CODE (x) == UNSPEC
17137 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17138 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17139 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17140 && !MEM_P (orig_x) && !addend)))
17141 result = XVECEXP (x, 0, 0);
17143 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17144 && !MEM_P (orig_x))
17145 result = XVECEXP (x, 0, 0);
17147 if (! result)
17148 return ix86_delegitimize_tls_address (orig_x);
17150 if (const_addend)
17151 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17152 if (reg_addend)
17153 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17154 if (addend)
17156 /* If the rest of original X doesn't involve the PIC register, add
17157 addend and subtract pic_offset_table_rtx. This can happen e.g.
17158 for code like:
17159 leal (%ebx, %ecx, 4), %ecx
17161 movl foo@GOTOFF(%ecx), %edx
17162 in which case we return (%ecx - %ebx) + foo
17163 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17164 and reload has completed. */
17165 if (pic_offset_table_rtx
17166 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17167 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17168 pic_offset_table_rtx),
17169 result);
17170 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17172 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17173 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17174 result = gen_rtx_PLUS (Pmode, tmp, result);
17176 else
17177 return orig_x;
17179 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17181 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17182 if (result == NULL_RTX)
17183 return orig_x;
17185 return result;
17188 /* If X is a machine specific address (i.e. a symbol or label being
17189 referenced as a displacement from the GOT implemented using an
17190 UNSPEC), then return the base term. Otherwise return X. */
17193 ix86_find_base_term (rtx x)
17195 rtx term;
17197 if (TARGET_64BIT)
17199 if (GET_CODE (x) != CONST)
17200 return x;
17201 term = XEXP (x, 0);
17202 if (GET_CODE (term) == PLUS
17203 && CONST_INT_P (XEXP (term, 1)))
17204 term = XEXP (term, 0);
17205 if (GET_CODE (term) != UNSPEC
17206 || (XINT (term, 1) != UNSPEC_GOTPCREL
17207 && XINT (term, 1) != UNSPEC_PCREL))
17208 return x;
17210 return XVECEXP (term, 0, 0);
17213 return ix86_delegitimize_address (x);
17216 static void
17217 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17218 bool fp, FILE *file)
17220 const char *suffix;
17222 if (mode == CCFPmode || mode == CCFPUmode)
17224 code = ix86_fp_compare_code_to_integer (code);
17225 mode = CCmode;
17227 if (reverse)
17228 code = reverse_condition (code);
17230 switch (code)
17232 case EQ:
17233 switch (mode)
17235 case CCAmode:
17236 suffix = "a";
17237 break;
17238 case CCCmode:
17239 suffix = "c";
17240 break;
17241 case CCOmode:
17242 suffix = "o";
17243 break;
17244 case CCPmode:
17245 suffix = "p";
17246 break;
17247 case CCSmode:
17248 suffix = "s";
17249 break;
17250 default:
17251 suffix = "e";
17252 break;
17254 break;
17255 case NE:
17256 switch (mode)
17258 case CCAmode:
17259 suffix = "na";
17260 break;
17261 case CCCmode:
17262 suffix = "nc";
17263 break;
17264 case CCOmode:
17265 suffix = "no";
17266 break;
17267 case CCPmode:
17268 suffix = "np";
17269 break;
17270 case CCSmode:
17271 suffix = "ns";
17272 break;
17273 default:
17274 suffix = "ne";
17275 break;
17277 break;
17278 case GT:
17279 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17280 suffix = "g";
17281 break;
17282 case GTU:
17283 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17284 Those same assemblers have the same but opposite lossage on cmov. */
17285 if (mode == CCmode)
17286 suffix = fp ? "nbe" : "a";
17287 else
17288 gcc_unreachable ();
17289 break;
17290 case LT:
17291 switch (mode)
17293 case CCNOmode:
17294 case CCGOCmode:
17295 suffix = "s";
17296 break;
17298 case CCmode:
17299 case CCGCmode:
17300 suffix = "l";
17301 break;
17303 default:
17304 gcc_unreachable ();
17306 break;
17307 case LTU:
17308 if (mode == CCmode)
17309 suffix = "b";
17310 else if (mode == CCCmode)
17311 suffix = fp ? "b" : "c";
17312 else
17313 gcc_unreachable ();
17314 break;
17315 case GE:
17316 switch (mode)
17318 case CCNOmode:
17319 case CCGOCmode:
17320 suffix = "ns";
17321 break;
17323 case CCmode:
17324 case CCGCmode:
17325 suffix = "ge";
17326 break;
17328 default:
17329 gcc_unreachable ();
17331 break;
17332 case GEU:
17333 if (mode == CCmode)
17334 suffix = "nb";
17335 else if (mode == CCCmode)
17336 suffix = fp ? "nb" : "nc";
17337 else
17338 gcc_unreachable ();
17339 break;
17340 case LE:
17341 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17342 suffix = "le";
17343 break;
17344 case LEU:
17345 if (mode == CCmode)
17346 suffix = "be";
17347 else
17348 gcc_unreachable ();
17349 break;
17350 case UNORDERED:
17351 suffix = fp ? "u" : "p";
17352 break;
17353 case ORDERED:
17354 suffix = fp ? "nu" : "np";
17355 break;
17356 default:
17357 gcc_unreachable ();
17359 fputs (suffix, file);
17362 /* Print the name of register X to FILE based on its machine mode and number.
17363 If CODE is 'w', pretend the mode is HImode.
17364 If CODE is 'b', pretend the mode is QImode.
17365 If CODE is 'k', pretend the mode is SImode.
17366 If CODE is 'q', pretend the mode is DImode.
17367 If CODE is 'x', pretend the mode is V4SFmode.
17368 If CODE is 't', pretend the mode is V8SFmode.
17369 If CODE is 'g', pretend the mode is V16SFmode.
17370 If CODE is 'h', pretend the reg is the 'high' byte register.
17371 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17372 If CODE is 'd', duplicate the operand for AVX instruction.
17375 void
17376 print_reg (rtx x, int code, FILE *file)
17378 const char *reg;
17379 int msize;
17380 unsigned int regno;
17381 bool duplicated;
17383 if (ASSEMBLER_DIALECT == ASM_ATT)
17384 putc ('%', file);
17386 if (x == pc_rtx)
17388 gcc_assert (TARGET_64BIT);
17389 fputs ("rip", file);
17390 return;
17393 if (code == 'y' && STACK_TOP_P (x))
17395 fputs ("st(0)", file);
17396 return;
17399 if (code == 'w')
17400 msize = 2;
17401 else if (code == 'b')
17402 msize = 1;
17403 else if (code == 'k')
17404 msize = 4;
17405 else if (code == 'q')
17406 msize = 8;
17407 else if (code == 'h')
17408 msize = 0;
17409 else if (code == 'x')
17410 msize = 16;
17411 else if (code == 't')
17412 msize = 32;
17413 else if (code == 'g')
17414 msize = 64;
17415 else
17416 msize = GET_MODE_SIZE (GET_MODE (x));
17418 regno = true_regnum (x);
17420 gcc_assert (regno != ARG_POINTER_REGNUM
17421 && regno != FRAME_POINTER_REGNUM
17422 && regno != FPSR_REG
17423 && regno != FPCR_REG);
17425 if (regno == FLAGS_REG)
17427 output_operand_lossage ("invalid use of asm flag output");
17428 return;
17431 duplicated = code == 'd' && TARGET_AVX;
17433 switch (msize)
17435 case 8:
17436 case 4:
17437 if (LEGACY_INT_REGNO_P (regno))
17438 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17439 /* FALLTHRU */
17440 case 16:
17441 case 12:
17442 case 2:
17443 normal:
17444 reg = hi_reg_name[regno];
17445 break;
17446 case 1:
17447 if (regno >= ARRAY_SIZE (qi_reg_name))
17448 goto normal;
17449 reg = qi_reg_name[regno];
17450 break;
17451 case 0:
17452 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17453 goto normal;
17454 reg = qi_high_reg_name[regno];
17455 break;
17456 case 32:
17457 case 64:
17458 if (SSE_REGNO_P (regno))
17460 gcc_assert (!duplicated);
17461 putc (msize == 32 ? 'y' : 'z', file);
17462 reg = hi_reg_name[regno] + 1;
17463 break;
17465 goto normal;
17466 default:
17467 gcc_unreachable ();
17470 fputs (reg, file);
17472 /* Irritatingly, AMD extended registers use
17473 different naming convention: "r%d[bwd]" */
17474 if (REX_INT_REGNO_P (regno))
17476 gcc_assert (TARGET_64BIT);
17477 switch (msize)
17479 case 0:
17480 error ("extended registers have no high halves");
17481 break;
17482 case 1:
17483 putc ('b', file);
17484 break;
17485 case 2:
17486 putc ('w', file);
17487 break;
17488 case 4:
17489 putc ('d', file);
17490 break;
17491 case 8:
17492 /* no suffix */
17493 break;
17494 default:
17495 error ("unsupported operand size for extended register");
17496 break;
17498 return;
17501 if (duplicated)
17503 if (ASSEMBLER_DIALECT == ASM_ATT)
17504 fprintf (file, ", %%%s", reg);
17505 else
17506 fprintf (file, ", %s", reg);
17510 /* Meaning of CODE:
17511 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17512 C -- print opcode suffix for set/cmov insn.
17513 c -- like C, but print reversed condition
17514 F,f -- likewise, but for floating-point.
17515 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17516 otherwise nothing
17517 R -- print embeded rounding and sae.
17518 r -- print only sae.
17519 z -- print the opcode suffix for the size of the current operand.
17520 Z -- likewise, with special suffixes for x87 instructions.
17521 * -- print a star (in certain assembler syntax)
17522 A -- print an absolute memory reference.
17523 E -- print address with DImode register names if TARGET_64BIT.
17524 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17525 s -- print a shift double count, followed by the assemblers argument
17526 delimiter.
17527 b -- print the QImode name of the register for the indicated operand.
17528 %b0 would print %al if operands[0] is reg 0.
17529 w -- likewise, print the HImode name of the register.
17530 k -- likewise, print the SImode name of the register.
17531 q -- likewise, print the DImode name of the register.
17532 x -- likewise, print the V4SFmode name of the register.
17533 t -- likewise, print the V8SFmode name of the register.
17534 g -- likewise, print the V16SFmode name of the register.
17535 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17536 y -- print "st(0)" instead of "st" as a register.
17537 d -- print duplicated register operand for AVX instruction.
17538 D -- print condition for SSE cmp instruction.
17539 P -- if PIC, print an @PLT suffix.
17540 p -- print raw symbol name.
17541 X -- don't print any sort of PIC '@' suffix for a symbol.
17542 & -- print some in-use local-dynamic symbol name.
17543 H -- print a memory address offset by 8; used for sse high-parts
17544 Y -- print condition for XOP pcom* instruction.
17545 + -- print a branch hint as 'cs' or 'ds' prefix
17546 ; -- print a semicolon (after prefixes due to bug in older gas).
17547 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17548 @ -- print a segment register of thread base pointer load
17549 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17550 ! -- print MPX prefix for jxx/call/ret instructions if required.
17553 void
17554 ix86_print_operand (FILE *file, rtx x, int code)
17556 if (code)
17558 switch (code)
17560 case 'A':
17561 switch (ASSEMBLER_DIALECT)
17563 case ASM_ATT:
17564 putc ('*', file);
17565 break;
17567 case ASM_INTEL:
17568 /* Intel syntax. For absolute addresses, registers should not
17569 be surrounded by braces. */
17570 if (!REG_P (x))
17572 putc ('[', file);
17573 ix86_print_operand (file, x, 0);
17574 putc (']', file);
17575 return;
17577 break;
17579 default:
17580 gcc_unreachable ();
17583 ix86_print_operand (file, x, 0);
17584 return;
17586 case 'E':
17587 /* Wrap address in an UNSPEC to declare special handling. */
17588 if (TARGET_64BIT)
17589 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17591 output_address (VOIDmode, x);
17592 return;
17594 case 'L':
17595 if (ASSEMBLER_DIALECT == ASM_ATT)
17596 putc ('l', file);
17597 return;
17599 case 'W':
17600 if (ASSEMBLER_DIALECT == ASM_ATT)
17601 putc ('w', file);
17602 return;
17604 case 'B':
17605 if (ASSEMBLER_DIALECT == ASM_ATT)
17606 putc ('b', file);
17607 return;
17609 case 'Q':
17610 if (ASSEMBLER_DIALECT == ASM_ATT)
17611 putc ('l', file);
17612 return;
17614 case 'S':
17615 if (ASSEMBLER_DIALECT == ASM_ATT)
17616 putc ('s', file);
17617 return;
17619 case 'T':
17620 if (ASSEMBLER_DIALECT == ASM_ATT)
17621 putc ('t', file);
17622 return;
17624 case 'O':
17625 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17626 if (ASSEMBLER_DIALECT != ASM_ATT)
17627 return;
17629 switch (GET_MODE_SIZE (GET_MODE (x)))
17631 case 2:
17632 putc ('w', file);
17633 break;
17635 case 4:
17636 putc ('l', file);
17637 break;
17639 case 8:
17640 putc ('q', file);
17641 break;
17643 default:
17644 output_operand_lossage
17645 ("invalid operand size for operand code 'O'");
17646 return;
17649 putc ('.', file);
17650 #endif
17651 return;
17653 case 'z':
17654 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17656 /* Opcodes don't get size suffixes if using Intel opcodes. */
17657 if (ASSEMBLER_DIALECT == ASM_INTEL)
17658 return;
17660 switch (GET_MODE_SIZE (GET_MODE (x)))
17662 case 1:
17663 putc ('b', file);
17664 return;
17666 case 2:
17667 putc ('w', file);
17668 return;
17670 case 4:
17671 putc ('l', file);
17672 return;
17674 case 8:
17675 putc ('q', file);
17676 return;
17678 default:
17679 output_operand_lossage
17680 ("invalid operand size for operand code 'z'");
17681 return;
17685 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17686 warning
17687 (0, "non-integer operand used with operand code 'z'");
17688 /* FALLTHRU */
17690 case 'Z':
17691 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17692 if (ASSEMBLER_DIALECT == ASM_INTEL)
17693 return;
17695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17697 switch (GET_MODE_SIZE (GET_MODE (x)))
17699 case 2:
17700 #ifdef HAVE_AS_IX86_FILDS
17701 putc ('s', file);
17702 #endif
17703 return;
17705 case 4:
17706 putc ('l', file);
17707 return;
17709 case 8:
17710 #ifdef HAVE_AS_IX86_FILDQ
17711 putc ('q', file);
17712 #else
17713 fputs ("ll", file);
17714 #endif
17715 return;
17717 default:
17718 break;
17721 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17723 /* 387 opcodes don't get size suffixes
17724 if the operands are registers. */
17725 if (STACK_REG_P (x))
17726 return;
17728 switch (GET_MODE_SIZE (GET_MODE (x)))
17730 case 4:
17731 putc ('s', file);
17732 return;
17734 case 8:
17735 putc ('l', file);
17736 return;
17738 case 12:
17739 case 16:
17740 putc ('t', file);
17741 return;
17743 default:
17744 break;
17747 else
17749 output_operand_lossage
17750 ("invalid operand type used with operand code 'Z'");
17751 return;
17754 output_operand_lossage
17755 ("invalid operand size for operand code 'Z'");
17756 return;
17758 case 'd':
17759 case 'b':
17760 case 'w':
17761 case 'k':
17762 case 'q':
17763 case 'h':
17764 case 't':
17765 case 'g':
17766 case 'y':
17767 case 'x':
17768 case 'X':
17769 case 'P':
17770 case 'p':
17771 break;
17773 case 's':
17774 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17776 ix86_print_operand (file, x, 0);
17777 fputs (", ", file);
17779 return;
17781 case 'Y':
17782 switch (GET_CODE (x))
17784 case NE:
17785 fputs ("neq", file);
17786 break;
17787 case EQ:
17788 fputs ("eq", file);
17789 break;
17790 case GE:
17791 case GEU:
17792 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17793 break;
17794 case GT:
17795 case GTU:
17796 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17797 break;
17798 case LE:
17799 case LEU:
17800 fputs ("le", file);
17801 break;
17802 case LT:
17803 case LTU:
17804 fputs ("lt", file);
17805 break;
17806 case UNORDERED:
17807 fputs ("unord", file);
17808 break;
17809 case ORDERED:
17810 fputs ("ord", file);
17811 break;
17812 case UNEQ:
17813 fputs ("ueq", file);
17814 break;
17815 case UNGE:
17816 fputs ("nlt", file);
17817 break;
17818 case UNGT:
17819 fputs ("nle", file);
17820 break;
17821 case UNLE:
17822 fputs ("ule", file);
17823 break;
17824 case UNLT:
17825 fputs ("ult", file);
17826 break;
17827 case LTGT:
17828 fputs ("une", file);
17829 break;
17830 default:
17831 output_operand_lossage ("operand is not a condition code, "
17832 "invalid operand code 'Y'");
17833 return;
17835 return;
17837 case 'D':
17838 /* Little bit of braindamage here. The SSE compare instructions
17839 does use completely different names for the comparisons that the
17840 fp conditional moves. */
17841 switch (GET_CODE (x))
17843 case UNEQ:
17844 if (TARGET_AVX)
17846 fputs ("eq_us", file);
17847 break;
17849 /* FALLTHRU */
17850 case EQ:
17851 fputs ("eq", file);
17852 break;
17853 case UNLT:
17854 if (TARGET_AVX)
17856 fputs ("nge", file);
17857 break;
17859 /* FALLTHRU */
17860 case LT:
17861 fputs ("lt", file);
17862 break;
17863 case UNLE:
17864 if (TARGET_AVX)
17866 fputs ("ngt", file);
17867 break;
17869 /* FALLTHRU */
17870 case LE:
17871 fputs ("le", file);
17872 break;
17873 case UNORDERED:
17874 fputs ("unord", file);
17875 break;
17876 case LTGT:
17877 if (TARGET_AVX)
17879 fputs ("neq_oq", file);
17880 break;
17882 /* FALLTHRU */
17883 case NE:
17884 fputs ("neq", file);
17885 break;
17886 case GE:
17887 if (TARGET_AVX)
17889 fputs ("ge", file);
17890 break;
17892 /* FALLTHRU */
17893 case UNGE:
17894 fputs ("nlt", file);
17895 break;
17896 case GT:
17897 if (TARGET_AVX)
17899 fputs ("gt", file);
17900 break;
17902 /* FALLTHRU */
17903 case UNGT:
17904 fputs ("nle", file);
17905 break;
17906 case ORDERED:
17907 fputs ("ord", file);
17908 break;
17909 default:
17910 output_operand_lossage ("operand is not a condition code, "
17911 "invalid operand code 'D'");
17912 return;
17914 return;
17916 case 'F':
17917 case 'f':
17918 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17919 if (ASSEMBLER_DIALECT == ASM_ATT)
17920 putc ('.', file);
17921 gcc_fallthrough ();
17922 #endif
17924 case 'C':
17925 case 'c':
17926 if (!COMPARISON_P (x))
17928 output_operand_lossage ("operand is not a condition code, "
17929 "invalid operand code '%c'", code);
17930 return;
17932 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17933 code == 'c' || code == 'f',
17934 code == 'F' || code == 'f',
17935 file);
17936 return;
17938 case 'H':
17939 if (!offsettable_memref_p (x))
17941 output_operand_lossage ("operand is not an offsettable memory "
17942 "reference, invalid operand code 'H'");
17943 return;
17945 /* It doesn't actually matter what mode we use here, as we're
17946 only going to use this for printing. */
17947 x = adjust_address_nv (x, DImode, 8);
17948 /* Output 'qword ptr' for intel assembler dialect. */
17949 if (ASSEMBLER_DIALECT == ASM_INTEL)
17950 code = 'q';
17951 break;
17953 case 'K':
17954 gcc_assert (CONST_INT_P (x));
17956 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17957 #ifdef HAVE_AS_IX86_HLE
17958 fputs ("xacquire ", file);
17959 #else
17960 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17961 #endif
17962 else if (INTVAL (x) & IX86_HLE_RELEASE)
17963 #ifdef HAVE_AS_IX86_HLE
17964 fputs ("xrelease ", file);
17965 #else
17966 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17967 #endif
17968 /* We do not want to print value of the operand. */
17969 return;
17971 case 'N':
17972 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17973 fputs ("{z}", file);
17974 return;
17976 case 'r':
17977 gcc_assert (CONST_INT_P (x));
17978 gcc_assert (INTVAL (x) == ROUND_SAE);
17980 if (ASSEMBLER_DIALECT == ASM_INTEL)
17981 fputs (", ", file);
17983 fputs ("{sae}", file);
17985 if (ASSEMBLER_DIALECT == ASM_ATT)
17986 fputs (", ", file);
17988 return;
17990 case 'R':
17991 gcc_assert (CONST_INT_P (x));
17993 if (ASSEMBLER_DIALECT == ASM_INTEL)
17994 fputs (", ", file);
17996 switch (INTVAL (x))
17998 case ROUND_NEAREST_INT | ROUND_SAE:
17999 fputs ("{rn-sae}", file);
18000 break;
18001 case ROUND_NEG_INF | ROUND_SAE:
18002 fputs ("{rd-sae}", file);
18003 break;
18004 case ROUND_POS_INF | ROUND_SAE:
18005 fputs ("{ru-sae}", file);
18006 break;
18007 case ROUND_ZERO | ROUND_SAE:
18008 fputs ("{rz-sae}", file);
18009 break;
18010 default:
18011 gcc_unreachable ();
18014 if (ASSEMBLER_DIALECT == ASM_ATT)
18015 fputs (", ", file);
18017 return;
18019 case '*':
18020 if (ASSEMBLER_DIALECT == ASM_ATT)
18021 putc ('*', file);
18022 return;
18024 case '&':
18026 const char *name = get_some_local_dynamic_name ();
18027 if (name == NULL)
18028 output_operand_lossage ("'%%&' used without any "
18029 "local dynamic TLS references");
18030 else
18031 assemble_name (file, name);
18032 return;
18035 case '+':
18037 rtx x;
18039 if (!optimize
18040 || optimize_function_for_size_p (cfun)
18041 || !TARGET_BRANCH_PREDICTION_HINTS)
18042 return;
18044 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18045 if (x)
18047 int pred_val = XINT (x, 0);
18049 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18050 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18052 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18053 bool cputaken
18054 = final_forward_branch_p (current_output_insn) == 0;
18056 /* Emit hints only in the case default branch prediction
18057 heuristics would fail. */
18058 if (taken != cputaken)
18060 /* We use 3e (DS) prefix for taken branches and
18061 2e (CS) prefix for not taken branches. */
18062 if (taken)
18063 fputs ("ds ; ", file);
18064 else
18065 fputs ("cs ; ", file);
18069 return;
18072 case ';':
18073 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18074 putc (';', file);
18075 #endif
18076 return;
18078 case '@':
18079 if (ASSEMBLER_DIALECT == ASM_ATT)
18080 putc ('%', file);
18082 /* The kernel uses a different segment register for performance
18083 reasons; a system call would not have to trash the userspace
18084 segment register, which would be expensive. */
18085 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18086 fputs ("fs", file);
18087 else
18088 fputs ("gs", file);
18089 return;
18091 case '~':
18092 putc (TARGET_AVX2 ? 'i' : 'f', file);
18093 return;
18095 case '^':
18096 if (TARGET_64BIT && Pmode != word_mode)
18097 fputs ("addr32 ", file);
18098 return;
18100 case '!':
18101 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18102 fputs ("bnd ", file);
18103 return;
18105 default:
18106 output_operand_lossage ("invalid operand code '%c'", code);
18110 if (REG_P (x))
18111 print_reg (x, code, file);
18113 else if (MEM_P (x))
18115 rtx addr = XEXP (x, 0);
18117 /* No `byte ptr' prefix for call instructions ... */
18118 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18120 machine_mode mode = GET_MODE (x);
18121 const char *size;
18123 /* Check for explicit size override codes. */
18124 if (code == 'b')
18125 size = "BYTE";
18126 else if (code == 'w')
18127 size = "WORD";
18128 else if (code == 'k')
18129 size = "DWORD";
18130 else if (code == 'q')
18131 size = "QWORD";
18132 else if (code == 'x')
18133 size = "XMMWORD";
18134 else if (code == 't')
18135 size = "YMMWORD";
18136 else if (code == 'g')
18137 size = "ZMMWORD";
18138 else if (mode == BLKmode)
18139 /* ... or BLKmode operands, when not overridden. */
18140 size = NULL;
18141 else
18142 switch (GET_MODE_SIZE (mode))
18144 case 1: size = "BYTE"; break;
18145 case 2: size = "WORD"; break;
18146 case 4: size = "DWORD"; break;
18147 case 8: size = "QWORD"; break;
18148 case 12: size = "TBYTE"; break;
18149 case 16:
18150 if (mode == XFmode)
18151 size = "TBYTE";
18152 else
18153 size = "XMMWORD";
18154 break;
18155 case 32: size = "YMMWORD"; break;
18156 case 64: size = "ZMMWORD"; break;
18157 default:
18158 gcc_unreachable ();
18160 if (size)
18162 fputs (size, file);
18163 fputs (" PTR ", file);
18167 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18168 output_operand_lossage ("invalid constraints for operand");
18169 else
18170 ix86_print_operand_address_as
18171 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18174 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18176 long l;
18178 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18180 if (ASSEMBLER_DIALECT == ASM_ATT)
18181 putc ('$', file);
18182 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18183 if (code == 'q')
18184 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18185 (unsigned long long) (int) l);
18186 else
18187 fprintf (file, "0x%08x", (unsigned int) l);
18190 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18192 long l[2];
18194 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18196 if (ASSEMBLER_DIALECT == ASM_ATT)
18197 putc ('$', file);
18198 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18201 /* These float cases don't actually occur as immediate operands. */
18202 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18204 char dstr[30];
18206 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18207 fputs (dstr, file);
18210 else
18212 /* We have patterns that allow zero sets of memory, for instance.
18213 In 64-bit mode, we should probably support all 8-byte vectors,
18214 since we can in fact encode that into an immediate. */
18215 if (GET_CODE (x) == CONST_VECTOR)
18217 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18218 x = const0_rtx;
18221 if (code != 'P' && code != 'p')
18223 if (CONST_INT_P (x))
18225 if (ASSEMBLER_DIALECT == ASM_ATT)
18226 putc ('$', file);
18228 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18229 || GET_CODE (x) == LABEL_REF)
18231 if (ASSEMBLER_DIALECT == ASM_ATT)
18232 putc ('$', file);
18233 else
18234 fputs ("OFFSET FLAT:", file);
18237 if (CONST_INT_P (x))
18238 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18239 else if (flag_pic || MACHOPIC_INDIRECT)
18240 output_pic_addr_const (file, x, code);
18241 else
18242 output_addr_const (file, x);
18246 static bool
18247 ix86_print_operand_punct_valid_p (unsigned char code)
18249 return (code == '@' || code == '*' || code == '+' || code == '&'
18250 || code == ';' || code == '~' || code == '^' || code == '!');
18253 /* Print a memory operand whose address is ADDR. */
18255 static void
18256 ix86_print_operand_address_as (FILE *file, rtx addr,
18257 addr_space_t as, bool no_rip)
18259 struct ix86_address parts;
18260 rtx base, index, disp;
18261 int scale;
18262 int ok;
18263 bool vsib = false;
18264 int code = 0;
18266 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18268 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18269 gcc_assert (parts.index == NULL_RTX);
18270 parts.index = XVECEXP (addr, 0, 1);
18271 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18272 addr = XVECEXP (addr, 0, 0);
18273 vsib = true;
18275 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18277 gcc_assert (TARGET_64BIT);
18278 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18279 code = 'q';
18281 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18283 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18284 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18285 if (parts.base != NULL_RTX)
18287 parts.index = parts.base;
18288 parts.scale = 1;
18290 parts.base = XVECEXP (addr, 0, 0);
18291 addr = XVECEXP (addr, 0, 0);
18293 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18295 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18296 gcc_assert (parts.index == NULL_RTX);
18297 parts.index = XVECEXP (addr, 0, 1);
18298 addr = XVECEXP (addr, 0, 0);
18300 else
18301 ok = ix86_decompose_address (addr, &parts);
18303 gcc_assert (ok);
18305 base = parts.base;
18306 index = parts.index;
18307 disp = parts.disp;
18308 scale = parts.scale;
18310 if (ADDR_SPACE_GENERIC_P (as))
18311 as = parts.seg;
18312 else
18313 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18315 if (!ADDR_SPACE_GENERIC_P (as))
18317 const char *string;
18319 if (as == ADDR_SPACE_SEG_FS)
18320 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18321 else if (as == ADDR_SPACE_SEG_GS)
18322 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18323 else
18324 gcc_unreachable ();
18325 fputs (string, file);
18328 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18329 if (TARGET_64BIT && !base && !index && !no_rip)
18331 rtx symbol = disp;
18333 if (GET_CODE (disp) == CONST
18334 && GET_CODE (XEXP (disp, 0)) == PLUS
18335 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18336 symbol = XEXP (XEXP (disp, 0), 0);
18338 if (GET_CODE (symbol) == LABEL_REF
18339 || (GET_CODE (symbol) == SYMBOL_REF
18340 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18341 base = pc_rtx;
18344 if (!base && !index)
18346 /* Displacement only requires special attention. */
18347 if (CONST_INT_P (disp))
18349 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18350 fputs ("ds:", file);
18351 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18353 /* Load the external function address via the GOT slot to avoid PLT. */
18354 else if (GET_CODE (disp) == CONST
18355 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18356 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18357 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18358 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18359 output_pic_addr_const (file, disp, 0);
18360 else if (flag_pic)
18361 output_pic_addr_const (file, disp, 0);
18362 else
18363 output_addr_const (file, disp);
18365 else
18367 /* Print SImode register names to force addr32 prefix. */
18368 if (SImode_address_operand (addr, VOIDmode))
18370 if (flag_checking)
18372 gcc_assert (TARGET_64BIT);
18373 switch (GET_CODE (addr))
18375 case SUBREG:
18376 gcc_assert (GET_MODE (addr) == SImode);
18377 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18378 break;
18379 case ZERO_EXTEND:
18380 case AND:
18381 gcc_assert (GET_MODE (addr) == DImode);
18382 break;
18383 default:
18384 gcc_unreachable ();
18387 gcc_assert (!code);
18388 code = 'k';
18390 else if (code == 0
18391 && TARGET_X32
18392 && disp
18393 && CONST_INT_P (disp)
18394 && INTVAL (disp) < -16*1024*1024)
18396 /* X32 runs in 64-bit mode, where displacement, DISP, in
18397 address DISP(%r64), is encoded as 32-bit immediate sign-
18398 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18399 address is %r64 + 0xffffffffbffffd00. When %r64 <
18400 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18401 which is invalid for x32. The correct address is %r64
18402 - 0x40000300 == 0xf7ffdd64. To properly encode
18403 -0x40000300(%r64) for x32, we zero-extend negative
18404 displacement by forcing addr32 prefix which truncates
18405 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18406 zero-extend all negative displacements, including -1(%rsp).
18407 However, for small negative displacements, sign-extension
18408 won't cause overflow. We only zero-extend negative
18409 displacements if they < -16*1024*1024, which is also used
18410 to check legitimate address displacements for PIC. */
18411 code = 'k';
18414 if (ASSEMBLER_DIALECT == ASM_ATT)
18416 if (disp)
18418 if (flag_pic)
18419 output_pic_addr_const (file, disp, 0);
18420 else if (GET_CODE (disp) == LABEL_REF)
18421 output_asm_label (disp);
18422 else
18423 output_addr_const (file, disp);
18426 putc ('(', file);
18427 if (base)
18428 print_reg (base, code, file);
18429 if (index)
18431 putc (',', file);
18432 print_reg (index, vsib ? 0 : code, file);
18433 if (scale != 1 || vsib)
18434 fprintf (file, ",%d", scale);
18436 putc (')', file);
18438 else
18440 rtx offset = NULL_RTX;
18442 if (disp)
18444 /* Pull out the offset of a symbol; print any symbol itself. */
18445 if (GET_CODE (disp) == CONST
18446 && GET_CODE (XEXP (disp, 0)) == PLUS
18447 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18449 offset = XEXP (XEXP (disp, 0), 1);
18450 disp = gen_rtx_CONST (VOIDmode,
18451 XEXP (XEXP (disp, 0), 0));
18454 if (flag_pic)
18455 output_pic_addr_const (file, disp, 0);
18456 else if (GET_CODE (disp) == LABEL_REF)
18457 output_asm_label (disp);
18458 else if (CONST_INT_P (disp))
18459 offset = disp;
18460 else
18461 output_addr_const (file, disp);
18464 putc ('[', file);
18465 if (base)
18467 print_reg (base, code, file);
18468 if (offset)
18470 if (INTVAL (offset) >= 0)
18471 putc ('+', file);
18472 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18475 else if (offset)
18476 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18477 else
18478 putc ('0', file);
18480 if (index)
18482 putc ('+', file);
18483 print_reg (index, vsib ? 0 : code, file);
18484 if (scale != 1 || vsib)
18485 fprintf (file, "*%d", scale);
18487 putc (']', file);
18492 static void
18493 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18495 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18498 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18500 static bool
18501 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18503 rtx op;
18505 if (GET_CODE (x) != UNSPEC)
18506 return false;
18508 op = XVECEXP (x, 0, 0);
18509 switch (XINT (x, 1))
18511 case UNSPEC_GOTTPOFF:
18512 output_addr_const (file, op);
18513 /* FIXME: This might be @TPOFF in Sun ld. */
18514 fputs ("@gottpoff", file);
18515 break;
18516 case UNSPEC_TPOFF:
18517 output_addr_const (file, op);
18518 fputs ("@tpoff", file);
18519 break;
18520 case UNSPEC_NTPOFF:
18521 output_addr_const (file, op);
18522 if (TARGET_64BIT)
18523 fputs ("@tpoff", file);
18524 else
18525 fputs ("@ntpoff", file);
18526 break;
18527 case UNSPEC_DTPOFF:
18528 output_addr_const (file, op);
18529 fputs ("@dtpoff", file);
18530 break;
18531 case UNSPEC_GOTNTPOFF:
18532 output_addr_const (file, op);
18533 if (TARGET_64BIT)
18534 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18535 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18536 else
18537 fputs ("@gotntpoff", file);
18538 break;
18539 case UNSPEC_INDNTPOFF:
18540 output_addr_const (file, op);
18541 fputs ("@indntpoff", file);
18542 break;
18543 #if TARGET_MACHO
18544 case UNSPEC_MACHOPIC_OFFSET:
18545 output_addr_const (file, op);
18546 putc ('-', file);
18547 machopic_output_function_base_name (file);
18548 break;
18549 #endif
18551 case UNSPEC_STACK_CHECK:
18553 int offset;
18555 gcc_assert (flag_split_stack);
18557 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18558 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18559 #else
18560 gcc_unreachable ();
18561 #endif
18563 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18565 break;
18567 default:
18568 return false;
18571 return true;
18574 /* Split one or more double-mode RTL references into pairs of half-mode
18575 references. The RTL can be REG, offsettable MEM, integer constant, or
18576 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18577 split and "num" is its length. lo_half and hi_half are output arrays
18578 that parallel "operands". */
18580 void
18581 split_double_mode (machine_mode mode, rtx operands[],
18582 int num, rtx lo_half[], rtx hi_half[])
18584 machine_mode half_mode;
18585 unsigned int byte;
18587 switch (mode)
18589 case TImode:
18590 half_mode = DImode;
18591 break;
18592 case DImode:
18593 half_mode = SImode;
18594 break;
18595 default:
18596 gcc_unreachable ();
18599 byte = GET_MODE_SIZE (half_mode);
18601 while (num--)
18603 rtx op = operands[num];
18605 /* simplify_subreg refuse to split volatile memory addresses,
18606 but we still have to handle it. */
18607 if (MEM_P (op))
18609 lo_half[num] = adjust_address (op, half_mode, 0);
18610 hi_half[num] = adjust_address (op, half_mode, byte);
18612 else
18614 lo_half[num] = simplify_gen_subreg (half_mode, op,
18615 GET_MODE (op) == VOIDmode
18616 ? mode : GET_MODE (op), 0);
18617 hi_half[num] = simplify_gen_subreg (half_mode, op,
18618 GET_MODE (op) == VOIDmode
18619 ? mode : GET_MODE (op), byte);
18624 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18625 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18626 is the expression of the binary operation. The output may either be
18627 emitted here, or returned to the caller, like all output_* functions.
18629 There is no guarantee that the operands are the same mode, as they
18630 might be within FLOAT or FLOAT_EXTEND expressions. */
18632 #ifndef SYSV386_COMPAT
18633 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18634 wants to fix the assemblers because that causes incompatibility
18635 with gcc. No-one wants to fix gcc because that causes
18636 incompatibility with assemblers... You can use the option of
18637 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18638 #define SYSV386_COMPAT 1
18639 #endif
18641 const char *
18642 output_387_binary_op (rtx insn, rtx *operands)
18644 static char buf[40];
18645 const char *p;
18646 const char *ssep;
18647 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18649 /* Even if we do not want to check the inputs, this documents input
18650 constraints. Which helps in understanding the following code. */
18651 if (flag_checking)
18653 if (STACK_REG_P (operands[0])
18654 && ((REG_P (operands[1])
18655 && REGNO (operands[0]) == REGNO (operands[1])
18656 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18657 || (REG_P (operands[2])
18658 && REGNO (operands[0]) == REGNO (operands[2])
18659 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18660 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18661 ; /* ok */
18662 else
18663 gcc_assert (is_sse);
18666 switch (GET_CODE (operands[3]))
18668 case PLUS:
18669 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18670 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18671 p = "fiadd";
18672 else
18673 p = "fadd";
18674 ssep = "vadd";
18675 break;
18677 case MINUS:
18678 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18679 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18680 p = "fisub";
18681 else
18682 p = "fsub";
18683 ssep = "vsub";
18684 break;
18686 case MULT:
18687 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18688 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18689 p = "fimul";
18690 else
18691 p = "fmul";
18692 ssep = "vmul";
18693 break;
18695 case DIV:
18696 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18697 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18698 p = "fidiv";
18699 else
18700 p = "fdiv";
18701 ssep = "vdiv";
18702 break;
18704 default:
18705 gcc_unreachable ();
18708 if (is_sse)
18710 if (TARGET_AVX)
18712 strcpy (buf, ssep);
18713 if (GET_MODE (operands[0]) == SFmode)
18714 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18715 else
18716 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18718 else
18720 strcpy (buf, ssep + 1);
18721 if (GET_MODE (operands[0]) == SFmode)
18722 strcat (buf, "ss\t{%2, %0|%0, %2}");
18723 else
18724 strcat (buf, "sd\t{%2, %0|%0, %2}");
18726 return buf;
18728 strcpy (buf, p);
18730 switch (GET_CODE (operands[3]))
18732 case MULT:
18733 case PLUS:
18734 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18735 std::swap (operands[1], operands[2]);
18737 /* know operands[0] == operands[1]. */
18739 if (MEM_P (operands[2]))
18741 p = "%Z2\t%2";
18742 break;
18745 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18747 if (STACK_TOP_P (operands[0]))
18748 /* How is it that we are storing to a dead operand[2]?
18749 Well, presumably operands[1] is dead too. We can't
18750 store the result to st(0) as st(0) gets popped on this
18751 instruction. Instead store to operands[2] (which I
18752 think has to be st(1)). st(1) will be popped later.
18753 gcc <= 2.8.1 didn't have this check and generated
18754 assembly code that the Unixware assembler rejected. */
18755 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18756 else
18757 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18758 break;
18761 if (STACK_TOP_P (operands[0]))
18762 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18763 else
18764 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18765 break;
18767 case MINUS:
18768 case DIV:
18769 if (MEM_P (operands[1]))
18771 p = "r%Z1\t%1";
18772 break;
18775 if (MEM_P (operands[2]))
18777 p = "%Z2\t%2";
18778 break;
18781 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18783 #if SYSV386_COMPAT
18784 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18785 derived assemblers, confusingly reverse the direction of
18786 the operation for fsub{r} and fdiv{r} when the
18787 destination register is not st(0). The Intel assembler
18788 doesn't have this brain damage. Read !SYSV386_COMPAT to
18789 figure out what the hardware really does. */
18790 if (STACK_TOP_P (operands[0]))
18791 p = "{p\t%0, %2|rp\t%2, %0}";
18792 else
18793 p = "{rp\t%2, %0|p\t%0, %2}";
18794 #else
18795 if (STACK_TOP_P (operands[0]))
18796 /* As above for fmul/fadd, we can't store to st(0). */
18797 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18798 else
18799 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18800 #endif
18801 break;
18804 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18806 #if SYSV386_COMPAT
18807 if (STACK_TOP_P (operands[0]))
18808 p = "{rp\t%0, %1|p\t%1, %0}";
18809 else
18810 p = "{p\t%1, %0|rp\t%0, %1}";
18811 #else
18812 if (STACK_TOP_P (operands[0]))
18813 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18814 else
18815 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18816 #endif
18817 break;
18820 if (STACK_TOP_P (operands[0]))
18822 if (STACK_TOP_P (operands[1]))
18823 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18824 else
18825 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18826 break;
18828 else if (STACK_TOP_P (operands[1]))
18830 #if SYSV386_COMPAT
18831 p = "{\t%1, %0|r\t%0, %1}";
18832 #else
18833 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18834 #endif
18836 else
18838 #if SYSV386_COMPAT
18839 p = "{r\t%2, %0|\t%0, %2}";
18840 #else
18841 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18842 #endif
18844 break;
18846 default:
18847 gcc_unreachable ();
18850 strcat (buf, p);
18851 return buf;
18854 /* Return needed mode for entity in optimize_mode_switching pass. */
18856 static int
18857 ix86_dirflag_mode_needed (rtx_insn *insn)
18859 if (CALL_P (insn))
18861 if (cfun->machine->func_type == TYPE_NORMAL)
18862 return X86_DIRFLAG_ANY;
18863 else
18864 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18865 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18868 if (recog_memoized (insn) < 0)
18869 return X86_DIRFLAG_ANY;
18871 if (get_attr_type (insn) == TYPE_STR)
18873 /* Emit cld instruction if stringops are used in the function. */
18874 if (cfun->machine->func_type == TYPE_NORMAL)
18875 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18876 else
18877 return X86_DIRFLAG_RESET;
18880 return X86_DIRFLAG_ANY;
18883 /* Check if a 256bit AVX register is referenced inside of EXP. */
18885 static bool
18886 ix86_check_avx256_register (const_rtx exp)
18888 if (SUBREG_P (exp))
18889 exp = SUBREG_REG (exp);
18891 return (REG_P (exp)
18892 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
18895 /* Return needed mode for entity in optimize_mode_switching pass. */
18897 static int
18898 ix86_avx_u128_mode_needed (rtx_insn *insn)
18900 if (CALL_P (insn))
18902 rtx link;
18904 /* Needed mode is set to AVX_U128_CLEAN if there are
18905 no 256bit modes used in function arguments. */
18906 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18907 link;
18908 link = XEXP (link, 1))
18910 if (GET_CODE (XEXP (link, 0)) == USE)
18912 rtx arg = XEXP (XEXP (link, 0), 0);
18914 if (ix86_check_avx256_register (arg))
18915 return AVX_U128_DIRTY;
18919 return AVX_U128_CLEAN;
18922 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
18923 changes state only when a 256bit register is written to, but we need
18924 to prevent the compiler from moving optimal insertion point above
18925 eventual read from 256bit register. */
18926 subrtx_iterator::array_type array;
18927 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18928 if (ix86_check_avx256_register (*iter))
18929 return AVX_U128_DIRTY;
18931 return AVX_U128_ANY;
18934 /* Return mode that i387 must be switched into
18935 prior to the execution of insn. */
18937 static int
18938 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18940 enum attr_i387_cw mode;
18942 /* The mode UNINITIALIZED is used to store control word after a
18943 function call or ASM pattern. The mode ANY specify that function
18944 has no requirements on the control word and make no changes in the
18945 bits we are interested in. */
18947 if (CALL_P (insn)
18948 || (NONJUMP_INSN_P (insn)
18949 && (asm_noperands (PATTERN (insn)) >= 0
18950 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18951 return I387_CW_UNINITIALIZED;
18953 if (recog_memoized (insn) < 0)
18954 return I387_CW_ANY;
18956 mode = get_attr_i387_cw (insn);
18958 switch (entity)
18960 case I387_TRUNC:
18961 if (mode == I387_CW_TRUNC)
18962 return mode;
18963 break;
18965 case I387_FLOOR:
18966 if (mode == I387_CW_FLOOR)
18967 return mode;
18968 break;
18970 case I387_CEIL:
18971 if (mode == I387_CW_CEIL)
18972 return mode;
18973 break;
18975 case I387_MASK_PM:
18976 if (mode == I387_CW_MASK_PM)
18977 return mode;
18978 break;
18980 default:
18981 gcc_unreachable ();
18984 return I387_CW_ANY;
18987 /* Return mode that entity must be switched into
18988 prior to the execution of insn. */
18990 static int
18991 ix86_mode_needed (int entity, rtx_insn *insn)
18993 switch (entity)
18995 case X86_DIRFLAG:
18996 return ix86_dirflag_mode_needed (insn);
18997 case AVX_U128:
18998 return ix86_avx_u128_mode_needed (insn);
18999 case I387_TRUNC:
19000 case I387_FLOOR:
19001 case I387_CEIL:
19002 case I387_MASK_PM:
19003 return ix86_i387_mode_needed (entity, insn);
19004 default:
19005 gcc_unreachable ();
19007 return 0;
19010 /* Check if a 256bit AVX register is referenced in stores. */
19012 static void
19013 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19015 if (ix86_check_avx256_register (dest))
19017 bool *used = (bool *) data;
19018 *used = true;
19022 /* Calculate mode of upper 128bit AVX registers after the insn. */
19024 static int
19025 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19027 rtx pat = PATTERN (insn);
19029 if (vzeroupper_operation (pat, VOIDmode)
19030 || vzeroall_operation (pat, VOIDmode))
19031 return AVX_U128_CLEAN;
19033 /* We know that state is clean after CALL insn if there are no
19034 256bit registers used in the function return register. */
19035 if (CALL_P (insn))
19037 bool avx_reg256_found = false;
19038 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19040 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19043 /* Otherwise, return current mode. Remember that if insn
19044 references AVX 256bit registers, the mode was already changed
19045 to DIRTY from MODE_NEEDED. */
19046 return mode;
19049 /* Return the mode that an insn results in. */
19051 static int
19052 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19054 switch (entity)
19056 case X86_DIRFLAG:
19057 return mode;
19058 case AVX_U128:
19059 return ix86_avx_u128_mode_after (mode, insn);
19060 case I387_TRUNC:
19061 case I387_FLOOR:
19062 case I387_CEIL:
19063 case I387_MASK_PM:
19064 return mode;
19065 default:
19066 gcc_unreachable ();
19070 static int
19071 ix86_dirflag_mode_entry (void)
19073 /* For TARGET_CLD or in the interrupt handler we can't assume
19074 direction flag state at function entry. */
19075 if (TARGET_CLD
19076 || cfun->machine->func_type != TYPE_NORMAL)
19077 return X86_DIRFLAG_ANY;
19079 return X86_DIRFLAG_RESET;
19082 static int
19083 ix86_avx_u128_mode_entry (void)
19085 tree arg;
19087 /* Entry mode is set to AVX_U128_DIRTY if there are
19088 256bit modes used in function arguments. */
19089 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19090 arg = TREE_CHAIN (arg))
19092 rtx incoming = DECL_INCOMING_RTL (arg);
19094 if (incoming && ix86_check_avx256_register (incoming))
19095 return AVX_U128_DIRTY;
19098 return AVX_U128_CLEAN;
19101 /* Return a mode that ENTITY is assumed to be
19102 switched to at function entry. */
19104 static int
19105 ix86_mode_entry (int entity)
19107 switch (entity)
19109 case X86_DIRFLAG:
19110 return ix86_dirflag_mode_entry ();
19111 case AVX_U128:
19112 return ix86_avx_u128_mode_entry ();
19113 case I387_TRUNC:
19114 case I387_FLOOR:
19115 case I387_CEIL:
19116 case I387_MASK_PM:
19117 return I387_CW_ANY;
19118 default:
19119 gcc_unreachable ();
19123 static int
19124 ix86_avx_u128_mode_exit (void)
19126 rtx reg = crtl->return_rtx;
19128 /* Exit mode is set to AVX_U128_DIRTY if there are
19129 256bit modes used in the function return register. */
19130 if (reg && ix86_check_avx256_register (reg))
19131 return AVX_U128_DIRTY;
19133 return AVX_U128_CLEAN;
19136 /* Return a mode that ENTITY is assumed to be
19137 switched to at function exit. */
19139 static int
19140 ix86_mode_exit (int entity)
19142 switch (entity)
19144 case X86_DIRFLAG:
19145 return X86_DIRFLAG_ANY;
19146 case AVX_U128:
19147 return ix86_avx_u128_mode_exit ();
19148 case I387_TRUNC:
19149 case I387_FLOOR:
19150 case I387_CEIL:
19151 case I387_MASK_PM:
19152 return I387_CW_ANY;
19153 default:
19154 gcc_unreachable ();
19158 static int
19159 ix86_mode_priority (int, int n)
19161 return n;
19164 /* Output code to initialize control word copies used by trunc?f?i and
19165 rounding patterns. CURRENT_MODE is set to current control word,
19166 while NEW_MODE is set to new control word. */
19168 static void
19169 emit_i387_cw_initialization (int mode)
19171 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19172 rtx new_mode;
19174 enum ix86_stack_slot slot;
19176 rtx reg = gen_reg_rtx (HImode);
19178 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19179 emit_move_insn (reg, copy_rtx (stored_mode));
19181 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19182 || optimize_insn_for_size_p ())
19184 switch (mode)
19186 case I387_CW_TRUNC:
19187 /* round toward zero (truncate) */
19188 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19189 slot = SLOT_CW_TRUNC;
19190 break;
19192 case I387_CW_FLOOR:
19193 /* round down toward -oo */
19194 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19195 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19196 slot = SLOT_CW_FLOOR;
19197 break;
19199 case I387_CW_CEIL:
19200 /* round up toward +oo */
19201 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19202 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19203 slot = SLOT_CW_CEIL;
19204 break;
19206 case I387_CW_MASK_PM:
19207 /* mask precision exception for nearbyint() */
19208 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19209 slot = SLOT_CW_MASK_PM;
19210 break;
19212 default:
19213 gcc_unreachable ();
19216 else
19218 switch (mode)
19220 case I387_CW_TRUNC:
19221 /* round toward zero (truncate) */
19222 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19223 slot = SLOT_CW_TRUNC;
19224 break;
19226 case I387_CW_FLOOR:
19227 /* round down toward -oo */
19228 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19229 slot = SLOT_CW_FLOOR;
19230 break;
19232 case I387_CW_CEIL:
19233 /* round up toward +oo */
19234 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19235 slot = SLOT_CW_CEIL;
19236 break;
19238 case I387_CW_MASK_PM:
19239 /* mask precision exception for nearbyint() */
19240 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19241 slot = SLOT_CW_MASK_PM;
19242 break;
19244 default:
19245 gcc_unreachable ();
19249 gcc_assert (slot < MAX_386_STACK_LOCALS);
19251 new_mode = assign_386_stack_local (HImode, slot);
19252 emit_move_insn (new_mode, reg);
19255 /* Emit vzeroupper. */
19257 void
19258 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19260 int i;
19262 /* Cancel automatic vzeroupper insertion if there are
19263 live call-saved SSE registers at the insertion point. */
19265 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19266 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19267 return;
19269 if (TARGET_64BIT)
19270 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19271 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19272 return;
19274 emit_insn (gen_avx_vzeroupper ());
19277 /* Generate one or more insns to set ENTITY to MODE. */
19279 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19280 is the set of hard registers live at the point where the insn(s)
19281 are to be inserted. */
19283 static void
19284 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19285 HARD_REG_SET regs_live)
19287 switch (entity)
19289 case X86_DIRFLAG:
19290 if (mode == X86_DIRFLAG_RESET)
19291 emit_insn (gen_cld ());
19292 break;
19293 case AVX_U128:
19294 if (mode == AVX_U128_CLEAN)
19295 ix86_avx_emit_vzeroupper (regs_live);
19296 break;
19297 case I387_TRUNC:
19298 case I387_FLOOR:
19299 case I387_CEIL:
19300 case I387_MASK_PM:
19301 if (mode != I387_CW_ANY
19302 && mode != I387_CW_UNINITIALIZED)
19303 emit_i387_cw_initialization (mode);
19304 break;
19305 default:
19306 gcc_unreachable ();
19310 /* Output code for INSN to convert a float to a signed int. OPERANDS
19311 are the insn operands. The output may be [HSD]Imode and the input
19312 operand may be [SDX]Fmode. */
19314 const char *
19315 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19317 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19318 int dimode_p = GET_MODE (operands[0]) == DImode;
19319 int round_mode = get_attr_i387_cw (insn);
19321 /* Jump through a hoop or two for DImode, since the hardware has no
19322 non-popping instruction. We used to do this a different way, but
19323 that was somewhat fragile and broke with post-reload splitters. */
19324 if ((dimode_p || fisttp) && !stack_top_dies)
19325 output_asm_insn ("fld\t%y1", operands);
19327 gcc_assert (STACK_TOP_P (operands[1]));
19328 gcc_assert (MEM_P (operands[0]));
19329 gcc_assert (GET_MODE (operands[1]) != TFmode);
19331 if (fisttp)
19332 output_asm_insn ("fisttp%Z0\t%0", operands);
19333 else
19335 if (round_mode != I387_CW_ANY)
19336 output_asm_insn ("fldcw\t%3", operands);
19337 if (stack_top_dies || dimode_p)
19338 output_asm_insn ("fistp%Z0\t%0", operands);
19339 else
19340 output_asm_insn ("fist%Z0\t%0", operands);
19341 if (round_mode != I387_CW_ANY)
19342 output_asm_insn ("fldcw\t%2", operands);
19345 return "";
19348 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19349 have the values zero or one, indicates the ffreep insn's operand
19350 from the OPERANDS array. */
19352 static const char *
19353 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19355 if (TARGET_USE_FFREEP)
19356 #ifdef HAVE_AS_IX86_FFREEP
19357 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19358 #else
19360 static char retval[32];
19361 int regno = REGNO (operands[opno]);
19363 gcc_assert (STACK_REGNO_P (regno));
19365 regno -= FIRST_STACK_REG;
19367 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19368 return retval;
19370 #endif
19372 return opno ? "fstp\t%y1" : "fstp\t%y0";
19376 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19377 should be used. UNORDERED_P is true when fucom should be used. */
19379 const char *
19380 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
19382 int stack_top_dies;
19383 rtx cmp_op0, cmp_op1;
19384 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19386 if (eflags_p)
19388 cmp_op0 = operands[0];
19389 cmp_op1 = operands[1];
19391 else
19393 cmp_op0 = operands[1];
19394 cmp_op1 = operands[2];
19397 if (is_sse)
19399 if (GET_MODE (operands[0]) == SFmode)
19400 if (unordered_p)
19401 return "%vucomiss\t{%1, %0|%0, %1}";
19402 else
19403 return "%vcomiss\t{%1, %0|%0, %1}";
19404 else
19405 if (unordered_p)
19406 return "%vucomisd\t{%1, %0|%0, %1}";
19407 else
19408 return "%vcomisd\t{%1, %0|%0, %1}";
19411 gcc_assert (STACK_TOP_P (cmp_op0));
19413 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19415 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19417 if (stack_top_dies)
19419 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19420 return output_387_ffreep (operands, 1);
19422 else
19423 return "ftst\n\tfnstsw\t%0";
19426 if (STACK_REG_P (cmp_op1)
19427 && stack_top_dies
19428 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19429 && REGNO (cmp_op1) != FIRST_STACK_REG)
19431 /* If both the top of the 387 stack dies, and the other operand
19432 is also a stack register that dies, then this must be a
19433 `fcompp' float compare */
19435 if (eflags_p)
19437 /* There is no double popping fcomi variant. Fortunately,
19438 eflags is immune from the fstp's cc clobbering. */
19439 if (unordered_p)
19440 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19441 else
19442 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19443 return output_387_ffreep (operands, 0);
19445 else
19447 if (unordered_p)
19448 return "fucompp\n\tfnstsw\t%0";
19449 else
19450 return "fcompp\n\tfnstsw\t%0";
19453 else
19455 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19457 static const char * const alt[16] =
19459 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19460 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19461 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19462 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19464 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19465 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19466 NULL,
19467 NULL,
19469 "fcomi\t{%y1, %0|%0, %y1}",
19470 "fcomip\t{%y1, %0|%0, %y1}",
19471 "fucomi\t{%y1, %0|%0, %y1}",
19472 "fucomip\t{%y1, %0|%0, %y1}",
19474 NULL,
19475 NULL,
19476 NULL,
19477 NULL
19480 int mask;
19481 const char *ret;
19483 mask = eflags_p << 3;
19484 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19485 mask |= unordered_p << 1;
19486 mask |= stack_top_dies;
19488 gcc_assert (mask < 16);
19489 ret = alt[mask];
19490 gcc_assert (ret);
19492 return ret;
19496 void
19497 ix86_output_addr_vec_elt (FILE *file, int value)
19499 const char *directive = ASM_LONG;
19501 #ifdef ASM_QUAD
19502 if (TARGET_LP64)
19503 directive = ASM_QUAD;
19504 #else
19505 gcc_assert (!TARGET_64BIT);
19506 #endif
19508 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19511 void
19512 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19514 const char *directive = ASM_LONG;
19516 #ifdef ASM_QUAD
19517 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19518 directive = ASM_QUAD;
19519 #else
19520 gcc_assert (!TARGET_64BIT);
19521 #endif
19522 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19523 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19524 fprintf (file, "%s%s%d-%s%d\n",
19525 directive, LPREFIX, value, LPREFIX, rel);
19526 else if (HAVE_AS_GOTOFF_IN_DATA)
19527 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19528 #if TARGET_MACHO
19529 else if (TARGET_MACHO)
19531 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19532 machopic_output_function_base_name (file);
19533 putc ('\n', file);
19535 #endif
19536 else
19537 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19538 GOT_SYMBOL_NAME, LPREFIX, value);
19541 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19542 for the target. */
19544 void
19545 ix86_expand_clear (rtx dest)
19547 rtx tmp;
19549 /* We play register width games, which are only valid after reload. */
19550 gcc_assert (reload_completed);
19552 /* Avoid HImode and its attendant prefix byte. */
19553 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19554 dest = gen_rtx_REG (SImode, REGNO (dest));
19555 tmp = gen_rtx_SET (dest, const0_rtx);
19557 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19559 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19560 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19563 emit_insn (tmp);
19566 /* X is an unchanging MEM. If it is a constant pool reference, return
19567 the constant pool rtx, else NULL. */
19570 maybe_get_pool_constant (rtx x)
19572 x = ix86_delegitimize_address (XEXP (x, 0));
19574 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19575 return get_pool_constant (x);
19577 return NULL_RTX;
19580 void
19581 ix86_expand_move (machine_mode mode, rtx operands[])
19583 rtx op0, op1;
19584 rtx tmp, addend = NULL_RTX;
19585 enum tls_model model;
19587 op0 = operands[0];
19588 op1 = operands[1];
19590 switch (GET_CODE (op1))
19592 case CONST:
19593 tmp = XEXP (op1, 0);
19595 if (GET_CODE (tmp) != PLUS
19596 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19597 break;
19599 op1 = XEXP (tmp, 0);
19600 addend = XEXP (tmp, 1);
19601 /* FALLTHRU */
19603 case SYMBOL_REF:
19604 model = SYMBOL_REF_TLS_MODEL (op1);
19606 if (model)
19607 op1 = legitimize_tls_address (op1, model, true);
19608 else if (ix86_force_load_from_GOT_p (op1))
19610 /* Load the external function address via GOT slot to avoid PLT. */
19611 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19612 (TARGET_64BIT
19613 ? UNSPEC_GOTPCREL
19614 : UNSPEC_GOT));
19615 op1 = gen_rtx_CONST (Pmode, op1);
19616 op1 = gen_const_mem (Pmode, op1);
19617 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19619 else
19621 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19622 if (tmp)
19624 op1 = tmp;
19625 if (!addend)
19626 break;
19628 else
19630 op1 = operands[1];
19631 break;
19635 if (addend)
19637 op1 = force_operand (op1, NULL_RTX);
19638 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19639 op0, 1, OPTAB_DIRECT);
19641 else
19642 op1 = force_operand (op1, op0);
19644 if (op1 == op0)
19645 return;
19647 op1 = convert_to_mode (mode, op1, 1);
19649 default:
19650 break;
19653 if ((flag_pic || MACHOPIC_INDIRECT)
19654 && symbolic_operand (op1, mode))
19656 if (TARGET_MACHO && !TARGET_64BIT)
19658 #if TARGET_MACHO
19659 /* dynamic-no-pic */
19660 if (MACHOPIC_INDIRECT)
19662 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19663 ? op0 : gen_reg_rtx (Pmode);
19664 op1 = machopic_indirect_data_reference (op1, temp);
19665 if (MACHOPIC_PURE)
19666 op1 = machopic_legitimize_pic_address (op1, mode,
19667 temp == op1 ? 0 : temp);
19669 if (op0 != op1 && GET_CODE (op0) != MEM)
19671 rtx insn = gen_rtx_SET (op0, op1);
19672 emit_insn (insn);
19673 return;
19675 if (GET_CODE (op0) == MEM)
19676 op1 = force_reg (Pmode, op1);
19677 else
19679 rtx temp = op0;
19680 if (GET_CODE (temp) != REG)
19681 temp = gen_reg_rtx (Pmode);
19682 temp = legitimize_pic_address (op1, temp);
19683 if (temp == op0)
19684 return;
19685 op1 = temp;
19687 /* dynamic-no-pic */
19688 #endif
19690 else
19692 if (MEM_P (op0))
19693 op1 = force_reg (mode, op1);
19694 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19696 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19697 op1 = legitimize_pic_address (op1, reg);
19698 if (op0 == op1)
19699 return;
19700 op1 = convert_to_mode (mode, op1, 1);
19704 else
19706 if (MEM_P (op0)
19707 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19708 || !push_operand (op0, mode))
19709 && MEM_P (op1))
19710 op1 = force_reg (mode, op1);
19712 if (push_operand (op0, mode)
19713 && ! general_no_elim_operand (op1, mode))
19714 op1 = copy_to_mode_reg (mode, op1);
19716 /* Force large constants in 64bit compilation into register
19717 to get them CSEed. */
19718 if (can_create_pseudo_p ()
19719 && (mode == DImode) && TARGET_64BIT
19720 && immediate_operand (op1, mode)
19721 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19722 && !register_operand (op0, mode)
19723 && optimize)
19724 op1 = copy_to_mode_reg (mode, op1);
19726 if (can_create_pseudo_p ()
19727 && CONST_DOUBLE_P (op1))
19729 /* If we are loading a floating point constant to a register,
19730 force the value to memory now, since we'll get better code
19731 out the back end. */
19733 op1 = validize_mem (force_const_mem (mode, op1));
19734 if (!register_operand (op0, mode))
19736 rtx temp = gen_reg_rtx (mode);
19737 emit_insn (gen_rtx_SET (temp, op1));
19738 emit_move_insn (op0, temp);
19739 return;
19744 emit_insn (gen_rtx_SET (op0, op1));
19747 void
19748 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19750 rtx op0 = operands[0], op1 = operands[1];
19751 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19752 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19753 unsigned int align = (TARGET_IAMCU
19754 ? GET_MODE_BITSIZE (mode)
19755 : GET_MODE_ALIGNMENT (mode));
19757 if (push_operand (op0, VOIDmode))
19758 op0 = emit_move_resolve_push (mode, op0);
19760 /* Force constants other than zero into memory. We do not know how
19761 the instructions used to build constants modify the upper 64 bits
19762 of the register, once we have that information we may be able
19763 to handle some of them more efficiently. */
19764 if (can_create_pseudo_p ()
19765 && (CONSTANT_P (op1)
19766 || (SUBREG_P (op1)
19767 && CONSTANT_P (SUBREG_REG (op1))))
19768 && ((register_operand (op0, mode)
19769 && !standard_sse_constant_p (op1, mode))
19770 /* ix86_expand_vector_move_misalign() does not like constants. */
19771 || (SSE_REG_MODE_P (mode)
19772 && MEM_P (op0)
19773 && MEM_ALIGN (op0) < align)))
19775 if (SUBREG_P (op1))
19777 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19778 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19779 if (r)
19780 r = validize_mem (r);
19781 else
19782 r = force_reg (imode, SUBREG_REG (op1));
19783 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19785 else
19786 op1 = validize_mem (force_const_mem (mode, op1));
19789 /* We need to check memory alignment for SSE mode since attribute
19790 can make operands unaligned. */
19791 if (can_create_pseudo_p ()
19792 && SSE_REG_MODE_P (mode)
19793 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19794 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19796 rtx tmp[2];
19798 /* ix86_expand_vector_move_misalign() does not like both
19799 arguments in memory. */
19800 if (!register_operand (op0, mode)
19801 && !register_operand (op1, mode))
19802 op1 = force_reg (mode, op1);
19804 tmp[0] = op0; tmp[1] = op1;
19805 ix86_expand_vector_move_misalign (mode, tmp);
19806 return;
19809 /* Make operand1 a register if it isn't already. */
19810 if (can_create_pseudo_p ()
19811 && !register_operand (op0, mode)
19812 && !register_operand (op1, mode))
19814 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19815 return;
19818 emit_insn (gen_rtx_SET (op0, op1));
19821 /* Split 32-byte AVX unaligned load and store if needed. */
19823 static void
19824 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19826 rtx m;
19827 rtx (*extract) (rtx, rtx, rtx);
19828 machine_mode mode;
19830 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19831 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19833 emit_insn (gen_rtx_SET (op0, op1));
19834 return;
19837 rtx orig_op0 = NULL_RTX;
19838 mode = GET_MODE (op0);
19839 switch (GET_MODE_CLASS (mode))
19841 case MODE_VECTOR_INT:
19842 case MODE_INT:
19843 if (mode != V32QImode)
19845 if (!MEM_P (op0))
19847 orig_op0 = op0;
19848 op0 = gen_reg_rtx (V32QImode);
19850 else
19851 op0 = gen_lowpart (V32QImode, op0);
19852 op1 = gen_lowpart (V32QImode, op1);
19853 mode = V32QImode;
19855 break;
19856 case MODE_VECTOR_FLOAT:
19857 break;
19858 default:
19859 gcc_unreachable ();
19862 switch (mode)
19864 default:
19865 gcc_unreachable ();
19866 case V32QImode:
19867 extract = gen_avx_vextractf128v32qi;
19868 mode = V16QImode;
19869 break;
19870 case V8SFmode:
19871 extract = gen_avx_vextractf128v8sf;
19872 mode = V4SFmode;
19873 break;
19874 case V4DFmode:
19875 extract = gen_avx_vextractf128v4df;
19876 mode = V2DFmode;
19877 break;
19880 if (MEM_P (op1))
19882 rtx r = gen_reg_rtx (mode);
19883 m = adjust_address (op1, mode, 0);
19884 emit_move_insn (r, m);
19885 m = adjust_address (op1, mode, 16);
19886 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19887 emit_move_insn (op0, r);
19889 else if (MEM_P (op0))
19891 m = adjust_address (op0, mode, 0);
19892 emit_insn (extract (m, op1, const0_rtx));
19893 m = adjust_address (op0, mode, 16);
19894 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19896 else
19897 gcc_unreachable ();
19899 if (orig_op0)
19900 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19903 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19904 straight to ix86_expand_vector_move. */
19905 /* Code generation for scalar reg-reg moves of single and double precision data:
19906 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19907 movaps reg, reg
19908 else
19909 movss reg, reg
19910 if (x86_sse_partial_reg_dependency == true)
19911 movapd reg, reg
19912 else
19913 movsd reg, reg
19915 Code generation for scalar loads of double precision data:
19916 if (x86_sse_split_regs == true)
19917 movlpd mem, reg (gas syntax)
19918 else
19919 movsd mem, reg
19921 Code generation for unaligned packed loads of single precision data
19922 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19923 if (x86_sse_unaligned_move_optimal)
19924 movups mem, reg
19926 if (x86_sse_partial_reg_dependency == true)
19928 xorps reg, reg
19929 movlps mem, reg
19930 movhps mem+8, reg
19932 else
19934 movlps mem, reg
19935 movhps mem+8, reg
19938 Code generation for unaligned packed loads of double precision data
19939 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19940 if (x86_sse_unaligned_move_optimal)
19941 movupd mem, reg
19943 if (x86_sse_split_regs == true)
19945 movlpd mem, reg
19946 movhpd mem+8, reg
19948 else
19950 movsd mem, reg
19951 movhpd mem+8, reg
19955 void
19956 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19958 rtx op0, op1, m;
19960 op0 = operands[0];
19961 op1 = operands[1];
19963 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19964 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19966 emit_insn (gen_rtx_SET (op0, op1));
19967 return;
19970 if (TARGET_AVX)
19972 if (GET_MODE_SIZE (mode) == 32)
19973 ix86_avx256_split_vector_move_misalign (op0, op1);
19974 else
19975 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19976 emit_insn (gen_rtx_SET (op0, op1));
19977 return;
19980 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19981 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19983 emit_insn (gen_rtx_SET (op0, op1));
19984 return;
19987 /* ??? If we have typed data, then it would appear that using
19988 movdqu is the only way to get unaligned data loaded with
19989 integer type. */
19990 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19992 emit_insn (gen_rtx_SET (op0, op1));
19993 return;
19996 if (MEM_P (op1))
19998 if (TARGET_SSE2 && mode == V2DFmode)
20000 rtx zero;
20002 /* When SSE registers are split into halves, we can avoid
20003 writing to the top half twice. */
20004 if (TARGET_SSE_SPLIT_REGS)
20006 emit_clobber (op0);
20007 zero = op0;
20009 else
20011 /* ??? Not sure about the best option for the Intel chips.
20012 The following would seem to satisfy; the register is
20013 entirely cleared, breaking the dependency chain. We
20014 then store to the upper half, with a dependency depth
20015 of one. A rumor has it that Intel recommends two movsd
20016 followed by an unpacklpd, but this is unconfirmed. And
20017 given that the dependency depth of the unpacklpd would
20018 still be one, I'm not sure why this would be better. */
20019 zero = CONST0_RTX (V2DFmode);
20022 m = adjust_address (op1, DFmode, 0);
20023 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20024 m = adjust_address (op1, DFmode, 8);
20025 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20027 else
20029 rtx t;
20031 if (mode != V4SFmode)
20032 t = gen_reg_rtx (V4SFmode);
20033 else
20034 t = op0;
20036 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20037 emit_move_insn (t, CONST0_RTX (V4SFmode));
20038 else
20039 emit_clobber (t);
20041 m = adjust_address (op1, V2SFmode, 0);
20042 emit_insn (gen_sse_loadlps (t, t, m));
20043 m = adjust_address (op1, V2SFmode, 8);
20044 emit_insn (gen_sse_loadhps (t, t, m));
20045 if (mode != V4SFmode)
20046 emit_move_insn (op0, gen_lowpart (mode, t));
20049 else if (MEM_P (op0))
20051 if (TARGET_SSE2 && mode == V2DFmode)
20053 m = adjust_address (op0, DFmode, 0);
20054 emit_insn (gen_sse2_storelpd (m, op1));
20055 m = adjust_address (op0, DFmode, 8);
20056 emit_insn (gen_sse2_storehpd (m, op1));
20058 else
20060 if (mode != V4SFmode)
20061 op1 = gen_lowpart (V4SFmode, op1);
20063 m = adjust_address (op0, V2SFmode, 0);
20064 emit_insn (gen_sse_storelps (m, op1));
20065 m = adjust_address (op0, V2SFmode, 8);
20066 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20069 else
20070 gcc_unreachable ();
20073 /* Helper function of ix86_fixup_binary_operands to canonicalize
20074 operand order. Returns true if the operands should be swapped. */
20076 static bool
20077 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20078 rtx operands[])
20080 rtx dst = operands[0];
20081 rtx src1 = operands[1];
20082 rtx src2 = operands[2];
20084 /* If the operation is not commutative, we can't do anything. */
20085 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20086 return false;
20088 /* Highest priority is that src1 should match dst. */
20089 if (rtx_equal_p (dst, src1))
20090 return false;
20091 if (rtx_equal_p (dst, src2))
20092 return true;
20094 /* Next highest priority is that immediate constants come second. */
20095 if (immediate_operand (src2, mode))
20096 return false;
20097 if (immediate_operand (src1, mode))
20098 return true;
20100 /* Lowest priority is that memory references should come second. */
20101 if (MEM_P (src2))
20102 return false;
20103 if (MEM_P (src1))
20104 return true;
20106 return false;
20110 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20111 destination to use for the operation. If different from the true
20112 destination in operands[0], a copy operation will be required. */
20115 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20116 rtx operands[])
20118 rtx dst = operands[0];
20119 rtx src1 = operands[1];
20120 rtx src2 = operands[2];
20122 /* Canonicalize operand order. */
20123 if (ix86_swap_binary_operands_p (code, mode, operands))
20125 /* It is invalid to swap operands of different modes. */
20126 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20128 std::swap (src1, src2);
20131 /* Both source operands cannot be in memory. */
20132 if (MEM_P (src1) && MEM_P (src2))
20134 /* Optimization: Only read from memory once. */
20135 if (rtx_equal_p (src1, src2))
20137 src2 = force_reg (mode, src2);
20138 src1 = src2;
20140 else if (rtx_equal_p (dst, src1))
20141 src2 = force_reg (mode, src2);
20142 else
20143 src1 = force_reg (mode, src1);
20146 /* If the destination is memory, and we do not have matching source
20147 operands, do things in registers. */
20148 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20149 dst = gen_reg_rtx (mode);
20151 /* Source 1 cannot be a constant. */
20152 if (CONSTANT_P (src1))
20153 src1 = force_reg (mode, src1);
20155 /* Source 1 cannot be a non-matching memory. */
20156 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20157 src1 = force_reg (mode, src1);
20159 /* Improve address combine. */
20160 if (code == PLUS
20161 && GET_MODE_CLASS (mode) == MODE_INT
20162 && MEM_P (src2))
20163 src2 = force_reg (mode, src2);
20165 operands[1] = src1;
20166 operands[2] = src2;
20167 return dst;
20170 /* Similarly, but assume that the destination has already been
20171 set up properly. */
20173 void
20174 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20175 machine_mode mode, rtx operands[])
20177 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20178 gcc_assert (dst == operands[0]);
20181 /* Attempt to expand a binary operator. Make the expansion closer to the
20182 actual machine, then just general_operand, which will allow 3 separate
20183 memory references (one output, two input) in a single insn. */
20185 void
20186 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20187 rtx operands[])
20189 rtx src1, src2, dst, op, clob;
20191 dst = ix86_fixup_binary_operands (code, mode, operands);
20192 src1 = operands[1];
20193 src2 = operands[2];
20195 /* Emit the instruction. */
20197 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20199 if (reload_completed
20200 && code == PLUS
20201 && !rtx_equal_p (dst, src1))
20203 /* This is going to be an LEA; avoid splitting it later. */
20204 emit_insn (op);
20206 else
20208 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20209 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20212 /* Fix up the destination if needed. */
20213 if (dst != operands[0])
20214 emit_move_insn (operands[0], dst);
20217 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20218 the given OPERANDS. */
20220 void
20221 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20222 rtx operands[])
20224 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20225 if (SUBREG_P (operands[1]))
20227 op1 = operands[1];
20228 op2 = operands[2];
20230 else if (SUBREG_P (operands[2]))
20232 op1 = operands[2];
20233 op2 = operands[1];
20235 /* Optimize (__m128i) d | (__m128i) e and similar code
20236 when d and e are float vectors into float vector logical
20237 insn. In C/C++ without using intrinsics there is no other way
20238 to express vector logical operation on float vectors than
20239 to cast them temporarily to integer vectors. */
20240 if (op1
20241 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20242 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20243 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20244 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20245 && SUBREG_BYTE (op1) == 0
20246 && (GET_CODE (op2) == CONST_VECTOR
20247 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20248 && SUBREG_BYTE (op2) == 0))
20249 && can_create_pseudo_p ())
20251 rtx dst;
20252 switch (GET_MODE (SUBREG_REG (op1)))
20254 case V4SFmode:
20255 case V8SFmode:
20256 case V16SFmode:
20257 case V2DFmode:
20258 case V4DFmode:
20259 case V8DFmode:
20260 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20261 if (GET_CODE (op2) == CONST_VECTOR)
20263 op2 = gen_lowpart (GET_MODE (dst), op2);
20264 op2 = force_reg (GET_MODE (dst), op2);
20266 else
20268 op1 = operands[1];
20269 op2 = SUBREG_REG (operands[2]);
20270 if (!vector_operand (op2, GET_MODE (dst)))
20271 op2 = force_reg (GET_MODE (dst), op2);
20273 op1 = SUBREG_REG (op1);
20274 if (!vector_operand (op1, GET_MODE (dst)))
20275 op1 = force_reg (GET_MODE (dst), op1);
20276 emit_insn (gen_rtx_SET (dst,
20277 gen_rtx_fmt_ee (code, GET_MODE (dst),
20278 op1, op2)));
20279 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20280 return;
20281 default:
20282 break;
20285 if (!vector_operand (operands[1], mode))
20286 operands[1] = force_reg (mode, operands[1]);
20287 if (!vector_operand (operands[2], mode))
20288 operands[2] = force_reg (mode, operands[2]);
20289 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20290 emit_insn (gen_rtx_SET (operands[0],
20291 gen_rtx_fmt_ee (code, mode, operands[1],
20292 operands[2])));
20295 /* Return TRUE or FALSE depending on whether the binary operator meets the
20296 appropriate constraints. */
20298 bool
20299 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20300 rtx operands[3])
20302 rtx dst = operands[0];
20303 rtx src1 = operands[1];
20304 rtx src2 = operands[2];
20306 /* Both source operands cannot be in memory. */
20307 if (MEM_P (src1) && MEM_P (src2))
20308 return false;
20310 /* Canonicalize operand order for commutative operators. */
20311 if (ix86_swap_binary_operands_p (code, mode, operands))
20312 std::swap (src1, src2);
20314 /* If the destination is memory, we must have a matching source operand. */
20315 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20316 return false;
20318 /* Source 1 cannot be a constant. */
20319 if (CONSTANT_P (src1))
20320 return false;
20322 /* Source 1 cannot be a non-matching memory. */
20323 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20324 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20325 return (code == AND
20326 && (mode == HImode
20327 || mode == SImode
20328 || (TARGET_64BIT && mode == DImode))
20329 && satisfies_constraint_L (src2));
20331 return true;
20334 /* Attempt to expand a unary operator. Make the expansion closer to the
20335 actual machine, then just general_operand, which will allow 2 separate
20336 memory references (one output, one input) in a single insn. */
20338 void
20339 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20340 rtx operands[])
20342 bool matching_memory = false;
20343 rtx src, dst, op, clob;
20345 dst = operands[0];
20346 src = operands[1];
20348 /* If the destination is memory, and we do not have matching source
20349 operands, do things in registers. */
20350 if (MEM_P (dst))
20352 if (rtx_equal_p (dst, src))
20353 matching_memory = true;
20354 else
20355 dst = gen_reg_rtx (mode);
20358 /* When source operand is memory, destination must match. */
20359 if (MEM_P (src) && !matching_memory)
20360 src = force_reg (mode, src);
20362 /* Emit the instruction. */
20364 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20366 if (code == NOT)
20367 emit_insn (op);
20368 else
20370 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20371 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20374 /* Fix up the destination if needed. */
20375 if (dst != operands[0])
20376 emit_move_insn (operands[0], dst);
20379 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20380 divisor are within the range [0-255]. */
20382 void
20383 ix86_split_idivmod (machine_mode mode, rtx operands[],
20384 bool signed_p)
20386 rtx_code_label *end_label, *qimode_label;
20387 rtx insn, div, mod;
20388 rtx scratch, tmp0, tmp1, tmp2;
20389 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20390 rtx (*gen_zero_extend) (rtx, rtx);
20391 rtx (*gen_test_ccno_1) (rtx, rtx);
20393 switch (mode)
20395 case SImode:
20396 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20397 gen_test_ccno_1 = gen_testsi_ccno_1;
20398 gen_zero_extend = gen_zero_extendqisi2;
20399 break;
20400 case DImode:
20401 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20402 gen_test_ccno_1 = gen_testdi_ccno_1;
20403 gen_zero_extend = gen_zero_extendqidi2;
20404 break;
20405 default:
20406 gcc_unreachable ();
20409 end_label = gen_label_rtx ();
20410 qimode_label = gen_label_rtx ();
20412 scratch = gen_reg_rtx (mode);
20414 /* Use 8bit unsigned divimod if dividend and divisor are within
20415 the range [0-255]. */
20416 emit_move_insn (scratch, operands[2]);
20417 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20418 scratch, 1, OPTAB_DIRECT);
20419 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20420 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20421 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20422 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20423 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20424 pc_rtx);
20425 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20426 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20427 JUMP_LABEL (insn) = qimode_label;
20429 /* Generate original signed/unsigned divimod. */
20430 div = gen_divmod4_1 (operands[0], operands[1],
20431 operands[2], operands[3]);
20432 emit_insn (div);
20434 /* Branch to the end. */
20435 emit_jump_insn (gen_jump (end_label));
20436 emit_barrier ();
20438 /* Generate 8bit unsigned divide. */
20439 emit_label (qimode_label);
20440 /* Don't use operands[0] for result of 8bit divide since not all
20441 registers support QImode ZERO_EXTRACT. */
20442 tmp0 = lowpart_subreg (HImode, scratch, mode);
20443 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20444 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20445 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20447 if (signed_p)
20449 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20450 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20452 else
20454 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20455 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20458 /* Extract remainder from AH. */
20459 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20460 if (REG_P (operands[1]))
20461 insn = emit_move_insn (operands[1], tmp1);
20462 else
20464 /* Need a new scratch register since the old one has result
20465 of 8bit divide. */
20466 scratch = gen_reg_rtx (mode);
20467 emit_move_insn (scratch, tmp1);
20468 insn = emit_move_insn (operands[1], scratch);
20470 set_unique_reg_note (insn, REG_EQUAL, mod);
20472 /* Zero extend quotient from AL. */
20473 tmp1 = gen_lowpart (QImode, tmp0);
20474 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20475 set_unique_reg_note (insn, REG_EQUAL, div);
20477 emit_label (end_label);
20480 #define LEA_MAX_STALL (3)
20481 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20483 /* Increase given DISTANCE in half-cycles according to
20484 dependencies between PREV and NEXT instructions.
20485 Add 1 half-cycle if there is no dependency and
20486 go to next cycle if there is some dependecy. */
20488 static unsigned int
20489 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20491 df_ref def, use;
20493 if (!prev || !next)
20494 return distance + (distance & 1) + 2;
20496 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20497 return distance + 1;
20499 FOR_EACH_INSN_USE (use, next)
20500 FOR_EACH_INSN_DEF (def, prev)
20501 if (!DF_REF_IS_ARTIFICIAL (def)
20502 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20503 return distance + (distance & 1) + 2;
20505 return distance + 1;
20508 /* Function checks if instruction INSN defines register number
20509 REGNO1 or REGNO2. */
20511 static bool
20512 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20513 rtx_insn *insn)
20515 df_ref def;
20517 FOR_EACH_INSN_DEF (def, insn)
20518 if (DF_REF_REG_DEF_P (def)
20519 && !DF_REF_IS_ARTIFICIAL (def)
20520 && (regno1 == DF_REF_REGNO (def)
20521 || regno2 == DF_REF_REGNO (def)))
20522 return true;
20524 return false;
20527 /* Function checks if instruction INSN uses register number
20528 REGNO as a part of address expression. */
20530 static bool
20531 insn_uses_reg_mem (unsigned int regno, rtx insn)
20533 df_ref use;
20535 FOR_EACH_INSN_USE (use, insn)
20536 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20537 return true;
20539 return false;
20542 /* Search backward for non-agu definition of register number REGNO1
20543 or register number REGNO2 in basic block starting from instruction
20544 START up to head of basic block or instruction INSN.
20546 Function puts true value into *FOUND var if definition was found
20547 and false otherwise.
20549 Distance in half-cycles between START and found instruction or head
20550 of BB is added to DISTANCE and returned. */
20552 static int
20553 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20554 rtx_insn *insn, int distance,
20555 rtx_insn *start, bool *found)
20557 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20558 rtx_insn *prev = start;
20559 rtx_insn *next = NULL;
20561 *found = false;
20563 while (prev
20564 && prev != insn
20565 && distance < LEA_SEARCH_THRESHOLD)
20567 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20569 distance = increase_distance (prev, next, distance);
20570 if (insn_defines_reg (regno1, regno2, prev))
20572 if (recog_memoized (prev) < 0
20573 || get_attr_type (prev) != TYPE_LEA)
20575 *found = true;
20576 return distance;
20580 next = prev;
20582 if (prev == BB_HEAD (bb))
20583 break;
20585 prev = PREV_INSN (prev);
20588 return distance;
20591 /* Search backward for non-agu definition of register number REGNO1
20592 or register number REGNO2 in INSN's basic block until
20593 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20594 2. Reach neighbor BBs boundary, or
20595 3. Reach agu definition.
20596 Returns the distance between the non-agu definition point and INSN.
20597 If no definition point, returns -1. */
20599 static int
20600 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20601 rtx_insn *insn)
20603 basic_block bb = BLOCK_FOR_INSN (insn);
20604 int distance = 0;
20605 bool found = false;
20607 if (insn != BB_HEAD (bb))
20608 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20609 distance, PREV_INSN (insn),
20610 &found);
20612 if (!found && distance < LEA_SEARCH_THRESHOLD)
20614 edge e;
20615 edge_iterator ei;
20616 bool simple_loop = false;
20618 FOR_EACH_EDGE (e, ei, bb->preds)
20619 if (e->src == bb)
20621 simple_loop = true;
20622 break;
20625 if (simple_loop)
20626 distance = distance_non_agu_define_in_bb (regno1, regno2,
20627 insn, distance,
20628 BB_END (bb), &found);
20629 else
20631 int shortest_dist = -1;
20632 bool found_in_bb = false;
20634 FOR_EACH_EDGE (e, ei, bb->preds)
20636 int bb_dist
20637 = distance_non_agu_define_in_bb (regno1, regno2,
20638 insn, distance,
20639 BB_END (e->src),
20640 &found_in_bb);
20641 if (found_in_bb)
20643 if (shortest_dist < 0)
20644 shortest_dist = bb_dist;
20645 else if (bb_dist > 0)
20646 shortest_dist = MIN (bb_dist, shortest_dist);
20648 found = true;
20652 distance = shortest_dist;
20656 /* get_attr_type may modify recog data. We want to make sure
20657 that recog data is valid for instruction INSN, on which
20658 distance_non_agu_define is called. INSN is unchanged here. */
20659 extract_insn_cached (insn);
20661 if (!found)
20662 return -1;
20664 return distance >> 1;
20667 /* Return the distance in half-cycles between INSN and the next
20668 insn that uses register number REGNO in memory address added
20669 to DISTANCE. Return -1 if REGNO0 is set.
20671 Put true value into *FOUND if register usage was found and
20672 false otherwise.
20673 Put true value into *REDEFINED if register redefinition was
20674 found and false otherwise. */
20676 static int
20677 distance_agu_use_in_bb (unsigned int regno,
20678 rtx_insn *insn, int distance, rtx_insn *start,
20679 bool *found, bool *redefined)
20681 basic_block bb = NULL;
20682 rtx_insn *next = start;
20683 rtx_insn *prev = NULL;
20685 *found = false;
20686 *redefined = false;
20688 if (start != NULL_RTX)
20690 bb = BLOCK_FOR_INSN (start);
20691 if (start != BB_HEAD (bb))
20692 /* If insn and start belong to the same bb, set prev to insn,
20693 so the call to increase_distance will increase the distance
20694 between insns by 1. */
20695 prev = insn;
20698 while (next
20699 && next != insn
20700 && distance < LEA_SEARCH_THRESHOLD)
20702 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20704 distance = increase_distance(prev, next, distance);
20705 if (insn_uses_reg_mem (regno, next))
20707 /* Return DISTANCE if OP0 is used in memory
20708 address in NEXT. */
20709 *found = true;
20710 return distance;
20713 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20715 /* Return -1 if OP0 is set in NEXT. */
20716 *redefined = true;
20717 return -1;
20720 prev = next;
20723 if (next == BB_END (bb))
20724 break;
20726 next = NEXT_INSN (next);
20729 return distance;
20732 /* Return the distance between INSN and the next insn that uses
20733 register number REGNO0 in memory address. Return -1 if no such
20734 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20736 static int
20737 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20739 basic_block bb = BLOCK_FOR_INSN (insn);
20740 int distance = 0;
20741 bool found = false;
20742 bool redefined = false;
20744 if (insn != BB_END (bb))
20745 distance = distance_agu_use_in_bb (regno0, insn, distance,
20746 NEXT_INSN (insn),
20747 &found, &redefined);
20749 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20751 edge e;
20752 edge_iterator ei;
20753 bool simple_loop = false;
20755 FOR_EACH_EDGE (e, ei, bb->succs)
20756 if (e->dest == bb)
20758 simple_loop = true;
20759 break;
20762 if (simple_loop)
20763 distance = distance_agu_use_in_bb (regno0, insn,
20764 distance, BB_HEAD (bb),
20765 &found, &redefined);
20766 else
20768 int shortest_dist = -1;
20769 bool found_in_bb = false;
20770 bool redefined_in_bb = false;
20772 FOR_EACH_EDGE (e, ei, bb->succs)
20774 int bb_dist
20775 = distance_agu_use_in_bb (regno0, insn,
20776 distance, BB_HEAD (e->dest),
20777 &found_in_bb, &redefined_in_bb);
20778 if (found_in_bb)
20780 if (shortest_dist < 0)
20781 shortest_dist = bb_dist;
20782 else if (bb_dist > 0)
20783 shortest_dist = MIN (bb_dist, shortest_dist);
20785 found = true;
20789 distance = shortest_dist;
20793 if (!found || redefined)
20794 return -1;
20796 return distance >> 1;
20799 /* Define this macro to tune LEA priority vs ADD, it take effect when
20800 there is a dilemma of choicing LEA or ADD
20801 Negative value: ADD is more preferred than LEA
20802 Zero: Netrual
20803 Positive value: LEA is more preferred than ADD*/
20804 #define IX86_LEA_PRIORITY 0
20806 /* Return true if usage of lea INSN has performance advantage
20807 over a sequence of instructions. Instructions sequence has
20808 SPLIT_COST cycles higher latency than lea latency. */
20810 static bool
20811 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20812 unsigned int regno2, int split_cost, bool has_scale)
20814 int dist_define, dist_use;
20816 /* For Silvermont if using a 2-source or 3-source LEA for
20817 non-destructive destination purposes, or due to wanting
20818 ability to use SCALE, the use of LEA is justified. */
20819 if (TARGET_SILVERMONT || TARGET_INTEL)
20821 if (has_scale)
20822 return true;
20823 if (split_cost < 1)
20824 return false;
20825 if (regno0 == regno1 || regno0 == regno2)
20826 return false;
20827 return true;
20830 dist_define = distance_non_agu_define (regno1, regno2, insn);
20831 dist_use = distance_agu_use (regno0, insn);
20833 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20835 /* If there is no non AGU operand definition, no AGU
20836 operand usage and split cost is 0 then both lea
20837 and non lea variants have same priority. Currently
20838 we prefer lea for 64 bit code and non lea on 32 bit
20839 code. */
20840 if (dist_use < 0 && split_cost == 0)
20841 return TARGET_64BIT || IX86_LEA_PRIORITY;
20842 else
20843 return true;
20846 /* With longer definitions distance lea is more preferable.
20847 Here we change it to take into account splitting cost and
20848 lea priority. */
20849 dist_define += split_cost + IX86_LEA_PRIORITY;
20851 /* If there is no use in memory addess then we just check
20852 that split cost exceeds AGU stall. */
20853 if (dist_use < 0)
20854 return dist_define > LEA_MAX_STALL;
20856 /* If this insn has both backward non-agu dependence and forward
20857 agu dependence, the one with short distance takes effect. */
20858 return dist_define >= dist_use;
20861 /* Return true if it is legal to clobber flags by INSN and
20862 false otherwise. */
20864 static bool
20865 ix86_ok_to_clobber_flags (rtx_insn *insn)
20867 basic_block bb = BLOCK_FOR_INSN (insn);
20868 df_ref use;
20869 bitmap live;
20871 while (insn)
20873 if (NONDEBUG_INSN_P (insn))
20875 FOR_EACH_INSN_USE (use, insn)
20876 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20877 return false;
20879 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20880 return true;
20883 if (insn == BB_END (bb))
20884 break;
20886 insn = NEXT_INSN (insn);
20889 live = df_get_live_out(bb);
20890 return !REGNO_REG_SET_P (live, FLAGS_REG);
20893 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20894 move and add to avoid AGU stalls. */
20896 bool
20897 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20899 unsigned int regno0, regno1, regno2;
20901 /* Check if we need to optimize. */
20902 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20903 return false;
20905 /* Check it is correct to split here. */
20906 if (!ix86_ok_to_clobber_flags(insn))
20907 return false;
20909 regno0 = true_regnum (operands[0]);
20910 regno1 = true_regnum (operands[1]);
20911 regno2 = true_regnum (operands[2]);
20913 /* We need to split only adds with non destructive
20914 destination operand. */
20915 if (regno0 == regno1 || regno0 == regno2)
20916 return false;
20917 else
20918 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20921 /* Return true if we should emit lea instruction instead of mov
20922 instruction. */
20924 bool
20925 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20927 unsigned int regno0, regno1;
20929 /* Check if we need to optimize. */
20930 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20931 return false;
20933 /* Use lea for reg to reg moves only. */
20934 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20935 return false;
20937 regno0 = true_regnum (operands[0]);
20938 regno1 = true_regnum (operands[1]);
20940 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20943 /* Return true if we need to split lea into a sequence of
20944 instructions to avoid AGU stalls. */
20946 bool
20947 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20949 unsigned int regno0, regno1, regno2;
20950 int split_cost;
20951 struct ix86_address parts;
20952 int ok;
20954 /* Check we need to optimize. */
20955 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20956 return false;
20958 /* The "at least two components" test below might not catch simple
20959 move or zero extension insns if parts.base is non-NULL and parts.disp
20960 is const0_rtx as the only components in the address, e.g. if the
20961 register is %rbp or %r13. As this test is much cheaper and moves or
20962 zero extensions are the common case, do this check first. */
20963 if (REG_P (operands[1])
20964 || (SImode_address_operand (operands[1], VOIDmode)
20965 && REG_P (XEXP (operands[1], 0))))
20966 return false;
20968 /* Check if it is OK to split here. */
20969 if (!ix86_ok_to_clobber_flags (insn))
20970 return false;
20972 ok = ix86_decompose_address (operands[1], &parts);
20973 gcc_assert (ok);
20975 /* There should be at least two components in the address. */
20976 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20977 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20978 return false;
20980 /* We should not split into add if non legitimate pic
20981 operand is used as displacement. */
20982 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20983 return false;
20985 regno0 = true_regnum (operands[0]) ;
20986 regno1 = INVALID_REGNUM;
20987 regno2 = INVALID_REGNUM;
20989 if (parts.base)
20990 regno1 = true_regnum (parts.base);
20991 if (parts.index)
20992 regno2 = true_regnum (parts.index);
20994 split_cost = 0;
20996 /* Compute how many cycles we will add to execution time
20997 if split lea into a sequence of instructions. */
20998 if (parts.base || parts.index)
21000 /* Have to use mov instruction if non desctructive
21001 destination form is used. */
21002 if (regno1 != regno0 && regno2 != regno0)
21003 split_cost += 1;
21005 /* Have to add index to base if both exist. */
21006 if (parts.base && parts.index)
21007 split_cost += 1;
21009 /* Have to use shift and adds if scale is 2 or greater. */
21010 if (parts.scale > 1)
21012 if (regno0 != regno1)
21013 split_cost += 1;
21014 else if (regno2 == regno0)
21015 split_cost += 4;
21016 else
21017 split_cost += parts.scale;
21020 /* Have to use add instruction with immediate if
21021 disp is non zero. */
21022 if (parts.disp && parts.disp != const0_rtx)
21023 split_cost += 1;
21025 /* Subtract the price of lea. */
21026 split_cost -= 1;
21029 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21030 parts.scale > 1);
21033 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21034 matches destination. RTX includes clobber of FLAGS_REG. */
21036 static void
21037 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21038 rtx dst, rtx src)
21040 rtx op, clob;
21042 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21043 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21045 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21048 /* Return true if regno1 def is nearest to the insn. */
21050 static bool
21051 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21053 rtx_insn *prev = insn;
21054 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21056 if (insn == start)
21057 return false;
21058 while (prev && prev != start)
21060 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21062 prev = PREV_INSN (prev);
21063 continue;
21065 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21066 return true;
21067 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21068 return false;
21069 prev = PREV_INSN (prev);
21072 /* None of the regs is defined in the bb. */
21073 return false;
21076 /* Split lea instructions into a sequence of instructions
21077 which are executed on ALU to avoid AGU stalls.
21078 It is assumed that it is allowed to clobber flags register
21079 at lea position. */
21081 void
21082 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21084 unsigned int regno0, regno1, regno2;
21085 struct ix86_address parts;
21086 rtx target, tmp;
21087 int ok, adds;
21089 ok = ix86_decompose_address (operands[1], &parts);
21090 gcc_assert (ok);
21092 target = gen_lowpart (mode, operands[0]);
21094 regno0 = true_regnum (target);
21095 regno1 = INVALID_REGNUM;
21096 regno2 = INVALID_REGNUM;
21098 if (parts.base)
21100 parts.base = gen_lowpart (mode, parts.base);
21101 regno1 = true_regnum (parts.base);
21104 if (parts.index)
21106 parts.index = gen_lowpart (mode, parts.index);
21107 regno2 = true_regnum (parts.index);
21110 if (parts.disp)
21111 parts.disp = gen_lowpart (mode, parts.disp);
21113 if (parts.scale > 1)
21115 /* Case r1 = r1 + ... */
21116 if (regno1 == regno0)
21118 /* If we have a case r1 = r1 + C * r2 then we
21119 should use multiplication which is very
21120 expensive. Assume cost model is wrong if we
21121 have such case here. */
21122 gcc_assert (regno2 != regno0);
21124 for (adds = parts.scale; adds > 0; adds--)
21125 ix86_emit_binop (PLUS, mode, target, parts.index);
21127 else
21129 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21130 if (regno0 != regno2)
21131 emit_insn (gen_rtx_SET (target, parts.index));
21133 /* Use shift for scaling. */
21134 ix86_emit_binop (ASHIFT, mode, target,
21135 GEN_INT (exact_log2 (parts.scale)));
21137 if (parts.base)
21138 ix86_emit_binop (PLUS, mode, target, parts.base);
21140 if (parts.disp && parts.disp != const0_rtx)
21141 ix86_emit_binop (PLUS, mode, target, parts.disp);
21144 else if (!parts.base && !parts.index)
21146 gcc_assert(parts.disp);
21147 emit_insn (gen_rtx_SET (target, parts.disp));
21149 else
21151 if (!parts.base)
21153 if (regno0 != regno2)
21154 emit_insn (gen_rtx_SET (target, parts.index));
21156 else if (!parts.index)
21158 if (regno0 != regno1)
21159 emit_insn (gen_rtx_SET (target, parts.base));
21161 else
21163 if (regno0 == regno1)
21164 tmp = parts.index;
21165 else if (regno0 == regno2)
21166 tmp = parts.base;
21167 else
21169 rtx tmp1;
21171 /* Find better operand for SET instruction, depending
21172 on which definition is farther from the insn. */
21173 if (find_nearest_reg_def (insn, regno1, regno2))
21174 tmp = parts.index, tmp1 = parts.base;
21175 else
21176 tmp = parts.base, tmp1 = parts.index;
21178 emit_insn (gen_rtx_SET (target, tmp));
21180 if (parts.disp && parts.disp != const0_rtx)
21181 ix86_emit_binop (PLUS, mode, target, parts.disp);
21183 ix86_emit_binop (PLUS, mode, target, tmp1);
21184 return;
21187 ix86_emit_binop (PLUS, mode, target, tmp);
21190 if (parts.disp && parts.disp != const0_rtx)
21191 ix86_emit_binop (PLUS, mode, target, parts.disp);
21195 /* Return true if it is ok to optimize an ADD operation to LEA
21196 operation to avoid flag register consumation. For most processors,
21197 ADD is faster than LEA. For the processors like BONNELL, if the
21198 destination register of LEA holds an actual address which will be
21199 used soon, LEA is better and otherwise ADD is better. */
21201 bool
21202 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21204 unsigned int regno0 = true_regnum (operands[0]);
21205 unsigned int regno1 = true_regnum (operands[1]);
21206 unsigned int regno2 = true_regnum (operands[2]);
21208 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21209 if (regno0 != regno1 && regno0 != regno2)
21210 return true;
21212 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21213 return false;
21215 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21218 /* Return true if destination reg of SET_BODY is shift count of
21219 USE_BODY. */
21221 static bool
21222 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21224 rtx set_dest;
21225 rtx shift_rtx;
21226 int i;
21228 /* Retrieve destination of SET_BODY. */
21229 switch (GET_CODE (set_body))
21231 case SET:
21232 set_dest = SET_DEST (set_body);
21233 if (!set_dest || !REG_P (set_dest))
21234 return false;
21235 break;
21236 case PARALLEL:
21237 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21238 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21239 use_body))
21240 return true;
21241 default:
21242 return false;
21243 break;
21246 /* Retrieve shift count of USE_BODY. */
21247 switch (GET_CODE (use_body))
21249 case SET:
21250 shift_rtx = XEXP (use_body, 1);
21251 break;
21252 case PARALLEL:
21253 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21254 if (ix86_dep_by_shift_count_body (set_body,
21255 XVECEXP (use_body, 0, i)))
21256 return true;
21257 default:
21258 return false;
21259 break;
21262 if (shift_rtx
21263 && (GET_CODE (shift_rtx) == ASHIFT
21264 || GET_CODE (shift_rtx) == LSHIFTRT
21265 || GET_CODE (shift_rtx) == ASHIFTRT
21266 || GET_CODE (shift_rtx) == ROTATE
21267 || GET_CODE (shift_rtx) == ROTATERT))
21269 rtx shift_count = XEXP (shift_rtx, 1);
21271 /* Return true if shift count is dest of SET_BODY. */
21272 if (REG_P (shift_count))
21274 /* Add check since it can be invoked before register
21275 allocation in pre-reload schedule. */
21276 if (reload_completed
21277 && true_regnum (set_dest) == true_regnum (shift_count))
21278 return true;
21279 else if (REGNO(set_dest) == REGNO(shift_count))
21280 return true;
21284 return false;
21287 /* Return true if destination reg of SET_INSN is shift count of
21288 USE_INSN. */
21290 bool
21291 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21293 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21294 PATTERN (use_insn));
21297 /* Return TRUE or FALSE depending on whether the unary operator meets the
21298 appropriate constraints. */
21300 bool
21301 ix86_unary_operator_ok (enum rtx_code,
21302 machine_mode,
21303 rtx operands[2])
21305 /* If one of operands is memory, source and destination must match. */
21306 if ((MEM_P (operands[0])
21307 || MEM_P (operands[1]))
21308 && ! rtx_equal_p (operands[0], operands[1]))
21309 return false;
21310 return true;
21313 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21314 are ok, keeping in mind the possible movddup alternative. */
21316 bool
21317 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21319 if (MEM_P (operands[0]))
21320 return rtx_equal_p (operands[0], operands[1 + high]);
21321 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21322 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21323 return true;
21326 /* Post-reload splitter for converting an SF or DFmode value in an
21327 SSE register into an unsigned SImode. */
21329 void
21330 ix86_split_convert_uns_si_sse (rtx operands[])
21332 machine_mode vecmode;
21333 rtx value, large, zero_or_two31, input, two31, x;
21335 large = operands[1];
21336 zero_or_two31 = operands[2];
21337 input = operands[3];
21338 two31 = operands[4];
21339 vecmode = GET_MODE (large);
21340 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21342 /* Load up the value into the low element. We must ensure that the other
21343 elements are valid floats -- zero is the easiest such value. */
21344 if (MEM_P (input))
21346 if (vecmode == V4SFmode)
21347 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21348 else
21349 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21351 else
21353 input = gen_rtx_REG (vecmode, REGNO (input));
21354 emit_move_insn (value, CONST0_RTX (vecmode));
21355 if (vecmode == V4SFmode)
21356 emit_insn (gen_sse_movss (value, value, input));
21357 else
21358 emit_insn (gen_sse2_movsd (value, value, input));
21361 emit_move_insn (large, two31);
21362 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21364 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21365 emit_insn (gen_rtx_SET (large, x));
21367 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21368 emit_insn (gen_rtx_SET (zero_or_two31, x));
21370 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21371 emit_insn (gen_rtx_SET (value, x));
21373 large = gen_rtx_REG (V4SImode, REGNO (large));
21374 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21376 x = gen_rtx_REG (V4SImode, REGNO (value));
21377 if (vecmode == V4SFmode)
21378 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21379 else
21380 emit_insn (gen_sse2_cvttpd2dq (x, value));
21381 value = x;
21383 emit_insn (gen_xorv4si3 (value, value, large));
21386 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21387 Expects the 64-bit DImode to be supplied in a pair of integral
21388 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21389 -mfpmath=sse, !optimize_size only. */
21391 void
21392 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21394 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21395 rtx int_xmm, fp_xmm;
21396 rtx biases, exponents;
21397 rtx x;
21399 int_xmm = gen_reg_rtx (V4SImode);
21400 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21401 emit_insn (gen_movdi_to_sse (int_xmm, input));
21402 else if (TARGET_SSE_SPLIT_REGS)
21404 emit_clobber (int_xmm);
21405 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21407 else
21409 x = gen_reg_rtx (V2DImode);
21410 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21411 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21414 x = gen_rtx_CONST_VECTOR (V4SImode,
21415 gen_rtvec (4, GEN_INT (0x43300000UL),
21416 GEN_INT (0x45300000UL),
21417 const0_rtx, const0_rtx));
21418 exponents = validize_mem (force_const_mem (V4SImode, x));
21420 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21421 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21423 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21424 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21425 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21426 (0x1.0p84 + double(fp_value_hi_xmm)).
21427 Note these exponents differ by 32. */
21429 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21431 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21432 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21433 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21434 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21435 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21436 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21437 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21438 biases = validize_mem (force_const_mem (V2DFmode, biases));
21439 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21441 /* Add the upper and lower DFmode values together. */
21442 if (TARGET_SSE3)
21443 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21444 else
21446 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21447 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21448 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21451 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21454 /* Not used, but eases macroization of patterns. */
21455 void
21456 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21458 gcc_unreachable ();
21461 /* Convert an unsigned SImode value into a DFmode. Only currently used
21462 for SSE, but applicable anywhere. */
21464 void
21465 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21467 REAL_VALUE_TYPE TWO31r;
21468 rtx x, fp;
21470 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21471 NULL, 1, OPTAB_DIRECT);
21473 fp = gen_reg_rtx (DFmode);
21474 emit_insn (gen_floatsidf2 (fp, x));
21476 real_ldexp (&TWO31r, &dconst1, 31);
21477 x = const_double_from_real_value (TWO31r, DFmode);
21479 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21480 if (x != target)
21481 emit_move_insn (target, x);
21484 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21485 32-bit mode; otherwise we have a direct convert instruction. */
21487 void
21488 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21490 REAL_VALUE_TYPE TWO32r;
21491 rtx fp_lo, fp_hi, x;
21493 fp_lo = gen_reg_rtx (DFmode);
21494 fp_hi = gen_reg_rtx (DFmode);
21496 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21498 real_ldexp (&TWO32r, &dconst1, 32);
21499 x = const_double_from_real_value (TWO32r, DFmode);
21500 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21502 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21504 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21505 0, OPTAB_DIRECT);
21506 if (x != target)
21507 emit_move_insn (target, x);
21510 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21511 For x86_32, -mfpmath=sse, !optimize_size only. */
21512 void
21513 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21515 REAL_VALUE_TYPE ONE16r;
21516 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21518 real_ldexp (&ONE16r, &dconst1, 16);
21519 x = const_double_from_real_value (ONE16r, SFmode);
21520 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21521 NULL, 0, OPTAB_DIRECT);
21522 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21523 NULL, 0, OPTAB_DIRECT);
21524 fp_hi = gen_reg_rtx (SFmode);
21525 fp_lo = gen_reg_rtx (SFmode);
21526 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21527 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21528 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21529 0, OPTAB_DIRECT);
21530 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21531 0, OPTAB_DIRECT);
21532 if (!rtx_equal_p (target, fp_hi))
21533 emit_move_insn (target, fp_hi);
21536 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21537 a vector of unsigned ints VAL to vector of floats TARGET. */
21539 void
21540 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21542 rtx tmp[8];
21543 REAL_VALUE_TYPE TWO16r;
21544 machine_mode intmode = GET_MODE (val);
21545 machine_mode fltmode = GET_MODE (target);
21546 rtx (*cvt) (rtx, rtx);
21548 if (intmode == V4SImode)
21549 cvt = gen_floatv4siv4sf2;
21550 else
21551 cvt = gen_floatv8siv8sf2;
21552 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21553 tmp[0] = force_reg (intmode, tmp[0]);
21554 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21555 OPTAB_DIRECT);
21556 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21557 NULL_RTX, 1, OPTAB_DIRECT);
21558 tmp[3] = gen_reg_rtx (fltmode);
21559 emit_insn (cvt (tmp[3], tmp[1]));
21560 tmp[4] = gen_reg_rtx (fltmode);
21561 emit_insn (cvt (tmp[4], tmp[2]));
21562 real_ldexp (&TWO16r, &dconst1, 16);
21563 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21564 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21565 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21566 OPTAB_DIRECT);
21567 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21568 OPTAB_DIRECT);
21569 if (tmp[7] != target)
21570 emit_move_insn (target, tmp[7]);
21573 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21574 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21575 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21576 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21579 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21581 REAL_VALUE_TYPE TWO31r;
21582 rtx two31r, tmp[4];
21583 machine_mode mode = GET_MODE (val);
21584 machine_mode scalarmode = GET_MODE_INNER (mode);
21585 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21586 rtx (*cmp) (rtx, rtx, rtx, rtx);
21587 int i;
21589 for (i = 0; i < 3; i++)
21590 tmp[i] = gen_reg_rtx (mode);
21591 real_ldexp (&TWO31r, &dconst1, 31);
21592 two31r = const_double_from_real_value (TWO31r, scalarmode);
21593 two31r = ix86_build_const_vector (mode, 1, two31r);
21594 two31r = force_reg (mode, two31r);
21595 switch (mode)
21597 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21598 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21599 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21600 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21601 default: gcc_unreachable ();
21603 tmp[3] = gen_rtx_LE (mode, two31r, val);
21604 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21605 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21606 0, OPTAB_DIRECT);
21607 if (intmode == V4SImode || TARGET_AVX2)
21608 *xorp = expand_simple_binop (intmode, ASHIFT,
21609 gen_lowpart (intmode, tmp[0]),
21610 GEN_INT (31), NULL_RTX, 0,
21611 OPTAB_DIRECT);
21612 else
21614 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21615 two31 = ix86_build_const_vector (intmode, 1, two31);
21616 *xorp = expand_simple_binop (intmode, AND,
21617 gen_lowpart (intmode, tmp[0]),
21618 two31, NULL_RTX, 0,
21619 OPTAB_DIRECT);
21621 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21622 0, OPTAB_DIRECT);
21625 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21626 then replicate the value for all elements of the vector
21627 register. */
21630 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21632 int i, n_elt;
21633 rtvec v;
21634 machine_mode scalar_mode;
21636 switch (mode)
21638 case V64QImode:
21639 case V32QImode:
21640 case V16QImode:
21641 case V32HImode:
21642 case V16HImode:
21643 case V8HImode:
21644 case V16SImode:
21645 case V8SImode:
21646 case V4SImode:
21647 case V8DImode:
21648 case V4DImode:
21649 case V2DImode:
21650 gcc_assert (vect);
21651 /* FALLTHRU */
21652 case V16SFmode:
21653 case V8SFmode:
21654 case V4SFmode:
21655 case V8DFmode:
21656 case V4DFmode:
21657 case V2DFmode:
21658 n_elt = GET_MODE_NUNITS (mode);
21659 v = rtvec_alloc (n_elt);
21660 scalar_mode = GET_MODE_INNER (mode);
21662 RTVEC_ELT (v, 0) = value;
21664 for (i = 1; i < n_elt; ++i)
21665 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21667 return gen_rtx_CONST_VECTOR (mode, v);
21669 default:
21670 gcc_unreachable ();
21674 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21675 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21676 for an SSE register. If VECT is true, then replicate the mask for
21677 all elements of the vector register. If INVERT is true, then create
21678 a mask excluding the sign bit. */
21681 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21683 machine_mode vec_mode, imode;
21684 wide_int w;
21685 rtx mask, v;
21687 switch (mode)
21689 case V16SImode:
21690 case V16SFmode:
21691 case V8SImode:
21692 case V4SImode:
21693 case V8SFmode:
21694 case V4SFmode:
21695 vec_mode = mode;
21696 imode = SImode;
21697 break;
21699 case V8DImode:
21700 case V4DImode:
21701 case V2DImode:
21702 case V8DFmode:
21703 case V4DFmode:
21704 case V2DFmode:
21705 vec_mode = mode;
21706 imode = DImode;
21707 break;
21709 case TImode:
21710 case TFmode:
21711 vec_mode = VOIDmode;
21712 imode = TImode;
21713 break;
21715 default:
21716 gcc_unreachable ();
21719 machine_mode inner_mode = GET_MODE_INNER (mode);
21720 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21721 GET_MODE_BITSIZE (inner_mode));
21722 if (invert)
21723 w = wi::bit_not (w);
21725 /* Force this value into the low part of a fp vector constant. */
21726 mask = immed_wide_int_const (w, imode);
21727 mask = gen_lowpart (inner_mode, mask);
21729 if (vec_mode == VOIDmode)
21730 return force_reg (inner_mode, mask);
21732 v = ix86_build_const_vector (vec_mode, vect, mask);
21733 return force_reg (vec_mode, v);
21736 /* Generate code for floating point ABS or NEG. */
21738 void
21739 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21740 rtx operands[])
21742 rtx mask, set, dst, src;
21743 bool use_sse = false;
21744 bool vector_mode = VECTOR_MODE_P (mode);
21745 machine_mode vmode = mode;
21747 if (vector_mode)
21748 use_sse = true;
21749 else if (mode == TFmode)
21750 use_sse = true;
21751 else if (TARGET_SSE_MATH)
21753 use_sse = SSE_FLOAT_MODE_P (mode);
21754 if (mode == SFmode)
21755 vmode = V4SFmode;
21756 else if (mode == DFmode)
21757 vmode = V2DFmode;
21760 /* NEG and ABS performed with SSE use bitwise mask operations.
21761 Create the appropriate mask now. */
21762 if (use_sse)
21763 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21764 else
21765 mask = NULL_RTX;
21767 dst = operands[0];
21768 src = operands[1];
21770 set = gen_rtx_fmt_e (code, mode, src);
21771 set = gen_rtx_SET (dst, set);
21773 if (mask)
21775 rtx use, clob;
21776 rtvec par;
21778 use = gen_rtx_USE (VOIDmode, mask);
21779 if (vector_mode)
21780 par = gen_rtvec (2, set, use);
21781 else
21783 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21784 par = gen_rtvec (3, set, use, clob);
21786 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21788 else
21789 emit_insn (set);
21792 /* Expand a copysign operation. Special case operand 0 being a constant. */
21794 void
21795 ix86_expand_copysign (rtx operands[])
21797 machine_mode mode, vmode;
21798 rtx dest, op0, op1, mask, nmask;
21800 dest = operands[0];
21801 op0 = operands[1];
21802 op1 = operands[2];
21804 mode = GET_MODE (dest);
21806 if (mode == SFmode)
21807 vmode = V4SFmode;
21808 else if (mode == DFmode)
21809 vmode = V2DFmode;
21810 else
21811 vmode = mode;
21813 if (CONST_DOUBLE_P (op0))
21815 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21817 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21818 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21820 if (mode == SFmode || mode == DFmode)
21822 if (op0 == CONST0_RTX (mode))
21823 op0 = CONST0_RTX (vmode);
21824 else
21826 rtx v = ix86_build_const_vector (vmode, false, op0);
21828 op0 = force_reg (vmode, v);
21831 else if (op0 != CONST0_RTX (mode))
21832 op0 = force_reg (mode, op0);
21834 mask = ix86_build_signbit_mask (vmode, 0, 0);
21836 if (mode == SFmode)
21837 copysign_insn = gen_copysignsf3_const;
21838 else if (mode == DFmode)
21839 copysign_insn = gen_copysigndf3_const;
21840 else
21841 copysign_insn = gen_copysigntf3_const;
21843 emit_insn (copysign_insn (dest, op0, op1, mask));
21845 else
21847 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21849 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21850 mask = ix86_build_signbit_mask (vmode, 0, 0);
21852 if (mode == SFmode)
21853 copysign_insn = gen_copysignsf3_var;
21854 else if (mode == DFmode)
21855 copysign_insn = gen_copysigndf3_var;
21856 else
21857 copysign_insn = gen_copysigntf3_var;
21859 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21863 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21864 be a constant, and so has already been expanded into a vector constant. */
21866 void
21867 ix86_split_copysign_const (rtx operands[])
21869 machine_mode mode, vmode;
21870 rtx dest, op0, mask, x;
21872 dest = operands[0];
21873 op0 = operands[1];
21874 mask = operands[3];
21876 mode = GET_MODE (dest);
21877 vmode = GET_MODE (mask);
21879 dest = lowpart_subreg (vmode, dest, mode);
21880 x = gen_rtx_AND (vmode, dest, mask);
21881 emit_insn (gen_rtx_SET (dest, x));
21883 if (op0 != CONST0_RTX (vmode))
21885 x = gen_rtx_IOR (vmode, dest, op0);
21886 emit_insn (gen_rtx_SET (dest, x));
21890 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21891 so we have to do two masks. */
21893 void
21894 ix86_split_copysign_var (rtx operands[])
21896 machine_mode mode, vmode;
21897 rtx dest, scratch, op0, op1, mask, nmask, x;
21899 dest = operands[0];
21900 scratch = operands[1];
21901 op0 = operands[2];
21902 op1 = operands[3];
21903 nmask = operands[4];
21904 mask = operands[5];
21906 mode = GET_MODE (dest);
21907 vmode = GET_MODE (mask);
21909 if (rtx_equal_p (op0, op1))
21911 /* Shouldn't happen often (it's useless, obviously), but when it does
21912 we'd generate incorrect code if we continue below. */
21913 emit_move_insn (dest, op0);
21914 return;
21917 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21919 gcc_assert (REGNO (op1) == REGNO (scratch));
21921 x = gen_rtx_AND (vmode, scratch, mask);
21922 emit_insn (gen_rtx_SET (scratch, x));
21924 dest = mask;
21925 op0 = lowpart_subreg (vmode, op0, mode);
21926 x = gen_rtx_NOT (vmode, dest);
21927 x = gen_rtx_AND (vmode, x, op0);
21928 emit_insn (gen_rtx_SET (dest, x));
21930 else
21932 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21934 x = gen_rtx_AND (vmode, scratch, mask);
21936 else /* alternative 2,4 */
21938 gcc_assert (REGNO (mask) == REGNO (scratch));
21939 op1 = lowpart_subreg (vmode, op1, mode);
21940 x = gen_rtx_AND (vmode, scratch, op1);
21942 emit_insn (gen_rtx_SET (scratch, x));
21944 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21946 dest = lowpart_subreg (vmode, op0, mode);
21947 x = gen_rtx_AND (vmode, dest, nmask);
21949 else /* alternative 3,4 */
21951 gcc_assert (REGNO (nmask) == REGNO (dest));
21952 dest = nmask;
21953 op0 = lowpart_subreg (vmode, op0, mode);
21954 x = gen_rtx_AND (vmode, dest, op0);
21956 emit_insn (gen_rtx_SET (dest, x));
21959 x = gen_rtx_IOR (vmode, dest, scratch);
21960 emit_insn (gen_rtx_SET (dest, x));
21963 /* Return TRUE or FALSE depending on whether the first SET in INSN
21964 has source and destination with matching CC modes, and that the
21965 CC mode is at least as constrained as REQ_MODE. */
21967 bool
21968 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21970 rtx set;
21971 machine_mode set_mode;
21973 set = PATTERN (insn);
21974 if (GET_CODE (set) == PARALLEL)
21975 set = XVECEXP (set, 0, 0);
21976 gcc_assert (GET_CODE (set) == SET);
21977 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21979 set_mode = GET_MODE (SET_DEST (set));
21980 switch (set_mode)
21982 case CCNOmode:
21983 if (req_mode != CCNOmode
21984 && (req_mode != CCmode
21985 || XEXP (SET_SRC (set), 1) != const0_rtx))
21986 return false;
21987 break;
21988 case CCmode:
21989 if (req_mode == CCGCmode)
21990 return false;
21991 /* FALLTHRU */
21992 case CCGCmode:
21993 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21994 return false;
21995 /* FALLTHRU */
21996 case CCGOCmode:
21997 if (req_mode == CCZmode)
21998 return false;
21999 /* FALLTHRU */
22000 case CCZmode:
22001 break;
22003 case CCAmode:
22004 case CCCmode:
22005 case CCOmode:
22006 case CCPmode:
22007 case CCSmode:
22008 if (set_mode != req_mode)
22009 return false;
22010 break;
22012 default:
22013 gcc_unreachable ();
22016 return GET_MODE (SET_SRC (set)) == set_mode;
22019 /* Generate insn patterns to do an integer compare of OPERANDS. */
22021 static rtx
22022 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22024 machine_mode cmpmode;
22025 rtx tmp, flags;
22027 cmpmode = SELECT_CC_MODE (code, op0, op1);
22028 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22030 /* This is very simple, but making the interface the same as in the
22031 FP case makes the rest of the code easier. */
22032 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22033 emit_insn (gen_rtx_SET (flags, tmp));
22035 /* Return the test that should be put into the flags user, i.e.
22036 the bcc, scc, or cmov instruction. */
22037 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22040 /* Figure out whether to use ordered or unordered fp comparisons.
22041 Return the appropriate mode to use. */
22043 machine_mode
22044 ix86_fp_compare_mode (enum rtx_code)
22046 /* ??? In order to make all comparisons reversible, we do all comparisons
22047 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22048 all forms trapping and nontrapping comparisons, we can make inequality
22049 comparisons trapping again, since it results in better code when using
22050 FCOM based compares. */
22051 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22054 machine_mode
22055 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22057 machine_mode mode = GET_MODE (op0);
22059 if (SCALAR_FLOAT_MODE_P (mode))
22061 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22062 return ix86_fp_compare_mode (code);
22065 switch (code)
22067 /* Only zero flag is needed. */
22068 case EQ: /* ZF=0 */
22069 case NE: /* ZF!=0 */
22070 return CCZmode;
22071 /* Codes needing carry flag. */
22072 case GEU: /* CF=0 */
22073 case LTU: /* CF=1 */
22074 /* Detect overflow checks. They need just the carry flag. */
22075 if (GET_CODE (op0) == PLUS
22076 && (rtx_equal_p (op1, XEXP (op0, 0))
22077 || rtx_equal_p (op1, XEXP (op0, 1))))
22078 return CCCmode;
22079 else
22080 return CCmode;
22081 case GTU: /* CF=0 & ZF=0 */
22082 case LEU: /* CF=1 | ZF=1 */
22083 return CCmode;
22084 /* Codes possibly doable only with sign flag when
22085 comparing against zero. */
22086 case GE: /* SF=OF or SF=0 */
22087 case LT: /* SF<>OF or SF=1 */
22088 if (op1 == const0_rtx)
22089 return CCGOCmode;
22090 else
22091 /* For other cases Carry flag is not required. */
22092 return CCGCmode;
22093 /* Codes doable only with sign flag when comparing
22094 against zero, but we miss jump instruction for it
22095 so we need to use relational tests against overflow
22096 that thus needs to be zero. */
22097 case GT: /* ZF=0 & SF=OF */
22098 case LE: /* ZF=1 | SF<>OF */
22099 if (op1 == const0_rtx)
22100 return CCNOmode;
22101 else
22102 return CCGCmode;
22103 /* strcmp pattern do (use flags) and combine may ask us for proper
22104 mode. */
22105 case USE:
22106 return CCmode;
22107 default:
22108 gcc_unreachable ();
22112 /* Return the fixed registers used for condition codes. */
22114 static bool
22115 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22117 *p1 = FLAGS_REG;
22118 *p2 = FPSR_REG;
22119 return true;
22122 /* If two condition code modes are compatible, return a condition code
22123 mode which is compatible with both. Otherwise, return
22124 VOIDmode. */
22126 static machine_mode
22127 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22129 if (m1 == m2)
22130 return m1;
22132 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22133 return VOIDmode;
22135 if ((m1 == CCGCmode && m2 == CCGOCmode)
22136 || (m1 == CCGOCmode && m2 == CCGCmode))
22137 return CCGCmode;
22139 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22140 return m2;
22141 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22142 return m1;
22144 switch (m1)
22146 default:
22147 gcc_unreachable ();
22149 case CCmode:
22150 case CCGCmode:
22151 case CCGOCmode:
22152 case CCNOmode:
22153 case CCAmode:
22154 case CCCmode:
22155 case CCOmode:
22156 case CCPmode:
22157 case CCSmode:
22158 case CCZmode:
22159 switch (m2)
22161 default:
22162 return VOIDmode;
22164 case CCmode:
22165 case CCGCmode:
22166 case CCGOCmode:
22167 case CCNOmode:
22168 case CCAmode:
22169 case CCCmode:
22170 case CCOmode:
22171 case CCPmode:
22172 case CCSmode:
22173 case CCZmode:
22174 return CCmode;
22177 case CCFPmode:
22178 case CCFPUmode:
22179 /* These are only compatible with themselves, which we already
22180 checked above. */
22181 return VOIDmode;
22186 /* Return a comparison we can do and that it is equivalent to
22187 swap_condition (code) apart possibly from orderedness.
22188 But, never change orderedness if TARGET_IEEE_FP, returning
22189 UNKNOWN in that case if necessary. */
22191 static enum rtx_code
22192 ix86_fp_swap_condition (enum rtx_code code)
22194 switch (code)
22196 case GT: /* GTU - CF=0 & ZF=0 */
22197 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22198 case GE: /* GEU - CF=0 */
22199 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22200 case UNLT: /* LTU - CF=1 */
22201 return TARGET_IEEE_FP ? UNKNOWN : GT;
22202 case UNLE: /* LEU - CF=1 | ZF=1 */
22203 return TARGET_IEEE_FP ? UNKNOWN : GE;
22204 default:
22205 return swap_condition (code);
22209 /* Return cost of comparison CODE using the best strategy for performance.
22210 All following functions do use number of instructions as a cost metrics.
22211 In future this should be tweaked to compute bytes for optimize_size and
22212 take into account performance of various instructions on various CPUs. */
22214 static int
22215 ix86_fp_comparison_cost (enum rtx_code code)
22217 int arith_cost;
22219 /* The cost of code using bit-twiddling on %ah. */
22220 switch (code)
22222 case UNLE:
22223 case UNLT:
22224 case LTGT:
22225 case GT:
22226 case GE:
22227 case UNORDERED:
22228 case ORDERED:
22229 case UNEQ:
22230 arith_cost = 4;
22231 break;
22232 case LT:
22233 case NE:
22234 case EQ:
22235 case UNGE:
22236 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22237 break;
22238 case LE:
22239 case UNGT:
22240 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22241 break;
22242 default:
22243 gcc_unreachable ();
22246 switch (ix86_fp_comparison_strategy (code))
22248 case IX86_FPCMP_COMI:
22249 return arith_cost > 4 ? 3 : 2;
22250 case IX86_FPCMP_SAHF:
22251 return arith_cost > 4 ? 4 : 3;
22252 default:
22253 return arith_cost;
22257 /* Return strategy to use for floating-point. We assume that fcomi is always
22258 preferrable where available, since that is also true when looking at size
22259 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22261 enum ix86_fpcmp_strategy
22262 ix86_fp_comparison_strategy (enum rtx_code)
22264 /* Do fcomi/sahf based test when profitable. */
22266 if (TARGET_CMOVE)
22267 return IX86_FPCMP_COMI;
22269 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22270 return IX86_FPCMP_SAHF;
22272 return IX86_FPCMP_ARITH;
22275 /* Swap, force into registers, or otherwise massage the two operands
22276 to a fp comparison. The operands are updated in place; the new
22277 comparison code is returned. */
22279 static enum rtx_code
22280 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22282 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22283 rtx op0 = *pop0, op1 = *pop1;
22284 machine_mode op_mode = GET_MODE (op0);
22285 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22287 /* All of the unordered compare instructions only work on registers.
22288 The same is true of the fcomi compare instructions. The XFmode
22289 compare instructions require registers except when comparing
22290 against zero or when converting operand 1 from fixed point to
22291 floating point. */
22293 if (!is_sse
22294 && (fpcmp_mode == CCFPUmode
22295 || (op_mode == XFmode
22296 && ! (standard_80387_constant_p (op0) == 1
22297 || standard_80387_constant_p (op1) == 1)
22298 && GET_CODE (op1) != FLOAT)
22299 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22301 op0 = force_reg (op_mode, op0);
22302 op1 = force_reg (op_mode, op1);
22304 else
22306 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22307 things around if they appear profitable, otherwise force op0
22308 into a register. */
22310 if (standard_80387_constant_p (op0) == 0
22311 || (MEM_P (op0)
22312 && ! (standard_80387_constant_p (op1) == 0
22313 || MEM_P (op1))))
22315 enum rtx_code new_code = ix86_fp_swap_condition (code);
22316 if (new_code != UNKNOWN)
22318 std::swap (op0, op1);
22319 code = new_code;
22323 if (!REG_P (op0))
22324 op0 = force_reg (op_mode, op0);
22326 if (CONSTANT_P (op1))
22328 int tmp = standard_80387_constant_p (op1);
22329 if (tmp == 0)
22330 op1 = validize_mem (force_const_mem (op_mode, op1));
22331 else if (tmp == 1)
22333 if (TARGET_CMOVE)
22334 op1 = force_reg (op_mode, op1);
22336 else
22337 op1 = force_reg (op_mode, op1);
22341 /* Try to rearrange the comparison to make it cheaper. */
22342 if (ix86_fp_comparison_cost (code)
22343 > ix86_fp_comparison_cost (swap_condition (code))
22344 && (REG_P (op1) || can_create_pseudo_p ()))
22346 std::swap (op0, op1);
22347 code = swap_condition (code);
22348 if (!REG_P (op0))
22349 op0 = force_reg (op_mode, op0);
22352 *pop0 = op0;
22353 *pop1 = op1;
22354 return code;
22357 /* Convert comparison codes we use to represent FP comparison to integer
22358 code that will result in proper branch. Return UNKNOWN if no such code
22359 is available. */
22361 enum rtx_code
22362 ix86_fp_compare_code_to_integer (enum rtx_code code)
22364 switch (code)
22366 case GT:
22367 return GTU;
22368 case GE:
22369 return GEU;
22370 case ORDERED:
22371 case UNORDERED:
22372 return code;
22373 break;
22374 case UNEQ:
22375 return EQ;
22376 break;
22377 case UNLT:
22378 return LTU;
22379 break;
22380 case UNLE:
22381 return LEU;
22382 break;
22383 case LTGT:
22384 return NE;
22385 break;
22386 default:
22387 return UNKNOWN;
22391 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22393 static rtx
22394 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22396 machine_mode fpcmp_mode, intcmp_mode;
22397 rtx tmp, tmp2;
22399 fpcmp_mode = ix86_fp_compare_mode (code);
22400 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22402 /* Do fcomi/sahf based test when profitable. */
22403 switch (ix86_fp_comparison_strategy (code))
22405 case IX86_FPCMP_COMI:
22406 intcmp_mode = fpcmp_mode;
22407 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22408 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22409 emit_insn (tmp);
22410 break;
22412 case IX86_FPCMP_SAHF:
22413 intcmp_mode = fpcmp_mode;
22414 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22415 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22417 if (!scratch)
22418 scratch = gen_reg_rtx (HImode);
22419 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22420 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22421 break;
22423 case IX86_FPCMP_ARITH:
22424 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22425 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22426 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22427 if (!scratch)
22428 scratch = gen_reg_rtx (HImode);
22429 emit_insn (gen_rtx_SET (scratch, tmp2));
22431 /* In the unordered case, we have to check C2 for NaN's, which
22432 doesn't happen to work out to anything nice combination-wise.
22433 So do some bit twiddling on the value we've got in AH to come
22434 up with an appropriate set of condition codes. */
22436 intcmp_mode = CCNOmode;
22437 switch (code)
22439 case GT:
22440 case UNGT:
22441 if (code == GT || !TARGET_IEEE_FP)
22443 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22444 code = EQ;
22446 else
22448 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22449 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22450 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22451 intcmp_mode = CCmode;
22452 code = GEU;
22454 break;
22455 case LT:
22456 case UNLT:
22457 if (code == LT && TARGET_IEEE_FP)
22459 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22460 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22461 intcmp_mode = CCmode;
22462 code = EQ;
22464 else
22466 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
22467 code = NE;
22469 break;
22470 case GE:
22471 case UNGE:
22472 if (code == GE || !TARGET_IEEE_FP)
22474 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
22475 code = EQ;
22477 else
22479 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22480 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
22481 code = NE;
22483 break;
22484 case LE:
22485 case UNLE:
22486 if (code == LE && TARGET_IEEE_FP)
22488 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22489 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22490 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22491 intcmp_mode = CCmode;
22492 code = LTU;
22494 else
22496 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22497 code = NE;
22499 break;
22500 case EQ:
22501 case UNEQ:
22502 if (code == EQ && TARGET_IEEE_FP)
22504 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22505 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22506 intcmp_mode = CCmode;
22507 code = EQ;
22509 else
22511 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22512 code = NE;
22514 break;
22515 case NE:
22516 case LTGT:
22517 if (code == NE && TARGET_IEEE_FP)
22519 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22520 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
22521 GEN_INT (0x40)));
22522 code = NE;
22524 else
22526 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22527 code = EQ;
22529 break;
22531 case UNORDERED:
22532 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22533 code = NE;
22534 break;
22535 case ORDERED:
22536 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22537 code = EQ;
22538 break;
22540 default:
22541 gcc_unreachable ();
22543 break;
22545 default:
22546 gcc_unreachable();
22549 /* Return the test that should be put into the flags user, i.e.
22550 the bcc, scc, or cmov instruction. */
22551 return gen_rtx_fmt_ee (code, VOIDmode,
22552 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22553 const0_rtx);
22556 static rtx
22557 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22559 rtx ret;
22561 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22562 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22564 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22566 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22567 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22569 else
22570 ret = ix86_expand_int_compare (code, op0, op1);
22572 return ret;
22575 void
22576 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22578 machine_mode mode = GET_MODE (op0);
22579 rtx tmp;
22581 /* Handle special case - vector comparsion with boolean result, transform
22582 it using ptest instruction. */
22583 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22585 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22586 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22588 gcc_assert (code == EQ || code == NE);
22589 /* Generate XOR since we can't check that one operand is zero vector. */
22590 tmp = gen_reg_rtx (mode);
22591 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22592 tmp = gen_lowpart (p_mode, tmp);
22593 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22594 gen_rtx_UNSPEC (CCmode,
22595 gen_rtvec (2, tmp, tmp),
22596 UNSPEC_PTEST)));
22597 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22598 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22599 gen_rtx_LABEL_REF (VOIDmode, label),
22600 pc_rtx);
22601 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22602 return;
22605 switch (mode)
22607 case SFmode:
22608 case DFmode:
22609 case XFmode:
22610 case QImode:
22611 case HImode:
22612 case SImode:
22613 simple:
22614 tmp = ix86_expand_compare (code, op0, op1);
22615 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22616 gen_rtx_LABEL_REF (VOIDmode, label),
22617 pc_rtx);
22618 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22619 return;
22621 case DImode:
22622 if (TARGET_64BIT)
22623 goto simple;
22624 /* For 32-bit target DI comparison may be performed on
22625 SSE registers. To allow this we should avoid split
22626 to SI mode which is achieved by doing xor in DI mode
22627 and then comparing with zero (which is recognized by
22628 STV pass). We don't compare using xor when optimizing
22629 for size. */
22630 if (!optimize_insn_for_size_p ()
22631 && TARGET_STV
22632 && (code == EQ || code == NE))
22634 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22635 op1 = const0_rtx;
22637 /* FALLTHRU */
22638 case TImode:
22639 /* Expand DImode branch into multiple compare+branch. */
22641 rtx lo[2], hi[2];
22642 rtx_code_label *label2;
22643 enum rtx_code code1, code2, code3;
22644 machine_mode submode;
22646 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22648 std::swap (op0, op1);
22649 code = swap_condition (code);
22652 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22653 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22655 submode = mode == DImode ? SImode : DImode;
22657 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22658 avoid two branches. This costs one extra insn, so disable when
22659 optimizing for size. */
22661 if ((code == EQ || code == NE)
22662 && (!optimize_insn_for_size_p ()
22663 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22665 rtx xor0, xor1;
22667 xor1 = hi[0];
22668 if (hi[1] != const0_rtx)
22669 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22670 NULL_RTX, 0, OPTAB_WIDEN);
22672 xor0 = lo[0];
22673 if (lo[1] != const0_rtx)
22674 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22675 NULL_RTX, 0, OPTAB_WIDEN);
22677 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22678 NULL_RTX, 0, OPTAB_WIDEN);
22680 ix86_expand_branch (code, tmp, const0_rtx, label);
22681 return;
22684 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22685 op1 is a constant and the low word is zero, then we can just
22686 examine the high word. Similarly for low word -1 and
22687 less-or-equal-than or greater-than. */
22689 if (CONST_INT_P (hi[1]))
22690 switch (code)
22692 case LT: case LTU: case GE: case GEU:
22693 if (lo[1] == const0_rtx)
22695 ix86_expand_branch (code, hi[0], hi[1], label);
22696 return;
22698 break;
22699 case LE: case LEU: case GT: case GTU:
22700 if (lo[1] == constm1_rtx)
22702 ix86_expand_branch (code, hi[0], hi[1], label);
22703 return;
22705 break;
22706 default:
22707 break;
22710 /* Otherwise, we need two or three jumps. */
22712 label2 = gen_label_rtx ();
22714 code1 = code;
22715 code2 = swap_condition (code);
22716 code3 = unsigned_condition (code);
22718 switch (code)
22720 case LT: case GT: case LTU: case GTU:
22721 break;
22723 case LE: code1 = LT; code2 = GT; break;
22724 case GE: code1 = GT; code2 = LT; break;
22725 case LEU: code1 = LTU; code2 = GTU; break;
22726 case GEU: code1 = GTU; code2 = LTU; break;
22728 case EQ: code1 = UNKNOWN; code2 = NE; break;
22729 case NE: code2 = UNKNOWN; break;
22731 default:
22732 gcc_unreachable ();
22736 * a < b =>
22737 * if (hi(a) < hi(b)) goto true;
22738 * if (hi(a) > hi(b)) goto false;
22739 * if (lo(a) < lo(b)) goto true;
22740 * false:
22743 if (code1 != UNKNOWN)
22744 ix86_expand_branch (code1, hi[0], hi[1], label);
22745 if (code2 != UNKNOWN)
22746 ix86_expand_branch (code2, hi[0], hi[1], label2);
22748 ix86_expand_branch (code3, lo[0], lo[1], label);
22750 if (code2 != UNKNOWN)
22751 emit_label (label2);
22752 return;
22755 default:
22756 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22757 goto simple;
22761 /* Split branch based on floating point condition. */
22762 void
22763 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22764 rtx target1, rtx target2, rtx tmp)
22766 rtx condition;
22767 rtx i;
22769 if (target2 != pc_rtx)
22771 std::swap (target1, target2);
22772 code = reverse_condition_maybe_unordered (code);
22775 condition = ix86_expand_fp_compare (code, op1, op2,
22776 tmp);
22778 i = emit_jump_insn (gen_rtx_SET
22779 (pc_rtx,
22780 gen_rtx_IF_THEN_ELSE (VOIDmode,
22781 condition, target1, target2)));
22782 if (split_branch_probability >= 0)
22783 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
22786 void
22787 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22789 rtx ret;
22791 gcc_assert (GET_MODE (dest) == QImode);
22793 ret = ix86_expand_compare (code, op0, op1);
22794 PUT_MODE (ret, QImode);
22795 emit_insn (gen_rtx_SET (dest, ret));
22798 /* Expand comparison setting or clearing carry flag. Return true when
22799 successful and set pop for the operation. */
22800 static bool
22801 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22803 machine_mode mode =
22804 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22806 /* Do not handle double-mode compares that go through special path. */
22807 if (mode == (TARGET_64BIT ? TImode : DImode))
22808 return false;
22810 if (SCALAR_FLOAT_MODE_P (mode))
22812 rtx compare_op;
22813 rtx_insn *compare_seq;
22815 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22817 /* Shortcut: following common codes never translate
22818 into carry flag compares. */
22819 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22820 || code == ORDERED || code == UNORDERED)
22821 return false;
22823 /* These comparisons require zero flag; swap operands so they won't. */
22824 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22825 && !TARGET_IEEE_FP)
22827 std::swap (op0, op1);
22828 code = swap_condition (code);
22831 /* Try to expand the comparison and verify that we end up with
22832 carry flag based comparison. This fails to be true only when
22833 we decide to expand comparison using arithmetic that is not
22834 too common scenario. */
22835 start_sequence ();
22836 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22837 compare_seq = get_insns ();
22838 end_sequence ();
22840 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22841 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22842 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22843 else
22844 code = GET_CODE (compare_op);
22846 if (code != LTU && code != GEU)
22847 return false;
22849 emit_insn (compare_seq);
22850 *pop = compare_op;
22851 return true;
22854 if (!INTEGRAL_MODE_P (mode))
22855 return false;
22857 switch (code)
22859 case LTU:
22860 case GEU:
22861 break;
22863 /* Convert a==0 into (unsigned)a<1. */
22864 case EQ:
22865 case NE:
22866 if (op1 != const0_rtx)
22867 return false;
22868 op1 = const1_rtx;
22869 code = (code == EQ ? LTU : GEU);
22870 break;
22872 /* Convert a>b into b<a or a>=b-1. */
22873 case GTU:
22874 case LEU:
22875 if (CONST_INT_P (op1))
22877 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22878 /* Bail out on overflow. We still can swap operands but that
22879 would force loading of the constant into register. */
22880 if (op1 == const0_rtx
22881 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22882 return false;
22883 code = (code == GTU ? GEU : LTU);
22885 else
22887 std::swap (op0, op1);
22888 code = (code == GTU ? LTU : GEU);
22890 break;
22892 /* Convert a>=0 into (unsigned)a<0x80000000. */
22893 case LT:
22894 case GE:
22895 if (mode == DImode || op1 != const0_rtx)
22896 return false;
22897 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22898 code = (code == LT ? GEU : LTU);
22899 break;
22900 case LE:
22901 case GT:
22902 if (mode == DImode || op1 != constm1_rtx)
22903 return false;
22904 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22905 code = (code == LE ? GEU : LTU);
22906 break;
22908 default:
22909 return false;
22911 /* Swapping operands may cause constant to appear as first operand. */
22912 if (!nonimmediate_operand (op0, VOIDmode))
22914 if (!can_create_pseudo_p ())
22915 return false;
22916 op0 = force_reg (mode, op0);
22918 *pop = ix86_expand_compare (code, op0, op1);
22919 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22920 return true;
22923 bool
22924 ix86_expand_int_movcc (rtx operands[])
22926 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22927 rtx_insn *compare_seq;
22928 rtx compare_op;
22929 machine_mode mode = GET_MODE (operands[0]);
22930 bool sign_bit_compare_p = false;
22931 rtx op0 = XEXP (operands[1], 0);
22932 rtx op1 = XEXP (operands[1], 1);
22934 if (GET_MODE (op0) == TImode
22935 || (GET_MODE (op0) == DImode
22936 && !TARGET_64BIT))
22937 return false;
22939 start_sequence ();
22940 compare_op = ix86_expand_compare (code, op0, op1);
22941 compare_seq = get_insns ();
22942 end_sequence ();
22944 compare_code = GET_CODE (compare_op);
22946 if ((op1 == const0_rtx && (code == GE || code == LT))
22947 || (op1 == constm1_rtx && (code == GT || code == LE)))
22948 sign_bit_compare_p = true;
22950 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22951 HImode insns, we'd be swallowed in word prefix ops. */
22953 if ((mode != HImode || TARGET_FAST_PREFIX)
22954 && (mode != (TARGET_64BIT ? TImode : DImode))
22955 && CONST_INT_P (operands[2])
22956 && CONST_INT_P (operands[3]))
22958 rtx out = operands[0];
22959 HOST_WIDE_INT ct = INTVAL (operands[2]);
22960 HOST_WIDE_INT cf = INTVAL (operands[3]);
22961 HOST_WIDE_INT diff;
22963 diff = ct - cf;
22964 /* Sign bit compares are better done using shifts than we do by using
22965 sbb. */
22966 if (sign_bit_compare_p
22967 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22969 /* Detect overlap between destination and compare sources. */
22970 rtx tmp = out;
22972 if (!sign_bit_compare_p)
22974 rtx flags;
22975 bool fpcmp = false;
22977 compare_code = GET_CODE (compare_op);
22979 flags = XEXP (compare_op, 0);
22981 if (GET_MODE (flags) == CCFPmode
22982 || GET_MODE (flags) == CCFPUmode)
22984 fpcmp = true;
22985 compare_code
22986 = ix86_fp_compare_code_to_integer (compare_code);
22989 /* To simplify rest of code, restrict to the GEU case. */
22990 if (compare_code == LTU)
22992 std::swap (ct, cf);
22993 compare_code = reverse_condition (compare_code);
22994 code = reverse_condition (code);
22996 else
22998 if (fpcmp)
22999 PUT_CODE (compare_op,
23000 reverse_condition_maybe_unordered
23001 (GET_CODE (compare_op)));
23002 else
23003 PUT_CODE (compare_op,
23004 reverse_condition (GET_CODE (compare_op)));
23006 diff = ct - cf;
23008 if (reg_overlap_mentioned_p (out, op0)
23009 || reg_overlap_mentioned_p (out, op1))
23010 tmp = gen_reg_rtx (mode);
23012 if (mode == DImode)
23013 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23014 else
23015 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23016 flags, compare_op));
23018 else
23020 if (code == GT || code == GE)
23021 code = reverse_condition (code);
23022 else
23024 std::swap (ct, cf);
23025 diff = ct - cf;
23027 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23030 if (diff == 1)
23033 * cmpl op0,op1
23034 * sbbl dest,dest
23035 * [addl dest, ct]
23037 * Size 5 - 8.
23039 if (ct)
23040 tmp = expand_simple_binop (mode, PLUS,
23041 tmp, GEN_INT (ct),
23042 copy_rtx (tmp), 1, OPTAB_DIRECT);
23044 else if (cf == -1)
23047 * cmpl op0,op1
23048 * sbbl dest,dest
23049 * orl $ct, dest
23051 * Size 8.
23053 tmp = expand_simple_binop (mode, IOR,
23054 tmp, GEN_INT (ct),
23055 copy_rtx (tmp), 1, OPTAB_DIRECT);
23057 else if (diff == -1 && ct)
23060 * cmpl op0,op1
23061 * sbbl dest,dest
23062 * notl dest
23063 * [addl dest, cf]
23065 * Size 8 - 11.
23067 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23068 if (cf)
23069 tmp = expand_simple_binop (mode, PLUS,
23070 copy_rtx (tmp), GEN_INT (cf),
23071 copy_rtx (tmp), 1, OPTAB_DIRECT);
23073 else
23076 * cmpl op0,op1
23077 * sbbl dest,dest
23078 * [notl dest]
23079 * andl cf - ct, dest
23080 * [addl dest, ct]
23082 * Size 8 - 11.
23085 if (cf == 0)
23087 cf = ct;
23088 ct = 0;
23089 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23092 tmp = expand_simple_binop (mode, AND,
23093 copy_rtx (tmp),
23094 gen_int_mode (cf - ct, mode),
23095 copy_rtx (tmp), 1, OPTAB_DIRECT);
23096 if (ct)
23097 tmp = expand_simple_binop (mode, PLUS,
23098 copy_rtx (tmp), GEN_INT (ct),
23099 copy_rtx (tmp), 1, OPTAB_DIRECT);
23102 if (!rtx_equal_p (tmp, out))
23103 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23105 return true;
23108 if (diff < 0)
23110 machine_mode cmp_mode = GET_MODE (op0);
23111 enum rtx_code new_code;
23113 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23115 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23117 /* We may be reversing unordered compare to normal compare, that
23118 is not valid in general (we may convert non-trapping condition
23119 to trapping one), however on i386 we currently emit all
23120 comparisons unordered. */
23121 new_code = reverse_condition_maybe_unordered (code);
23123 else
23124 new_code = ix86_reverse_condition (code, cmp_mode);
23125 if (new_code != UNKNOWN)
23127 std::swap (ct, cf);
23128 diff = -diff;
23129 code = new_code;
23133 compare_code = UNKNOWN;
23134 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23135 && CONST_INT_P (op1))
23137 if (op1 == const0_rtx
23138 && (code == LT || code == GE))
23139 compare_code = code;
23140 else if (op1 == constm1_rtx)
23142 if (code == LE)
23143 compare_code = LT;
23144 else if (code == GT)
23145 compare_code = GE;
23149 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23150 if (compare_code != UNKNOWN
23151 && GET_MODE (op0) == GET_MODE (out)
23152 && (cf == -1 || ct == -1))
23154 /* If lea code below could be used, only optimize
23155 if it results in a 2 insn sequence. */
23157 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23158 || diff == 3 || diff == 5 || diff == 9)
23159 || (compare_code == LT && ct == -1)
23160 || (compare_code == GE && cf == -1))
23163 * notl op1 (if necessary)
23164 * sarl $31, op1
23165 * orl cf, op1
23167 if (ct != -1)
23169 cf = ct;
23170 ct = -1;
23171 code = reverse_condition (code);
23174 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23176 out = expand_simple_binop (mode, IOR,
23177 out, GEN_INT (cf),
23178 out, 1, OPTAB_DIRECT);
23179 if (out != operands[0])
23180 emit_move_insn (operands[0], out);
23182 return true;
23187 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23188 || diff == 3 || diff == 5 || diff == 9)
23189 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23190 && (mode != DImode
23191 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23194 * xorl dest,dest
23195 * cmpl op1,op2
23196 * setcc dest
23197 * lea cf(dest*(ct-cf)),dest
23199 * Size 14.
23201 * This also catches the degenerate setcc-only case.
23204 rtx tmp;
23205 int nops;
23207 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23209 nops = 0;
23210 /* On x86_64 the lea instruction operates on Pmode, so we need
23211 to get arithmetics done in proper mode to match. */
23212 if (diff == 1)
23213 tmp = copy_rtx (out);
23214 else
23216 rtx out1;
23217 out1 = copy_rtx (out);
23218 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23219 nops++;
23220 if (diff & 1)
23222 tmp = gen_rtx_PLUS (mode, tmp, out1);
23223 nops++;
23226 if (cf != 0)
23228 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23229 nops++;
23231 if (!rtx_equal_p (tmp, out))
23233 if (nops == 1)
23234 out = force_operand (tmp, copy_rtx (out));
23235 else
23236 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23238 if (!rtx_equal_p (out, operands[0]))
23239 emit_move_insn (operands[0], copy_rtx (out));
23241 return true;
23245 * General case: Jumpful:
23246 * xorl dest,dest cmpl op1, op2
23247 * cmpl op1, op2 movl ct, dest
23248 * setcc dest jcc 1f
23249 * decl dest movl cf, dest
23250 * andl (cf-ct),dest 1:
23251 * addl ct,dest
23253 * Size 20. Size 14.
23255 * This is reasonably steep, but branch mispredict costs are
23256 * high on modern cpus, so consider failing only if optimizing
23257 * for space.
23260 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23261 && BRANCH_COST (optimize_insn_for_speed_p (),
23262 false) >= 2)
23264 if (cf == 0)
23266 machine_mode cmp_mode = GET_MODE (op0);
23267 enum rtx_code new_code;
23269 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23271 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23273 /* We may be reversing unordered compare to normal compare,
23274 that is not valid in general (we may convert non-trapping
23275 condition to trapping one), however on i386 we currently
23276 emit all comparisons unordered. */
23277 new_code = reverse_condition_maybe_unordered (code);
23279 else
23281 new_code = ix86_reverse_condition (code, cmp_mode);
23282 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23283 compare_code = reverse_condition (compare_code);
23286 if (new_code != UNKNOWN)
23288 cf = ct;
23289 ct = 0;
23290 code = new_code;
23294 if (compare_code != UNKNOWN)
23296 /* notl op1 (if needed)
23297 sarl $31, op1
23298 andl (cf-ct), op1
23299 addl ct, op1
23301 For x < 0 (resp. x <= -1) there will be no notl,
23302 so if possible swap the constants to get rid of the
23303 complement.
23304 True/false will be -1/0 while code below (store flag
23305 followed by decrement) is 0/-1, so the constants need
23306 to be exchanged once more. */
23308 if (compare_code == GE || !cf)
23310 code = reverse_condition (code);
23311 compare_code = LT;
23313 else
23314 std::swap (ct, cf);
23316 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23318 else
23320 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23322 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23323 constm1_rtx,
23324 copy_rtx (out), 1, OPTAB_DIRECT);
23327 out = expand_simple_binop (mode, AND, copy_rtx (out),
23328 gen_int_mode (cf - ct, mode),
23329 copy_rtx (out), 1, OPTAB_DIRECT);
23330 if (ct)
23331 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23332 copy_rtx (out), 1, OPTAB_DIRECT);
23333 if (!rtx_equal_p (out, operands[0]))
23334 emit_move_insn (operands[0], copy_rtx (out));
23336 return true;
23340 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23342 /* Try a few things more with specific constants and a variable. */
23344 optab op;
23345 rtx var, orig_out, out, tmp;
23347 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23348 return false;
23350 /* If one of the two operands is an interesting constant, load a
23351 constant with the above and mask it in with a logical operation. */
23353 if (CONST_INT_P (operands[2]))
23355 var = operands[3];
23356 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23357 operands[3] = constm1_rtx, op = and_optab;
23358 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23359 operands[3] = const0_rtx, op = ior_optab;
23360 else
23361 return false;
23363 else if (CONST_INT_P (operands[3]))
23365 var = operands[2];
23366 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23367 operands[2] = constm1_rtx, op = and_optab;
23368 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23369 operands[2] = const0_rtx, op = ior_optab;
23370 else
23371 return false;
23373 else
23374 return false;
23376 orig_out = operands[0];
23377 tmp = gen_reg_rtx (mode);
23378 operands[0] = tmp;
23380 /* Recurse to get the constant loaded. */
23381 if (!ix86_expand_int_movcc (operands))
23382 return false;
23384 /* Mask in the interesting variable. */
23385 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23386 OPTAB_WIDEN);
23387 if (!rtx_equal_p (out, orig_out))
23388 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23390 return true;
23394 * For comparison with above,
23396 * movl cf,dest
23397 * movl ct,tmp
23398 * cmpl op1,op2
23399 * cmovcc tmp,dest
23401 * Size 15.
23404 if (! nonimmediate_operand (operands[2], mode))
23405 operands[2] = force_reg (mode, operands[2]);
23406 if (! nonimmediate_operand (operands[3], mode))
23407 operands[3] = force_reg (mode, operands[3]);
23409 if (! register_operand (operands[2], VOIDmode)
23410 && (mode == QImode
23411 || ! register_operand (operands[3], VOIDmode)))
23412 operands[2] = force_reg (mode, operands[2]);
23414 if (mode == QImode
23415 && ! register_operand (operands[3], VOIDmode))
23416 operands[3] = force_reg (mode, operands[3]);
23418 emit_insn (compare_seq);
23419 emit_insn (gen_rtx_SET (operands[0],
23420 gen_rtx_IF_THEN_ELSE (mode,
23421 compare_op, operands[2],
23422 operands[3])));
23423 return true;
23426 /* Swap, force into registers, or otherwise massage the two operands
23427 to an sse comparison with a mask result. Thus we differ a bit from
23428 ix86_prepare_fp_compare_args which expects to produce a flags result.
23430 The DEST operand exists to help determine whether to commute commutative
23431 operators. The POP0/POP1 operands are updated in place. The new
23432 comparison code is returned, or UNKNOWN if not implementable. */
23434 static enum rtx_code
23435 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23436 rtx *pop0, rtx *pop1)
23438 switch (code)
23440 case LTGT:
23441 case UNEQ:
23442 /* AVX supports all the needed comparisons. */
23443 if (TARGET_AVX)
23444 break;
23445 /* We have no LTGT as an operator. We could implement it with
23446 NE & ORDERED, but this requires an extra temporary. It's
23447 not clear that it's worth it. */
23448 return UNKNOWN;
23450 case LT:
23451 case LE:
23452 case UNGT:
23453 case UNGE:
23454 /* These are supported directly. */
23455 break;
23457 case EQ:
23458 case NE:
23459 case UNORDERED:
23460 case ORDERED:
23461 /* AVX has 3 operand comparisons, no need to swap anything. */
23462 if (TARGET_AVX)
23463 break;
23464 /* For commutative operators, try to canonicalize the destination
23465 operand to be first in the comparison - this helps reload to
23466 avoid extra moves. */
23467 if (!dest || !rtx_equal_p (dest, *pop1))
23468 break;
23469 /* FALLTHRU */
23471 case GE:
23472 case GT:
23473 case UNLE:
23474 case UNLT:
23475 /* These are not supported directly before AVX, and furthermore
23476 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23477 comparison operands to transform into something that is
23478 supported. */
23479 std::swap (*pop0, *pop1);
23480 code = swap_condition (code);
23481 break;
23483 default:
23484 gcc_unreachable ();
23487 return code;
23490 /* Detect conditional moves that exactly match min/max operational
23491 semantics. Note that this is IEEE safe, as long as we don't
23492 interchange the operands.
23494 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23495 and TRUE if the operation is successful and instructions are emitted. */
23497 static bool
23498 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23499 rtx cmp_op1, rtx if_true, rtx if_false)
23501 machine_mode mode;
23502 bool is_min;
23503 rtx tmp;
23505 if (code == LT)
23507 else if (code == UNGE)
23508 std::swap (if_true, if_false);
23509 else
23510 return false;
23512 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23513 is_min = true;
23514 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23515 is_min = false;
23516 else
23517 return false;
23519 mode = GET_MODE (dest);
23521 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23522 but MODE may be a vector mode and thus not appropriate. */
23523 if (!flag_finite_math_only || flag_signed_zeros)
23525 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23526 rtvec v;
23528 if_true = force_reg (mode, if_true);
23529 v = gen_rtvec (2, if_true, if_false);
23530 tmp = gen_rtx_UNSPEC (mode, v, u);
23532 else
23534 code = is_min ? SMIN : SMAX;
23535 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23538 emit_insn (gen_rtx_SET (dest, tmp));
23539 return true;
23542 /* Expand an sse vector comparison. Return the register with the result. */
23544 static rtx
23545 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23546 rtx op_true, rtx op_false)
23548 machine_mode mode = GET_MODE (dest);
23549 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23551 /* In general case result of comparison can differ from operands' type. */
23552 machine_mode cmp_mode;
23554 /* In AVX512F the result of comparison is an integer mask. */
23555 bool maskcmp = false;
23556 rtx x;
23558 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23560 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23561 gcc_assert (cmp_mode != BLKmode);
23563 maskcmp = true;
23565 else
23566 cmp_mode = cmp_ops_mode;
23569 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23570 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23571 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23573 if (optimize
23574 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23575 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23576 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23578 /* Compare patterns for int modes are unspec in AVX512F only. */
23579 if (maskcmp && (code == GT || code == EQ))
23581 rtx (*gen)(rtx, rtx, rtx);
23583 switch (cmp_ops_mode)
23585 case V64QImode:
23586 gcc_assert (TARGET_AVX512BW);
23587 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23588 break;
23589 case V32HImode:
23590 gcc_assert (TARGET_AVX512BW);
23591 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23592 break;
23593 case V16SImode:
23594 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23595 break;
23596 case V8DImode:
23597 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23598 break;
23599 default:
23600 gen = NULL;
23603 if (gen)
23605 emit_insn (gen (dest, cmp_op0, cmp_op1));
23606 return dest;
23609 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23611 if (cmp_mode != mode && !maskcmp)
23613 x = force_reg (cmp_ops_mode, x);
23614 convert_move (dest, x, false);
23616 else
23617 emit_insn (gen_rtx_SET (dest, x));
23619 return dest;
23622 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23623 operations. This is used for both scalar and vector conditional moves. */
23625 void
23626 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23628 machine_mode mode = GET_MODE (dest);
23629 machine_mode cmpmode = GET_MODE (cmp);
23631 /* In AVX512F the result of comparison is an integer mask. */
23632 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23634 rtx t2, t3, x;
23636 /* If we have an integer mask and FP value then we need
23637 to cast mask to FP mode. */
23638 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23640 cmp = force_reg (cmpmode, cmp);
23641 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23644 if (vector_all_ones_operand (op_true, mode)
23645 && rtx_equal_p (op_false, CONST0_RTX (mode))
23646 && !maskcmp)
23648 emit_insn (gen_rtx_SET (dest, cmp));
23650 else if (op_false == CONST0_RTX (mode)
23651 && !maskcmp)
23653 op_true = force_reg (mode, op_true);
23654 x = gen_rtx_AND (mode, cmp, op_true);
23655 emit_insn (gen_rtx_SET (dest, x));
23657 else if (op_true == CONST0_RTX (mode)
23658 && !maskcmp)
23660 op_false = force_reg (mode, op_false);
23661 x = gen_rtx_NOT (mode, cmp);
23662 x = gen_rtx_AND (mode, x, op_false);
23663 emit_insn (gen_rtx_SET (dest, x));
23665 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23666 && !maskcmp)
23668 op_false = force_reg (mode, op_false);
23669 x = gen_rtx_IOR (mode, cmp, op_false);
23670 emit_insn (gen_rtx_SET (dest, x));
23672 else if (TARGET_XOP
23673 && !maskcmp)
23675 op_true = force_reg (mode, op_true);
23677 if (!nonimmediate_operand (op_false, mode))
23678 op_false = force_reg (mode, op_false);
23680 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23681 op_true,
23682 op_false)));
23684 else
23686 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23687 rtx d = dest;
23689 if (!nonimmediate_operand (op_true, mode))
23690 op_true = force_reg (mode, op_true);
23692 op_false = force_reg (mode, op_false);
23694 switch (mode)
23696 case V4SFmode:
23697 if (TARGET_SSE4_1)
23698 gen = gen_sse4_1_blendvps;
23699 break;
23700 case V2DFmode:
23701 if (TARGET_SSE4_1)
23702 gen = gen_sse4_1_blendvpd;
23703 break;
23704 case V16QImode:
23705 case V8HImode:
23706 case V4SImode:
23707 case V2DImode:
23708 if (TARGET_SSE4_1)
23710 gen = gen_sse4_1_pblendvb;
23711 if (mode != V16QImode)
23712 d = gen_reg_rtx (V16QImode);
23713 op_false = gen_lowpart (V16QImode, op_false);
23714 op_true = gen_lowpart (V16QImode, op_true);
23715 cmp = gen_lowpart (V16QImode, cmp);
23717 break;
23718 case V8SFmode:
23719 if (TARGET_AVX)
23720 gen = gen_avx_blendvps256;
23721 break;
23722 case V4DFmode:
23723 if (TARGET_AVX)
23724 gen = gen_avx_blendvpd256;
23725 break;
23726 case V32QImode:
23727 case V16HImode:
23728 case V8SImode:
23729 case V4DImode:
23730 if (TARGET_AVX2)
23732 gen = gen_avx2_pblendvb;
23733 if (mode != V32QImode)
23734 d = gen_reg_rtx (V32QImode);
23735 op_false = gen_lowpart (V32QImode, op_false);
23736 op_true = gen_lowpart (V32QImode, op_true);
23737 cmp = gen_lowpart (V32QImode, cmp);
23739 break;
23741 case V64QImode:
23742 gen = gen_avx512bw_blendmv64qi;
23743 break;
23744 case V32HImode:
23745 gen = gen_avx512bw_blendmv32hi;
23746 break;
23747 case V16SImode:
23748 gen = gen_avx512f_blendmv16si;
23749 break;
23750 case V8DImode:
23751 gen = gen_avx512f_blendmv8di;
23752 break;
23753 case V8DFmode:
23754 gen = gen_avx512f_blendmv8df;
23755 break;
23756 case V16SFmode:
23757 gen = gen_avx512f_blendmv16sf;
23758 break;
23760 default:
23761 break;
23764 if (gen != NULL)
23766 emit_insn (gen (d, op_false, op_true, cmp));
23767 if (d != dest)
23768 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23770 else
23772 op_true = force_reg (mode, op_true);
23774 t2 = gen_reg_rtx (mode);
23775 if (optimize)
23776 t3 = gen_reg_rtx (mode);
23777 else
23778 t3 = dest;
23780 x = gen_rtx_AND (mode, op_true, cmp);
23781 emit_insn (gen_rtx_SET (t2, x));
23783 x = gen_rtx_NOT (mode, cmp);
23784 x = gen_rtx_AND (mode, x, op_false);
23785 emit_insn (gen_rtx_SET (t3, x));
23787 x = gen_rtx_IOR (mode, t3, t2);
23788 emit_insn (gen_rtx_SET (dest, x));
23793 /* Expand a floating-point conditional move. Return true if successful. */
23795 bool
23796 ix86_expand_fp_movcc (rtx operands[])
23798 machine_mode mode = GET_MODE (operands[0]);
23799 enum rtx_code code = GET_CODE (operands[1]);
23800 rtx tmp, compare_op;
23801 rtx op0 = XEXP (operands[1], 0);
23802 rtx op1 = XEXP (operands[1], 1);
23804 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23806 machine_mode cmode;
23808 /* Since we've no cmove for sse registers, don't force bad register
23809 allocation just to gain access to it. Deny movcc when the
23810 comparison mode doesn't match the move mode. */
23811 cmode = GET_MODE (op0);
23812 if (cmode == VOIDmode)
23813 cmode = GET_MODE (op1);
23814 if (cmode != mode)
23815 return false;
23817 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23818 if (code == UNKNOWN)
23819 return false;
23821 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23822 operands[2], operands[3]))
23823 return true;
23825 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23826 operands[2], operands[3]);
23827 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23828 return true;
23831 if (GET_MODE (op0) == TImode
23832 || (GET_MODE (op0) == DImode
23833 && !TARGET_64BIT))
23834 return false;
23836 /* The floating point conditional move instructions don't directly
23837 support conditions resulting from a signed integer comparison. */
23839 compare_op = ix86_expand_compare (code, op0, op1);
23840 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23842 tmp = gen_reg_rtx (QImode);
23843 ix86_expand_setcc (tmp, code, op0, op1);
23845 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23848 emit_insn (gen_rtx_SET (operands[0],
23849 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23850 operands[2], operands[3])));
23852 return true;
23855 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23857 static int
23858 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23860 switch (code)
23862 case EQ:
23863 return 0;
23864 case LT:
23865 case LTU:
23866 return 1;
23867 case LE:
23868 case LEU:
23869 return 2;
23870 case NE:
23871 return 4;
23872 case GE:
23873 case GEU:
23874 return 5;
23875 case GT:
23876 case GTU:
23877 return 6;
23878 default:
23879 gcc_unreachable ();
23883 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23885 static int
23886 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23888 switch (code)
23890 case EQ:
23891 return 0x00;
23892 case NE:
23893 return 0x04;
23894 case GT:
23895 return 0x0e;
23896 case LE:
23897 return 0x02;
23898 case GE:
23899 return 0x0d;
23900 case LT:
23901 return 0x01;
23902 case UNLE:
23903 return 0x0a;
23904 case UNLT:
23905 return 0x09;
23906 case UNGE:
23907 return 0x05;
23908 case UNGT:
23909 return 0x06;
23910 case UNEQ:
23911 return 0x18;
23912 case LTGT:
23913 return 0x0c;
23914 case ORDERED:
23915 return 0x07;
23916 case UNORDERED:
23917 return 0x03;
23918 default:
23919 gcc_unreachable ();
23923 /* Return immediate value to be used in UNSPEC_PCMP
23924 for comparison CODE in MODE. */
23926 static int
23927 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23929 if (FLOAT_MODE_P (mode))
23930 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23931 return ix86_int_cmp_code_to_pcmp_immediate (code);
23934 /* Expand AVX-512 vector comparison. */
23936 bool
23937 ix86_expand_mask_vec_cmp (rtx operands[])
23939 machine_mode mask_mode = GET_MODE (operands[0]);
23940 machine_mode cmp_mode = GET_MODE (operands[2]);
23941 enum rtx_code code = GET_CODE (operands[1]);
23942 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23943 int unspec_code;
23944 rtx unspec;
23946 switch (code)
23948 case LEU:
23949 case GTU:
23950 case GEU:
23951 case LTU:
23952 unspec_code = UNSPEC_UNSIGNED_PCMP;
23953 break;
23955 default:
23956 unspec_code = UNSPEC_PCMP;
23959 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23960 operands[3], imm),
23961 unspec_code);
23962 emit_insn (gen_rtx_SET (operands[0], unspec));
23964 return true;
23967 /* Expand fp vector comparison. */
23969 bool
23970 ix86_expand_fp_vec_cmp (rtx operands[])
23972 enum rtx_code code = GET_CODE (operands[1]);
23973 rtx cmp;
23975 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23976 &operands[2], &operands[3]);
23977 if (code == UNKNOWN)
23979 rtx temp;
23980 switch (GET_CODE (operands[1]))
23982 case LTGT:
23983 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23984 operands[3], NULL, NULL);
23985 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23986 operands[3], NULL, NULL);
23987 code = AND;
23988 break;
23989 case UNEQ:
23990 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23991 operands[3], NULL, NULL);
23992 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23993 operands[3], NULL, NULL);
23994 code = IOR;
23995 break;
23996 default:
23997 gcc_unreachable ();
23999 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24000 OPTAB_DIRECT);
24002 else
24003 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24004 operands[1], operands[2]);
24006 if (operands[0] != cmp)
24007 emit_move_insn (operands[0], cmp);
24009 return true;
24012 static rtx
24013 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24014 rtx op_true, rtx op_false, bool *negate)
24016 machine_mode data_mode = GET_MODE (dest);
24017 machine_mode mode = GET_MODE (cop0);
24018 rtx x;
24020 *negate = false;
24022 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24023 if (TARGET_XOP
24024 && (mode == V16QImode || mode == V8HImode
24025 || mode == V4SImode || mode == V2DImode))
24027 else
24029 /* Canonicalize the comparison to EQ, GT, GTU. */
24030 switch (code)
24032 case EQ:
24033 case GT:
24034 case GTU:
24035 break;
24037 case NE:
24038 case LE:
24039 case LEU:
24040 code = reverse_condition (code);
24041 *negate = true;
24042 break;
24044 case GE:
24045 case GEU:
24046 code = reverse_condition (code);
24047 *negate = true;
24048 /* FALLTHRU */
24050 case LT:
24051 case LTU:
24052 std::swap (cop0, cop1);
24053 code = swap_condition (code);
24054 break;
24056 default:
24057 gcc_unreachable ();
24060 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24061 if (mode == V2DImode)
24063 switch (code)
24065 case EQ:
24066 /* SSE4.1 supports EQ. */
24067 if (!TARGET_SSE4_1)
24068 return NULL;
24069 break;
24071 case GT:
24072 case GTU:
24073 /* SSE4.2 supports GT/GTU. */
24074 if (!TARGET_SSE4_2)
24075 return NULL;
24076 break;
24078 default:
24079 gcc_unreachable ();
24083 /* Unsigned parallel compare is not supported by the hardware.
24084 Play some tricks to turn this into a signed comparison
24085 against 0. */
24086 if (code == GTU)
24088 cop0 = force_reg (mode, cop0);
24090 switch (mode)
24092 case V16SImode:
24093 case V8DImode:
24094 case V8SImode:
24095 case V4DImode:
24096 case V4SImode:
24097 case V2DImode:
24099 rtx t1, t2, mask;
24100 rtx (*gen_sub3) (rtx, rtx, rtx);
24102 switch (mode)
24104 case V16SImode: gen_sub3 = gen_subv16si3; break;
24105 case V8DImode: gen_sub3 = gen_subv8di3; break;
24106 case V8SImode: gen_sub3 = gen_subv8si3; break;
24107 case V4DImode: gen_sub3 = gen_subv4di3; break;
24108 case V4SImode: gen_sub3 = gen_subv4si3; break;
24109 case V2DImode: gen_sub3 = gen_subv2di3; break;
24110 default:
24111 gcc_unreachable ();
24113 /* Subtract (-(INT MAX) - 1) from both operands to make
24114 them signed. */
24115 mask = ix86_build_signbit_mask (mode, true, false);
24116 t1 = gen_reg_rtx (mode);
24117 emit_insn (gen_sub3 (t1, cop0, mask));
24119 t2 = gen_reg_rtx (mode);
24120 emit_insn (gen_sub3 (t2, cop1, mask));
24122 cop0 = t1;
24123 cop1 = t2;
24124 code = GT;
24126 break;
24128 case V64QImode:
24129 case V32HImode:
24130 case V32QImode:
24131 case V16HImode:
24132 case V16QImode:
24133 case V8HImode:
24134 /* Perform a parallel unsigned saturating subtraction. */
24135 x = gen_reg_rtx (mode);
24136 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24137 cop1)));
24139 cop0 = x;
24140 cop1 = CONST0_RTX (mode);
24141 code = EQ;
24142 *negate = !*negate;
24143 break;
24145 default:
24146 gcc_unreachable ();
24151 if (*negate)
24152 std::swap (op_true, op_false);
24154 /* Allow the comparison to be done in one mode, but the movcc to
24155 happen in another mode. */
24156 if (data_mode == mode)
24158 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24159 op_true, op_false);
24161 else
24163 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24164 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24165 op_true, op_false);
24166 if (GET_MODE (x) == mode)
24167 x = gen_lowpart (data_mode, x);
24170 return x;
24173 /* Expand integer vector comparison. */
24175 bool
24176 ix86_expand_int_vec_cmp (rtx operands[])
24178 rtx_code code = GET_CODE (operands[1]);
24179 bool negate = false;
24180 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24181 operands[3], NULL, NULL, &negate);
24183 if (!cmp)
24184 return false;
24186 if (negate)
24187 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24188 CONST0_RTX (GET_MODE (cmp)),
24189 NULL, NULL, &negate);
24191 gcc_assert (!negate);
24193 if (operands[0] != cmp)
24194 emit_move_insn (operands[0], cmp);
24196 return true;
24199 /* Expand a floating-point vector conditional move; a vcond operation
24200 rather than a movcc operation. */
24202 bool
24203 ix86_expand_fp_vcond (rtx operands[])
24205 enum rtx_code code = GET_CODE (operands[3]);
24206 rtx cmp;
24208 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24209 &operands[4], &operands[5]);
24210 if (code == UNKNOWN)
24212 rtx temp;
24213 switch (GET_CODE (operands[3]))
24215 case LTGT:
24216 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24217 operands[5], operands[0], operands[0]);
24218 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24219 operands[5], operands[1], operands[2]);
24220 code = AND;
24221 break;
24222 case UNEQ:
24223 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24224 operands[5], operands[0], operands[0]);
24225 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24226 operands[5], operands[1], operands[2]);
24227 code = IOR;
24228 break;
24229 default:
24230 gcc_unreachable ();
24232 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24233 OPTAB_DIRECT);
24234 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24235 return true;
24238 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24239 operands[5], operands[1], operands[2]))
24240 return true;
24242 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24243 operands[1], operands[2]);
24244 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24245 return true;
24248 /* Expand a signed/unsigned integral vector conditional move. */
24250 bool
24251 ix86_expand_int_vcond (rtx operands[])
24253 machine_mode data_mode = GET_MODE (operands[0]);
24254 machine_mode mode = GET_MODE (operands[4]);
24255 enum rtx_code code = GET_CODE (operands[3]);
24256 bool negate = false;
24257 rtx x, cop0, cop1;
24259 cop0 = operands[4];
24260 cop1 = operands[5];
24262 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24263 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24264 if ((code == LT || code == GE)
24265 && data_mode == mode
24266 && cop1 == CONST0_RTX (mode)
24267 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24268 && GET_MODE_UNIT_SIZE (data_mode) > 1
24269 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24270 && (GET_MODE_SIZE (data_mode) == 16
24271 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24273 rtx negop = operands[2 - (code == LT)];
24274 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24275 if (negop == CONST1_RTX (data_mode))
24277 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24278 operands[0], 1, OPTAB_DIRECT);
24279 if (res != operands[0])
24280 emit_move_insn (operands[0], res);
24281 return true;
24283 else if (GET_MODE_INNER (data_mode) != DImode
24284 && vector_all_ones_operand (negop, data_mode))
24286 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24287 operands[0], 0, OPTAB_DIRECT);
24288 if (res != operands[0])
24289 emit_move_insn (operands[0], res);
24290 return true;
24294 if (!nonimmediate_operand (cop1, mode))
24295 cop1 = force_reg (mode, cop1);
24296 if (!general_operand (operands[1], data_mode))
24297 operands[1] = force_reg (data_mode, operands[1]);
24298 if (!general_operand (operands[2], data_mode))
24299 operands[2] = force_reg (data_mode, operands[2]);
24301 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24302 operands[1], operands[2], &negate);
24304 if (!x)
24305 return false;
24307 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24308 operands[2-negate]);
24309 return true;
24312 /* AVX512F does support 64-byte integer vector operations,
24313 thus the longest vector we are faced with is V64QImode. */
24314 #define MAX_VECT_LEN 64
24316 struct expand_vec_perm_d
24318 rtx target, op0, op1;
24319 unsigned char perm[MAX_VECT_LEN];
24320 machine_mode vmode;
24321 unsigned char nelt;
24322 bool one_operand_p;
24323 bool testing_p;
24326 static bool
24327 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24328 struct expand_vec_perm_d *d)
24330 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24331 expander, so args are either in d, or in op0, op1 etc. */
24332 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24333 machine_mode maskmode = mode;
24334 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24336 switch (mode)
24338 case V8HImode:
24339 if (TARGET_AVX512VL && TARGET_AVX512BW)
24340 gen = gen_avx512vl_vpermi2varv8hi3;
24341 break;
24342 case V16HImode:
24343 if (TARGET_AVX512VL && TARGET_AVX512BW)
24344 gen = gen_avx512vl_vpermi2varv16hi3;
24345 break;
24346 case V64QImode:
24347 if (TARGET_AVX512VBMI)
24348 gen = gen_avx512bw_vpermi2varv64qi3;
24349 break;
24350 case V32HImode:
24351 if (TARGET_AVX512BW)
24352 gen = gen_avx512bw_vpermi2varv32hi3;
24353 break;
24354 case V4SImode:
24355 if (TARGET_AVX512VL)
24356 gen = gen_avx512vl_vpermi2varv4si3;
24357 break;
24358 case V8SImode:
24359 if (TARGET_AVX512VL)
24360 gen = gen_avx512vl_vpermi2varv8si3;
24361 break;
24362 case V16SImode:
24363 if (TARGET_AVX512F)
24364 gen = gen_avx512f_vpermi2varv16si3;
24365 break;
24366 case V4SFmode:
24367 if (TARGET_AVX512VL)
24369 gen = gen_avx512vl_vpermi2varv4sf3;
24370 maskmode = V4SImode;
24372 break;
24373 case V8SFmode:
24374 if (TARGET_AVX512VL)
24376 gen = gen_avx512vl_vpermi2varv8sf3;
24377 maskmode = V8SImode;
24379 break;
24380 case V16SFmode:
24381 if (TARGET_AVX512F)
24383 gen = gen_avx512f_vpermi2varv16sf3;
24384 maskmode = V16SImode;
24386 break;
24387 case V2DImode:
24388 if (TARGET_AVX512VL)
24389 gen = gen_avx512vl_vpermi2varv2di3;
24390 break;
24391 case V4DImode:
24392 if (TARGET_AVX512VL)
24393 gen = gen_avx512vl_vpermi2varv4di3;
24394 break;
24395 case V8DImode:
24396 if (TARGET_AVX512F)
24397 gen = gen_avx512f_vpermi2varv8di3;
24398 break;
24399 case V2DFmode:
24400 if (TARGET_AVX512VL)
24402 gen = gen_avx512vl_vpermi2varv2df3;
24403 maskmode = V2DImode;
24405 break;
24406 case V4DFmode:
24407 if (TARGET_AVX512VL)
24409 gen = gen_avx512vl_vpermi2varv4df3;
24410 maskmode = V4DImode;
24412 break;
24413 case V8DFmode:
24414 if (TARGET_AVX512F)
24416 gen = gen_avx512f_vpermi2varv8df3;
24417 maskmode = V8DImode;
24419 break;
24420 default:
24421 break;
24424 if (gen == NULL)
24425 return false;
24427 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24428 expander, so args are either in d, or in op0, op1 etc. */
24429 if (d)
24431 rtx vec[64];
24432 target = d->target;
24433 op0 = d->op0;
24434 op1 = d->op1;
24435 for (int i = 0; i < d->nelt; ++i)
24436 vec[i] = GEN_INT (d->perm[i]);
24437 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24440 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24441 return true;
24444 /* Expand a variable vector permutation. */
24446 void
24447 ix86_expand_vec_perm (rtx operands[])
24449 rtx target = operands[0];
24450 rtx op0 = operands[1];
24451 rtx op1 = operands[2];
24452 rtx mask = operands[3];
24453 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24454 machine_mode mode = GET_MODE (op0);
24455 machine_mode maskmode = GET_MODE (mask);
24456 int w, e, i;
24457 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24459 /* Number of elements in the vector. */
24460 w = GET_MODE_NUNITS (mode);
24461 e = GET_MODE_UNIT_SIZE (mode);
24462 gcc_assert (w <= 64);
24464 if (TARGET_AVX512F && one_operand_shuffle)
24466 rtx (*gen) (rtx, rtx, rtx) = NULL;
24467 switch (mode)
24469 case V16SImode:
24470 gen =gen_avx512f_permvarv16si;
24471 break;
24472 case V16SFmode:
24473 gen = gen_avx512f_permvarv16sf;
24474 break;
24475 case V8DImode:
24476 gen = gen_avx512f_permvarv8di;
24477 break;
24478 case V8DFmode:
24479 gen = gen_avx512f_permvarv8df;
24480 break;
24481 default:
24482 break;
24484 if (gen != NULL)
24486 emit_insn (gen (target, op0, mask));
24487 return;
24491 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24492 return;
24494 if (TARGET_AVX2)
24496 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24498 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24499 an constant shuffle operand. With a tiny bit of effort we can
24500 use VPERMD instead. A re-interpretation stall for V4DFmode is
24501 unfortunate but there's no avoiding it.
24502 Similarly for V16HImode we don't have instructions for variable
24503 shuffling, while for V32QImode we can use after preparing suitable
24504 masks vpshufb; vpshufb; vpermq; vpor. */
24506 if (mode == V16HImode)
24508 maskmode = mode = V32QImode;
24509 w = 32;
24510 e = 1;
24512 else
24514 maskmode = mode = V8SImode;
24515 w = 8;
24516 e = 4;
24518 t1 = gen_reg_rtx (maskmode);
24520 /* Replicate the low bits of the V4DImode mask into V8SImode:
24521 mask = { A B C D }
24522 t1 = { A A B B C C D D }. */
24523 for (i = 0; i < w / 2; ++i)
24524 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24525 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24526 vt = force_reg (maskmode, vt);
24527 mask = gen_lowpart (maskmode, mask);
24528 if (maskmode == V8SImode)
24529 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24530 else
24531 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24533 /* Multiply the shuffle indicies by two. */
24534 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24535 OPTAB_DIRECT);
24537 /* Add one to the odd shuffle indicies:
24538 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24539 for (i = 0; i < w / 2; ++i)
24541 vec[i * 2] = const0_rtx;
24542 vec[i * 2 + 1] = const1_rtx;
24544 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24545 vt = validize_mem (force_const_mem (maskmode, vt));
24546 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24547 OPTAB_DIRECT);
24549 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24550 operands[3] = mask = t1;
24551 target = gen_reg_rtx (mode);
24552 op0 = gen_lowpart (mode, op0);
24553 op1 = gen_lowpart (mode, op1);
24556 switch (mode)
24558 case V8SImode:
24559 /* The VPERMD and VPERMPS instructions already properly ignore
24560 the high bits of the shuffle elements. No need for us to
24561 perform an AND ourselves. */
24562 if (one_operand_shuffle)
24564 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24565 if (target != operands[0])
24566 emit_move_insn (operands[0],
24567 gen_lowpart (GET_MODE (operands[0]), target));
24569 else
24571 t1 = gen_reg_rtx (V8SImode);
24572 t2 = gen_reg_rtx (V8SImode);
24573 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24574 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24575 goto merge_two;
24577 return;
24579 case V8SFmode:
24580 mask = gen_lowpart (V8SImode, mask);
24581 if (one_operand_shuffle)
24582 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24583 else
24585 t1 = gen_reg_rtx (V8SFmode);
24586 t2 = gen_reg_rtx (V8SFmode);
24587 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24588 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24589 goto merge_two;
24591 return;
24593 case V4SImode:
24594 /* By combining the two 128-bit input vectors into one 256-bit
24595 input vector, we can use VPERMD and VPERMPS for the full
24596 two-operand shuffle. */
24597 t1 = gen_reg_rtx (V8SImode);
24598 t2 = gen_reg_rtx (V8SImode);
24599 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24600 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24601 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24602 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24603 return;
24605 case V4SFmode:
24606 t1 = gen_reg_rtx (V8SFmode);
24607 t2 = gen_reg_rtx (V8SImode);
24608 mask = gen_lowpart (V4SImode, mask);
24609 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24610 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24611 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24612 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24613 return;
24615 case V32QImode:
24616 t1 = gen_reg_rtx (V32QImode);
24617 t2 = gen_reg_rtx (V32QImode);
24618 t3 = gen_reg_rtx (V32QImode);
24619 vt2 = GEN_INT (-128);
24620 for (i = 0; i < 32; i++)
24621 vec[i] = vt2;
24622 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24623 vt = force_reg (V32QImode, vt);
24624 for (i = 0; i < 32; i++)
24625 vec[i] = i < 16 ? vt2 : const0_rtx;
24626 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24627 vt2 = force_reg (V32QImode, vt2);
24628 /* From mask create two adjusted masks, which contain the same
24629 bits as mask in the low 7 bits of each vector element.
24630 The first mask will have the most significant bit clear
24631 if it requests element from the same 128-bit lane
24632 and MSB set if it requests element from the other 128-bit lane.
24633 The second mask will have the opposite values of the MSB,
24634 and additionally will have its 128-bit lanes swapped.
24635 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24636 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24637 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24638 stands for other 12 bytes. */
24639 /* The bit whether element is from the same lane or the other
24640 lane is bit 4, so shift it up by 3 to the MSB position. */
24641 t5 = gen_reg_rtx (V4DImode);
24642 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24643 GEN_INT (3)));
24644 /* Clear MSB bits from the mask just in case it had them set. */
24645 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24646 /* After this t1 will have MSB set for elements from other lane. */
24647 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24648 /* Clear bits other than MSB. */
24649 emit_insn (gen_andv32qi3 (t1, t1, vt));
24650 /* Or in the lower bits from mask into t3. */
24651 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24652 /* And invert MSB bits in t1, so MSB is set for elements from the same
24653 lane. */
24654 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24655 /* Swap 128-bit lanes in t3. */
24656 t6 = gen_reg_rtx (V4DImode);
24657 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24658 const2_rtx, GEN_INT (3),
24659 const0_rtx, const1_rtx));
24660 /* And or in the lower bits from mask into t1. */
24661 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24662 if (one_operand_shuffle)
24664 /* Each of these shuffles will put 0s in places where
24665 element from the other 128-bit lane is needed, otherwise
24666 will shuffle in the requested value. */
24667 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24668 gen_lowpart (V32QImode, t6)));
24669 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24670 /* For t3 the 128-bit lanes are swapped again. */
24671 t7 = gen_reg_rtx (V4DImode);
24672 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24673 const2_rtx, GEN_INT (3),
24674 const0_rtx, const1_rtx));
24675 /* And oring both together leads to the result. */
24676 emit_insn (gen_iorv32qi3 (target, t1,
24677 gen_lowpart (V32QImode, t7)));
24678 if (target != operands[0])
24679 emit_move_insn (operands[0],
24680 gen_lowpart (GET_MODE (operands[0]), target));
24681 return;
24684 t4 = gen_reg_rtx (V32QImode);
24685 /* Similarly to the above one_operand_shuffle code,
24686 just for repeated twice for each operand. merge_two:
24687 code will merge the two results together. */
24688 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24689 gen_lowpart (V32QImode, t6)));
24690 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24691 gen_lowpart (V32QImode, t6)));
24692 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24693 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24694 t7 = gen_reg_rtx (V4DImode);
24695 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24696 const2_rtx, GEN_INT (3),
24697 const0_rtx, const1_rtx));
24698 t8 = gen_reg_rtx (V4DImode);
24699 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24700 const2_rtx, GEN_INT (3),
24701 const0_rtx, const1_rtx));
24702 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24703 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24704 t1 = t4;
24705 t2 = t3;
24706 goto merge_two;
24708 default:
24709 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24710 break;
24714 if (TARGET_XOP)
24716 /* The XOP VPPERM insn supports three inputs. By ignoring the
24717 one_operand_shuffle special case, we avoid creating another
24718 set of constant vectors in memory. */
24719 one_operand_shuffle = false;
24721 /* mask = mask & {2*w-1, ...} */
24722 vt = GEN_INT (2*w - 1);
24724 else
24726 /* mask = mask & {w-1, ...} */
24727 vt = GEN_INT (w - 1);
24730 for (i = 0; i < w; i++)
24731 vec[i] = vt;
24732 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24733 mask = expand_simple_binop (maskmode, AND, mask, vt,
24734 NULL_RTX, 0, OPTAB_DIRECT);
24736 /* For non-QImode operations, convert the word permutation control
24737 into a byte permutation control. */
24738 if (mode != V16QImode)
24740 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24741 GEN_INT (exact_log2 (e)),
24742 NULL_RTX, 0, OPTAB_DIRECT);
24744 /* Convert mask to vector of chars. */
24745 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24747 /* Replicate each of the input bytes into byte positions:
24748 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24749 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24750 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24751 for (i = 0; i < 16; ++i)
24752 vec[i] = GEN_INT (i/e * e);
24753 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24754 vt = validize_mem (force_const_mem (V16QImode, vt));
24755 if (TARGET_XOP)
24756 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24757 else
24758 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24760 /* Convert it into the byte positions by doing
24761 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24762 for (i = 0; i < 16; ++i)
24763 vec[i] = GEN_INT (i % e);
24764 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24765 vt = validize_mem (force_const_mem (V16QImode, vt));
24766 emit_insn (gen_addv16qi3 (mask, mask, vt));
24769 /* The actual shuffle operations all operate on V16QImode. */
24770 op0 = gen_lowpart (V16QImode, op0);
24771 op1 = gen_lowpart (V16QImode, op1);
24773 if (TARGET_XOP)
24775 if (GET_MODE (target) != V16QImode)
24776 target = gen_reg_rtx (V16QImode);
24777 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24778 if (target != operands[0])
24779 emit_move_insn (operands[0],
24780 gen_lowpart (GET_MODE (operands[0]), target));
24782 else if (one_operand_shuffle)
24784 if (GET_MODE (target) != V16QImode)
24785 target = gen_reg_rtx (V16QImode);
24786 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24787 if (target != operands[0])
24788 emit_move_insn (operands[0],
24789 gen_lowpart (GET_MODE (operands[0]), target));
24791 else
24793 rtx xops[6];
24794 bool ok;
24796 /* Shuffle the two input vectors independently. */
24797 t1 = gen_reg_rtx (V16QImode);
24798 t2 = gen_reg_rtx (V16QImode);
24799 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24800 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24802 merge_two:
24803 /* Then merge them together. The key is whether any given control
24804 element contained a bit set that indicates the second word. */
24805 mask = operands[3];
24806 vt = GEN_INT (w);
24807 if (maskmode == V2DImode && !TARGET_SSE4_1)
24809 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24810 more shuffle to convert the V2DI input mask into a V4SI
24811 input mask. At which point the masking that expand_int_vcond
24812 will work as desired. */
24813 rtx t3 = gen_reg_rtx (V4SImode);
24814 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24815 const0_rtx, const0_rtx,
24816 const2_rtx, const2_rtx));
24817 mask = t3;
24818 maskmode = V4SImode;
24819 e = w = 4;
24822 for (i = 0; i < w; i++)
24823 vec[i] = vt;
24824 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24825 vt = force_reg (maskmode, vt);
24826 mask = expand_simple_binop (maskmode, AND, mask, vt,
24827 NULL_RTX, 0, OPTAB_DIRECT);
24829 if (GET_MODE (target) != mode)
24830 target = gen_reg_rtx (mode);
24831 xops[0] = target;
24832 xops[1] = gen_lowpart (mode, t2);
24833 xops[2] = gen_lowpart (mode, t1);
24834 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24835 xops[4] = mask;
24836 xops[5] = vt;
24837 ok = ix86_expand_int_vcond (xops);
24838 gcc_assert (ok);
24839 if (target != operands[0])
24840 emit_move_insn (operands[0],
24841 gen_lowpart (GET_MODE (operands[0]), target));
24845 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24846 true if we should do zero extension, else sign extension. HIGH_P is
24847 true if we want the N/2 high elements, else the low elements. */
24849 void
24850 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24852 machine_mode imode = GET_MODE (src);
24853 rtx tmp;
24855 if (TARGET_SSE4_1)
24857 rtx (*unpack)(rtx, rtx);
24858 rtx (*extract)(rtx, rtx) = NULL;
24859 machine_mode halfmode = BLKmode;
24861 switch (imode)
24863 case V64QImode:
24864 if (unsigned_p)
24865 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24866 else
24867 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24868 halfmode = V32QImode;
24869 extract
24870 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24871 break;
24872 case V32QImode:
24873 if (unsigned_p)
24874 unpack = gen_avx2_zero_extendv16qiv16hi2;
24875 else
24876 unpack = gen_avx2_sign_extendv16qiv16hi2;
24877 halfmode = V16QImode;
24878 extract
24879 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24880 break;
24881 case V32HImode:
24882 if (unsigned_p)
24883 unpack = gen_avx512f_zero_extendv16hiv16si2;
24884 else
24885 unpack = gen_avx512f_sign_extendv16hiv16si2;
24886 halfmode = V16HImode;
24887 extract
24888 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24889 break;
24890 case V16HImode:
24891 if (unsigned_p)
24892 unpack = gen_avx2_zero_extendv8hiv8si2;
24893 else
24894 unpack = gen_avx2_sign_extendv8hiv8si2;
24895 halfmode = V8HImode;
24896 extract
24897 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24898 break;
24899 case V16SImode:
24900 if (unsigned_p)
24901 unpack = gen_avx512f_zero_extendv8siv8di2;
24902 else
24903 unpack = gen_avx512f_sign_extendv8siv8di2;
24904 halfmode = V8SImode;
24905 extract
24906 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24907 break;
24908 case V8SImode:
24909 if (unsigned_p)
24910 unpack = gen_avx2_zero_extendv4siv4di2;
24911 else
24912 unpack = gen_avx2_sign_extendv4siv4di2;
24913 halfmode = V4SImode;
24914 extract
24915 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24916 break;
24917 case V16QImode:
24918 if (unsigned_p)
24919 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24920 else
24921 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24922 break;
24923 case V8HImode:
24924 if (unsigned_p)
24925 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24926 else
24927 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24928 break;
24929 case V4SImode:
24930 if (unsigned_p)
24931 unpack = gen_sse4_1_zero_extendv2siv2di2;
24932 else
24933 unpack = gen_sse4_1_sign_extendv2siv2di2;
24934 break;
24935 default:
24936 gcc_unreachable ();
24939 if (GET_MODE_SIZE (imode) >= 32)
24941 tmp = gen_reg_rtx (halfmode);
24942 emit_insn (extract (tmp, src));
24944 else if (high_p)
24946 /* Shift higher 8 bytes to lower 8 bytes. */
24947 tmp = gen_reg_rtx (V1TImode);
24948 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24949 GEN_INT (64)));
24950 tmp = gen_lowpart (imode, tmp);
24952 else
24953 tmp = src;
24955 emit_insn (unpack (dest, tmp));
24957 else
24959 rtx (*unpack)(rtx, rtx, rtx);
24961 switch (imode)
24963 case V16QImode:
24964 if (high_p)
24965 unpack = gen_vec_interleave_highv16qi;
24966 else
24967 unpack = gen_vec_interleave_lowv16qi;
24968 break;
24969 case V8HImode:
24970 if (high_p)
24971 unpack = gen_vec_interleave_highv8hi;
24972 else
24973 unpack = gen_vec_interleave_lowv8hi;
24974 break;
24975 case V4SImode:
24976 if (high_p)
24977 unpack = gen_vec_interleave_highv4si;
24978 else
24979 unpack = gen_vec_interleave_lowv4si;
24980 break;
24981 default:
24982 gcc_unreachable ();
24985 if (unsigned_p)
24986 tmp = force_reg (imode, CONST0_RTX (imode));
24987 else
24988 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24989 src, pc_rtx, pc_rtx);
24991 rtx tmp2 = gen_reg_rtx (imode);
24992 emit_insn (unpack (tmp2, src, tmp));
24993 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24997 /* Expand conditional increment or decrement using adb/sbb instructions.
24998 The default case using setcc followed by the conditional move can be
24999 done by generic code. */
25000 bool
25001 ix86_expand_int_addcc (rtx operands[])
25003 enum rtx_code code = GET_CODE (operands[1]);
25004 rtx flags;
25005 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25006 rtx compare_op;
25007 rtx val = const0_rtx;
25008 bool fpcmp = false;
25009 machine_mode mode;
25010 rtx op0 = XEXP (operands[1], 0);
25011 rtx op1 = XEXP (operands[1], 1);
25013 if (operands[3] != const1_rtx
25014 && operands[3] != constm1_rtx)
25015 return false;
25016 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25017 return false;
25018 code = GET_CODE (compare_op);
25020 flags = XEXP (compare_op, 0);
25022 if (GET_MODE (flags) == CCFPmode
25023 || GET_MODE (flags) == CCFPUmode)
25025 fpcmp = true;
25026 code = ix86_fp_compare_code_to_integer (code);
25029 if (code != LTU)
25031 val = constm1_rtx;
25032 if (fpcmp)
25033 PUT_CODE (compare_op,
25034 reverse_condition_maybe_unordered
25035 (GET_CODE (compare_op)));
25036 else
25037 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25040 mode = GET_MODE (operands[0]);
25042 /* Construct either adc or sbb insn. */
25043 if ((code == LTU) == (operands[3] == constm1_rtx))
25045 switch (mode)
25047 case QImode:
25048 insn = gen_subqi3_carry;
25049 break;
25050 case HImode:
25051 insn = gen_subhi3_carry;
25052 break;
25053 case SImode:
25054 insn = gen_subsi3_carry;
25055 break;
25056 case DImode:
25057 insn = gen_subdi3_carry;
25058 break;
25059 default:
25060 gcc_unreachable ();
25063 else
25065 switch (mode)
25067 case QImode:
25068 insn = gen_addqi3_carry;
25069 break;
25070 case HImode:
25071 insn = gen_addhi3_carry;
25072 break;
25073 case SImode:
25074 insn = gen_addsi3_carry;
25075 break;
25076 case DImode:
25077 insn = gen_adddi3_carry;
25078 break;
25079 default:
25080 gcc_unreachable ();
25083 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25085 return true;
25089 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25090 but works for floating pointer parameters and nonoffsetable memories.
25091 For pushes, it returns just stack offsets; the values will be saved
25092 in the right order. Maximally three parts are generated. */
25094 static int
25095 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25097 int size;
25099 if (!TARGET_64BIT)
25100 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25101 else
25102 size = (GET_MODE_SIZE (mode) + 4) / 8;
25104 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25105 gcc_assert (size >= 2 && size <= 4);
25107 /* Optimize constant pool reference to immediates. This is used by fp
25108 moves, that force all constants to memory to allow combining. */
25109 if (MEM_P (operand) && MEM_READONLY_P (operand))
25111 rtx tmp = maybe_get_pool_constant (operand);
25112 if (tmp)
25113 operand = tmp;
25116 if (MEM_P (operand) && !offsettable_memref_p (operand))
25118 /* The only non-offsetable memories we handle are pushes. */
25119 int ok = push_operand (operand, VOIDmode);
25121 gcc_assert (ok);
25123 operand = copy_rtx (operand);
25124 PUT_MODE (operand, word_mode);
25125 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25126 return size;
25129 if (GET_CODE (operand) == CONST_VECTOR)
25131 machine_mode imode = int_mode_for_mode (mode);
25132 /* Caution: if we looked through a constant pool memory above,
25133 the operand may actually have a different mode now. That's
25134 ok, since we want to pun this all the way back to an integer. */
25135 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25136 gcc_assert (operand != NULL);
25137 mode = imode;
25140 if (!TARGET_64BIT)
25142 if (mode == DImode)
25143 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25144 else
25146 int i;
25148 if (REG_P (operand))
25150 gcc_assert (reload_completed);
25151 for (i = 0; i < size; i++)
25152 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25154 else if (offsettable_memref_p (operand))
25156 operand = adjust_address (operand, SImode, 0);
25157 parts[0] = operand;
25158 for (i = 1; i < size; i++)
25159 parts[i] = adjust_address (operand, SImode, 4 * i);
25161 else if (CONST_DOUBLE_P (operand))
25163 const REAL_VALUE_TYPE *r;
25164 long l[4];
25166 r = CONST_DOUBLE_REAL_VALUE (operand);
25167 switch (mode)
25169 case TFmode:
25170 real_to_target (l, r, mode);
25171 parts[3] = gen_int_mode (l[3], SImode);
25172 parts[2] = gen_int_mode (l[2], SImode);
25173 break;
25174 case XFmode:
25175 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25176 long double may not be 80-bit. */
25177 real_to_target (l, r, mode);
25178 parts[2] = gen_int_mode (l[2], SImode);
25179 break;
25180 case DFmode:
25181 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25182 break;
25183 default:
25184 gcc_unreachable ();
25186 parts[1] = gen_int_mode (l[1], SImode);
25187 parts[0] = gen_int_mode (l[0], SImode);
25189 else
25190 gcc_unreachable ();
25193 else
25195 if (mode == TImode)
25196 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25197 if (mode == XFmode || mode == TFmode)
25199 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25200 if (REG_P (operand))
25202 gcc_assert (reload_completed);
25203 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25204 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25206 else if (offsettable_memref_p (operand))
25208 operand = adjust_address (operand, DImode, 0);
25209 parts[0] = operand;
25210 parts[1] = adjust_address (operand, upper_mode, 8);
25212 else if (CONST_DOUBLE_P (operand))
25214 long l[4];
25216 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25218 /* real_to_target puts 32-bit pieces in each long. */
25219 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25220 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25221 << 32), DImode);
25223 if (upper_mode == SImode)
25224 parts[1] = gen_int_mode (l[2], SImode);
25225 else
25226 parts[1]
25227 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25228 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25229 << 32), DImode);
25231 else
25232 gcc_unreachable ();
25236 return size;
25239 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25240 Return false when normal moves are needed; true when all required
25241 insns have been emitted. Operands 2-4 contain the input values
25242 int the correct order; operands 5-7 contain the output values. */
25244 void
25245 ix86_split_long_move (rtx operands[])
25247 rtx part[2][4];
25248 int nparts, i, j;
25249 int push = 0;
25250 int collisions = 0;
25251 machine_mode mode = GET_MODE (operands[0]);
25252 bool collisionparts[4];
25254 /* The DFmode expanders may ask us to move double.
25255 For 64bit target this is single move. By hiding the fact
25256 here we simplify i386.md splitters. */
25257 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25259 /* Optimize constant pool reference to immediates. This is used by
25260 fp moves, that force all constants to memory to allow combining. */
25262 if (MEM_P (operands[1])
25263 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25264 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25265 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25266 if (push_operand (operands[0], VOIDmode))
25268 operands[0] = copy_rtx (operands[0]);
25269 PUT_MODE (operands[0], word_mode);
25271 else
25272 operands[0] = gen_lowpart (DImode, operands[0]);
25273 operands[1] = gen_lowpart (DImode, operands[1]);
25274 emit_move_insn (operands[0], operands[1]);
25275 return;
25278 /* The only non-offsettable memory we handle is push. */
25279 if (push_operand (operands[0], VOIDmode))
25280 push = 1;
25281 else
25282 gcc_assert (!MEM_P (operands[0])
25283 || offsettable_memref_p (operands[0]));
25285 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25286 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25288 /* When emitting push, take care for source operands on the stack. */
25289 if (push && MEM_P (operands[1])
25290 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25292 rtx src_base = XEXP (part[1][nparts - 1], 0);
25294 /* Compensate for the stack decrement by 4. */
25295 if (!TARGET_64BIT && nparts == 3
25296 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25297 src_base = plus_constant (Pmode, src_base, 4);
25299 /* src_base refers to the stack pointer and is
25300 automatically decreased by emitted push. */
25301 for (i = 0; i < nparts; i++)
25302 part[1][i] = change_address (part[1][i],
25303 GET_MODE (part[1][i]), src_base);
25306 /* We need to do copy in the right order in case an address register
25307 of the source overlaps the destination. */
25308 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25310 rtx tmp;
25312 for (i = 0; i < nparts; i++)
25314 collisionparts[i]
25315 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25316 if (collisionparts[i])
25317 collisions++;
25320 /* Collision in the middle part can be handled by reordering. */
25321 if (collisions == 1 && nparts == 3 && collisionparts [1])
25323 std::swap (part[0][1], part[0][2]);
25324 std::swap (part[1][1], part[1][2]);
25326 else if (collisions == 1
25327 && nparts == 4
25328 && (collisionparts [1] || collisionparts [2]))
25330 if (collisionparts [1])
25332 std::swap (part[0][1], part[0][2]);
25333 std::swap (part[1][1], part[1][2]);
25335 else
25337 std::swap (part[0][2], part[0][3]);
25338 std::swap (part[1][2], part[1][3]);
25342 /* If there are more collisions, we can't handle it by reordering.
25343 Do an lea to the last part and use only one colliding move. */
25344 else if (collisions > 1)
25346 rtx base, addr, tls_base = NULL_RTX;
25348 collisions = 1;
25350 base = part[0][nparts - 1];
25352 /* Handle the case when the last part isn't valid for lea.
25353 Happens in 64-bit mode storing the 12-byte XFmode. */
25354 if (GET_MODE (base) != Pmode)
25355 base = gen_rtx_REG (Pmode, REGNO (base));
25357 addr = XEXP (part[1][0], 0);
25358 if (TARGET_TLS_DIRECT_SEG_REFS)
25360 struct ix86_address parts;
25361 int ok = ix86_decompose_address (addr, &parts);
25362 gcc_assert (ok);
25363 if (parts.seg == DEFAULT_TLS_SEG_REG)
25365 /* It is not valid to use %gs: or %fs: in
25366 lea though, so we need to remove it from the
25367 address used for lea and add it to each individual
25368 memory loads instead. */
25369 addr = copy_rtx (addr);
25370 rtx *x = &addr;
25371 while (GET_CODE (*x) == PLUS)
25373 for (i = 0; i < 2; i++)
25375 rtx u = XEXP (*x, i);
25376 if (GET_CODE (u) == ZERO_EXTEND)
25377 u = XEXP (u, 0);
25378 if (GET_CODE (u) == UNSPEC
25379 && XINT (u, 1) == UNSPEC_TP)
25381 tls_base = XEXP (*x, i);
25382 *x = XEXP (*x, 1 - i);
25383 break;
25386 if (tls_base)
25387 break;
25388 x = &XEXP (*x, 0);
25390 gcc_assert (tls_base);
25393 emit_insn (gen_rtx_SET (base, addr));
25394 if (tls_base)
25395 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25396 part[1][0] = replace_equiv_address (part[1][0], base);
25397 for (i = 1; i < nparts; i++)
25399 if (tls_base)
25400 base = copy_rtx (base);
25401 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25402 part[1][i] = replace_equiv_address (part[1][i], tmp);
25407 if (push)
25409 if (!TARGET_64BIT)
25411 if (nparts == 3)
25413 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25414 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25415 stack_pointer_rtx, GEN_INT (-4)));
25416 emit_move_insn (part[0][2], part[1][2]);
25418 else if (nparts == 4)
25420 emit_move_insn (part[0][3], part[1][3]);
25421 emit_move_insn (part[0][2], part[1][2]);
25424 else
25426 /* In 64bit mode we don't have 32bit push available. In case this is
25427 register, it is OK - we will just use larger counterpart. We also
25428 retype memory - these comes from attempt to avoid REX prefix on
25429 moving of second half of TFmode value. */
25430 if (GET_MODE (part[1][1]) == SImode)
25432 switch (GET_CODE (part[1][1]))
25434 case MEM:
25435 part[1][1] = adjust_address (part[1][1], DImode, 0);
25436 break;
25438 case REG:
25439 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25440 break;
25442 default:
25443 gcc_unreachable ();
25446 if (GET_MODE (part[1][0]) == SImode)
25447 part[1][0] = part[1][1];
25450 emit_move_insn (part[0][1], part[1][1]);
25451 emit_move_insn (part[0][0], part[1][0]);
25452 return;
25455 /* Choose correct order to not overwrite the source before it is copied. */
25456 if ((REG_P (part[0][0])
25457 && REG_P (part[1][1])
25458 && (REGNO (part[0][0]) == REGNO (part[1][1])
25459 || (nparts == 3
25460 && REGNO (part[0][0]) == REGNO (part[1][2]))
25461 || (nparts == 4
25462 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25463 || (collisions > 0
25464 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25466 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25468 operands[2 + i] = part[0][j];
25469 operands[6 + i] = part[1][j];
25472 else
25474 for (i = 0; i < nparts; i++)
25476 operands[2 + i] = part[0][i];
25477 operands[6 + i] = part[1][i];
25481 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25482 if (optimize_insn_for_size_p ())
25484 for (j = 0; j < nparts - 1; j++)
25485 if (CONST_INT_P (operands[6 + j])
25486 && operands[6 + j] != const0_rtx
25487 && REG_P (operands[2 + j]))
25488 for (i = j; i < nparts - 1; i++)
25489 if (CONST_INT_P (operands[7 + i])
25490 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25491 operands[7 + i] = operands[2 + j];
25494 for (i = 0; i < nparts; i++)
25495 emit_move_insn (operands[2 + i], operands[6 + i]);
25497 return;
25500 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25501 left shift by a constant, either using a single shift or
25502 a sequence of add instructions. */
25504 static void
25505 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25507 rtx (*insn)(rtx, rtx, rtx);
25509 if (count == 1
25510 || (count * ix86_cost->add <= ix86_cost->shift_const
25511 && !optimize_insn_for_size_p ()))
25513 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25514 while (count-- > 0)
25515 emit_insn (insn (operand, operand, operand));
25517 else
25519 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25520 emit_insn (insn (operand, operand, GEN_INT (count)));
25524 void
25525 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25527 rtx (*gen_ashl3)(rtx, rtx, rtx);
25528 rtx (*gen_shld)(rtx, rtx, rtx);
25529 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25531 rtx low[2], high[2];
25532 int count;
25534 if (CONST_INT_P (operands[2]))
25536 split_double_mode (mode, operands, 2, low, high);
25537 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25539 if (count >= half_width)
25541 emit_move_insn (high[0], low[1]);
25542 emit_move_insn (low[0], const0_rtx);
25544 if (count > half_width)
25545 ix86_expand_ashl_const (high[0], count - half_width, mode);
25547 else
25549 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25551 if (!rtx_equal_p (operands[0], operands[1]))
25552 emit_move_insn (operands[0], operands[1]);
25554 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25555 ix86_expand_ashl_const (low[0], count, mode);
25557 return;
25560 split_double_mode (mode, operands, 1, low, high);
25562 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25564 if (operands[1] == const1_rtx)
25566 /* Assuming we've chosen a QImode capable registers, then 1 << N
25567 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25568 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25570 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25572 ix86_expand_clear (low[0]);
25573 ix86_expand_clear (high[0]);
25574 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25576 d = gen_lowpart (QImode, low[0]);
25577 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25578 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25579 emit_insn (gen_rtx_SET (d, s));
25581 d = gen_lowpart (QImode, high[0]);
25582 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25583 s = gen_rtx_NE (QImode, flags, const0_rtx);
25584 emit_insn (gen_rtx_SET (d, s));
25587 /* Otherwise, we can get the same results by manually performing
25588 a bit extract operation on bit 5/6, and then performing the two
25589 shifts. The two methods of getting 0/1 into low/high are exactly
25590 the same size. Avoiding the shift in the bit extract case helps
25591 pentium4 a bit; no one else seems to care much either way. */
25592 else
25594 machine_mode half_mode;
25595 rtx (*gen_lshr3)(rtx, rtx, rtx);
25596 rtx (*gen_and3)(rtx, rtx, rtx);
25597 rtx (*gen_xor3)(rtx, rtx, rtx);
25598 HOST_WIDE_INT bits;
25599 rtx x;
25601 if (mode == DImode)
25603 half_mode = SImode;
25604 gen_lshr3 = gen_lshrsi3;
25605 gen_and3 = gen_andsi3;
25606 gen_xor3 = gen_xorsi3;
25607 bits = 5;
25609 else
25611 half_mode = DImode;
25612 gen_lshr3 = gen_lshrdi3;
25613 gen_and3 = gen_anddi3;
25614 gen_xor3 = gen_xordi3;
25615 bits = 6;
25618 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25619 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25620 else
25621 x = gen_lowpart (half_mode, operands[2]);
25622 emit_insn (gen_rtx_SET (high[0], x));
25624 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25625 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25626 emit_move_insn (low[0], high[0]);
25627 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25630 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25631 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25632 return;
25635 if (operands[1] == constm1_rtx)
25637 /* For -1 << N, we can avoid the shld instruction, because we
25638 know that we're shifting 0...31/63 ones into a -1. */
25639 emit_move_insn (low[0], constm1_rtx);
25640 if (optimize_insn_for_size_p ())
25641 emit_move_insn (high[0], low[0]);
25642 else
25643 emit_move_insn (high[0], constm1_rtx);
25645 else
25647 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25649 if (!rtx_equal_p (operands[0], operands[1]))
25650 emit_move_insn (operands[0], operands[1]);
25652 split_double_mode (mode, operands, 1, low, high);
25653 emit_insn (gen_shld (high[0], low[0], operands[2]));
25656 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25658 if (TARGET_CMOVE && scratch)
25660 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25661 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25663 ix86_expand_clear (scratch);
25664 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25666 else
25668 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25669 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25671 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25675 void
25676 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25678 rtx (*gen_ashr3)(rtx, rtx, rtx)
25679 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25680 rtx (*gen_shrd)(rtx, rtx, rtx);
25681 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25683 rtx low[2], high[2];
25684 int count;
25686 if (CONST_INT_P (operands[2]))
25688 split_double_mode (mode, operands, 2, low, high);
25689 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25691 if (count == GET_MODE_BITSIZE (mode) - 1)
25693 emit_move_insn (high[0], high[1]);
25694 emit_insn (gen_ashr3 (high[0], high[0],
25695 GEN_INT (half_width - 1)));
25696 emit_move_insn (low[0], high[0]);
25699 else if (count >= half_width)
25701 emit_move_insn (low[0], high[1]);
25702 emit_move_insn (high[0], low[0]);
25703 emit_insn (gen_ashr3 (high[0], high[0],
25704 GEN_INT (half_width - 1)));
25706 if (count > half_width)
25707 emit_insn (gen_ashr3 (low[0], low[0],
25708 GEN_INT (count - half_width)));
25710 else
25712 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25714 if (!rtx_equal_p (operands[0], operands[1]))
25715 emit_move_insn (operands[0], operands[1]);
25717 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25718 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25721 else
25723 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25725 if (!rtx_equal_p (operands[0], operands[1]))
25726 emit_move_insn (operands[0], operands[1]);
25728 split_double_mode (mode, operands, 1, low, high);
25730 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25731 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25733 if (TARGET_CMOVE && scratch)
25735 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25736 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25738 emit_move_insn (scratch, high[0]);
25739 emit_insn (gen_ashr3 (scratch, scratch,
25740 GEN_INT (half_width - 1)));
25741 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25742 scratch));
25744 else
25746 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25747 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25749 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25754 void
25755 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25757 rtx (*gen_lshr3)(rtx, rtx, rtx)
25758 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25759 rtx (*gen_shrd)(rtx, rtx, rtx);
25760 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25762 rtx low[2], high[2];
25763 int count;
25765 if (CONST_INT_P (operands[2]))
25767 split_double_mode (mode, operands, 2, low, high);
25768 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25770 if (count >= half_width)
25772 emit_move_insn (low[0], high[1]);
25773 ix86_expand_clear (high[0]);
25775 if (count > half_width)
25776 emit_insn (gen_lshr3 (low[0], low[0],
25777 GEN_INT (count - half_width)));
25779 else
25781 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25783 if (!rtx_equal_p (operands[0], operands[1]))
25784 emit_move_insn (operands[0], operands[1]);
25786 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25787 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25790 else
25792 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25794 if (!rtx_equal_p (operands[0], operands[1]))
25795 emit_move_insn (operands[0], operands[1]);
25797 split_double_mode (mode, operands, 1, low, high);
25799 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25800 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25802 if (TARGET_CMOVE && scratch)
25804 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25805 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25807 ix86_expand_clear (scratch);
25808 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25809 scratch));
25811 else
25813 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25814 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25816 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25821 /* Predict just emitted jump instruction to be taken with probability PROB. */
25822 static void
25823 predict_jump (int prob)
25825 rtx insn = get_last_insn ();
25826 gcc_assert (JUMP_P (insn));
25827 add_int_reg_note (insn, REG_BR_PROB, prob);
25830 /* Helper function for the string operations below. Dest VARIABLE whether
25831 it is aligned to VALUE bytes. If true, jump to the label. */
25832 static rtx_code_label *
25833 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25835 rtx_code_label *label = gen_label_rtx ();
25836 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25837 if (GET_MODE (variable) == DImode)
25838 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25839 else
25840 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25841 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25842 1, label);
25843 if (epilogue)
25844 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25845 else
25846 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25847 return label;
25850 /* Adjust COUNTER by the VALUE. */
25851 static void
25852 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25854 rtx (*gen_add)(rtx, rtx, rtx)
25855 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25857 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25860 /* Zero extend possibly SImode EXP to Pmode register. */
25862 ix86_zero_extend_to_Pmode (rtx exp)
25864 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25867 /* Divide COUNTREG by SCALE. */
25868 static rtx
25869 scale_counter (rtx countreg, int scale)
25871 rtx sc;
25873 if (scale == 1)
25874 return countreg;
25875 if (CONST_INT_P (countreg))
25876 return GEN_INT (INTVAL (countreg) / scale);
25877 gcc_assert (REG_P (countreg));
25879 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25880 GEN_INT (exact_log2 (scale)),
25881 NULL, 1, OPTAB_DIRECT);
25882 return sc;
25885 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25886 DImode for constant loop counts. */
25888 static machine_mode
25889 counter_mode (rtx count_exp)
25891 if (GET_MODE (count_exp) != VOIDmode)
25892 return GET_MODE (count_exp);
25893 if (!CONST_INT_P (count_exp))
25894 return Pmode;
25895 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25896 return DImode;
25897 return SImode;
25900 /* Copy the address to a Pmode register. This is used for x32 to
25901 truncate DImode TLS address to a SImode register. */
25903 static rtx
25904 ix86_copy_addr_to_reg (rtx addr)
25906 rtx reg;
25907 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25909 reg = copy_addr_to_reg (addr);
25910 REG_POINTER (reg) = 1;
25911 return reg;
25913 else
25915 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25916 reg = copy_to_mode_reg (DImode, addr);
25917 REG_POINTER (reg) = 1;
25918 return gen_rtx_SUBREG (SImode, reg, 0);
25922 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25923 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25924 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25925 memory by VALUE (supposed to be in MODE).
25927 The size is rounded down to whole number of chunk size moved at once.
25928 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25931 static void
25932 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25933 rtx destptr, rtx srcptr, rtx value,
25934 rtx count, machine_mode mode, int unroll,
25935 int expected_size, bool issetmem)
25937 rtx_code_label *out_label, *top_label;
25938 rtx iter, tmp;
25939 machine_mode iter_mode = counter_mode (count);
25940 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25941 rtx piece_size = GEN_INT (piece_size_n);
25942 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25943 rtx size;
25944 int i;
25946 top_label = gen_label_rtx ();
25947 out_label = gen_label_rtx ();
25948 iter = gen_reg_rtx (iter_mode);
25950 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25951 NULL, 1, OPTAB_DIRECT);
25952 /* Those two should combine. */
25953 if (piece_size == const1_rtx)
25955 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25956 true, out_label);
25957 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25959 emit_move_insn (iter, const0_rtx);
25961 emit_label (top_label);
25963 tmp = convert_modes (Pmode, iter_mode, iter, true);
25965 /* This assert could be relaxed - in this case we'll need to compute
25966 smallest power of two, containing in PIECE_SIZE_N and pass it to
25967 offset_address. */
25968 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25969 destmem = offset_address (destmem, tmp, piece_size_n);
25970 destmem = adjust_address (destmem, mode, 0);
25972 if (!issetmem)
25974 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25975 srcmem = adjust_address (srcmem, mode, 0);
25977 /* When unrolling for chips that reorder memory reads and writes,
25978 we can save registers by using single temporary.
25979 Also using 4 temporaries is overkill in 32bit mode. */
25980 if (!TARGET_64BIT && 0)
25982 for (i = 0; i < unroll; i++)
25984 if (i)
25986 destmem =
25987 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25988 srcmem =
25989 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25991 emit_move_insn (destmem, srcmem);
25994 else
25996 rtx tmpreg[4];
25997 gcc_assert (unroll <= 4);
25998 for (i = 0; i < unroll; i++)
26000 tmpreg[i] = gen_reg_rtx (mode);
26001 if (i)
26003 srcmem =
26004 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26006 emit_move_insn (tmpreg[i], srcmem);
26008 for (i = 0; i < unroll; i++)
26010 if (i)
26012 destmem =
26013 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26015 emit_move_insn (destmem, tmpreg[i]);
26019 else
26020 for (i = 0; i < unroll; i++)
26022 if (i)
26023 destmem =
26024 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26025 emit_move_insn (destmem, value);
26028 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26029 true, OPTAB_LIB_WIDEN);
26030 if (tmp != iter)
26031 emit_move_insn (iter, tmp);
26033 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26034 true, top_label);
26035 if (expected_size != -1)
26037 expected_size /= GET_MODE_SIZE (mode) * unroll;
26038 if (expected_size == 0)
26039 predict_jump (0);
26040 else if (expected_size > REG_BR_PROB_BASE)
26041 predict_jump (REG_BR_PROB_BASE - 1);
26042 else
26043 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26045 else
26046 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26047 iter = ix86_zero_extend_to_Pmode (iter);
26048 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26049 true, OPTAB_LIB_WIDEN);
26050 if (tmp != destptr)
26051 emit_move_insn (destptr, tmp);
26052 if (!issetmem)
26054 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26055 true, OPTAB_LIB_WIDEN);
26056 if (tmp != srcptr)
26057 emit_move_insn (srcptr, tmp);
26059 emit_label (out_label);
26062 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26063 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26064 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26065 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26066 ORIG_VALUE is the original value passed to memset to fill the memory with.
26067 Other arguments have same meaning as for previous function. */
26069 static void
26070 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26071 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26072 rtx count,
26073 machine_mode mode, bool issetmem)
26075 rtx destexp;
26076 rtx srcexp;
26077 rtx countreg;
26078 HOST_WIDE_INT rounded_count;
26080 /* If possible, it is shorter to use rep movs.
26081 TODO: Maybe it is better to move this logic to decide_alg. */
26082 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26083 && (!issetmem || orig_value == const0_rtx))
26084 mode = SImode;
26086 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26087 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26089 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26090 GET_MODE_SIZE (mode)));
26091 if (mode != QImode)
26093 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26094 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26095 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26097 else
26098 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26099 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26101 rounded_count
26102 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26103 destmem = shallow_copy_rtx (destmem);
26104 set_mem_size (destmem, rounded_count);
26106 else if (MEM_SIZE_KNOWN_P (destmem))
26107 clear_mem_size (destmem);
26109 if (issetmem)
26111 value = force_reg (mode, gen_lowpart (mode, value));
26112 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26114 else
26116 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26117 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26118 if (mode != QImode)
26120 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26121 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26122 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26124 else
26125 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26126 if (CONST_INT_P (count))
26128 rounded_count
26129 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26130 srcmem = shallow_copy_rtx (srcmem);
26131 set_mem_size (srcmem, rounded_count);
26133 else
26135 if (MEM_SIZE_KNOWN_P (srcmem))
26136 clear_mem_size (srcmem);
26138 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26139 destexp, srcexp));
26143 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26144 DESTMEM.
26145 SRC is passed by pointer to be updated on return.
26146 Return value is updated DST. */
26147 static rtx
26148 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26149 HOST_WIDE_INT size_to_move)
26151 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26152 enum insn_code code;
26153 machine_mode move_mode;
26154 int piece_size, i;
26156 /* Find the widest mode in which we could perform moves.
26157 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26158 it until move of such size is supported. */
26159 piece_size = 1 << floor_log2 (size_to_move);
26160 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26161 code = optab_handler (mov_optab, move_mode);
26162 while (code == CODE_FOR_nothing && piece_size > 1)
26164 piece_size >>= 1;
26165 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26166 code = optab_handler (mov_optab, move_mode);
26169 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26170 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26171 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26173 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26174 move_mode = mode_for_vector (word_mode, nunits);
26175 code = optab_handler (mov_optab, move_mode);
26176 if (code == CODE_FOR_nothing)
26178 move_mode = word_mode;
26179 piece_size = GET_MODE_SIZE (move_mode);
26180 code = optab_handler (mov_optab, move_mode);
26183 gcc_assert (code != CODE_FOR_nothing);
26185 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26186 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26188 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26189 gcc_assert (size_to_move % piece_size == 0);
26190 adjust = GEN_INT (piece_size);
26191 for (i = 0; i < size_to_move; i += piece_size)
26193 /* We move from memory to memory, so we'll need to do it via
26194 a temporary register. */
26195 tempreg = gen_reg_rtx (move_mode);
26196 emit_insn (GEN_FCN (code) (tempreg, src));
26197 emit_insn (GEN_FCN (code) (dst, tempreg));
26199 emit_move_insn (destptr,
26200 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26201 emit_move_insn (srcptr,
26202 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26204 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26205 piece_size);
26206 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26207 piece_size);
26210 /* Update DST and SRC rtx. */
26211 *srcmem = src;
26212 return dst;
26215 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26216 static void
26217 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26218 rtx destptr, rtx srcptr, rtx count, int max_size)
26220 rtx src, dest;
26221 if (CONST_INT_P (count))
26223 HOST_WIDE_INT countval = INTVAL (count);
26224 HOST_WIDE_INT epilogue_size = countval % max_size;
26225 int i;
26227 /* For now MAX_SIZE should be a power of 2. This assert could be
26228 relaxed, but it'll require a bit more complicated epilogue
26229 expanding. */
26230 gcc_assert ((max_size & (max_size - 1)) == 0);
26231 for (i = max_size; i >= 1; i >>= 1)
26233 if (epilogue_size & i)
26234 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26236 return;
26238 if (max_size > 8)
26240 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26241 count, 1, OPTAB_DIRECT);
26242 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26243 count, QImode, 1, 4, false);
26244 return;
26247 /* When there are stringops, we can cheaply increase dest and src pointers.
26248 Otherwise we save code size by maintaining offset (zero is readily
26249 available from preceding rep operation) and using x86 addressing modes.
26251 if (TARGET_SINGLE_STRINGOP)
26253 if (max_size > 4)
26255 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26256 src = change_address (srcmem, SImode, srcptr);
26257 dest = change_address (destmem, SImode, destptr);
26258 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26259 emit_label (label);
26260 LABEL_NUSES (label) = 1;
26262 if (max_size > 2)
26264 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26265 src = change_address (srcmem, HImode, srcptr);
26266 dest = change_address (destmem, HImode, destptr);
26267 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26268 emit_label (label);
26269 LABEL_NUSES (label) = 1;
26271 if (max_size > 1)
26273 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26274 src = change_address (srcmem, QImode, srcptr);
26275 dest = change_address (destmem, QImode, destptr);
26276 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26277 emit_label (label);
26278 LABEL_NUSES (label) = 1;
26281 else
26283 rtx offset = force_reg (Pmode, const0_rtx);
26284 rtx tmp;
26286 if (max_size > 4)
26288 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26289 src = change_address (srcmem, SImode, srcptr);
26290 dest = change_address (destmem, SImode, destptr);
26291 emit_move_insn (dest, src);
26292 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26293 true, OPTAB_LIB_WIDEN);
26294 if (tmp != offset)
26295 emit_move_insn (offset, tmp);
26296 emit_label (label);
26297 LABEL_NUSES (label) = 1;
26299 if (max_size > 2)
26301 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26302 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26303 src = change_address (srcmem, HImode, tmp);
26304 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26305 dest = change_address (destmem, HImode, tmp);
26306 emit_move_insn (dest, src);
26307 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26308 true, OPTAB_LIB_WIDEN);
26309 if (tmp != offset)
26310 emit_move_insn (offset, tmp);
26311 emit_label (label);
26312 LABEL_NUSES (label) = 1;
26314 if (max_size > 1)
26316 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26317 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26318 src = change_address (srcmem, QImode, tmp);
26319 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26320 dest = change_address (destmem, QImode, tmp);
26321 emit_move_insn (dest, src);
26322 emit_label (label);
26323 LABEL_NUSES (label) = 1;
26328 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26329 with value PROMOTED_VAL.
26330 SRC is passed by pointer to be updated on return.
26331 Return value is updated DST. */
26332 static rtx
26333 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26334 HOST_WIDE_INT size_to_move)
26336 rtx dst = destmem, adjust;
26337 enum insn_code code;
26338 machine_mode move_mode;
26339 int piece_size, i;
26341 /* Find the widest mode in which we could perform moves.
26342 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26343 it until move of such size is supported. */
26344 move_mode = GET_MODE (promoted_val);
26345 if (move_mode == VOIDmode)
26346 move_mode = QImode;
26347 if (size_to_move < GET_MODE_SIZE (move_mode))
26349 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26350 promoted_val = gen_lowpart (move_mode, promoted_val);
26352 piece_size = GET_MODE_SIZE (move_mode);
26353 code = optab_handler (mov_optab, move_mode);
26354 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26356 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26358 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26359 gcc_assert (size_to_move % piece_size == 0);
26360 adjust = GEN_INT (piece_size);
26361 for (i = 0; i < size_to_move; i += piece_size)
26363 if (piece_size <= GET_MODE_SIZE (word_mode))
26365 emit_insn (gen_strset (destptr, dst, promoted_val));
26366 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26367 piece_size);
26368 continue;
26371 emit_insn (GEN_FCN (code) (dst, promoted_val));
26373 emit_move_insn (destptr,
26374 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26376 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26377 piece_size);
26380 /* Update DST rtx. */
26381 return dst;
26383 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26384 static void
26385 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26386 rtx count, int max_size)
26388 count =
26389 expand_simple_binop (counter_mode (count), AND, count,
26390 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26391 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26392 gen_lowpart (QImode, value), count, QImode,
26393 1, max_size / 2, true);
26396 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26397 static void
26398 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26399 rtx count, int max_size)
26401 rtx dest;
26403 if (CONST_INT_P (count))
26405 HOST_WIDE_INT countval = INTVAL (count);
26406 HOST_WIDE_INT epilogue_size = countval % max_size;
26407 int i;
26409 /* For now MAX_SIZE should be a power of 2. This assert could be
26410 relaxed, but it'll require a bit more complicated epilogue
26411 expanding. */
26412 gcc_assert ((max_size & (max_size - 1)) == 0);
26413 for (i = max_size; i >= 1; i >>= 1)
26415 if (epilogue_size & i)
26417 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26418 destmem = emit_memset (destmem, destptr, vec_value, i);
26419 else
26420 destmem = emit_memset (destmem, destptr, value, i);
26423 return;
26425 if (max_size > 32)
26427 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26428 return;
26430 if (max_size > 16)
26432 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26433 if (TARGET_64BIT)
26435 dest = change_address (destmem, DImode, destptr);
26436 emit_insn (gen_strset (destptr, dest, value));
26437 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26438 emit_insn (gen_strset (destptr, dest, value));
26440 else
26442 dest = change_address (destmem, SImode, destptr);
26443 emit_insn (gen_strset (destptr, dest, value));
26444 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26445 emit_insn (gen_strset (destptr, dest, value));
26446 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26447 emit_insn (gen_strset (destptr, dest, value));
26448 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26449 emit_insn (gen_strset (destptr, dest, value));
26451 emit_label (label);
26452 LABEL_NUSES (label) = 1;
26454 if (max_size > 8)
26456 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26457 if (TARGET_64BIT)
26459 dest = change_address (destmem, DImode, destptr);
26460 emit_insn (gen_strset (destptr, dest, value));
26462 else
26464 dest = change_address (destmem, SImode, destptr);
26465 emit_insn (gen_strset (destptr, dest, value));
26466 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26467 emit_insn (gen_strset (destptr, dest, value));
26469 emit_label (label);
26470 LABEL_NUSES (label) = 1;
26472 if (max_size > 4)
26474 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26475 dest = change_address (destmem, SImode, destptr);
26476 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26477 emit_label (label);
26478 LABEL_NUSES (label) = 1;
26480 if (max_size > 2)
26482 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26483 dest = change_address (destmem, HImode, destptr);
26484 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26485 emit_label (label);
26486 LABEL_NUSES (label) = 1;
26488 if (max_size > 1)
26490 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26491 dest = change_address (destmem, QImode, destptr);
26492 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26493 emit_label (label);
26494 LABEL_NUSES (label) = 1;
26498 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26499 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26500 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26501 ignored.
26502 Return value is updated DESTMEM. */
26503 static rtx
26504 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26505 rtx destptr, rtx srcptr, rtx value,
26506 rtx vec_value, rtx count, int align,
26507 int desired_alignment, bool issetmem)
26509 int i;
26510 for (i = 1; i < desired_alignment; i <<= 1)
26512 if (align <= i)
26514 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26515 if (issetmem)
26517 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26518 destmem = emit_memset (destmem, destptr, vec_value, i);
26519 else
26520 destmem = emit_memset (destmem, destptr, value, i);
26522 else
26523 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26524 ix86_adjust_counter (count, i);
26525 emit_label (label);
26526 LABEL_NUSES (label) = 1;
26527 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26530 return destmem;
26533 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26534 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26535 and jump to DONE_LABEL. */
26536 static void
26537 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26538 rtx destptr, rtx srcptr,
26539 rtx value, rtx vec_value,
26540 rtx count, int size,
26541 rtx done_label, bool issetmem)
26543 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26544 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26545 rtx modesize;
26546 int n;
26548 /* If we do not have vector value to copy, we must reduce size. */
26549 if (issetmem)
26551 if (!vec_value)
26553 if (GET_MODE (value) == VOIDmode && size > 8)
26554 mode = Pmode;
26555 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26556 mode = GET_MODE (value);
26558 else
26559 mode = GET_MODE (vec_value), value = vec_value;
26561 else
26563 /* Choose appropriate vector mode. */
26564 if (size >= 32)
26565 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26566 else if (size >= 16)
26567 mode = TARGET_SSE ? V16QImode : DImode;
26568 srcmem = change_address (srcmem, mode, srcptr);
26570 destmem = change_address (destmem, mode, destptr);
26571 modesize = GEN_INT (GET_MODE_SIZE (mode));
26572 gcc_assert (GET_MODE_SIZE (mode) <= size);
26573 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26575 if (issetmem)
26576 emit_move_insn (destmem, gen_lowpart (mode, value));
26577 else
26579 emit_move_insn (destmem, srcmem);
26580 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26582 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26585 destmem = offset_address (destmem, count, 1);
26586 destmem = offset_address (destmem, GEN_INT (-2 * size),
26587 GET_MODE_SIZE (mode));
26588 if (!issetmem)
26590 srcmem = offset_address (srcmem, count, 1);
26591 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26592 GET_MODE_SIZE (mode));
26594 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26596 if (issetmem)
26597 emit_move_insn (destmem, gen_lowpart (mode, value));
26598 else
26600 emit_move_insn (destmem, srcmem);
26601 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26603 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26605 emit_jump_insn (gen_jump (done_label));
26606 emit_barrier ();
26608 emit_label (label);
26609 LABEL_NUSES (label) = 1;
26612 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26613 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26614 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26615 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26616 DONE_LABEL is a label after the whole copying sequence. The label is created
26617 on demand if *DONE_LABEL is NULL.
26618 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26619 bounds after the initial copies.
26621 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26622 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26623 we will dispatch to a library call for large blocks.
26625 In pseudocode we do:
26627 if (COUNT < SIZE)
26629 Assume that SIZE is 4. Bigger sizes are handled analogously
26630 if (COUNT & 4)
26632 copy 4 bytes from SRCPTR to DESTPTR
26633 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26634 goto done_label
26636 if (!COUNT)
26637 goto done_label;
26638 copy 1 byte from SRCPTR to DESTPTR
26639 if (COUNT & 2)
26641 copy 2 bytes from SRCPTR to DESTPTR
26642 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26645 else
26647 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26648 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26650 OLD_DESPTR = DESTPTR;
26651 Align DESTPTR up to DESIRED_ALIGN
26652 SRCPTR += DESTPTR - OLD_DESTPTR
26653 COUNT -= DEST_PTR - OLD_DESTPTR
26654 if (DYNAMIC_CHECK)
26655 Round COUNT down to multiple of SIZE
26656 << optional caller supplied zero size guard is here >>
26657 << optional caller supplied dynamic check is here >>
26658 << caller supplied main copy loop is here >>
26660 done_label:
26662 static void
26663 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26664 rtx *destptr, rtx *srcptr,
26665 machine_mode mode,
26666 rtx value, rtx vec_value,
26667 rtx *count,
26668 rtx_code_label **done_label,
26669 int size,
26670 int desired_align,
26671 int align,
26672 unsigned HOST_WIDE_INT *min_size,
26673 bool dynamic_check,
26674 bool issetmem)
26676 rtx_code_label *loop_label = NULL, *label;
26677 int n;
26678 rtx modesize;
26679 int prolog_size = 0;
26680 rtx mode_value;
26682 /* Chose proper value to copy. */
26683 if (issetmem && VECTOR_MODE_P (mode))
26684 mode_value = vec_value;
26685 else
26686 mode_value = value;
26687 gcc_assert (GET_MODE_SIZE (mode) <= size);
26689 /* See if block is big or small, handle small blocks. */
26690 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26692 int size2 = size;
26693 loop_label = gen_label_rtx ();
26695 if (!*done_label)
26696 *done_label = gen_label_rtx ();
26698 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26699 1, loop_label);
26700 size2 >>= 1;
26702 /* Handle sizes > 3. */
26703 for (;size2 > 2; size2 >>= 1)
26704 expand_small_movmem_or_setmem (destmem, srcmem,
26705 *destptr, *srcptr,
26706 value, vec_value,
26707 *count,
26708 size2, *done_label, issetmem);
26709 /* Nothing to copy? Jump to DONE_LABEL if so */
26710 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26711 1, *done_label);
26713 /* Do a byte copy. */
26714 destmem = change_address (destmem, QImode, *destptr);
26715 if (issetmem)
26716 emit_move_insn (destmem, gen_lowpart (QImode, value));
26717 else
26719 srcmem = change_address (srcmem, QImode, *srcptr);
26720 emit_move_insn (destmem, srcmem);
26723 /* Handle sizes 2 and 3. */
26724 label = ix86_expand_aligntest (*count, 2, false);
26725 destmem = change_address (destmem, HImode, *destptr);
26726 destmem = offset_address (destmem, *count, 1);
26727 destmem = offset_address (destmem, GEN_INT (-2), 2);
26728 if (issetmem)
26729 emit_move_insn (destmem, gen_lowpart (HImode, value));
26730 else
26732 srcmem = change_address (srcmem, HImode, *srcptr);
26733 srcmem = offset_address (srcmem, *count, 1);
26734 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26735 emit_move_insn (destmem, srcmem);
26738 emit_label (label);
26739 LABEL_NUSES (label) = 1;
26740 emit_jump_insn (gen_jump (*done_label));
26741 emit_barrier ();
26743 else
26744 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26745 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26747 /* Start memcpy for COUNT >= SIZE. */
26748 if (loop_label)
26750 emit_label (loop_label);
26751 LABEL_NUSES (loop_label) = 1;
26754 /* Copy first desired_align bytes. */
26755 if (!issetmem)
26756 srcmem = change_address (srcmem, mode, *srcptr);
26757 destmem = change_address (destmem, mode, *destptr);
26758 modesize = GEN_INT (GET_MODE_SIZE (mode));
26759 for (n = 0; prolog_size < desired_align - align; n++)
26761 if (issetmem)
26762 emit_move_insn (destmem, mode_value);
26763 else
26765 emit_move_insn (destmem, srcmem);
26766 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26768 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26769 prolog_size += GET_MODE_SIZE (mode);
26773 /* Copy last SIZE bytes. */
26774 destmem = offset_address (destmem, *count, 1);
26775 destmem = offset_address (destmem,
26776 GEN_INT (-size - prolog_size),
26778 if (issetmem)
26779 emit_move_insn (destmem, mode_value);
26780 else
26782 srcmem = offset_address (srcmem, *count, 1);
26783 srcmem = offset_address (srcmem,
26784 GEN_INT (-size - prolog_size),
26786 emit_move_insn (destmem, srcmem);
26788 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26790 destmem = offset_address (destmem, modesize, 1);
26791 if (issetmem)
26792 emit_move_insn (destmem, mode_value);
26793 else
26795 srcmem = offset_address (srcmem, modesize, 1);
26796 emit_move_insn (destmem, srcmem);
26800 /* Align destination. */
26801 if (desired_align > 1 && desired_align > align)
26803 rtx saveddest = *destptr;
26805 gcc_assert (desired_align <= size);
26806 /* Align destptr up, place it to new register. */
26807 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26808 GEN_INT (prolog_size),
26809 NULL_RTX, 1, OPTAB_DIRECT);
26810 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26811 REG_POINTER (*destptr) = 1;
26812 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26813 GEN_INT (-desired_align),
26814 *destptr, 1, OPTAB_DIRECT);
26815 /* See how many bytes we skipped. */
26816 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26817 *destptr,
26818 saveddest, 1, OPTAB_DIRECT);
26819 /* Adjust srcptr and count. */
26820 if (!issetmem)
26821 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26822 saveddest, *srcptr, 1, OPTAB_DIRECT);
26823 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26824 saveddest, *count, 1, OPTAB_DIRECT);
26825 /* We copied at most size + prolog_size. */
26826 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26827 *min_size
26828 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26829 else
26830 *min_size = 0;
26832 /* Our loops always round down the block size, but for dispatch to
26833 library we need precise value. */
26834 if (dynamic_check)
26835 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26836 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26838 else
26840 gcc_assert (prolog_size == 0);
26841 /* Decrease count, so we won't end up copying last word twice. */
26842 if (!CONST_INT_P (*count))
26843 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26844 constm1_rtx, *count, 1, OPTAB_DIRECT);
26845 else
26846 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26847 (unsigned HOST_WIDE_INT)size));
26848 if (*min_size)
26849 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26854 /* This function is like the previous one, except here we know how many bytes
26855 need to be copied. That allows us to update alignment not only of DST, which
26856 is returned, but also of SRC, which is passed as a pointer for that
26857 reason. */
26858 static rtx
26859 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26860 rtx srcreg, rtx value, rtx vec_value,
26861 int desired_align, int align_bytes,
26862 bool issetmem)
26864 rtx src = NULL;
26865 rtx orig_dst = dst;
26866 rtx orig_src = NULL;
26867 int piece_size = 1;
26868 int copied_bytes = 0;
26870 if (!issetmem)
26872 gcc_assert (srcp != NULL);
26873 src = *srcp;
26874 orig_src = src;
26877 for (piece_size = 1;
26878 piece_size <= desired_align && copied_bytes < align_bytes;
26879 piece_size <<= 1)
26881 if (align_bytes & piece_size)
26883 if (issetmem)
26885 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26886 dst = emit_memset (dst, destreg, vec_value, piece_size);
26887 else
26888 dst = emit_memset (dst, destreg, value, piece_size);
26890 else
26891 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26892 copied_bytes += piece_size;
26895 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26896 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26897 if (MEM_SIZE_KNOWN_P (orig_dst))
26898 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26900 if (!issetmem)
26902 int src_align_bytes = get_mem_align_offset (src, desired_align
26903 * BITS_PER_UNIT);
26904 if (src_align_bytes >= 0)
26905 src_align_bytes = desired_align - src_align_bytes;
26906 if (src_align_bytes >= 0)
26908 unsigned int src_align;
26909 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26911 if ((src_align_bytes & (src_align - 1))
26912 == (align_bytes & (src_align - 1)))
26913 break;
26915 if (src_align > (unsigned int) desired_align)
26916 src_align = desired_align;
26917 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26918 set_mem_align (src, src_align * BITS_PER_UNIT);
26920 if (MEM_SIZE_KNOWN_P (orig_src))
26921 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26922 *srcp = src;
26925 return dst;
26928 /* Return true if ALG can be used in current context.
26929 Assume we expand memset if MEMSET is true. */
26930 static bool
26931 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26933 if (alg == no_stringop)
26934 return false;
26935 if (alg == vector_loop)
26936 return TARGET_SSE || TARGET_AVX;
26937 /* Algorithms using the rep prefix want at least edi and ecx;
26938 additionally, memset wants eax and memcpy wants esi. Don't
26939 consider such algorithms if the user has appropriated those
26940 registers for their own purposes, or if we have a non-default
26941 address space, since some string insns cannot override the segment. */
26942 if (alg == rep_prefix_1_byte
26943 || alg == rep_prefix_4_byte
26944 || alg == rep_prefix_8_byte)
26946 if (have_as)
26947 return false;
26948 if (fixed_regs[CX_REG]
26949 || fixed_regs[DI_REG]
26950 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26951 return false;
26953 return true;
26956 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26957 static enum stringop_alg
26958 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26959 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26960 bool memset, bool zero_memset, bool have_as,
26961 int *dynamic_check, bool *noalign, bool recur)
26963 const struct stringop_algs *algs;
26964 bool optimize_for_speed;
26965 int max = 0;
26966 const struct processor_costs *cost;
26967 int i;
26968 bool any_alg_usable_p = false;
26970 *noalign = false;
26971 *dynamic_check = -1;
26973 /* Even if the string operation call is cold, we still might spend a lot
26974 of time processing large blocks. */
26975 if (optimize_function_for_size_p (cfun)
26976 || (optimize_insn_for_size_p ()
26977 && (max_size < 256
26978 || (expected_size != -1 && expected_size < 256))))
26979 optimize_for_speed = false;
26980 else
26981 optimize_for_speed = true;
26983 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26984 if (memset)
26985 algs = &cost->memset[TARGET_64BIT != 0];
26986 else
26987 algs = &cost->memcpy[TARGET_64BIT != 0];
26989 /* See maximal size for user defined algorithm. */
26990 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26992 enum stringop_alg candidate = algs->size[i].alg;
26993 bool usable = alg_usable_p (candidate, memset, have_as);
26994 any_alg_usable_p |= usable;
26996 if (candidate != libcall && candidate && usable)
26997 max = algs->size[i].max;
27000 /* If expected size is not known but max size is small enough
27001 so inline version is a win, set expected size into
27002 the range. */
27003 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27004 && expected_size == -1)
27005 expected_size = min_size / 2 + max_size / 2;
27007 /* If user specified the algorithm, honor it if possible. */
27008 if (ix86_stringop_alg != no_stringop
27009 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27010 return ix86_stringop_alg;
27011 /* rep; movq or rep; movl is the smallest variant. */
27012 else if (!optimize_for_speed)
27014 *noalign = true;
27015 if (!count || (count & 3) || (memset && !zero_memset))
27016 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27017 ? rep_prefix_1_byte : loop_1_byte;
27018 else
27019 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27020 ? rep_prefix_4_byte : loop;
27022 /* Very tiny blocks are best handled via the loop, REP is expensive to
27023 setup. */
27024 else if (expected_size != -1 && expected_size < 4)
27025 return loop_1_byte;
27026 else if (expected_size != -1)
27028 enum stringop_alg alg = libcall;
27029 bool alg_noalign = false;
27030 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27032 /* We get here if the algorithms that were not libcall-based
27033 were rep-prefix based and we are unable to use rep prefixes
27034 based on global register usage. Break out of the loop and
27035 use the heuristic below. */
27036 if (algs->size[i].max == 0)
27037 break;
27038 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27040 enum stringop_alg candidate = algs->size[i].alg;
27042 if (candidate != libcall
27043 && alg_usable_p (candidate, memset, have_as))
27045 alg = candidate;
27046 alg_noalign = algs->size[i].noalign;
27048 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27049 last non-libcall inline algorithm. */
27050 if (TARGET_INLINE_ALL_STRINGOPS)
27052 /* When the current size is best to be copied by a libcall,
27053 but we are still forced to inline, run the heuristic below
27054 that will pick code for medium sized blocks. */
27055 if (alg != libcall)
27057 *noalign = alg_noalign;
27058 return alg;
27060 else if (!any_alg_usable_p)
27061 break;
27063 else if (alg_usable_p (candidate, memset, have_as))
27065 *noalign = algs->size[i].noalign;
27066 return candidate;
27071 /* When asked to inline the call anyway, try to pick meaningful choice.
27072 We look for maximal size of block that is faster to copy by hand and
27073 take blocks of at most of that size guessing that average size will
27074 be roughly half of the block.
27076 If this turns out to be bad, we might simply specify the preferred
27077 choice in ix86_costs. */
27078 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27079 && (algs->unknown_size == libcall
27080 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27082 enum stringop_alg alg;
27083 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27085 /* If there aren't any usable algorithms or if recursing already,
27086 then recursing on smaller sizes or same size isn't going to
27087 find anything. Just return the simple byte-at-a-time copy loop. */
27088 if (!any_alg_usable_p || recur)
27090 /* Pick something reasonable. */
27091 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27092 *dynamic_check = 128;
27093 return loop_1_byte;
27095 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27096 zero_memset, have_as, dynamic_check, noalign, true);
27097 gcc_assert (*dynamic_check == -1);
27098 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27099 *dynamic_check = max;
27100 else
27101 gcc_assert (alg != libcall);
27102 return alg;
27104 return (alg_usable_p (algs->unknown_size, memset, have_as)
27105 ? algs->unknown_size : libcall);
27108 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27109 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27110 static int
27111 decide_alignment (int align,
27112 enum stringop_alg alg,
27113 int expected_size,
27114 machine_mode move_mode)
27116 int desired_align = 0;
27118 gcc_assert (alg != no_stringop);
27120 if (alg == libcall)
27121 return 0;
27122 if (move_mode == VOIDmode)
27123 return 0;
27125 desired_align = GET_MODE_SIZE (move_mode);
27126 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27127 copying whole cacheline at once. */
27128 if (TARGET_PENTIUMPRO
27129 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27130 desired_align = 8;
27132 if (optimize_size)
27133 desired_align = 1;
27134 if (desired_align < align)
27135 desired_align = align;
27136 if (expected_size != -1 && expected_size < 4)
27137 desired_align = align;
27139 return desired_align;
27143 /* Helper function for memcpy. For QImode value 0xXY produce
27144 0xXYXYXYXY of wide specified by MODE. This is essentially
27145 a * 0x10101010, but we can do slightly better than
27146 synth_mult by unwinding the sequence by hand on CPUs with
27147 slow multiply. */
27148 static rtx
27149 promote_duplicated_reg (machine_mode mode, rtx val)
27151 machine_mode valmode = GET_MODE (val);
27152 rtx tmp;
27153 int nops = mode == DImode ? 3 : 2;
27155 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27156 if (val == const0_rtx)
27157 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27158 if (CONST_INT_P (val))
27160 HOST_WIDE_INT v = INTVAL (val) & 255;
27162 v |= v << 8;
27163 v |= v << 16;
27164 if (mode == DImode)
27165 v |= (v << 16) << 16;
27166 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27169 if (valmode == VOIDmode)
27170 valmode = QImode;
27171 if (valmode != QImode)
27172 val = gen_lowpart (QImode, val);
27173 if (mode == QImode)
27174 return val;
27175 if (!TARGET_PARTIAL_REG_STALL)
27176 nops--;
27177 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27178 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27179 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27180 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27182 rtx reg = convert_modes (mode, QImode, val, true);
27183 tmp = promote_duplicated_reg (mode, const1_rtx);
27184 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27185 OPTAB_DIRECT);
27187 else
27189 rtx reg = convert_modes (mode, QImode, val, true);
27191 if (!TARGET_PARTIAL_REG_STALL)
27192 if (mode == SImode)
27193 emit_insn (gen_insvsi_1 (reg, reg));
27194 else
27195 emit_insn (gen_insvdi_1 (reg, reg));
27196 else
27198 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27199 NULL, 1, OPTAB_DIRECT);
27200 reg =
27201 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27203 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27204 NULL, 1, OPTAB_DIRECT);
27205 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27206 if (mode == SImode)
27207 return reg;
27208 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27209 NULL, 1, OPTAB_DIRECT);
27210 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27211 return reg;
27215 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27216 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27217 alignment from ALIGN to DESIRED_ALIGN. */
27218 static rtx
27219 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27220 int align)
27222 rtx promoted_val;
27224 if (TARGET_64BIT
27225 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27226 promoted_val = promote_duplicated_reg (DImode, val);
27227 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27228 promoted_val = promote_duplicated_reg (SImode, val);
27229 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27230 promoted_val = promote_duplicated_reg (HImode, val);
27231 else
27232 promoted_val = val;
27234 return promoted_val;
27237 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27238 operations when profitable. The code depends upon architecture, block size
27239 and alignment, but always has one of the following overall structures:
27241 Aligned move sequence:
27243 1) Prologue guard: Conditional that jumps up to epilogues for small
27244 blocks that can be handled by epilogue alone. This is faster
27245 but also needed for correctness, since prologue assume the block
27246 is larger than the desired alignment.
27248 Optional dynamic check for size and libcall for large
27249 blocks is emitted here too, with -minline-stringops-dynamically.
27251 2) Prologue: copy first few bytes in order to get destination
27252 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27253 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27254 copied. We emit either a jump tree on power of two sized
27255 blocks, or a byte loop.
27257 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27258 with specified algorithm.
27260 4) Epilogue: code copying tail of the block that is too small to be
27261 handled by main body (or up to size guarded by prologue guard).
27263 Misaligned move sequence
27265 1) missaligned move prologue/epilogue containing:
27266 a) Prologue handling small memory blocks and jumping to done_label
27267 (skipped if blocks are known to be large enough)
27268 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27269 needed by single possibly misaligned move
27270 (skipped if alignment is not needed)
27271 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27273 2) Zero size guard dispatching to done_label, if needed
27275 3) dispatch to library call, if needed,
27277 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27278 with specified algorithm. */
27279 bool
27280 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27281 rtx align_exp, rtx expected_align_exp,
27282 rtx expected_size_exp, rtx min_size_exp,
27283 rtx max_size_exp, rtx probable_max_size_exp,
27284 bool issetmem)
27286 rtx destreg;
27287 rtx srcreg = NULL;
27288 rtx_code_label *label = NULL;
27289 rtx tmp;
27290 rtx_code_label *jump_around_label = NULL;
27291 HOST_WIDE_INT align = 1;
27292 unsigned HOST_WIDE_INT count = 0;
27293 HOST_WIDE_INT expected_size = -1;
27294 int size_needed = 0, epilogue_size_needed;
27295 int desired_align = 0, align_bytes = 0;
27296 enum stringop_alg alg;
27297 rtx promoted_val = NULL;
27298 rtx vec_promoted_val = NULL;
27299 bool force_loopy_epilogue = false;
27300 int dynamic_check;
27301 bool need_zero_guard = false;
27302 bool noalign;
27303 machine_mode move_mode = VOIDmode;
27304 int unroll_factor = 1;
27305 /* TODO: Once value ranges are available, fill in proper data. */
27306 unsigned HOST_WIDE_INT min_size = 0;
27307 unsigned HOST_WIDE_INT max_size = -1;
27308 unsigned HOST_WIDE_INT probable_max_size = -1;
27309 bool misaligned_prologue_used = false;
27310 bool have_as;
27312 if (CONST_INT_P (align_exp))
27313 align = INTVAL (align_exp);
27314 /* i386 can do misaligned access on reasonably increased cost. */
27315 if (CONST_INT_P (expected_align_exp)
27316 && INTVAL (expected_align_exp) > align)
27317 align = INTVAL (expected_align_exp);
27318 /* ALIGN is the minimum of destination and source alignment, but we care here
27319 just about destination alignment. */
27320 else if (!issetmem
27321 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27322 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27324 if (CONST_INT_P (count_exp))
27326 min_size = max_size = probable_max_size = count = expected_size
27327 = INTVAL (count_exp);
27328 /* When COUNT is 0, there is nothing to do. */
27329 if (!count)
27330 return true;
27332 else
27334 if (min_size_exp)
27335 min_size = INTVAL (min_size_exp);
27336 if (max_size_exp)
27337 max_size = INTVAL (max_size_exp);
27338 if (probable_max_size_exp)
27339 probable_max_size = INTVAL (probable_max_size_exp);
27340 if (CONST_INT_P (expected_size_exp))
27341 expected_size = INTVAL (expected_size_exp);
27344 /* Make sure we don't need to care about overflow later on. */
27345 if (count > (HOST_WIDE_INT_1U << 30))
27346 return false;
27348 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27349 if (!issetmem)
27350 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27352 /* Step 0: Decide on preferred algorithm, desired alignment and
27353 size of chunks to be copied by main loop. */
27354 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27355 issetmem,
27356 issetmem && val_exp == const0_rtx, have_as,
27357 &dynamic_check, &noalign, false);
27358 if (alg == libcall)
27359 return false;
27360 gcc_assert (alg != no_stringop);
27362 /* For now vector-version of memset is generated only for memory zeroing, as
27363 creating of promoted vector value is very cheap in this case. */
27364 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27365 alg = unrolled_loop;
27367 if (!count)
27368 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27369 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27370 if (!issetmem)
27371 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27373 unroll_factor = 1;
27374 move_mode = word_mode;
27375 switch (alg)
27377 case libcall:
27378 case no_stringop:
27379 case last_alg:
27380 gcc_unreachable ();
27381 case loop_1_byte:
27382 need_zero_guard = true;
27383 move_mode = QImode;
27384 break;
27385 case loop:
27386 need_zero_guard = true;
27387 break;
27388 case unrolled_loop:
27389 need_zero_guard = true;
27390 unroll_factor = (TARGET_64BIT ? 4 : 2);
27391 break;
27392 case vector_loop:
27393 need_zero_guard = true;
27394 unroll_factor = 4;
27395 /* Find the widest supported mode. */
27396 move_mode = word_mode;
27397 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27398 != CODE_FOR_nothing)
27399 move_mode = GET_MODE_WIDER_MODE (move_mode);
27401 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27402 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27403 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27405 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27406 move_mode = mode_for_vector (word_mode, nunits);
27407 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27408 move_mode = word_mode;
27410 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27411 break;
27412 case rep_prefix_8_byte:
27413 move_mode = DImode;
27414 break;
27415 case rep_prefix_4_byte:
27416 move_mode = SImode;
27417 break;
27418 case rep_prefix_1_byte:
27419 move_mode = QImode;
27420 break;
27422 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27423 epilogue_size_needed = size_needed;
27425 /* If we are going to call any library calls conditionally, make sure any
27426 pending stack adjustment happen before the first conditional branch,
27427 otherwise they will be emitted before the library call only and won't
27428 happen from the other branches. */
27429 if (dynamic_check != -1)
27430 do_pending_stack_adjust ();
27432 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27433 if (!TARGET_ALIGN_STRINGOPS || noalign)
27434 align = desired_align;
27436 /* Step 1: Prologue guard. */
27438 /* Alignment code needs count to be in register. */
27439 if (CONST_INT_P (count_exp) && desired_align > align)
27441 if (INTVAL (count_exp) > desired_align
27442 && INTVAL (count_exp) > size_needed)
27444 align_bytes
27445 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27446 if (align_bytes <= 0)
27447 align_bytes = 0;
27448 else
27449 align_bytes = desired_align - align_bytes;
27451 if (align_bytes == 0)
27452 count_exp = force_reg (counter_mode (count_exp), count_exp);
27454 gcc_assert (desired_align >= 1 && align >= 1);
27456 /* Misaligned move sequences handle both prologue and epilogue at once.
27457 Default code generation results in a smaller code for large alignments
27458 and also avoids redundant job when sizes are known precisely. */
27459 misaligned_prologue_used
27460 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27461 && MAX (desired_align, epilogue_size_needed) <= 32
27462 && desired_align <= epilogue_size_needed
27463 && ((desired_align > align && !align_bytes)
27464 || (!count && epilogue_size_needed > 1)));
27466 /* Do the cheap promotion to allow better CSE across the
27467 main loop and epilogue (ie one load of the big constant in the
27468 front of all code.
27469 For now the misaligned move sequences do not have fast path
27470 without broadcasting. */
27471 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27473 if (alg == vector_loop)
27475 gcc_assert (val_exp == const0_rtx);
27476 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27477 promoted_val = promote_duplicated_reg_to_size (val_exp,
27478 GET_MODE_SIZE (word_mode),
27479 desired_align, align);
27481 else
27483 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27484 desired_align, align);
27487 /* Misaligned move sequences handles both prologues and epilogues at once.
27488 Default code generation results in smaller code for large alignments and
27489 also avoids redundant job when sizes are known precisely. */
27490 if (misaligned_prologue_used)
27492 /* Misaligned move prologue handled small blocks by itself. */
27493 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27494 (dst, src, &destreg, &srcreg,
27495 move_mode, promoted_val, vec_promoted_val,
27496 &count_exp,
27497 &jump_around_label,
27498 desired_align < align
27499 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27500 desired_align, align, &min_size, dynamic_check, issetmem);
27501 if (!issetmem)
27502 src = change_address (src, BLKmode, srcreg);
27503 dst = change_address (dst, BLKmode, destreg);
27504 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27505 epilogue_size_needed = 0;
27506 if (need_zero_guard
27507 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27509 /* It is possible that we copied enough so the main loop will not
27510 execute. */
27511 gcc_assert (size_needed > 1);
27512 if (jump_around_label == NULL_RTX)
27513 jump_around_label = gen_label_rtx ();
27514 emit_cmp_and_jump_insns (count_exp,
27515 GEN_INT (size_needed),
27516 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27517 if (expected_size == -1
27518 || expected_size < (desired_align - align) / 2 + size_needed)
27519 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27520 else
27521 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27524 /* Ensure that alignment prologue won't copy past end of block. */
27525 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27527 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27528 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27529 Make sure it is power of 2. */
27530 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27532 /* To improve performance of small blocks, we jump around the VAL
27533 promoting mode. This mean that if the promoted VAL is not constant,
27534 we might not use it in the epilogue and have to use byte
27535 loop variant. */
27536 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27537 force_loopy_epilogue = true;
27538 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27539 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27541 /* If main algorithm works on QImode, no epilogue is needed.
27542 For small sizes just don't align anything. */
27543 if (size_needed == 1)
27544 desired_align = align;
27545 else
27546 goto epilogue;
27548 else if (!count
27549 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27551 label = gen_label_rtx ();
27552 emit_cmp_and_jump_insns (count_exp,
27553 GEN_INT (epilogue_size_needed),
27554 LTU, 0, counter_mode (count_exp), 1, label);
27555 if (expected_size == -1 || expected_size < epilogue_size_needed)
27556 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27557 else
27558 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27562 /* Emit code to decide on runtime whether library call or inline should be
27563 used. */
27564 if (dynamic_check != -1)
27566 if (!issetmem && CONST_INT_P (count_exp))
27568 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27570 emit_block_copy_via_libcall (dst, src, count_exp);
27571 count_exp = const0_rtx;
27572 goto epilogue;
27575 else
27577 rtx_code_label *hot_label = gen_label_rtx ();
27578 if (jump_around_label == NULL_RTX)
27579 jump_around_label = gen_label_rtx ();
27580 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27581 LEU, 0, counter_mode (count_exp),
27582 1, hot_label);
27583 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27584 if (issetmem)
27585 set_storage_via_libcall (dst, count_exp, val_exp);
27586 else
27587 emit_block_copy_via_libcall (dst, src, count_exp);
27588 emit_jump (jump_around_label);
27589 emit_label (hot_label);
27593 /* Step 2: Alignment prologue. */
27594 /* Do the expensive promotion once we branched off the small blocks. */
27595 if (issetmem && !promoted_val)
27596 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27597 desired_align, align);
27599 if (desired_align > align && !misaligned_prologue_used)
27601 if (align_bytes == 0)
27603 /* Except for the first move in prologue, we no longer know
27604 constant offset in aliasing info. It don't seems to worth
27605 the pain to maintain it for the first move, so throw away
27606 the info early. */
27607 dst = change_address (dst, BLKmode, destreg);
27608 if (!issetmem)
27609 src = change_address (src, BLKmode, srcreg);
27610 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27611 promoted_val, vec_promoted_val,
27612 count_exp, align, desired_align,
27613 issetmem);
27614 /* At most desired_align - align bytes are copied. */
27615 if (min_size < (unsigned)(desired_align - align))
27616 min_size = 0;
27617 else
27618 min_size -= desired_align - align;
27620 else
27622 /* If we know how many bytes need to be stored before dst is
27623 sufficiently aligned, maintain aliasing info accurately. */
27624 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27625 srcreg,
27626 promoted_val,
27627 vec_promoted_val,
27628 desired_align,
27629 align_bytes,
27630 issetmem);
27632 count_exp = plus_constant (counter_mode (count_exp),
27633 count_exp, -align_bytes);
27634 count -= align_bytes;
27635 min_size -= align_bytes;
27636 max_size -= align_bytes;
27638 if (need_zero_guard
27639 && min_size < (unsigned HOST_WIDE_INT) size_needed
27640 && (count < (unsigned HOST_WIDE_INT) size_needed
27641 || (align_bytes == 0
27642 && count < ((unsigned HOST_WIDE_INT) size_needed
27643 + desired_align - align))))
27645 /* It is possible that we copied enough so the main loop will not
27646 execute. */
27647 gcc_assert (size_needed > 1);
27648 if (label == NULL_RTX)
27649 label = gen_label_rtx ();
27650 emit_cmp_and_jump_insns (count_exp,
27651 GEN_INT (size_needed),
27652 LTU, 0, counter_mode (count_exp), 1, label);
27653 if (expected_size == -1
27654 || expected_size < (desired_align - align) / 2 + size_needed)
27655 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27656 else
27657 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27660 if (label && size_needed == 1)
27662 emit_label (label);
27663 LABEL_NUSES (label) = 1;
27664 label = NULL;
27665 epilogue_size_needed = 1;
27666 if (issetmem)
27667 promoted_val = val_exp;
27669 else if (label == NULL_RTX && !misaligned_prologue_used)
27670 epilogue_size_needed = size_needed;
27672 /* Step 3: Main loop. */
27674 switch (alg)
27676 case libcall:
27677 case no_stringop:
27678 case last_alg:
27679 gcc_unreachable ();
27680 case loop_1_byte:
27681 case loop:
27682 case unrolled_loop:
27683 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27684 count_exp, move_mode, unroll_factor,
27685 expected_size, issetmem);
27686 break;
27687 case vector_loop:
27688 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27689 vec_promoted_val, count_exp, move_mode,
27690 unroll_factor, expected_size, issetmem);
27691 break;
27692 case rep_prefix_8_byte:
27693 case rep_prefix_4_byte:
27694 case rep_prefix_1_byte:
27695 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27696 val_exp, count_exp, move_mode, issetmem);
27697 break;
27699 /* Adjust properly the offset of src and dest memory for aliasing. */
27700 if (CONST_INT_P (count_exp))
27702 if (!issetmem)
27703 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27704 (count / size_needed) * size_needed);
27705 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27706 (count / size_needed) * size_needed);
27708 else
27710 if (!issetmem)
27711 src = change_address (src, BLKmode, srcreg);
27712 dst = change_address (dst, BLKmode, destreg);
27715 /* Step 4: Epilogue to copy the remaining bytes. */
27716 epilogue:
27717 if (label)
27719 /* When the main loop is done, COUNT_EXP might hold original count,
27720 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27721 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27722 bytes. Compensate if needed. */
27724 if (size_needed < epilogue_size_needed)
27726 tmp =
27727 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27728 GEN_INT (size_needed - 1), count_exp, 1,
27729 OPTAB_DIRECT);
27730 if (tmp != count_exp)
27731 emit_move_insn (count_exp, tmp);
27733 emit_label (label);
27734 LABEL_NUSES (label) = 1;
27737 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27739 if (force_loopy_epilogue)
27740 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27741 epilogue_size_needed);
27742 else
27744 if (issetmem)
27745 expand_setmem_epilogue (dst, destreg, promoted_val,
27746 vec_promoted_val, count_exp,
27747 epilogue_size_needed);
27748 else
27749 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27750 epilogue_size_needed);
27753 if (jump_around_label)
27754 emit_label (jump_around_label);
27755 return true;
27759 /* Expand the appropriate insns for doing strlen if not just doing
27760 repnz; scasb
27762 out = result, initialized with the start address
27763 align_rtx = alignment of the address.
27764 scratch = scratch register, initialized with the startaddress when
27765 not aligned, otherwise undefined
27767 This is just the body. It needs the initializations mentioned above and
27768 some address computing at the end. These things are done in i386.md. */
27770 static void
27771 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27773 int align;
27774 rtx tmp;
27775 rtx_code_label *align_2_label = NULL;
27776 rtx_code_label *align_3_label = NULL;
27777 rtx_code_label *align_4_label = gen_label_rtx ();
27778 rtx_code_label *end_0_label = gen_label_rtx ();
27779 rtx mem;
27780 rtx tmpreg = gen_reg_rtx (SImode);
27781 rtx scratch = gen_reg_rtx (SImode);
27782 rtx cmp;
27784 align = 0;
27785 if (CONST_INT_P (align_rtx))
27786 align = INTVAL (align_rtx);
27788 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27790 /* Is there a known alignment and is it less than 4? */
27791 if (align < 4)
27793 rtx scratch1 = gen_reg_rtx (Pmode);
27794 emit_move_insn (scratch1, out);
27795 /* Is there a known alignment and is it not 2? */
27796 if (align != 2)
27798 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27799 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27801 /* Leave just the 3 lower bits. */
27802 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27803 NULL_RTX, 0, OPTAB_WIDEN);
27805 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27806 Pmode, 1, align_4_label);
27807 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27808 Pmode, 1, align_2_label);
27809 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27810 Pmode, 1, align_3_label);
27812 else
27814 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27815 check if is aligned to 4 - byte. */
27817 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27818 NULL_RTX, 0, OPTAB_WIDEN);
27820 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27821 Pmode, 1, align_4_label);
27824 mem = change_address (src, QImode, out);
27826 /* Now compare the bytes. */
27828 /* Compare the first n unaligned byte on a byte per byte basis. */
27829 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27830 QImode, 1, end_0_label);
27832 /* Increment the address. */
27833 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27835 /* Not needed with an alignment of 2 */
27836 if (align != 2)
27838 emit_label (align_2_label);
27840 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27841 end_0_label);
27843 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27845 emit_label (align_3_label);
27848 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27849 end_0_label);
27851 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27854 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27855 align this loop. It gives only huge programs, but does not help to
27856 speed up. */
27857 emit_label (align_4_label);
27859 mem = change_address (src, SImode, out);
27860 emit_move_insn (scratch, mem);
27861 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27863 /* This formula yields a nonzero result iff one of the bytes is zero.
27864 This saves three branches inside loop and many cycles. */
27866 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27867 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27868 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27869 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27870 gen_int_mode (0x80808080, SImode)));
27871 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27872 align_4_label);
27874 if (TARGET_CMOVE)
27876 rtx reg = gen_reg_rtx (SImode);
27877 rtx reg2 = gen_reg_rtx (Pmode);
27878 emit_move_insn (reg, tmpreg);
27879 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27881 /* If zero is not in the first two bytes, move two bytes forward. */
27882 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27883 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27884 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27885 emit_insn (gen_rtx_SET (tmpreg,
27886 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27887 reg,
27888 tmpreg)));
27889 /* Emit lea manually to avoid clobbering of flags. */
27890 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27892 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27893 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27894 emit_insn (gen_rtx_SET (out,
27895 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27896 reg2,
27897 out)));
27899 else
27901 rtx_code_label *end_2_label = gen_label_rtx ();
27902 /* Is zero in the first two bytes? */
27904 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27905 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27906 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27907 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27908 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27909 pc_rtx);
27910 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27911 JUMP_LABEL (tmp) = end_2_label;
27913 /* Not in the first two. Move two bytes forward. */
27914 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27915 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27917 emit_label (end_2_label);
27921 /* Avoid branch in fixing the byte. */
27922 tmpreg = gen_lowpart (QImode, tmpreg);
27923 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27924 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27925 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27926 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27928 emit_label (end_0_label);
27931 /* Expand strlen. */
27933 bool
27934 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27936 rtx addr, scratch1, scratch2, scratch3, scratch4;
27938 /* The generic case of strlen expander is long. Avoid it's
27939 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27941 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27942 && !TARGET_INLINE_ALL_STRINGOPS
27943 && !optimize_insn_for_size_p ()
27944 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27945 return false;
27947 addr = force_reg (Pmode, XEXP (src, 0));
27948 scratch1 = gen_reg_rtx (Pmode);
27950 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27951 && !optimize_insn_for_size_p ())
27953 /* Well it seems that some optimizer does not combine a call like
27954 foo(strlen(bar), strlen(bar));
27955 when the move and the subtraction is done here. It does calculate
27956 the length just once when these instructions are done inside of
27957 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27958 often used and I use one fewer register for the lifetime of
27959 output_strlen_unroll() this is better. */
27961 emit_move_insn (out, addr);
27963 ix86_expand_strlensi_unroll_1 (out, src, align);
27965 /* strlensi_unroll_1 returns the address of the zero at the end of
27966 the string, like memchr(), so compute the length by subtracting
27967 the start address. */
27968 emit_insn (ix86_gen_sub3 (out, out, addr));
27970 else
27972 rtx unspec;
27974 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27975 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27976 return false;
27977 /* Can't use this for non-default address spaces. */
27978 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27979 return false;
27981 scratch2 = gen_reg_rtx (Pmode);
27982 scratch3 = gen_reg_rtx (Pmode);
27983 scratch4 = force_reg (Pmode, constm1_rtx);
27985 emit_move_insn (scratch3, addr);
27986 eoschar = force_reg (QImode, eoschar);
27988 src = replace_equiv_address_nv (src, scratch3);
27990 /* If .md starts supporting :P, this can be done in .md. */
27991 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27992 scratch4), UNSPEC_SCAS);
27993 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27994 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27995 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27997 return true;
28000 /* For given symbol (function) construct code to compute address of it's PLT
28001 entry in large x86-64 PIC model. */
28002 static rtx
28003 construct_plt_address (rtx symbol)
28005 rtx tmp, unspec;
28007 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28008 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28009 gcc_assert (Pmode == DImode);
28011 tmp = gen_reg_rtx (Pmode);
28012 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28014 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28015 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28016 return tmp;
28020 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28021 rtx callarg2,
28022 rtx pop, bool sibcall)
28024 rtx vec[3];
28025 rtx use = NULL, call;
28026 unsigned int vec_len = 0;
28027 tree fndecl;
28029 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28031 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28032 if (fndecl
28033 && (lookup_attribute ("interrupt",
28034 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28035 error ("interrupt service routine can't be called directly");
28037 else
28038 fndecl = NULL_TREE;
28040 if (pop == const0_rtx)
28041 pop = NULL;
28042 gcc_assert (!TARGET_64BIT || !pop);
28044 if (TARGET_MACHO && !TARGET_64BIT)
28046 #if TARGET_MACHO
28047 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28048 fnaddr = machopic_indirect_call_target (fnaddr);
28049 #endif
28051 else
28053 /* Static functions and indirect calls don't need the pic register. Also,
28054 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28055 it an indirect call. */
28056 rtx addr = XEXP (fnaddr, 0);
28057 if (flag_pic
28058 && GET_CODE (addr) == SYMBOL_REF
28059 && !SYMBOL_REF_LOCAL_P (addr))
28061 if (flag_plt
28062 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28063 || !lookup_attribute ("noplt",
28064 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28066 if (!TARGET_64BIT
28067 || (ix86_cmodel == CM_LARGE_PIC
28068 && DEFAULT_ABI != MS_ABI))
28070 use_reg (&use, gen_rtx_REG (Pmode,
28071 REAL_PIC_OFFSET_TABLE_REGNUM));
28072 if (ix86_use_pseudo_pic_reg ())
28073 emit_move_insn (gen_rtx_REG (Pmode,
28074 REAL_PIC_OFFSET_TABLE_REGNUM),
28075 pic_offset_table_rtx);
28078 else if (!TARGET_PECOFF && !TARGET_MACHO)
28080 if (TARGET_64BIT)
28082 fnaddr = gen_rtx_UNSPEC (Pmode,
28083 gen_rtvec (1, addr),
28084 UNSPEC_GOTPCREL);
28085 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28087 else
28089 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28090 UNSPEC_GOT);
28091 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28092 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28093 fnaddr);
28095 fnaddr = gen_const_mem (Pmode, fnaddr);
28096 /* Pmode may not be the same as word_mode for x32, which
28097 doesn't support indirect branch via 32-bit memory slot.
28098 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28099 indirect branch via x32 GOT slot is OK. */
28100 if (GET_MODE (fnaddr) != word_mode)
28101 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28102 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28107 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28108 parameters passed in vector registers. */
28109 if (TARGET_64BIT
28110 && (INTVAL (callarg2) > 0
28111 || (INTVAL (callarg2) == 0
28112 && (TARGET_SSE || !flag_skip_rax_setup))))
28114 rtx al = gen_rtx_REG (QImode, AX_REG);
28115 emit_move_insn (al, callarg2);
28116 use_reg (&use, al);
28119 if (ix86_cmodel == CM_LARGE_PIC
28120 && !TARGET_PECOFF
28121 && MEM_P (fnaddr)
28122 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28123 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28124 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28125 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28126 branch via x32 GOT slot is OK. */
28127 else if (!(TARGET_X32
28128 && MEM_P (fnaddr)
28129 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28130 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28131 && (sibcall
28132 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28133 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28135 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28136 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28139 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28141 if (retval)
28143 /* We should add bounds as destination register in case
28144 pointer with bounds may be returned. */
28145 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28147 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28148 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28149 if (GET_CODE (retval) == PARALLEL)
28151 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28152 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28153 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28154 retval = chkp_join_splitted_slot (retval, par);
28156 else
28158 retval = gen_rtx_PARALLEL (VOIDmode,
28159 gen_rtvec (3, retval, b0, b1));
28160 chkp_put_regs_to_expr_list (retval);
28164 call = gen_rtx_SET (retval, call);
28166 vec[vec_len++] = call;
28168 if (pop)
28170 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28171 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28172 vec[vec_len++] = pop;
28175 if (cfun->machine->no_caller_saved_registers
28176 && (!fndecl
28177 || (!TREE_THIS_VOLATILE (fndecl)
28178 && !lookup_attribute ("no_caller_saved_registers",
28179 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28181 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28182 bool is_64bit_ms_abi = (TARGET_64BIT
28183 && ix86_function_abi (fndecl) == MS_ABI);
28184 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28186 /* If there are no caller-saved registers, add all registers
28187 that are clobbered by the call which returns. */
28188 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28189 if (!fixed_regs[i]
28190 && (ix86_call_used_regs[i] == 1
28191 || (ix86_call_used_regs[i] & c_mask))
28192 && !STACK_REGNO_P (i)
28193 && !MMX_REGNO_P (i))
28194 clobber_reg (&use,
28195 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28197 else if (TARGET_64BIT_MS_ABI
28198 && (!callarg2 || INTVAL (callarg2) != -2))
28200 int const cregs_size
28201 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28202 int i;
28204 for (i = 0; i < cregs_size; i++)
28206 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28207 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28209 clobber_reg (&use, gen_rtx_REG (mode, regno));
28213 if (vec_len > 1)
28214 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28215 call = emit_call_insn (call);
28216 if (use)
28217 CALL_INSN_FUNCTION_USAGE (call) = use;
28219 return call;
28222 /* Return true if the function being called was marked with attribute
28223 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28224 to handle the non-PIC case in the backend because there is no easy
28225 interface for the front-end to force non-PLT calls to use the GOT.
28226 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28227 to call the function marked "noplt" indirectly. */
28229 static bool
28230 ix86_nopic_noplt_attribute_p (rtx call_op)
28232 if (flag_pic || ix86_cmodel == CM_LARGE
28233 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28234 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28235 || SYMBOL_REF_LOCAL_P (call_op))
28236 return false;
28238 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28240 if (!flag_plt
28241 || (symbol_decl != NULL_TREE
28242 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28243 return true;
28245 return false;
28248 /* Output the assembly for a call instruction. */
28250 const char *
28251 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28253 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28254 bool seh_nop_p = false;
28255 const char *xasm;
28257 if (SIBLING_CALL_P (insn))
28259 if (direct_p)
28261 if (ix86_nopic_noplt_attribute_p (call_op))
28263 if (TARGET_64BIT)
28264 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28265 else
28266 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28268 else
28269 xasm = "%!jmp\t%P0";
28271 /* SEH epilogue detection requires the indirect branch case
28272 to include REX.W. */
28273 else if (TARGET_SEH)
28274 xasm = "%!rex.W jmp\t%A0";
28275 else
28276 xasm = "%!jmp\t%A0";
28278 output_asm_insn (xasm, &call_op);
28279 return "";
28282 /* SEH unwinding can require an extra nop to be emitted in several
28283 circumstances. Determine if we have one of those. */
28284 if (TARGET_SEH)
28286 rtx_insn *i;
28288 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28290 /* If we get to another real insn, we don't need the nop. */
28291 if (INSN_P (i))
28292 break;
28294 /* If we get to the epilogue note, prevent a catch region from
28295 being adjacent to the standard epilogue sequence. If non-
28296 call-exceptions, we'll have done this during epilogue emission. */
28297 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28298 && !flag_non_call_exceptions
28299 && !can_throw_internal (insn))
28301 seh_nop_p = true;
28302 break;
28306 /* If we didn't find a real insn following the call, prevent the
28307 unwinder from looking into the next function. */
28308 if (i == NULL)
28309 seh_nop_p = true;
28312 if (direct_p)
28314 if (ix86_nopic_noplt_attribute_p (call_op))
28316 if (TARGET_64BIT)
28317 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28318 else
28319 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28321 else
28322 xasm = "%!call\t%P0";
28324 else
28325 xasm = "%!call\t%A0";
28327 output_asm_insn (xasm, &call_op);
28329 if (seh_nop_p)
28330 return "nop";
28332 return "";
28335 /* Clear stack slot assignments remembered from previous functions.
28336 This is called from INIT_EXPANDERS once before RTL is emitted for each
28337 function. */
28339 static struct machine_function *
28340 ix86_init_machine_status (void)
28342 struct machine_function *f;
28344 f = ggc_cleared_alloc<machine_function> ();
28345 f->use_fast_prologue_epilogue_nregs = -1;
28346 f->call_abi = ix86_abi;
28348 return f;
28351 /* Return a MEM corresponding to a stack slot with mode MODE.
28352 Allocate a new slot if necessary.
28354 The RTL for a function can have several slots available: N is
28355 which slot to use. */
28358 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28360 struct stack_local_entry *s;
28362 gcc_assert (n < MAX_386_STACK_LOCALS);
28364 for (s = ix86_stack_locals; s; s = s->next)
28365 if (s->mode == mode && s->n == n)
28366 return validize_mem (copy_rtx (s->rtl));
28368 s = ggc_alloc<stack_local_entry> ();
28369 s->n = n;
28370 s->mode = mode;
28371 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28373 s->next = ix86_stack_locals;
28374 ix86_stack_locals = s;
28375 return validize_mem (copy_rtx (s->rtl));
28378 static void
28379 ix86_instantiate_decls (void)
28381 struct stack_local_entry *s;
28383 for (s = ix86_stack_locals; s; s = s->next)
28384 if (s->rtl != NULL_RTX)
28385 instantiate_decl_rtl (s->rtl);
28388 /* Return the number used for encoding REG, in the range 0..7. */
28390 static int
28391 reg_encoded_number (rtx reg)
28393 unsigned regno = REGNO (reg);
28394 switch (regno)
28396 case AX_REG:
28397 return 0;
28398 case CX_REG:
28399 return 1;
28400 case DX_REG:
28401 return 2;
28402 case BX_REG:
28403 return 3;
28404 case SP_REG:
28405 return 4;
28406 case BP_REG:
28407 return 5;
28408 case SI_REG:
28409 return 6;
28410 case DI_REG:
28411 return 7;
28412 default:
28413 break;
28415 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28416 return regno - FIRST_STACK_REG;
28417 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28418 return regno - FIRST_SSE_REG;
28419 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28420 return regno - FIRST_MMX_REG;
28421 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28422 return regno - FIRST_REX_SSE_REG;
28423 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28424 return regno - FIRST_REX_INT_REG;
28425 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28426 return regno - FIRST_MASK_REG;
28427 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28428 return regno - FIRST_BND_REG;
28429 return -1;
28432 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28433 in its encoding if it could be relevant for ROP mitigation, otherwise
28434 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28435 used for calculating it into them. */
28437 static int
28438 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28439 int *popno0 = 0, int *popno1 = 0)
28441 if (asm_noperands (PATTERN (insn)) >= 0)
28442 return -1;
28443 int has_modrm = get_attr_modrm (insn);
28444 if (!has_modrm)
28445 return -1;
28446 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28447 rtx op0, op1;
28448 switch (cls)
28450 case MODRM_CLASS_OP02:
28451 gcc_assert (noperands >= 3);
28452 if (popno0)
28454 *popno0 = 0;
28455 *popno1 = 2;
28457 op0 = operands[0];
28458 op1 = operands[2];
28459 break;
28460 case MODRM_CLASS_OP01:
28461 gcc_assert (noperands >= 2);
28462 if (popno0)
28464 *popno0 = 0;
28465 *popno1 = 1;
28467 op0 = operands[0];
28468 op1 = operands[1];
28469 break;
28470 default:
28471 return -1;
28473 if (REG_P (op0) && REG_P (op1))
28475 int enc0 = reg_encoded_number (op0);
28476 int enc1 = reg_encoded_number (op1);
28477 return 0xc0 + (enc1 << 3) + enc0;
28479 return -1;
28482 /* Check whether x86 address PARTS is a pc-relative address. */
28484 static bool
28485 rip_relative_addr_p (struct ix86_address *parts)
28487 rtx base, index, disp;
28489 base = parts->base;
28490 index = parts->index;
28491 disp = parts->disp;
28493 if (disp && !base && !index)
28495 if (TARGET_64BIT)
28497 rtx symbol = disp;
28499 if (GET_CODE (disp) == CONST)
28500 symbol = XEXP (disp, 0);
28501 if (GET_CODE (symbol) == PLUS
28502 && CONST_INT_P (XEXP (symbol, 1)))
28503 symbol = XEXP (symbol, 0);
28505 if (GET_CODE (symbol) == LABEL_REF
28506 || (GET_CODE (symbol) == SYMBOL_REF
28507 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28508 || (GET_CODE (symbol) == UNSPEC
28509 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28510 || XINT (symbol, 1) == UNSPEC_PCREL
28511 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28512 return true;
28515 return false;
28518 /* Calculate the length of the memory address in the instruction encoding.
28519 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28520 or other prefixes. We never generate addr32 prefix for LEA insn. */
28523 memory_address_length (rtx addr, bool lea)
28525 struct ix86_address parts;
28526 rtx base, index, disp;
28527 int len;
28528 int ok;
28530 if (GET_CODE (addr) == PRE_DEC
28531 || GET_CODE (addr) == POST_INC
28532 || GET_CODE (addr) == PRE_MODIFY
28533 || GET_CODE (addr) == POST_MODIFY)
28534 return 0;
28536 ok = ix86_decompose_address (addr, &parts);
28537 gcc_assert (ok);
28539 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28541 /* If this is not LEA instruction, add the length of addr32 prefix. */
28542 if (TARGET_64BIT && !lea
28543 && (SImode_address_operand (addr, VOIDmode)
28544 || (parts.base && GET_MODE (parts.base) == SImode)
28545 || (parts.index && GET_MODE (parts.index) == SImode)))
28546 len++;
28548 base = parts.base;
28549 index = parts.index;
28550 disp = parts.disp;
28552 if (base && SUBREG_P (base))
28553 base = SUBREG_REG (base);
28554 if (index && SUBREG_P (index))
28555 index = SUBREG_REG (index);
28557 gcc_assert (base == NULL_RTX || REG_P (base));
28558 gcc_assert (index == NULL_RTX || REG_P (index));
28560 /* Rule of thumb:
28561 - esp as the base always wants an index,
28562 - ebp as the base always wants a displacement,
28563 - r12 as the base always wants an index,
28564 - r13 as the base always wants a displacement. */
28566 /* Register Indirect. */
28567 if (base && !index && !disp)
28569 /* esp (for its index) and ebp (for its displacement) need
28570 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28571 code. */
28572 if (base == arg_pointer_rtx
28573 || base == frame_pointer_rtx
28574 || REGNO (base) == SP_REG
28575 || REGNO (base) == BP_REG
28576 || REGNO (base) == R12_REG
28577 || REGNO (base) == R13_REG)
28578 len++;
28581 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28582 is not disp32, but disp32(%rip), so for disp32
28583 SIB byte is needed, unless print_operand_address
28584 optimizes it into disp32(%rip) or (%rip) is implied
28585 by UNSPEC. */
28586 else if (disp && !base && !index)
28588 len += 4;
28589 if (rip_relative_addr_p (&parts))
28590 len++;
28592 else
28594 /* Find the length of the displacement constant. */
28595 if (disp)
28597 if (base && satisfies_constraint_K (disp))
28598 len += 1;
28599 else
28600 len += 4;
28602 /* ebp always wants a displacement. Similarly r13. */
28603 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28604 len++;
28606 /* An index requires the two-byte modrm form.... */
28607 if (index
28608 /* ...like esp (or r12), which always wants an index. */
28609 || base == arg_pointer_rtx
28610 || base == frame_pointer_rtx
28611 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28612 len++;
28615 return len;
28618 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28619 is set, expect that insn have 8bit immediate alternative. */
28621 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28623 int len = 0;
28624 int i;
28625 extract_insn_cached (insn);
28626 for (i = recog_data.n_operands - 1; i >= 0; --i)
28627 if (CONSTANT_P (recog_data.operand[i]))
28629 enum attr_mode mode = get_attr_mode (insn);
28631 gcc_assert (!len);
28632 if (shortform && CONST_INT_P (recog_data.operand[i]))
28634 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28635 switch (mode)
28637 case MODE_QI:
28638 len = 1;
28639 continue;
28640 case MODE_HI:
28641 ival = trunc_int_for_mode (ival, HImode);
28642 break;
28643 case MODE_SI:
28644 ival = trunc_int_for_mode (ival, SImode);
28645 break;
28646 default:
28647 break;
28649 if (IN_RANGE (ival, -128, 127))
28651 len = 1;
28652 continue;
28655 switch (mode)
28657 case MODE_QI:
28658 len = 1;
28659 break;
28660 case MODE_HI:
28661 len = 2;
28662 break;
28663 case MODE_SI:
28664 len = 4;
28665 break;
28666 /* Immediates for DImode instructions are encoded
28667 as 32bit sign extended values. */
28668 case MODE_DI:
28669 len = 4;
28670 break;
28671 default:
28672 fatal_insn ("unknown insn mode", insn);
28675 return len;
28678 /* Compute default value for "length_address" attribute. */
28680 ix86_attr_length_address_default (rtx_insn *insn)
28682 int i;
28684 if (get_attr_type (insn) == TYPE_LEA)
28686 rtx set = PATTERN (insn), addr;
28688 if (GET_CODE (set) == PARALLEL)
28689 set = XVECEXP (set, 0, 0);
28691 gcc_assert (GET_CODE (set) == SET);
28693 addr = SET_SRC (set);
28695 return memory_address_length (addr, true);
28698 extract_insn_cached (insn);
28699 for (i = recog_data.n_operands - 1; i >= 0; --i)
28701 rtx op = recog_data.operand[i];
28702 if (MEM_P (op))
28704 constrain_operands_cached (insn, reload_completed);
28705 if (which_alternative != -1)
28707 const char *constraints = recog_data.constraints[i];
28708 int alt = which_alternative;
28710 while (*constraints == '=' || *constraints == '+')
28711 constraints++;
28712 while (alt-- > 0)
28713 while (*constraints++ != ',')
28715 /* Skip ignored operands. */
28716 if (*constraints == 'X')
28717 continue;
28720 int len = memory_address_length (XEXP (op, 0), false);
28722 /* Account for segment prefix for non-default addr spaces. */
28723 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28724 len++;
28726 return len;
28729 return 0;
28732 /* Compute default value for "length_vex" attribute. It includes
28733 2 or 3 byte VEX prefix and 1 opcode byte. */
28736 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28737 bool has_vex_w)
28739 int i;
28741 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28742 byte VEX prefix. */
28743 if (!has_0f_opcode || has_vex_w)
28744 return 3 + 1;
28746 /* We can always use 2 byte VEX prefix in 32bit. */
28747 if (!TARGET_64BIT)
28748 return 2 + 1;
28750 extract_insn_cached (insn);
28752 for (i = recog_data.n_operands - 1; i >= 0; --i)
28753 if (REG_P (recog_data.operand[i]))
28755 /* REX.W bit uses 3 byte VEX prefix. */
28756 if (GET_MODE (recog_data.operand[i]) == DImode
28757 && GENERAL_REG_P (recog_data.operand[i]))
28758 return 3 + 1;
28760 else
28762 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28763 if (MEM_P (recog_data.operand[i])
28764 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28765 return 3 + 1;
28768 return 2 + 1;
28771 /* Return the maximum number of instructions a cpu can issue. */
28773 static int
28774 ix86_issue_rate (void)
28776 switch (ix86_tune)
28778 case PROCESSOR_PENTIUM:
28779 case PROCESSOR_LAKEMONT:
28780 case PROCESSOR_BONNELL:
28781 case PROCESSOR_SILVERMONT:
28782 case PROCESSOR_KNL:
28783 case PROCESSOR_INTEL:
28784 case PROCESSOR_K6:
28785 case PROCESSOR_BTVER2:
28786 case PROCESSOR_PENTIUM4:
28787 case PROCESSOR_NOCONA:
28788 return 2;
28790 case PROCESSOR_PENTIUMPRO:
28791 case PROCESSOR_ATHLON:
28792 case PROCESSOR_K8:
28793 case PROCESSOR_AMDFAM10:
28794 case PROCESSOR_GENERIC:
28795 case PROCESSOR_BTVER1:
28796 return 3;
28798 case PROCESSOR_BDVER1:
28799 case PROCESSOR_BDVER2:
28800 case PROCESSOR_BDVER3:
28801 case PROCESSOR_BDVER4:
28802 case PROCESSOR_ZNVER1:
28803 case PROCESSOR_CORE2:
28804 case PROCESSOR_NEHALEM:
28805 case PROCESSOR_SANDYBRIDGE:
28806 case PROCESSOR_HASWELL:
28807 return 4;
28809 default:
28810 return 1;
28814 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
28815 by DEP_INSN and nothing set by DEP_INSN. */
28817 static bool
28818 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
28820 rtx set, set2;
28822 /* Simplify the test for uninteresting insns. */
28823 if (insn_type != TYPE_SETCC
28824 && insn_type != TYPE_ICMOV
28825 && insn_type != TYPE_FCMOV
28826 && insn_type != TYPE_IBR)
28827 return false;
28829 if ((set = single_set (dep_insn)) != 0)
28831 set = SET_DEST (set);
28832 set2 = NULL_RTX;
28834 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
28835 && XVECLEN (PATTERN (dep_insn), 0) == 2
28836 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
28837 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
28839 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28840 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28842 else
28843 return false;
28845 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
28846 return false;
28848 /* This test is true if the dependent insn reads the flags but
28849 not any other potentially set register. */
28850 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
28851 return false;
28853 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
28854 return false;
28856 return true;
28859 /* Return true iff USE_INSN has a memory address with operands set by
28860 SET_INSN. */
28862 bool
28863 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
28865 int i;
28866 extract_insn_cached (use_insn);
28867 for (i = recog_data.n_operands - 1; i >= 0; --i)
28868 if (MEM_P (recog_data.operand[i]))
28870 rtx addr = XEXP (recog_data.operand[i], 0);
28871 return modified_in_p (addr, set_insn) != 0;
28873 return false;
28876 /* Helper function for exact_store_load_dependency.
28877 Return true if addr is found in insn. */
28878 static bool
28879 exact_dependency_1 (rtx addr, rtx insn)
28881 enum rtx_code code;
28882 const char *format_ptr;
28883 int i, j;
28885 code = GET_CODE (insn);
28886 switch (code)
28888 case MEM:
28889 if (rtx_equal_p (addr, insn))
28890 return true;
28891 break;
28892 case REG:
28893 CASE_CONST_ANY:
28894 case SYMBOL_REF:
28895 case CODE_LABEL:
28896 case PC:
28897 case CC0:
28898 case EXPR_LIST:
28899 return false;
28900 default:
28901 break;
28904 format_ptr = GET_RTX_FORMAT (code);
28905 for (i = 0; i < GET_RTX_LENGTH (code); i++)
28907 switch (*format_ptr++)
28909 case 'e':
28910 if (exact_dependency_1 (addr, XEXP (insn, i)))
28911 return true;
28912 break;
28913 case 'E':
28914 for (j = 0; j < XVECLEN (insn, i); j++)
28915 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
28916 return true;
28917 break;
28920 return false;
28923 /* Return true if there exists exact dependency for store & load, i.e.
28924 the same memory address is used in them. */
28925 static bool
28926 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
28928 rtx set1, set2;
28930 set1 = single_set (store);
28931 if (!set1)
28932 return false;
28933 if (!MEM_P (SET_DEST (set1)))
28934 return false;
28935 set2 = single_set (load);
28936 if (!set2)
28937 return false;
28938 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
28939 return true;
28940 return false;
28943 static int
28944 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
28945 unsigned int)
28947 enum attr_type insn_type, dep_insn_type;
28948 enum attr_memory memory;
28949 rtx set, set2;
28950 int dep_insn_code_number;
28952 /* Anti and output dependencies have zero cost on all CPUs. */
28953 if (dep_type != 0)
28954 return 0;
28956 dep_insn_code_number = recog_memoized (dep_insn);
28958 /* If we can't recognize the insns, we can't really do anything. */
28959 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
28960 return cost;
28962 insn_type = get_attr_type (insn);
28963 dep_insn_type = get_attr_type (dep_insn);
28965 switch (ix86_tune)
28967 case PROCESSOR_PENTIUM:
28968 case PROCESSOR_LAKEMONT:
28969 /* Address Generation Interlock adds a cycle of latency. */
28970 if (insn_type == TYPE_LEA)
28972 rtx addr = PATTERN (insn);
28974 if (GET_CODE (addr) == PARALLEL)
28975 addr = XVECEXP (addr, 0, 0);
28977 gcc_assert (GET_CODE (addr) == SET);
28979 addr = SET_SRC (addr);
28980 if (modified_in_p (addr, dep_insn))
28981 cost += 1;
28983 else if (ix86_agi_dependent (dep_insn, insn))
28984 cost += 1;
28986 /* ??? Compares pair with jump/setcc. */
28987 if (ix86_flags_dependent (insn, dep_insn, insn_type))
28988 cost = 0;
28990 /* Floating point stores require value to be ready one cycle earlier. */
28991 if (insn_type == TYPE_FMOV
28992 && get_attr_memory (insn) == MEMORY_STORE
28993 && !ix86_agi_dependent (dep_insn, insn))
28994 cost += 1;
28995 break;
28997 case PROCESSOR_PENTIUMPRO:
28998 /* INT->FP conversion is expensive. */
28999 if (get_attr_fp_int_src (dep_insn))
29000 cost += 5;
29002 /* There is one cycle extra latency between an FP op and a store. */
29003 if (insn_type == TYPE_FMOV
29004 && (set = single_set (dep_insn)) != NULL_RTX
29005 && (set2 = single_set (insn)) != NULL_RTX
29006 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
29007 && MEM_P (SET_DEST (set2)))
29008 cost += 1;
29010 memory = get_attr_memory (insn);
29012 /* Show ability of reorder buffer to hide latency of load by executing
29013 in parallel with previous instruction in case
29014 previous instruction is not needed to compute the address. */
29015 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29016 && !ix86_agi_dependent (dep_insn, insn))
29018 /* Claim moves to take one cycle, as core can issue one load
29019 at time and the next load can start cycle later. */
29020 if (dep_insn_type == TYPE_IMOV
29021 || dep_insn_type == TYPE_FMOV)
29022 cost = 1;
29023 else if (cost > 1)
29024 cost--;
29026 break;
29028 case PROCESSOR_K6:
29029 /* The esp dependency is resolved before
29030 the instruction is really finished. */
29031 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29032 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29033 return 1;
29035 /* INT->FP conversion is expensive. */
29036 if (get_attr_fp_int_src (dep_insn))
29037 cost += 5;
29039 memory = get_attr_memory (insn);
29041 /* Show ability of reorder buffer to hide latency of load by executing
29042 in parallel with previous instruction in case
29043 previous instruction is not needed to compute the address. */
29044 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29045 && !ix86_agi_dependent (dep_insn, insn))
29047 /* Claim moves to take one cycle, as core can issue one load
29048 at time and the next load can start cycle later. */
29049 if (dep_insn_type == TYPE_IMOV
29050 || dep_insn_type == TYPE_FMOV)
29051 cost = 1;
29052 else if (cost > 2)
29053 cost -= 2;
29054 else
29055 cost = 1;
29057 break;
29059 case PROCESSOR_AMDFAM10:
29060 case PROCESSOR_BDVER1:
29061 case PROCESSOR_BDVER2:
29062 case PROCESSOR_BDVER3:
29063 case PROCESSOR_BDVER4:
29064 case PROCESSOR_ZNVER1:
29065 case PROCESSOR_BTVER1:
29066 case PROCESSOR_BTVER2:
29067 case PROCESSOR_GENERIC:
29068 /* Stack engine allows to execute push&pop instructions in parall. */
29069 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29070 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29071 return 0;
29072 /* FALLTHRU */
29074 case PROCESSOR_ATHLON:
29075 case PROCESSOR_K8:
29076 memory = get_attr_memory (insn);
29078 /* Show ability of reorder buffer to hide latency of load by executing
29079 in parallel with previous instruction in case
29080 previous instruction is not needed to compute the address. */
29081 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29082 && !ix86_agi_dependent (dep_insn, insn))
29084 enum attr_unit unit = get_attr_unit (insn);
29085 int loadcost = 3;
29087 /* Because of the difference between the length of integer and
29088 floating unit pipeline preparation stages, the memory operands
29089 for floating point are cheaper.
29091 ??? For Athlon it the difference is most probably 2. */
29092 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29093 loadcost = 3;
29094 else
29095 loadcost = TARGET_ATHLON ? 2 : 0;
29097 if (cost >= loadcost)
29098 cost -= loadcost;
29099 else
29100 cost = 0;
29102 break;
29104 case PROCESSOR_CORE2:
29105 case PROCESSOR_NEHALEM:
29106 case PROCESSOR_SANDYBRIDGE:
29107 case PROCESSOR_HASWELL:
29108 /* Stack engine allows to execute push&pop instructions in parall. */
29109 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29110 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29111 return 0;
29113 memory = get_attr_memory (insn);
29115 /* Show ability of reorder buffer to hide latency of load by executing
29116 in parallel with previous instruction in case
29117 previous instruction is not needed to compute the address. */
29118 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29119 && !ix86_agi_dependent (dep_insn, insn))
29121 if (cost >= 4)
29122 cost -= 4;
29123 else
29124 cost = 0;
29126 break;
29128 case PROCESSOR_SILVERMONT:
29129 case PROCESSOR_KNL:
29130 case PROCESSOR_INTEL:
29131 if (!reload_completed)
29132 return cost;
29134 /* Increase cost of integer loads. */
29135 memory = get_attr_memory (dep_insn);
29136 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29138 enum attr_unit unit = get_attr_unit (dep_insn);
29139 if (unit == UNIT_INTEGER && cost == 1)
29141 if (memory == MEMORY_LOAD)
29142 cost = 3;
29143 else
29145 /* Increase cost of ld/st for short int types only
29146 because of store forwarding issue. */
29147 rtx set = single_set (dep_insn);
29148 if (set && (GET_MODE (SET_DEST (set)) == QImode
29149 || GET_MODE (SET_DEST (set)) == HImode))
29151 /* Increase cost of store/load insn if exact
29152 dependence exists and it is load insn. */
29153 enum attr_memory insn_memory = get_attr_memory (insn);
29154 if (insn_memory == MEMORY_LOAD
29155 && exact_store_load_dependency (dep_insn, insn))
29156 cost = 3;
29162 default:
29163 break;
29166 return cost;
29169 /* How many alternative schedules to try. This should be as wide as the
29170 scheduling freedom in the DFA, but no wider. Making this value too
29171 large results extra work for the scheduler. */
29173 static int
29174 ia32_multipass_dfa_lookahead (void)
29176 switch (ix86_tune)
29178 case PROCESSOR_PENTIUM:
29179 case PROCESSOR_LAKEMONT:
29180 return 2;
29182 case PROCESSOR_PENTIUMPRO:
29183 case PROCESSOR_K6:
29184 return 1;
29186 case PROCESSOR_BDVER1:
29187 case PROCESSOR_BDVER2:
29188 case PROCESSOR_BDVER3:
29189 case PROCESSOR_BDVER4:
29190 /* We use lookahead value 4 for BD both before and after reload
29191 schedules. Plan is to have value 8 included for O3. */
29192 return 4;
29194 case PROCESSOR_CORE2:
29195 case PROCESSOR_NEHALEM:
29196 case PROCESSOR_SANDYBRIDGE:
29197 case PROCESSOR_HASWELL:
29198 case PROCESSOR_BONNELL:
29199 case PROCESSOR_SILVERMONT:
29200 case PROCESSOR_KNL:
29201 case PROCESSOR_INTEL:
29202 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29203 as many instructions can be executed on a cycle, i.e.,
29204 issue_rate. I wonder why tuning for many CPUs does not do this. */
29205 if (reload_completed)
29206 return ix86_issue_rate ();
29207 /* Don't use lookahead for pre-reload schedule to save compile time. */
29208 return 0;
29210 default:
29211 return 0;
29215 /* Return true if target platform supports macro-fusion. */
29217 static bool
29218 ix86_macro_fusion_p ()
29220 return TARGET_FUSE_CMP_AND_BRANCH;
29223 /* Check whether current microarchitecture support macro fusion
29224 for insn pair "CONDGEN + CONDJMP". Refer to
29225 "Intel Architectures Optimization Reference Manual". */
29227 static bool
29228 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29230 rtx src, dest;
29231 enum rtx_code ccode;
29232 rtx compare_set = NULL_RTX, test_if, cond;
29233 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29235 if (!any_condjump_p (condjmp))
29236 return false;
29238 if (get_attr_type (condgen) != TYPE_TEST
29239 && get_attr_type (condgen) != TYPE_ICMP
29240 && get_attr_type (condgen) != TYPE_INCDEC
29241 && get_attr_type (condgen) != TYPE_ALU)
29242 return false;
29244 compare_set = single_set (condgen);
29245 if (compare_set == NULL_RTX
29246 && !TARGET_FUSE_ALU_AND_BRANCH)
29247 return false;
29249 if (compare_set == NULL_RTX)
29251 int i;
29252 rtx pat = PATTERN (condgen);
29253 for (i = 0; i < XVECLEN (pat, 0); i++)
29254 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29256 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29257 if (GET_CODE (set_src) == COMPARE)
29258 compare_set = XVECEXP (pat, 0, i);
29259 else
29260 alu_set = XVECEXP (pat, 0, i);
29263 if (compare_set == NULL_RTX)
29264 return false;
29265 src = SET_SRC (compare_set);
29266 if (GET_CODE (src) != COMPARE)
29267 return false;
29269 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29270 supported. */
29271 if ((MEM_P (XEXP (src, 0))
29272 && CONST_INT_P (XEXP (src, 1)))
29273 || (MEM_P (XEXP (src, 1))
29274 && CONST_INT_P (XEXP (src, 0))))
29275 return false;
29277 /* No fusion for RIP-relative address. */
29278 if (MEM_P (XEXP (src, 0)))
29279 addr = XEXP (XEXP (src, 0), 0);
29280 else if (MEM_P (XEXP (src, 1)))
29281 addr = XEXP (XEXP (src, 1), 0);
29283 if (addr) {
29284 ix86_address parts;
29285 int ok = ix86_decompose_address (addr, &parts);
29286 gcc_assert (ok);
29288 if (rip_relative_addr_p (&parts))
29289 return false;
29292 test_if = SET_SRC (pc_set (condjmp));
29293 cond = XEXP (test_if, 0);
29294 ccode = GET_CODE (cond);
29295 /* Check whether conditional jump use Sign or Overflow Flags. */
29296 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29297 && (ccode == GE
29298 || ccode == GT
29299 || ccode == LE
29300 || ccode == LT))
29301 return false;
29303 /* Return true for TYPE_TEST and TYPE_ICMP. */
29304 if (get_attr_type (condgen) == TYPE_TEST
29305 || get_attr_type (condgen) == TYPE_ICMP)
29306 return true;
29308 /* The following is the case that macro-fusion for alu + jmp. */
29309 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29310 return false;
29312 /* No fusion for alu op with memory destination operand. */
29313 dest = SET_DEST (alu_set);
29314 if (MEM_P (dest))
29315 return false;
29317 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29318 supported. */
29319 if (get_attr_type (condgen) == TYPE_INCDEC
29320 && (ccode == GEU
29321 || ccode == GTU
29322 || ccode == LEU
29323 || ccode == LTU))
29324 return false;
29326 return true;
29329 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29330 execution. It is applied if
29331 (1) IMUL instruction is on the top of list;
29332 (2) There exists the only producer of independent IMUL instruction in
29333 ready list.
29334 Return index of IMUL producer if it was found and -1 otherwise. */
29335 static int
29336 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29338 rtx_insn *insn;
29339 rtx set, insn1, insn2;
29340 sd_iterator_def sd_it;
29341 dep_t dep;
29342 int index = -1;
29343 int i;
29345 if (!TARGET_BONNELL)
29346 return index;
29348 /* Check that IMUL instruction is on the top of ready list. */
29349 insn = ready[n_ready - 1];
29350 set = single_set (insn);
29351 if (!set)
29352 return index;
29353 if (!(GET_CODE (SET_SRC (set)) == MULT
29354 && GET_MODE (SET_SRC (set)) == SImode))
29355 return index;
29357 /* Search for producer of independent IMUL instruction. */
29358 for (i = n_ready - 2; i >= 0; i--)
29360 insn = ready[i];
29361 if (!NONDEBUG_INSN_P (insn))
29362 continue;
29363 /* Skip IMUL instruction. */
29364 insn2 = PATTERN (insn);
29365 if (GET_CODE (insn2) == PARALLEL)
29366 insn2 = XVECEXP (insn2, 0, 0);
29367 if (GET_CODE (insn2) == SET
29368 && GET_CODE (SET_SRC (insn2)) == MULT
29369 && GET_MODE (SET_SRC (insn2)) == SImode)
29370 continue;
29372 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29374 rtx con;
29375 con = DEP_CON (dep);
29376 if (!NONDEBUG_INSN_P (con))
29377 continue;
29378 insn1 = PATTERN (con);
29379 if (GET_CODE (insn1) == PARALLEL)
29380 insn1 = XVECEXP (insn1, 0, 0);
29382 if (GET_CODE (insn1) == SET
29383 && GET_CODE (SET_SRC (insn1)) == MULT
29384 && GET_MODE (SET_SRC (insn1)) == SImode)
29386 sd_iterator_def sd_it1;
29387 dep_t dep1;
29388 /* Check if there is no other dependee for IMUL. */
29389 index = i;
29390 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29392 rtx pro;
29393 pro = DEP_PRO (dep1);
29394 if (!NONDEBUG_INSN_P (pro))
29395 continue;
29396 if (pro != insn)
29397 index = -1;
29399 if (index >= 0)
29400 break;
29403 if (index >= 0)
29404 break;
29406 return index;
29409 /* Try to find the best candidate on the top of ready list if two insns
29410 have the same priority - candidate is best if its dependees were
29411 scheduled earlier. Applied for Silvermont only.
29412 Return true if top 2 insns must be interchanged. */
29413 static bool
29414 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29416 rtx_insn *top = ready[n_ready - 1];
29417 rtx_insn *next = ready[n_ready - 2];
29418 rtx set;
29419 sd_iterator_def sd_it;
29420 dep_t dep;
29421 int clock1 = -1;
29422 int clock2 = -1;
29423 #define INSN_TICK(INSN) (HID (INSN)->tick)
29425 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29426 return false;
29428 if (!NONDEBUG_INSN_P (top))
29429 return false;
29430 if (!NONJUMP_INSN_P (top))
29431 return false;
29432 if (!NONDEBUG_INSN_P (next))
29433 return false;
29434 if (!NONJUMP_INSN_P (next))
29435 return false;
29436 set = single_set (top);
29437 if (!set)
29438 return false;
29439 set = single_set (next);
29440 if (!set)
29441 return false;
29443 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29445 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29446 return false;
29447 /* Determine winner more precise. */
29448 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29450 rtx pro;
29451 pro = DEP_PRO (dep);
29452 if (!NONDEBUG_INSN_P (pro))
29453 continue;
29454 if (INSN_TICK (pro) > clock1)
29455 clock1 = INSN_TICK (pro);
29457 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29459 rtx pro;
29460 pro = DEP_PRO (dep);
29461 if (!NONDEBUG_INSN_P (pro))
29462 continue;
29463 if (INSN_TICK (pro) > clock2)
29464 clock2 = INSN_TICK (pro);
29467 if (clock1 == clock2)
29469 /* Determine winner - load must win. */
29470 enum attr_memory memory1, memory2;
29471 memory1 = get_attr_memory (top);
29472 memory2 = get_attr_memory (next);
29473 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29474 return true;
29476 return (bool) (clock2 < clock1);
29478 return false;
29479 #undef INSN_TICK
29482 /* Perform possible reodering of ready list for Atom/Silvermont only.
29483 Return issue rate. */
29484 static int
29485 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29486 int *pn_ready, int clock_var)
29488 int issue_rate = -1;
29489 int n_ready = *pn_ready;
29490 int i;
29491 rtx_insn *insn;
29492 int index = -1;
29494 /* Set up issue rate. */
29495 issue_rate = ix86_issue_rate ();
29497 /* Do reodering for BONNELL/SILVERMONT only. */
29498 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29499 return issue_rate;
29501 /* Nothing to do if ready list contains only 1 instruction. */
29502 if (n_ready <= 1)
29503 return issue_rate;
29505 /* Do reodering for post-reload scheduler only. */
29506 if (!reload_completed)
29507 return issue_rate;
29509 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29511 if (sched_verbose > 1)
29512 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29513 INSN_UID (ready[index]));
29515 /* Put IMUL producer (ready[index]) at the top of ready list. */
29516 insn = ready[index];
29517 for (i = index; i < n_ready - 1; i++)
29518 ready[i] = ready[i + 1];
29519 ready[n_ready - 1] = insn;
29520 return issue_rate;
29523 /* Skip selective scheduling since HID is not populated in it. */
29524 if (clock_var != 0
29525 && !sel_sched_p ()
29526 && swap_top_of_ready_list (ready, n_ready))
29528 if (sched_verbose > 1)
29529 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29530 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29531 /* Swap 2 top elements of ready list. */
29532 insn = ready[n_ready - 1];
29533 ready[n_ready - 1] = ready[n_ready - 2];
29534 ready[n_ready - 2] = insn;
29536 return issue_rate;
29539 static bool
29540 ix86_class_likely_spilled_p (reg_class_t);
29542 /* Returns true if lhs of insn is HW function argument register and set up
29543 is_spilled to true if it is likely spilled HW register. */
29544 static bool
29545 insn_is_function_arg (rtx insn, bool* is_spilled)
29547 rtx dst;
29549 if (!NONDEBUG_INSN_P (insn))
29550 return false;
29551 /* Call instructions are not movable, ignore it. */
29552 if (CALL_P (insn))
29553 return false;
29554 insn = PATTERN (insn);
29555 if (GET_CODE (insn) == PARALLEL)
29556 insn = XVECEXP (insn, 0, 0);
29557 if (GET_CODE (insn) != SET)
29558 return false;
29559 dst = SET_DEST (insn);
29560 if (REG_P (dst) && HARD_REGISTER_P (dst)
29561 && ix86_function_arg_regno_p (REGNO (dst)))
29563 /* Is it likely spilled HW register? */
29564 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29565 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29566 *is_spilled = true;
29567 return true;
29569 return false;
29572 /* Add output dependencies for chain of function adjacent arguments if only
29573 there is a move to likely spilled HW register. Return first argument
29574 if at least one dependence was added or NULL otherwise. */
29575 static rtx_insn *
29576 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29578 rtx_insn *insn;
29579 rtx_insn *last = call;
29580 rtx_insn *first_arg = NULL;
29581 bool is_spilled = false;
29583 head = PREV_INSN (head);
29585 /* Find nearest to call argument passing instruction. */
29586 while (true)
29588 last = PREV_INSN (last);
29589 if (last == head)
29590 return NULL;
29591 if (!NONDEBUG_INSN_P (last))
29592 continue;
29593 if (insn_is_function_arg (last, &is_spilled))
29594 break;
29595 return NULL;
29598 first_arg = last;
29599 while (true)
29601 insn = PREV_INSN (last);
29602 if (!INSN_P (insn))
29603 break;
29604 if (insn == head)
29605 break;
29606 if (!NONDEBUG_INSN_P (insn))
29608 last = insn;
29609 continue;
29611 if (insn_is_function_arg (insn, &is_spilled))
29613 /* Add output depdendence between two function arguments if chain
29614 of output arguments contains likely spilled HW registers. */
29615 if (is_spilled)
29616 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29617 first_arg = last = insn;
29619 else
29620 break;
29622 if (!is_spilled)
29623 return NULL;
29624 return first_arg;
29627 /* Add output or anti dependency from insn to first_arg to restrict its code
29628 motion. */
29629 static void
29630 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29632 rtx set;
29633 rtx tmp;
29635 /* Add anti dependencies for bounds stores. */
29636 if (INSN_P (insn)
29637 && GET_CODE (PATTERN (insn)) == PARALLEL
29638 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29639 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29641 add_dependence (first_arg, insn, REG_DEP_ANTI);
29642 return;
29645 set = single_set (insn);
29646 if (!set)
29647 return;
29648 tmp = SET_DEST (set);
29649 if (REG_P (tmp))
29651 /* Add output dependency to the first function argument. */
29652 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29653 return;
29655 /* Add anti dependency. */
29656 add_dependence (first_arg, insn, REG_DEP_ANTI);
29659 /* Avoid cross block motion of function argument through adding dependency
29660 from the first non-jump instruction in bb. */
29661 static void
29662 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29664 rtx_insn *insn = BB_END (bb);
29666 while (insn)
29668 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29670 rtx set = single_set (insn);
29671 if (set)
29673 avoid_func_arg_motion (arg, insn);
29674 return;
29677 if (insn == BB_HEAD (bb))
29678 return;
29679 insn = PREV_INSN (insn);
29683 /* Hook for pre-reload schedule - avoid motion of function arguments
29684 passed in likely spilled HW registers. */
29685 static void
29686 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29688 rtx_insn *insn;
29689 rtx_insn *first_arg = NULL;
29690 if (reload_completed)
29691 return;
29692 while (head != tail && DEBUG_INSN_P (head))
29693 head = NEXT_INSN (head);
29694 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29695 if (INSN_P (insn) && CALL_P (insn))
29697 first_arg = add_parameter_dependencies (insn, head);
29698 if (first_arg)
29700 /* Add dependee for first argument to predecessors if only
29701 region contains more than one block. */
29702 basic_block bb = BLOCK_FOR_INSN (insn);
29703 int rgn = CONTAINING_RGN (bb->index);
29704 int nr_blks = RGN_NR_BLOCKS (rgn);
29705 /* Skip trivial regions and region head blocks that can have
29706 predecessors outside of region. */
29707 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29709 edge e;
29710 edge_iterator ei;
29712 /* Regions are SCCs with the exception of selective
29713 scheduling with pipelining of outer blocks enabled.
29714 So also check that immediate predecessors of a non-head
29715 block are in the same region. */
29716 FOR_EACH_EDGE (e, ei, bb->preds)
29718 /* Avoid creating of loop-carried dependencies through
29719 using topological ordering in the region. */
29720 if (rgn == CONTAINING_RGN (e->src->index)
29721 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29722 add_dependee_for_func_arg (first_arg, e->src);
29725 insn = first_arg;
29726 if (insn == head)
29727 break;
29730 else if (first_arg)
29731 avoid_func_arg_motion (first_arg, insn);
29734 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29735 HW registers to maximum, to schedule them at soon as possible. These are
29736 moves from function argument registers at the top of the function entry
29737 and moves from function return value registers after call. */
29738 static int
29739 ix86_adjust_priority (rtx_insn *insn, int priority)
29741 rtx set;
29743 if (reload_completed)
29744 return priority;
29746 if (!NONDEBUG_INSN_P (insn))
29747 return priority;
29749 set = single_set (insn);
29750 if (set)
29752 rtx tmp = SET_SRC (set);
29753 if (REG_P (tmp)
29754 && HARD_REGISTER_P (tmp)
29755 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29756 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29757 return current_sched_info->sched_max_insns_priority;
29760 return priority;
29763 /* Model decoder of Core 2/i7.
29764 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29765 track the instruction fetch block boundaries and make sure that long
29766 (9+ bytes) instructions are assigned to D0. */
29768 /* Maximum length of an insn that can be handled by
29769 a secondary decoder unit. '8' for Core 2/i7. */
29770 static int core2i7_secondary_decoder_max_insn_size;
29772 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29773 '16' for Core 2/i7. */
29774 static int core2i7_ifetch_block_size;
29776 /* Maximum number of instructions decoder can handle per cycle.
29777 '6' for Core 2/i7. */
29778 static int core2i7_ifetch_block_max_insns;
29780 typedef struct ix86_first_cycle_multipass_data_ *
29781 ix86_first_cycle_multipass_data_t;
29782 typedef const struct ix86_first_cycle_multipass_data_ *
29783 const_ix86_first_cycle_multipass_data_t;
29785 /* A variable to store target state across calls to max_issue within
29786 one cycle. */
29787 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
29788 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
29790 /* Initialize DATA. */
29791 static void
29792 core2i7_first_cycle_multipass_init (void *_data)
29794 ix86_first_cycle_multipass_data_t data
29795 = (ix86_first_cycle_multipass_data_t) _data;
29797 data->ifetch_block_len = 0;
29798 data->ifetch_block_n_insns = 0;
29799 data->ready_try_change = NULL;
29800 data->ready_try_change_size = 0;
29803 /* Advancing the cycle; reset ifetch block counts. */
29804 static void
29805 core2i7_dfa_post_advance_cycle (void)
29807 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
29809 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29811 data->ifetch_block_len = 0;
29812 data->ifetch_block_n_insns = 0;
29815 static int min_insn_size (rtx_insn *);
29817 /* Filter out insns from ready_try that the core will not be able to issue
29818 on current cycle due to decoder. */
29819 static void
29820 core2i7_first_cycle_multipass_filter_ready_try
29821 (const_ix86_first_cycle_multipass_data_t data,
29822 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
29824 while (n_ready--)
29826 rtx_insn *insn;
29827 int insn_size;
29829 if (ready_try[n_ready])
29830 continue;
29832 insn = get_ready_element (n_ready);
29833 insn_size = min_insn_size (insn);
29835 if (/* If this is a too long an insn for a secondary decoder ... */
29836 (!first_cycle_insn_p
29837 && insn_size > core2i7_secondary_decoder_max_insn_size)
29838 /* ... or it would not fit into the ifetch block ... */
29839 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
29840 /* ... or the decoder is full already ... */
29841 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
29842 /* ... mask the insn out. */
29844 ready_try[n_ready] = 1;
29846 if (data->ready_try_change)
29847 bitmap_set_bit (data->ready_try_change, n_ready);
29852 /* Prepare for a new round of multipass lookahead scheduling. */
29853 static void
29854 core2i7_first_cycle_multipass_begin (void *_data,
29855 signed char *ready_try, int n_ready,
29856 bool first_cycle_insn_p)
29858 ix86_first_cycle_multipass_data_t data
29859 = (ix86_first_cycle_multipass_data_t) _data;
29860 const_ix86_first_cycle_multipass_data_t prev_data
29861 = ix86_first_cycle_multipass_data;
29863 /* Restore the state from the end of the previous round. */
29864 data->ifetch_block_len = prev_data->ifetch_block_len;
29865 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
29867 /* Filter instructions that cannot be issued on current cycle due to
29868 decoder restrictions. */
29869 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29870 first_cycle_insn_p);
29873 /* INSN is being issued in current solution. Account for its impact on
29874 the decoder model. */
29875 static void
29876 core2i7_first_cycle_multipass_issue (void *_data,
29877 signed char *ready_try, int n_ready,
29878 rtx_insn *insn, const void *_prev_data)
29880 ix86_first_cycle_multipass_data_t data
29881 = (ix86_first_cycle_multipass_data_t) _data;
29882 const_ix86_first_cycle_multipass_data_t prev_data
29883 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
29885 int insn_size = min_insn_size (insn);
29887 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
29888 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
29889 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
29890 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29892 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
29893 if (!data->ready_try_change)
29895 data->ready_try_change = sbitmap_alloc (n_ready);
29896 data->ready_try_change_size = n_ready;
29898 else if (data->ready_try_change_size < n_ready)
29900 data->ready_try_change = sbitmap_resize (data->ready_try_change,
29901 n_ready, 0);
29902 data->ready_try_change_size = n_ready;
29904 bitmap_clear (data->ready_try_change);
29906 /* Filter out insns from ready_try that the core will not be able to issue
29907 on current cycle due to decoder. */
29908 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29909 false);
29912 /* Revert the effect on ready_try. */
29913 static void
29914 core2i7_first_cycle_multipass_backtrack (const void *_data,
29915 signed char *ready_try,
29916 int n_ready ATTRIBUTE_UNUSED)
29918 const_ix86_first_cycle_multipass_data_t data
29919 = (const_ix86_first_cycle_multipass_data_t) _data;
29920 unsigned int i = 0;
29921 sbitmap_iterator sbi;
29923 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
29924 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
29926 ready_try[i] = 0;
29930 /* Save the result of multipass lookahead scheduling for the next round. */
29931 static void
29932 core2i7_first_cycle_multipass_end (const void *_data)
29934 const_ix86_first_cycle_multipass_data_t data
29935 = (const_ix86_first_cycle_multipass_data_t) _data;
29936 ix86_first_cycle_multipass_data_t next_data
29937 = ix86_first_cycle_multipass_data;
29939 if (data != NULL)
29941 next_data->ifetch_block_len = data->ifetch_block_len;
29942 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
29946 /* Deallocate target data. */
29947 static void
29948 core2i7_first_cycle_multipass_fini (void *_data)
29950 ix86_first_cycle_multipass_data_t data
29951 = (ix86_first_cycle_multipass_data_t) _data;
29953 if (data->ready_try_change)
29955 sbitmap_free (data->ready_try_change);
29956 data->ready_try_change = NULL;
29957 data->ready_try_change_size = 0;
29961 /* Prepare for scheduling pass. */
29962 static void
29963 ix86_sched_init_global (FILE *, int, int)
29965 /* Install scheduling hooks for current CPU. Some of these hooks are used
29966 in time-critical parts of the scheduler, so we only set them up when
29967 they are actually used. */
29968 switch (ix86_tune)
29970 case PROCESSOR_CORE2:
29971 case PROCESSOR_NEHALEM:
29972 case PROCESSOR_SANDYBRIDGE:
29973 case PROCESSOR_HASWELL:
29974 /* Do not perform multipass scheduling for pre-reload schedule
29975 to save compile time. */
29976 if (reload_completed)
29978 targetm.sched.dfa_post_advance_cycle
29979 = core2i7_dfa_post_advance_cycle;
29980 targetm.sched.first_cycle_multipass_init
29981 = core2i7_first_cycle_multipass_init;
29982 targetm.sched.first_cycle_multipass_begin
29983 = core2i7_first_cycle_multipass_begin;
29984 targetm.sched.first_cycle_multipass_issue
29985 = core2i7_first_cycle_multipass_issue;
29986 targetm.sched.first_cycle_multipass_backtrack
29987 = core2i7_first_cycle_multipass_backtrack;
29988 targetm.sched.first_cycle_multipass_end
29989 = core2i7_first_cycle_multipass_end;
29990 targetm.sched.first_cycle_multipass_fini
29991 = core2i7_first_cycle_multipass_fini;
29993 /* Set decoder parameters. */
29994 core2i7_secondary_decoder_max_insn_size = 8;
29995 core2i7_ifetch_block_size = 16;
29996 core2i7_ifetch_block_max_insns = 6;
29997 break;
29999 /* Fall through. */
30000 default:
30001 targetm.sched.dfa_post_advance_cycle = NULL;
30002 targetm.sched.first_cycle_multipass_init = NULL;
30003 targetm.sched.first_cycle_multipass_begin = NULL;
30004 targetm.sched.first_cycle_multipass_issue = NULL;
30005 targetm.sched.first_cycle_multipass_backtrack = NULL;
30006 targetm.sched.first_cycle_multipass_end = NULL;
30007 targetm.sched.first_cycle_multipass_fini = NULL;
30008 break;
30013 /* Compute the alignment given to a constant that is being placed in memory.
30014 EXP is the constant and ALIGN is the alignment that the object would
30015 ordinarily have.
30016 The value of this function is used instead of that alignment to align
30017 the object. */
30020 ix86_constant_alignment (tree exp, int align)
30022 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30023 || TREE_CODE (exp) == INTEGER_CST)
30025 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30026 return 64;
30027 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30028 return 128;
30030 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30031 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30032 return BITS_PER_WORD;
30034 return align;
30037 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30038 the data type, and ALIGN is the alignment that the object would
30039 ordinarily have. */
30041 static int
30042 iamcu_alignment (tree type, int align)
30044 enum machine_mode mode;
30046 if (align < 32 || TYPE_USER_ALIGN (type))
30047 return align;
30049 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30050 bytes. */
30051 mode = TYPE_MODE (strip_array_types (type));
30052 switch (GET_MODE_CLASS (mode))
30054 case MODE_INT:
30055 case MODE_COMPLEX_INT:
30056 case MODE_COMPLEX_FLOAT:
30057 case MODE_FLOAT:
30058 case MODE_DECIMAL_FLOAT:
30059 return 32;
30060 default:
30061 return align;
30065 /* Compute the alignment for a static variable.
30066 TYPE is the data type, and ALIGN is the alignment that
30067 the object would ordinarily have. The value of this function is used
30068 instead of that alignment to align the object. */
30071 ix86_data_alignment (tree type, int align, bool opt)
30073 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30074 for symbols from other compilation units or symbols that don't need
30075 to bind locally. In order to preserve some ABI compatibility with
30076 those compilers, ensure we don't decrease alignment from what we
30077 used to assume. */
30079 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30081 /* A data structure, equal or greater than the size of a cache line
30082 (64 bytes in the Pentium 4 and other recent Intel processors, including
30083 processors based on Intel Core microarchitecture) should be aligned
30084 so that its base address is a multiple of a cache line size. */
30086 int max_align
30087 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30089 if (max_align < BITS_PER_WORD)
30090 max_align = BITS_PER_WORD;
30092 switch (ix86_align_data_type)
30094 case ix86_align_data_type_abi: opt = false; break;
30095 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30096 case ix86_align_data_type_cacheline: break;
30099 if (TARGET_IAMCU)
30100 align = iamcu_alignment (type, align);
30102 if (opt
30103 && AGGREGATE_TYPE_P (type)
30104 && TYPE_SIZE (type)
30105 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30107 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30108 && align < max_align_compat)
30109 align = max_align_compat;
30110 if (wi::geu_p (TYPE_SIZE (type), max_align)
30111 && align < max_align)
30112 align = max_align;
30115 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30116 to 16byte boundary. */
30117 if (TARGET_64BIT)
30119 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30120 && TYPE_SIZE (type)
30121 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30122 && wi::geu_p (TYPE_SIZE (type), 128)
30123 && align < 128)
30124 return 128;
30127 if (!opt)
30128 return align;
30130 if (TREE_CODE (type) == ARRAY_TYPE)
30132 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30133 return 64;
30134 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30135 return 128;
30137 else if (TREE_CODE (type) == COMPLEX_TYPE)
30140 if (TYPE_MODE (type) == DCmode && align < 64)
30141 return 64;
30142 if ((TYPE_MODE (type) == XCmode
30143 || TYPE_MODE (type) == TCmode) && align < 128)
30144 return 128;
30146 else if ((TREE_CODE (type) == RECORD_TYPE
30147 || TREE_CODE (type) == UNION_TYPE
30148 || TREE_CODE (type) == QUAL_UNION_TYPE)
30149 && TYPE_FIELDS (type))
30151 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30152 return 64;
30153 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30154 return 128;
30156 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30157 || TREE_CODE (type) == INTEGER_TYPE)
30159 if (TYPE_MODE (type) == DFmode && align < 64)
30160 return 64;
30161 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30162 return 128;
30165 return align;
30168 /* Compute the alignment for a local variable or a stack slot. EXP is
30169 the data type or decl itself, MODE is the widest mode available and
30170 ALIGN is the alignment that the object would ordinarily have. The
30171 value of this macro is used instead of that alignment to align the
30172 object. */
30174 unsigned int
30175 ix86_local_alignment (tree exp, machine_mode mode,
30176 unsigned int align)
30178 tree type, decl;
30180 if (exp && DECL_P (exp))
30182 type = TREE_TYPE (exp);
30183 decl = exp;
30185 else
30187 type = exp;
30188 decl = NULL;
30191 /* Don't do dynamic stack realignment for long long objects with
30192 -mpreferred-stack-boundary=2. */
30193 if (!TARGET_64BIT
30194 && align == 64
30195 && ix86_preferred_stack_boundary < 64
30196 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30197 && (!type || !TYPE_USER_ALIGN (type))
30198 && (!decl || !DECL_USER_ALIGN (decl)))
30199 align = 32;
30201 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30202 register in MODE. We will return the largest alignment of XF
30203 and DF. */
30204 if (!type)
30206 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30207 align = GET_MODE_ALIGNMENT (DFmode);
30208 return align;
30211 /* Don't increase alignment for Intel MCU psABI. */
30212 if (TARGET_IAMCU)
30213 return align;
30215 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30216 to 16byte boundary. Exact wording is:
30218 An array uses the same alignment as its elements, except that a local or
30219 global array variable of length at least 16 bytes or
30220 a C99 variable-length array variable always has alignment of at least 16 bytes.
30222 This was added to allow use of aligned SSE instructions at arrays. This
30223 rule is meant for static storage (where compiler can not do the analysis
30224 by itself). We follow it for automatic variables only when convenient.
30225 We fully control everything in the function compiled and functions from
30226 other unit can not rely on the alignment.
30228 Exclude va_list type. It is the common case of local array where
30229 we can not benefit from the alignment.
30231 TODO: Probably one should optimize for size only when var is not escaping. */
30232 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30233 && TARGET_SSE)
30235 if (AGGREGATE_TYPE_P (type)
30236 && (va_list_type_node == NULL_TREE
30237 || (TYPE_MAIN_VARIANT (type)
30238 != TYPE_MAIN_VARIANT (va_list_type_node)))
30239 && TYPE_SIZE (type)
30240 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30241 && wi::geu_p (TYPE_SIZE (type), 16)
30242 && align < 128)
30243 return 128;
30245 if (TREE_CODE (type) == ARRAY_TYPE)
30247 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30248 return 64;
30249 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30250 return 128;
30252 else if (TREE_CODE (type) == COMPLEX_TYPE)
30254 if (TYPE_MODE (type) == DCmode && align < 64)
30255 return 64;
30256 if ((TYPE_MODE (type) == XCmode
30257 || TYPE_MODE (type) == TCmode) && align < 128)
30258 return 128;
30260 else if ((TREE_CODE (type) == RECORD_TYPE
30261 || TREE_CODE (type) == UNION_TYPE
30262 || TREE_CODE (type) == QUAL_UNION_TYPE)
30263 && TYPE_FIELDS (type))
30265 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30266 return 64;
30267 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30268 return 128;
30270 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30271 || TREE_CODE (type) == INTEGER_TYPE)
30274 if (TYPE_MODE (type) == DFmode && align < 64)
30275 return 64;
30276 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30277 return 128;
30279 return align;
30282 /* Compute the minimum required alignment for dynamic stack realignment
30283 purposes for a local variable, parameter or a stack slot. EXP is
30284 the data type or decl itself, MODE is its mode and ALIGN is the
30285 alignment that the object would ordinarily have. */
30287 unsigned int
30288 ix86_minimum_alignment (tree exp, machine_mode mode,
30289 unsigned int align)
30291 tree type, decl;
30293 if (exp && DECL_P (exp))
30295 type = TREE_TYPE (exp);
30296 decl = exp;
30298 else
30300 type = exp;
30301 decl = NULL;
30304 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30305 return align;
30307 /* Don't do dynamic stack realignment for long long objects with
30308 -mpreferred-stack-boundary=2. */
30309 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30310 && (!type || !TYPE_USER_ALIGN (type))
30311 && (!decl || !DECL_USER_ALIGN (decl)))
30313 gcc_checking_assert (!TARGET_STV);
30314 return 32;
30317 return align;
30320 /* Find a location for the static chain incoming to a nested function.
30321 This is a register, unless all free registers are used by arguments. */
30323 static rtx
30324 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30326 unsigned regno;
30328 /* While this function won't be called by the middle-end when a static
30329 chain isn't needed, it's also used throughout the backend so it's
30330 easiest to keep this check centralized. */
30331 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30332 return NULL;
30334 if (TARGET_64BIT)
30336 /* We always use R10 in 64-bit mode. */
30337 regno = R10_REG;
30339 else
30341 const_tree fntype, fndecl;
30342 unsigned int ccvt;
30344 /* By default in 32-bit mode we use ECX to pass the static chain. */
30345 regno = CX_REG;
30347 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30349 fntype = TREE_TYPE (fndecl_or_type);
30350 fndecl = fndecl_or_type;
30352 else
30354 fntype = fndecl_or_type;
30355 fndecl = NULL;
30358 ccvt = ix86_get_callcvt (fntype);
30359 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30361 /* Fastcall functions use ecx/edx for arguments, which leaves
30362 us with EAX for the static chain.
30363 Thiscall functions use ecx for arguments, which also
30364 leaves us with EAX for the static chain. */
30365 regno = AX_REG;
30367 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30369 /* Thiscall functions use ecx for arguments, which leaves
30370 us with EAX and EDX for the static chain.
30371 We are using for abi-compatibility EAX. */
30372 regno = AX_REG;
30374 else if (ix86_function_regparm (fntype, fndecl) == 3)
30376 /* For regparm 3, we have no free call-clobbered registers in
30377 which to store the static chain. In order to implement this,
30378 we have the trampoline push the static chain to the stack.
30379 However, we can't push a value below the return address when
30380 we call the nested function directly, so we have to use an
30381 alternate entry point. For this we use ESI, and have the
30382 alternate entry point push ESI, so that things appear the
30383 same once we're executing the nested function. */
30384 if (incoming_p)
30386 if (fndecl == current_function_decl)
30387 ix86_static_chain_on_stack = true;
30388 return gen_frame_mem (SImode,
30389 plus_constant (Pmode,
30390 arg_pointer_rtx, -8));
30392 regno = SI_REG;
30396 return gen_rtx_REG (Pmode, regno);
30399 /* Emit RTL insns to initialize the variable parts of a trampoline.
30400 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30401 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30402 to be passed to the target function. */
30404 static void
30405 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30407 rtx mem, fnaddr;
30408 int opcode;
30409 int offset = 0;
30411 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30413 if (TARGET_64BIT)
30415 int size;
30417 /* Load the function address to r11. Try to load address using
30418 the shorter movl instead of movabs. We may want to support
30419 movq for kernel mode, but kernel does not use trampolines at
30420 the moment. FNADDR is a 32bit address and may not be in
30421 DImode when ptr_mode == SImode. Always use movl in this
30422 case. */
30423 if (ptr_mode == SImode
30424 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30426 fnaddr = copy_addr_to_reg (fnaddr);
30428 mem = adjust_address (m_tramp, HImode, offset);
30429 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30431 mem = adjust_address (m_tramp, SImode, offset + 2);
30432 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30433 offset += 6;
30435 else
30437 mem = adjust_address (m_tramp, HImode, offset);
30438 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30440 mem = adjust_address (m_tramp, DImode, offset + 2);
30441 emit_move_insn (mem, fnaddr);
30442 offset += 10;
30445 /* Load static chain using movabs to r10. Use the shorter movl
30446 instead of movabs when ptr_mode == SImode. */
30447 if (ptr_mode == SImode)
30449 opcode = 0xba41;
30450 size = 6;
30452 else
30454 opcode = 0xba49;
30455 size = 10;
30458 mem = adjust_address (m_tramp, HImode, offset);
30459 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30461 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30462 emit_move_insn (mem, chain_value);
30463 offset += size;
30465 /* Jump to r11; the last (unused) byte is a nop, only there to
30466 pad the write out to a single 32-bit store. */
30467 mem = adjust_address (m_tramp, SImode, offset);
30468 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30469 offset += 4;
30471 else
30473 rtx disp, chain;
30475 /* Depending on the static chain location, either load a register
30476 with a constant, or push the constant to the stack. All of the
30477 instructions are the same size. */
30478 chain = ix86_static_chain (fndecl, true);
30479 if (REG_P (chain))
30481 switch (REGNO (chain))
30483 case AX_REG:
30484 opcode = 0xb8; break;
30485 case CX_REG:
30486 opcode = 0xb9; break;
30487 default:
30488 gcc_unreachable ();
30491 else
30492 opcode = 0x68;
30494 mem = adjust_address (m_tramp, QImode, offset);
30495 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30497 mem = adjust_address (m_tramp, SImode, offset + 1);
30498 emit_move_insn (mem, chain_value);
30499 offset += 5;
30501 mem = adjust_address (m_tramp, QImode, offset);
30502 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30504 mem = adjust_address (m_tramp, SImode, offset + 1);
30506 /* Compute offset from the end of the jmp to the target function.
30507 In the case in which the trampoline stores the static chain on
30508 the stack, we need to skip the first insn which pushes the
30509 (call-saved) register static chain; this push is 1 byte. */
30510 offset += 5;
30511 disp = expand_binop (SImode, sub_optab, fnaddr,
30512 plus_constant (Pmode, XEXP (m_tramp, 0),
30513 offset - (MEM_P (chain) ? 1 : 0)),
30514 NULL_RTX, 1, OPTAB_DIRECT);
30515 emit_move_insn (mem, disp);
30518 gcc_assert (offset <= TRAMPOLINE_SIZE);
30520 #ifdef HAVE_ENABLE_EXECUTE_STACK
30521 #ifdef CHECK_EXECUTE_STACK_ENABLED
30522 if (CHECK_EXECUTE_STACK_ENABLED)
30523 #endif
30524 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30525 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30526 #endif
30529 /* The following file contains several enumerations and data structures
30530 built from the definitions in i386-builtin-types.def. */
30532 #include "i386-builtin-types.inc"
30534 /* Table for the ix86 builtin non-function types. */
30535 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30537 /* Retrieve an element from the above table, building some of
30538 the types lazily. */
30540 static tree
30541 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30543 unsigned int index;
30544 tree type, itype;
30546 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30548 type = ix86_builtin_type_tab[(int) tcode];
30549 if (type != NULL)
30550 return type;
30552 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30553 if (tcode <= IX86_BT_LAST_VECT)
30555 machine_mode mode;
30557 index = tcode - IX86_BT_LAST_PRIM - 1;
30558 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30559 mode = ix86_builtin_type_vect_mode[index];
30561 type = build_vector_type_for_mode (itype, mode);
30563 else
30565 int quals;
30567 index = tcode - IX86_BT_LAST_VECT - 1;
30568 if (tcode <= IX86_BT_LAST_PTR)
30569 quals = TYPE_UNQUALIFIED;
30570 else
30571 quals = TYPE_QUAL_CONST;
30573 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30574 if (quals != TYPE_UNQUALIFIED)
30575 itype = build_qualified_type (itype, quals);
30577 type = build_pointer_type (itype);
30580 ix86_builtin_type_tab[(int) tcode] = type;
30581 return type;
30584 /* Table for the ix86 builtin function types. */
30585 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30587 /* Retrieve an element from the above table, building some of
30588 the types lazily. */
30590 static tree
30591 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30593 tree type;
30595 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30597 type = ix86_builtin_func_type_tab[(int) tcode];
30598 if (type != NULL)
30599 return type;
30601 if (tcode <= IX86_BT_LAST_FUNC)
30603 unsigned start = ix86_builtin_func_start[(int) tcode];
30604 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30605 tree rtype, atype, args = void_list_node;
30606 unsigned i;
30608 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30609 for (i = after - 1; i > start; --i)
30611 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30612 args = tree_cons (NULL, atype, args);
30615 type = build_function_type (rtype, args);
30617 else
30619 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30620 enum ix86_builtin_func_type icode;
30622 icode = ix86_builtin_func_alias_base[index];
30623 type = ix86_get_builtin_func_type (icode);
30626 ix86_builtin_func_type_tab[(int) tcode] = type;
30627 return type;
30631 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30632 bdesc_* arrays below should come first, then builtins for each bdesc_*
30633 array in ascending order, so that we can use direct array accesses. */
30634 enum ix86_builtins
30636 IX86_BUILTIN_MASKMOVQ,
30637 IX86_BUILTIN_LDMXCSR,
30638 IX86_BUILTIN_STMXCSR,
30639 IX86_BUILTIN_MASKMOVDQU,
30640 IX86_BUILTIN_PSLLDQ128,
30641 IX86_BUILTIN_CLFLUSH,
30642 IX86_BUILTIN_MONITOR,
30643 IX86_BUILTIN_MWAIT,
30644 IX86_BUILTIN_CLZERO,
30645 IX86_BUILTIN_VEC_INIT_V2SI,
30646 IX86_BUILTIN_VEC_INIT_V4HI,
30647 IX86_BUILTIN_VEC_INIT_V8QI,
30648 IX86_BUILTIN_VEC_EXT_V2DF,
30649 IX86_BUILTIN_VEC_EXT_V2DI,
30650 IX86_BUILTIN_VEC_EXT_V4SF,
30651 IX86_BUILTIN_VEC_EXT_V4SI,
30652 IX86_BUILTIN_VEC_EXT_V8HI,
30653 IX86_BUILTIN_VEC_EXT_V2SI,
30654 IX86_BUILTIN_VEC_EXT_V4HI,
30655 IX86_BUILTIN_VEC_EXT_V16QI,
30656 IX86_BUILTIN_VEC_SET_V2DI,
30657 IX86_BUILTIN_VEC_SET_V4SF,
30658 IX86_BUILTIN_VEC_SET_V4SI,
30659 IX86_BUILTIN_VEC_SET_V8HI,
30660 IX86_BUILTIN_VEC_SET_V4HI,
30661 IX86_BUILTIN_VEC_SET_V16QI,
30662 IX86_BUILTIN_GATHERSIV2DF,
30663 IX86_BUILTIN_GATHERSIV4DF,
30664 IX86_BUILTIN_GATHERDIV2DF,
30665 IX86_BUILTIN_GATHERDIV4DF,
30666 IX86_BUILTIN_GATHERSIV4SF,
30667 IX86_BUILTIN_GATHERSIV8SF,
30668 IX86_BUILTIN_GATHERDIV4SF,
30669 IX86_BUILTIN_GATHERDIV8SF,
30670 IX86_BUILTIN_GATHERSIV2DI,
30671 IX86_BUILTIN_GATHERSIV4DI,
30672 IX86_BUILTIN_GATHERDIV2DI,
30673 IX86_BUILTIN_GATHERDIV4DI,
30674 IX86_BUILTIN_GATHERSIV4SI,
30675 IX86_BUILTIN_GATHERSIV8SI,
30676 IX86_BUILTIN_GATHERDIV4SI,
30677 IX86_BUILTIN_GATHERDIV8SI,
30678 IX86_BUILTIN_VFMSUBSD3_MASK3,
30679 IX86_BUILTIN_VFMSUBSS3_MASK3,
30680 IX86_BUILTIN_GATHER3SIV8SF,
30681 IX86_BUILTIN_GATHER3SIV4SF,
30682 IX86_BUILTIN_GATHER3SIV4DF,
30683 IX86_BUILTIN_GATHER3SIV2DF,
30684 IX86_BUILTIN_GATHER3DIV8SF,
30685 IX86_BUILTIN_GATHER3DIV4SF,
30686 IX86_BUILTIN_GATHER3DIV4DF,
30687 IX86_BUILTIN_GATHER3DIV2DF,
30688 IX86_BUILTIN_GATHER3SIV8SI,
30689 IX86_BUILTIN_GATHER3SIV4SI,
30690 IX86_BUILTIN_GATHER3SIV4DI,
30691 IX86_BUILTIN_GATHER3SIV2DI,
30692 IX86_BUILTIN_GATHER3DIV8SI,
30693 IX86_BUILTIN_GATHER3DIV4SI,
30694 IX86_BUILTIN_GATHER3DIV4DI,
30695 IX86_BUILTIN_GATHER3DIV2DI,
30696 IX86_BUILTIN_SCATTERSIV8SF,
30697 IX86_BUILTIN_SCATTERSIV4SF,
30698 IX86_BUILTIN_SCATTERSIV4DF,
30699 IX86_BUILTIN_SCATTERSIV2DF,
30700 IX86_BUILTIN_SCATTERDIV8SF,
30701 IX86_BUILTIN_SCATTERDIV4SF,
30702 IX86_BUILTIN_SCATTERDIV4DF,
30703 IX86_BUILTIN_SCATTERDIV2DF,
30704 IX86_BUILTIN_SCATTERSIV8SI,
30705 IX86_BUILTIN_SCATTERSIV4SI,
30706 IX86_BUILTIN_SCATTERSIV4DI,
30707 IX86_BUILTIN_SCATTERSIV2DI,
30708 IX86_BUILTIN_SCATTERDIV8SI,
30709 IX86_BUILTIN_SCATTERDIV4SI,
30710 IX86_BUILTIN_SCATTERDIV4DI,
30711 IX86_BUILTIN_SCATTERDIV2DI,
30712 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30713 where all operands are 32-byte or 64-byte wide respectively. */
30714 IX86_BUILTIN_GATHERALTSIV4DF,
30715 IX86_BUILTIN_GATHERALTDIV8SF,
30716 IX86_BUILTIN_GATHERALTSIV4DI,
30717 IX86_BUILTIN_GATHERALTDIV8SI,
30718 IX86_BUILTIN_GATHER3ALTDIV16SF,
30719 IX86_BUILTIN_GATHER3ALTDIV16SI,
30720 IX86_BUILTIN_GATHER3ALTSIV4DF,
30721 IX86_BUILTIN_GATHER3ALTDIV8SF,
30722 IX86_BUILTIN_GATHER3ALTSIV4DI,
30723 IX86_BUILTIN_GATHER3ALTDIV8SI,
30724 IX86_BUILTIN_GATHER3ALTSIV8DF,
30725 IX86_BUILTIN_GATHER3ALTSIV8DI,
30726 IX86_BUILTIN_GATHER3DIV16SF,
30727 IX86_BUILTIN_GATHER3DIV16SI,
30728 IX86_BUILTIN_GATHER3DIV8DF,
30729 IX86_BUILTIN_GATHER3DIV8DI,
30730 IX86_BUILTIN_GATHER3SIV16SF,
30731 IX86_BUILTIN_GATHER3SIV16SI,
30732 IX86_BUILTIN_GATHER3SIV8DF,
30733 IX86_BUILTIN_GATHER3SIV8DI,
30734 IX86_BUILTIN_SCATTERALTSIV8DF,
30735 IX86_BUILTIN_SCATTERALTDIV16SF,
30736 IX86_BUILTIN_SCATTERALTSIV8DI,
30737 IX86_BUILTIN_SCATTERALTDIV16SI,
30738 IX86_BUILTIN_SCATTERDIV16SF,
30739 IX86_BUILTIN_SCATTERDIV16SI,
30740 IX86_BUILTIN_SCATTERDIV8DF,
30741 IX86_BUILTIN_SCATTERDIV8DI,
30742 IX86_BUILTIN_SCATTERSIV16SF,
30743 IX86_BUILTIN_SCATTERSIV16SI,
30744 IX86_BUILTIN_SCATTERSIV8DF,
30745 IX86_BUILTIN_SCATTERSIV8DI,
30746 IX86_BUILTIN_GATHERPFQPD,
30747 IX86_BUILTIN_GATHERPFDPS,
30748 IX86_BUILTIN_GATHERPFDPD,
30749 IX86_BUILTIN_GATHERPFQPS,
30750 IX86_BUILTIN_SCATTERPFDPD,
30751 IX86_BUILTIN_SCATTERPFDPS,
30752 IX86_BUILTIN_SCATTERPFQPD,
30753 IX86_BUILTIN_SCATTERPFQPS,
30754 IX86_BUILTIN_CLWB,
30755 IX86_BUILTIN_CLFLUSHOPT,
30756 IX86_BUILTIN_INFQ,
30757 IX86_BUILTIN_HUGE_VALQ,
30758 IX86_BUILTIN_NANQ,
30759 IX86_BUILTIN_NANSQ,
30760 IX86_BUILTIN_XABORT,
30761 IX86_BUILTIN_ADDCARRYX32,
30762 IX86_BUILTIN_ADDCARRYX64,
30763 IX86_BUILTIN_SBB32,
30764 IX86_BUILTIN_SBB64,
30765 IX86_BUILTIN_RDRAND16_STEP,
30766 IX86_BUILTIN_RDRAND32_STEP,
30767 IX86_BUILTIN_RDRAND64_STEP,
30768 IX86_BUILTIN_RDSEED16_STEP,
30769 IX86_BUILTIN_RDSEED32_STEP,
30770 IX86_BUILTIN_RDSEED64_STEP,
30771 IX86_BUILTIN_MONITORX,
30772 IX86_BUILTIN_MWAITX,
30773 IX86_BUILTIN_CFSTRING,
30774 IX86_BUILTIN_CPU_INIT,
30775 IX86_BUILTIN_CPU_IS,
30776 IX86_BUILTIN_CPU_SUPPORTS,
30777 IX86_BUILTIN_READ_FLAGS,
30778 IX86_BUILTIN_WRITE_FLAGS,
30780 /* All the remaining builtins are tracked in bdesc_* arrays in
30781 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30782 this point. */
30783 #define BDESC(mask, icode, name, code, comparison, flag) \
30784 code,
30785 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30786 code, \
30787 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30788 #define BDESC_END(kind, next_kind)
30790 #include "i386-builtin.def"
30792 #undef BDESC
30793 #undef BDESC_FIRST
30794 #undef BDESC_END
30796 IX86_BUILTIN_MAX,
30798 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30800 /* Now just the aliases for bdesc_* start/end. */
30801 #define BDESC(mask, icode, name, code, comparison, flag)
30802 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30803 #define BDESC_END(kind, next_kind) \
30804 IX86_BUILTIN__BDESC_##kind##_LAST \
30805 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30807 #include "i386-builtin.def"
30809 #undef BDESC
30810 #undef BDESC_FIRST
30811 #undef BDESC_END
30813 /* Just to make sure there is no comma after the last enumerator. */
30814 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30817 /* Table for the ix86 builtin decls. */
30818 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30820 /* Table of all of the builtin functions that are possible with different ISA's
30821 but are waiting to be built until a function is declared to use that
30822 ISA. */
30823 struct builtin_isa {
30824 const char *name; /* function name */
30825 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30826 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30827 bool const_p; /* true if the declaration is constant */
30828 bool leaf_p; /* true if the declaration has leaf attribute */
30829 bool nothrow_p; /* true if the declaration has nothrow attribute */
30830 bool set_and_not_built_p;
30833 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30835 /* Bits that can still enable any inclusion of a builtin. */
30836 static HOST_WIDE_INT deferred_isa_values = 0;
30838 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30839 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30840 function decl in the ix86_builtins array. Returns the function decl or
30841 NULL_TREE, if the builtin was not added.
30843 If the front end has a special hook for builtin functions, delay adding
30844 builtin functions that aren't in the current ISA until the ISA is changed
30845 with function specific optimization. Doing so, can save about 300K for the
30846 default compiler. When the builtin is expanded, check at that time whether
30847 it is valid.
30849 If the front end doesn't have a special hook, record all builtins, even if
30850 it isn't an instruction set in the current ISA in case the user uses
30851 function specific options for a different ISA, so that we don't get scope
30852 errors if a builtin is added in the middle of a function scope. */
30854 static inline tree
30855 def_builtin (HOST_WIDE_INT mask, const char *name,
30856 enum ix86_builtin_func_type tcode,
30857 enum ix86_builtins code)
30859 tree decl = NULL_TREE;
30861 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30863 ix86_builtins_isa[(int) code].isa = mask;
30865 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
30866 where any bit set means that built-in is enable, this bit must be *and-ed*
30867 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
30868 means that *both* cpuid bits must be set for the built-in to be available.
30869 Handle this here. */
30870 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30871 mask &= ~OPTION_MASK_ISA_AVX512VL;
30873 mask &= ~OPTION_MASK_ISA_64BIT;
30874 if (mask == 0
30875 || (mask & ix86_isa_flags) != 0
30876 || (lang_hooks.builtin_function
30877 == lang_hooks.builtin_function_ext_scope))
30880 tree type = ix86_get_builtin_func_type (tcode);
30881 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30882 NULL, NULL_TREE);
30883 ix86_builtins[(int) code] = decl;
30884 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30886 else
30888 /* Just a MASK where set_and_not_built_p == true can potentially
30889 include a builtin. */
30890 deferred_isa_values |= mask;
30891 ix86_builtins[(int) code] = NULL_TREE;
30892 ix86_builtins_isa[(int) code].tcode = tcode;
30893 ix86_builtins_isa[(int) code].name = name;
30894 ix86_builtins_isa[(int) code].leaf_p = false;
30895 ix86_builtins_isa[(int) code].nothrow_p = false;
30896 ix86_builtins_isa[(int) code].const_p = false;
30897 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30901 return decl;
30904 /* Like def_builtin, but also marks the function decl "const". */
30906 static inline tree
30907 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30908 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30910 tree decl = def_builtin (mask, name, tcode, code);
30911 if (decl)
30912 TREE_READONLY (decl) = 1;
30913 else
30914 ix86_builtins_isa[(int) code].const_p = true;
30916 return decl;
30919 /* Add any new builtin functions for a given ISA that may not have been
30920 declared. This saves a bit of space compared to adding all of the
30921 declarations to the tree, even if we didn't use them. */
30923 static void
30924 ix86_add_new_builtins (HOST_WIDE_INT isa)
30926 if ((isa & deferred_isa_values) == 0)
30927 return;
30929 /* Bits in ISA value can be removed from potential isa values. */
30930 deferred_isa_values &= ~isa;
30932 int i;
30933 tree saved_current_target_pragma = current_target_pragma;
30934 current_target_pragma = NULL_TREE;
30936 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30938 if ((ix86_builtins_isa[i].isa & isa) != 0
30939 && ix86_builtins_isa[i].set_and_not_built_p)
30941 tree decl, type;
30943 /* Don't define the builtin again. */
30944 ix86_builtins_isa[i].set_and_not_built_p = false;
30946 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30947 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30948 type, i, BUILT_IN_MD, NULL,
30949 NULL_TREE);
30951 ix86_builtins[i] = decl;
30952 if (ix86_builtins_isa[i].const_p)
30953 TREE_READONLY (decl) = 1;
30954 if (ix86_builtins_isa[i].leaf_p)
30955 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30956 NULL_TREE);
30957 if (ix86_builtins_isa[i].nothrow_p)
30958 TREE_NOTHROW (decl) = 1;
30962 current_target_pragma = saved_current_target_pragma;
30965 /* Bits for builtin_description.flag. */
30967 /* Set when we don't support the comparison natively, and should
30968 swap_comparison in order to support it. */
30969 #define BUILTIN_DESC_SWAP_OPERANDS 1
30971 struct builtin_description
30973 const HOST_WIDE_INT mask;
30974 const enum insn_code icode;
30975 const char *const name;
30976 const enum ix86_builtins code;
30977 const enum rtx_code comparison;
30978 const int flag;
30981 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30982 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30983 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30984 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30985 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30986 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30987 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30988 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30989 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30990 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30991 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30992 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30993 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30994 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30995 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30996 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30997 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30998 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30999 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31000 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31001 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31002 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31003 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31004 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31005 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31006 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31007 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31008 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31009 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31010 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31011 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31012 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31013 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31014 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31015 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31016 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31017 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31018 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31019 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31020 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31021 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31022 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31023 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31024 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31025 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31026 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31027 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31028 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31029 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31030 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31031 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31032 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31034 #define BDESC(mask, icode, name, code, comparison, flag) \
31035 { mask, icode, name, code, comparison, flag },
31036 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31037 static const struct builtin_description bdesc_##kind[] = \
31039 BDESC (mask, icode, name, code, comparison, flag)
31040 #define BDESC_END(kind, next_kind) \
31043 #include "i386-builtin.def"
31045 #undef BDESC
31046 #undef BDESC_FIRST
31047 #undef BDESC_END
31049 /* TM vector builtins. */
31051 /* Reuse the existing x86-specific `struct builtin_description' cause
31052 we're lazy. Add casts to make them fit. */
31053 static const struct builtin_description bdesc_tm[] =
31055 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31056 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31057 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31058 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31059 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31060 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31061 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31063 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31064 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31065 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31066 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31067 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31068 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31069 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31071 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31072 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31073 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31074 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31075 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31076 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31077 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31079 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31080 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31081 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31084 /* Initialize the transactional memory vector load/store builtins. */
31086 static void
31087 ix86_init_tm_builtins (void)
31089 enum ix86_builtin_func_type ftype;
31090 const struct builtin_description *d;
31091 size_t i;
31092 tree decl;
31093 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31094 tree attrs_log, attrs_type_log;
31096 if (!flag_tm)
31097 return;
31099 /* If there are no builtins defined, we must be compiling in a
31100 language without trans-mem support. */
31101 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31102 return;
31104 /* Use whatever attributes a normal TM load has. */
31105 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31106 attrs_load = DECL_ATTRIBUTES (decl);
31107 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31108 /* Use whatever attributes a normal TM store has. */
31109 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31110 attrs_store = DECL_ATTRIBUTES (decl);
31111 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31112 /* Use whatever attributes a normal TM log has. */
31113 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31114 attrs_log = DECL_ATTRIBUTES (decl);
31115 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31117 for (i = 0, d = bdesc_tm;
31118 i < ARRAY_SIZE (bdesc_tm);
31119 i++, d++)
31121 if ((d->mask & ix86_isa_flags) != 0
31122 || (lang_hooks.builtin_function
31123 == lang_hooks.builtin_function_ext_scope))
31125 tree type, attrs, attrs_type;
31126 enum built_in_function code = (enum built_in_function) d->code;
31128 ftype = (enum ix86_builtin_func_type) d->flag;
31129 type = ix86_get_builtin_func_type (ftype);
31131 if (BUILTIN_TM_LOAD_P (code))
31133 attrs = attrs_load;
31134 attrs_type = attrs_type_load;
31136 else if (BUILTIN_TM_STORE_P (code))
31138 attrs = attrs_store;
31139 attrs_type = attrs_type_store;
31141 else
31143 attrs = attrs_log;
31144 attrs_type = attrs_type_log;
31146 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31147 /* The builtin without the prefix for
31148 calling it directly. */
31149 d->name + strlen ("__builtin_"),
31150 attrs);
31151 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31152 set the TYPE_ATTRIBUTES. */
31153 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31155 set_builtin_decl (code, decl, false);
31160 /* Macros for verification of enum ix86_builtins order. */
31161 #define BDESC_VERIFY(x, y, z) \
31162 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31163 #define BDESC_VERIFYS(x, y, z) \
31164 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31166 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31167 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31168 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31169 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31170 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31171 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31172 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31173 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31174 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31175 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31176 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31177 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31178 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31179 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31180 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31181 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31182 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31183 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31185 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31186 in the current target ISA to allow the user to compile particular modules
31187 with different target specific options that differ from the command line
31188 options. */
31189 static void
31190 ix86_init_mmx_sse_builtins (void)
31192 const struct builtin_description * d;
31193 enum ix86_builtin_func_type ftype;
31194 size_t i;
31196 /* Add all special builtins with variable number of operands. */
31197 for (i = 0, d = bdesc_special_args;
31198 i < ARRAY_SIZE (bdesc_special_args);
31199 i++, d++)
31201 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31202 if (d->name == 0)
31203 continue;
31205 ftype = (enum ix86_builtin_func_type) d->flag;
31206 def_builtin (d->mask, d->name, ftype, d->code);
31208 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31209 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31210 ARRAY_SIZE (bdesc_special_args) - 1);
31212 /* Add all builtins with variable number of operands. */
31213 for (i = 0, d = bdesc_args;
31214 i < ARRAY_SIZE (bdesc_args);
31215 i++, d++)
31217 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31218 if (d->name == 0)
31219 continue;
31221 ftype = (enum ix86_builtin_func_type) d->flag;
31222 def_builtin_const (d->mask, d->name, ftype, d->code);
31224 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31225 IX86_BUILTIN__BDESC_ARGS_FIRST,
31226 ARRAY_SIZE (bdesc_args) - 1);
31228 /* Add all builtins with rounding. */
31229 for (i = 0, d = bdesc_round_args;
31230 i < ARRAY_SIZE (bdesc_round_args);
31231 i++, d++)
31233 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31234 if (d->name == 0)
31235 continue;
31237 ftype = (enum ix86_builtin_func_type) d->flag;
31238 def_builtin_const (d->mask, d->name, ftype, d->code);
31240 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31241 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31242 ARRAY_SIZE (bdesc_round_args) - 1);
31244 /* pcmpestr[im] insns. */
31245 for (i = 0, d = bdesc_pcmpestr;
31246 i < ARRAY_SIZE (bdesc_pcmpestr);
31247 i++, d++)
31249 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31250 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31251 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31252 else
31253 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31254 def_builtin_const (d->mask, d->name, ftype, d->code);
31256 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31257 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31258 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31260 /* pcmpistr[im] insns. */
31261 for (i = 0, d = bdesc_pcmpistr;
31262 i < ARRAY_SIZE (bdesc_pcmpistr);
31263 i++, d++)
31265 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31266 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31267 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31268 else
31269 ftype = INT_FTYPE_V16QI_V16QI_INT;
31270 def_builtin_const (d->mask, d->name, ftype, d->code);
31272 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31273 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31274 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31276 /* comi/ucomi insns. */
31277 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31279 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31280 if (d->mask == OPTION_MASK_ISA_SSE2)
31281 ftype = INT_FTYPE_V2DF_V2DF;
31282 else
31283 ftype = INT_FTYPE_V4SF_V4SF;
31284 def_builtin_const (d->mask, d->name, ftype, d->code);
31286 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31287 IX86_BUILTIN__BDESC_COMI_FIRST,
31288 ARRAY_SIZE (bdesc_comi) - 1);
31290 /* SSE */
31291 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31292 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31293 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31294 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31296 /* SSE or 3DNow!A */
31297 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31298 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31299 IX86_BUILTIN_MASKMOVQ);
31301 /* SSE2 */
31302 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31303 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31305 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31306 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31307 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31308 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31310 /* SSE3. */
31311 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31312 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31313 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31314 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31316 /* AES */
31317 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31318 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31319 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31320 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31321 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31322 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31323 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31324 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31325 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31326 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31327 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31328 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31330 /* PCLMUL */
31331 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31332 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31334 /* RDRND */
31335 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31336 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31337 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31338 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31339 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31340 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31341 IX86_BUILTIN_RDRAND64_STEP);
31343 /* AVX2 */
31344 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31345 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31346 IX86_BUILTIN_GATHERSIV2DF);
31348 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31349 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31350 IX86_BUILTIN_GATHERSIV4DF);
31352 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31353 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31354 IX86_BUILTIN_GATHERDIV2DF);
31356 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31357 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31358 IX86_BUILTIN_GATHERDIV4DF);
31360 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31361 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31362 IX86_BUILTIN_GATHERSIV4SF);
31364 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31365 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31366 IX86_BUILTIN_GATHERSIV8SF);
31368 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31369 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31370 IX86_BUILTIN_GATHERDIV4SF);
31372 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31373 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31374 IX86_BUILTIN_GATHERDIV8SF);
31376 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31377 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31378 IX86_BUILTIN_GATHERSIV2DI);
31380 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31381 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31382 IX86_BUILTIN_GATHERSIV4DI);
31384 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31385 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31386 IX86_BUILTIN_GATHERDIV2DI);
31388 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31389 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31390 IX86_BUILTIN_GATHERDIV4DI);
31392 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31393 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31394 IX86_BUILTIN_GATHERSIV4SI);
31396 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31397 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31398 IX86_BUILTIN_GATHERSIV8SI);
31400 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31401 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31402 IX86_BUILTIN_GATHERDIV4SI);
31404 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31405 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31406 IX86_BUILTIN_GATHERDIV8SI);
31408 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31409 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31410 IX86_BUILTIN_GATHERALTSIV4DF);
31412 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31413 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31414 IX86_BUILTIN_GATHERALTDIV8SF);
31416 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31417 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31418 IX86_BUILTIN_GATHERALTSIV4DI);
31420 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31421 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31422 IX86_BUILTIN_GATHERALTDIV8SI);
31424 /* AVX512F */
31425 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31426 V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
31427 IX86_BUILTIN_GATHER3SIV16SF);
31429 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31430 V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
31431 IX86_BUILTIN_GATHER3SIV8DF);
31433 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31434 V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
31435 IX86_BUILTIN_GATHER3DIV16SF);
31437 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31438 V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
31439 IX86_BUILTIN_GATHER3DIV8DF);
31441 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31442 V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
31443 IX86_BUILTIN_GATHER3SIV16SI);
31445 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31446 V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
31447 IX86_BUILTIN_GATHER3SIV8DI);
31449 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31450 V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
31451 IX86_BUILTIN_GATHER3DIV16SI);
31453 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31454 V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
31455 IX86_BUILTIN_GATHER3DIV8DI);
31457 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31458 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31459 IX86_BUILTIN_GATHER3ALTSIV8DF);
31461 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31462 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31463 IX86_BUILTIN_GATHER3ALTDIV16SF);
31465 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31466 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31467 IX86_BUILTIN_GATHER3ALTSIV8DI);
31469 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31470 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31471 IX86_BUILTIN_GATHER3ALTDIV16SI);
31473 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31474 VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
31475 IX86_BUILTIN_SCATTERSIV16SF);
31477 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31478 VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
31479 IX86_BUILTIN_SCATTERSIV8DF);
31481 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31482 VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
31483 IX86_BUILTIN_SCATTERDIV16SF);
31485 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31486 VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
31487 IX86_BUILTIN_SCATTERDIV8DF);
31489 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31490 VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
31491 IX86_BUILTIN_SCATTERSIV16SI);
31493 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31494 VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
31495 IX86_BUILTIN_SCATTERSIV8DI);
31497 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31498 VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
31499 IX86_BUILTIN_SCATTERDIV16SI);
31501 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31502 VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
31503 IX86_BUILTIN_SCATTERDIV8DI);
31505 /* AVX512VL */
31506 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31507 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_QI_INT,
31508 IX86_BUILTIN_GATHER3SIV2DF);
31510 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31511 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_QI_INT,
31512 IX86_BUILTIN_GATHER3SIV4DF);
31514 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31515 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_QI_INT,
31516 IX86_BUILTIN_GATHER3DIV2DF);
31518 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31519 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_QI_INT,
31520 IX86_BUILTIN_GATHER3DIV4DF);
31522 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31523 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_QI_INT,
31524 IX86_BUILTIN_GATHER3SIV4SF);
31526 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31527 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_QI_INT,
31528 IX86_BUILTIN_GATHER3SIV8SF);
31530 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31531 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_QI_INT,
31532 IX86_BUILTIN_GATHER3DIV4SF);
31534 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31535 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_QI_INT,
31536 IX86_BUILTIN_GATHER3DIV8SF);
31538 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31539 V2DI_FTYPE_V2DI_PCINT64_V4SI_QI_INT,
31540 IX86_BUILTIN_GATHER3SIV2DI);
31542 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31543 V4DI_FTYPE_V4DI_PCINT64_V4SI_QI_INT,
31544 IX86_BUILTIN_GATHER3SIV4DI);
31546 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31547 V2DI_FTYPE_V2DI_PCINT64_V2DI_QI_INT,
31548 IX86_BUILTIN_GATHER3DIV2DI);
31550 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31551 V4DI_FTYPE_V4DI_PCINT64_V4DI_QI_INT,
31552 IX86_BUILTIN_GATHER3DIV4DI);
31554 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31555 V4SI_FTYPE_V4SI_PCINT_V4SI_QI_INT,
31556 IX86_BUILTIN_GATHER3SIV4SI);
31558 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31559 V8SI_FTYPE_V8SI_PCINT_V8SI_QI_INT,
31560 IX86_BUILTIN_GATHER3SIV8SI);
31562 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31563 V4SI_FTYPE_V4SI_PCINT_V2DI_QI_INT,
31564 IX86_BUILTIN_GATHER3DIV4SI);
31566 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31567 V4SI_FTYPE_V4SI_PCINT_V4DI_QI_INT,
31568 IX86_BUILTIN_GATHER3DIV8SI);
31570 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31571 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31572 IX86_BUILTIN_GATHER3ALTSIV4DF);
31574 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31575 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31576 IX86_BUILTIN_GATHER3ALTDIV8SF);
31578 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31579 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31580 IX86_BUILTIN_GATHER3ALTSIV4DI);
31582 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31583 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31584 IX86_BUILTIN_GATHER3ALTDIV8SI);
31586 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31587 VOID_FTYPE_PFLOAT_QI_V8SI_V8SF_INT,
31588 IX86_BUILTIN_SCATTERSIV8SF);
31590 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31591 VOID_FTYPE_PFLOAT_QI_V4SI_V4SF_INT,
31592 IX86_BUILTIN_SCATTERSIV4SF);
31594 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31595 VOID_FTYPE_PDOUBLE_QI_V4SI_V4DF_INT,
31596 IX86_BUILTIN_SCATTERSIV4DF);
31598 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31599 VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
31600 IX86_BUILTIN_SCATTERSIV2DF);
31602 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31603 VOID_FTYPE_PFLOAT_QI_V4DI_V4SF_INT,
31604 IX86_BUILTIN_SCATTERDIV8SF);
31606 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31607 VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
31608 IX86_BUILTIN_SCATTERDIV4SF);
31610 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31611 VOID_FTYPE_PDOUBLE_QI_V4DI_V4DF_INT,
31612 IX86_BUILTIN_SCATTERDIV4DF);
31614 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31615 VOID_FTYPE_PDOUBLE_QI_V2DI_V2DF_INT,
31616 IX86_BUILTIN_SCATTERDIV2DF);
31618 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31619 VOID_FTYPE_PINT_QI_V8SI_V8SI_INT,
31620 IX86_BUILTIN_SCATTERSIV8SI);
31622 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31623 VOID_FTYPE_PINT_QI_V4SI_V4SI_INT,
31624 IX86_BUILTIN_SCATTERSIV4SI);
31626 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31627 VOID_FTYPE_PLONGLONG_QI_V4SI_V4DI_INT,
31628 IX86_BUILTIN_SCATTERSIV4DI);
31630 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31631 VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
31632 IX86_BUILTIN_SCATTERSIV2DI);
31634 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31635 VOID_FTYPE_PINT_QI_V4DI_V4SI_INT,
31636 IX86_BUILTIN_SCATTERDIV8SI);
31638 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31639 VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
31640 IX86_BUILTIN_SCATTERDIV4SI);
31642 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31643 VOID_FTYPE_PLONGLONG_QI_V4DI_V4DI_INT,
31644 IX86_BUILTIN_SCATTERDIV4DI);
31646 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31647 VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
31648 IX86_BUILTIN_SCATTERDIV2DI);
31649 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31650 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31651 IX86_BUILTIN_SCATTERALTSIV8DF);
31653 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31654 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31655 IX86_BUILTIN_SCATTERALTDIV16SF);
31657 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31658 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31659 IX86_BUILTIN_SCATTERALTSIV8DI);
31661 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31662 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31663 IX86_BUILTIN_SCATTERALTDIV16SI);
31665 /* AVX512PF */
31666 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31667 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31668 IX86_BUILTIN_GATHERPFDPD);
31669 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31670 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31671 IX86_BUILTIN_GATHERPFDPS);
31672 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31673 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31674 IX86_BUILTIN_GATHERPFQPD);
31675 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31676 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31677 IX86_BUILTIN_GATHERPFQPS);
31678 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31679 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31680 IX86_BUILTIN_SCATTERPFDPD);
31681 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31682 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31683 IX86_BUILTIN_SCATTERPFDPS);
31684 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31685 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31686 IX86_BUILTIN_SCATTERPFQPD);
31687 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31688 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31689 IX86_BUILTIN_SCATTERPFQPS);
31691 /* SHA */
31692 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31693 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31694 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31695 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31696 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31697 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31698 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31699 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31700 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31701 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31702 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31703 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31704 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31705 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31707 /* RTM. */
31708 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31709 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31711 /* MMX access to the vec_init patterns. */
31712 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31713 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31715 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31716 V4HI_FTYPE_HI_HI_HI_HI,
31717 IX86_BUILTIN_VEC_INIT_V4HI);
31719 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31720 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31721 IX86_BUILTIN_VEC_INIT_V8QI);
31723 /* Access to the vec_extract patterns. */
31724 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31725 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31726 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31727 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31728 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31729 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31730 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31731 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31732 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31733 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31735 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31736 "__builtin_ia32_vec_ext_v4hi",
31737 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31739 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31740 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31742 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31743 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31745 /* Access to the vec_set patterns. */
31746 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31747 "__builtin_ia32_vec_set_v2di",
31748 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31750 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31751 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31753 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31754 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31756 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31757 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31759 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31760 "__builtin_ia32_vec_set_v4hi",
31761 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31763 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31764 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31766 /* RDSEED */
31767 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31768 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31769 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31770 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31771 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31772 "__builtin_ia32_rdseed_di_step",
31773 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31775 /* ADCX */
31776 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31777 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31778 def_builtin (OPTION_MASK_ISA_64BIT,
31779 "__builtin_ia32_addcarryx_u64",
31780 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31781 IX86_BUILTIN_ADDCARRYX64);
31783 /* SBB */
31784 def_builtin (0, "__builtin_ia32_sbb_u32",
31785 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31786 def_builtin (OPTION_MASK_ISA_64BIT,
31787 "__builtin_ia32_sbb_u64",
31788 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31789 IX86_BUILTIN_SBB64);
31791 /* Read/write FLAGS. */
31792 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
31793 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31794 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31795 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31796 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
31797 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31798 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31799 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31801 /* CLFLUSHOPT. */
31802 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31803 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31805 /* CLWB. */
31806 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31807 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31809 /* MONITORX and MWAITX. */
31810 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31811 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31812 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31813 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31815 /* CLZERO. */
31816 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31817 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31819 /* Add FMA4 multi-arg argument instructions */
31820 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31822 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31823 if (d->name == 0)
31824 continue;
31826 ftype = (enum ix86_builtin_func_type) d->flag;
31827 def_builtin_const (d->mask, d->name, ftype, d->code);
31829 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31830 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31831 ARRAY_SIZE (bdesc_multi_arg) - 1);
31834 static void
31835 ix86_init_mpx_builtins ()
31837 const struct builtin_description * d;
31838 enum ix86_builtin_func_type ftype;
31839 tree decl;
31840 size_t i;
31842 for (i = 0, d = bdesc_mpx;
31843 i < ARRAY_SIZE (bdesc_mpx);
31844 i++, d++)
31846 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31847 if (d->name == 0)
31848 continue;
31850 ftype = (enum ix86_builtin_func_type) d->flag;
31851 decl = def_builtin (d->mask, d->name, ftype, d->code);
31853 /* With no leaf and nothrow flags for MPX builtins
31854 abnormal edges may follow its call when setjmp
31855 presents in the function. Since we may have a lot
31856 of MPX builtins calls it causes lots of useless
31857 edges and enormous PHI nodes. To avoid this we mark
31858 MPX builtins as leaf and nothrow. */
31859 if (decl)
31861 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31862 NULL_TREE);
31863 TREE_NOTHROW (decl) = 1;
31865 else
31867 ix86_builtins_isa[(int)d->code].leaf_p = true;
31868 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31871 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31872 IX86_BUILTIN__BDESC_MPX_FIRST,
31873 ARRAY_SIZE (bdesc_mpx) - 1);
31875 for (i = 0, d = bdesc_mpx_const;
31876 i < ARRAY_SIZE (bdesc_mpx_const);
31877 i++, d++)
31879 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31880 if (d->name == 0)
31881 continue;
31883 ftype = (enum ix86_builtin_func_type) d->flag;
31884 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
31886 if (decl)
31888 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31889 NULL_TREE);
31890 TREE_NOTHROW (decl) = 1;
31892 else
31894 ix86_builtins_isa[(int)d->code].leaf_p = true;
31895 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31898 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31899 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31900 ARRAY_SIZE (bdesc_mpx_const) - 1);
31902 #undef BDESC_VERIFY
31903 #undef BDESC_VERIFYS
31905 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31906 to return a pointer to VERSION_DECL if the outcome of the expression
31907 formed by PREDICATE_CHAIN is true. This function will be called during
31908 version dispatch to decide which function version to execute. It returns
31909 the basic block at the end, to which more conditions can be added. */
31911 static basic_block
31912 add_condition_to_bb (tree function_decl, tree version_decl,
31913 tree predicate_chain, basic_block new_bb)
31915 gimple *return_stmt;
31916 tree convert_expr, result_var;
31917 gimple *convert_stmt;
31918 gimple *call_cond_stmt;
31919 gimple *if_else_stmt;
31921 basic_block bb1, bb2, bb3;
31922 edge e12, e23;
31924 tree cond_var, and_expr_var = NULL_TREE;
31925 gimple_seq gseq;
31927 tree predicate_decl, predicate_arg;
31929 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31931 gcc_assert (new_bb != NULL);
31932 gseq = bb_seq (new_bb);
31935 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31936 build_fold_addr_expr (version_decl));
31937 result_var = create_tmp_var (ptr_type_node);
31938 convert_stmt = gimple_build_assign (result_var, convert_expr);
31939 return_stmt = gimple_build_return (result_var);
31941 if (predicate_chain == NULL_TREE)
31943 gimple_seq_add_stmt (&gseq, convert_stmt);
31944 gimple_seq_add_stmt (&gseq, return_stmt);
31945 set_bb_seq (new_bb, gseq);
31946 gimple_set_bb (convert_stmt, new_bb);
31947 gimple_set_bb (return_stmt, new_bb);
31948 pop_cfun ();
31949 return new_bb;
31952 while (predicate_chain != NULL)
31954 cond_var = create_tmp_var (integer_type_node);
31955 predicate_decl = TREE_PURPOSE (predicate_chain);
31956 predicate_arg = TREE_VALUE (predicate_chain);
31957 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31958 gimple_call_set_lhs (call_cond_stmt, cond_var);
31960 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31961 gimple_set_bb (call_cond_stmt, new_bb);
31962 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31964 predicate_chain = TREE_CHAIN (predicate_chain);
31966 if (and_expr_var == NULL)
31967 and_expr_var = cond_var;
31968 else
31970 gimple *assign_stmt;
31971 /* Use MIN_EXPR to check if any integer is zero?.
31972 and_expr_var = min_expr <cond_var, and_expr_var> */
31973 assign_stmt = gimple_build_assign (and_expr_var,
31974 build2 (MIN_EXPR, integer_type_node,
31975 cond_var, and_expr_var));
31977 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31978 gimple_set_bb (assign_stmt, new_bb);
31979 gimple_seq_add_stmt (&gseq, assign_stmt);
31983 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31984 integer_zero_node,
31985 NULL_TREE, NULL_TREE);
31986 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31987 gimple_set_bb (if_else_stmt, new_bb);
31988 gimple_seq_add_stmt (&gseq, if_else_stmt);
31990 gimple_seq_add_stmt (&gseq, convert_stmt);
31991 gimple_seq_add_stmt (&gseq, return_stmt);
31992 set_bb_seq (new_bb, gseq);
31994 bb1 = new_bb;
31995 e12 = split_block (bb1, if_else_stmt);
31996 bb2 = e12->dest;
31997 e12->flags &= ~EDGE_FALLTHRU;
31998 e12->flags |= EDGE_TRUE_VALUE;
32000 e23 = split_block (bb2, return_stmt);
32002 gimple_set_bb (convert_stmt, bb2);
32003 gimple_set_bb (return_stmt, bb2);
32005 bb3 = e23->dest;
32006 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32008 remove_edge (e23);
32009 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32011 pop_cfun ();
32013 return bb3;
32016 /* This parses the attribute arguments to target in DECL and determines
32017 the right builtin to use to match the platform specification.
32018 It returns the priority value for this version decl. If PREDICATE_LIST
32019 is not NULL, it stores the list of cpu features that need to be checked
32020 before dispatching this function. */
32022 static unsigned int
32023 get_builtin_code_for_version (tree decl, tree *predicate_list)
32025 tree attrs;
32026 struct cl_target_option cur_target;
32027 tree target_node;
32028 struct cl_target_option *new_target;
32029 const char *arg_str = NULL;
32030 const char *attrs_str = NULL;
32031 char *tok_str = NULL;
32032 char *token;
32034 /* Priority of i386 features, greater value is higher priority. This is
32035 used to decide the order in which function dispatch must happen. For
32036 instance, a version specialized for SSE4.2 should be checked for dispatch
32037 before a version for SSE3, as SSE4.2 implies SSE3. */
32038 enum feature_priority
32040 P_ZERO = 0,
32041 P_MMX,
32042 P_SSE,
32043 P_SSE2,
32044 P_SSE3,
32045 P_SSSE3,
32046 P_PROC_SSSE3,
32047 P_SSE4_A,
32048 P_PROC_SSE4_A,
32049 P_SSE4_1,
32050 P_SSE4_2,
32051 P_PROC_SSE4_2,
32052 P_POPCNT,
32053 P_AES,
32054 P_PCLMUL,
32055 P_AVX,
32056 P_PROC_AVX,
32057 P_BMI,
32058 P_PROC_BMI,
32059 P_FMA4,
32060 P_XOP,
32061 P_PROC_XOP,
32062 P_FMA,
32063 P_PROC_FMA,
32064 P_BMI2,
32065 P_AVX2,
32066 P_PROC_AVX2,
32067 P_AVX512F,
32068 P_PROC_AVX512F
32071 enum feature_priority priority = P_ZERO;
32073 /* These are the target attribute strings for which a dispatcher is
32074 available, from fold_builtin_cpu. */
32076 static struct _feature_list
32078 const char *const name;
32079 const enum feature_priority priority;
32081 const feature_list[] =
32083 {"mmx", P_MMX},
32084 {"sse", P_SSE},
32085 {"sse2", P_SSE2},
32086 {"sse3", P_SSE3},
32087 {"sse4a", P_SSE4_A},
32088 {"ssse3", P_SSSE3},
32089 {"sse4.1", P_SSE4_1},
32090 {"sse4.2", P_SSE4_2},
32091 {"popcnt", P_POPCNT},
32092 {"aes", P_AES},
32093 {"pclmul", P_PCLMUL},
32094 {"avx", P_AVX},
32095 {"bmi", P_BMI},
32096 {"fma4", P_FMA4},
32097 {"xop", P_XOP},
32098 {"fma", P_FMA},
32099 {"bmi2", P_BMI2},
32100 {"avx2", P_AVX2},
32101 {"avx512f", P_AVX512F}
32105 static unsigned int NUM_FEATURES
32106 = sizeof (feature_list) / sizeof (struct _feature_list);
32108 unsigned int i;
32110 tree predicate_chain = NULL_TREE;
32111 tree predicate_decl, predicate_arg;
32113 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32114 gcc_assert (attrs != NULL);
32116 attrs = TREE_VALUE (TREE_VALUE (attrs));
32118 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32119 attrs_str = TREE_STRING_POINTER (attrs);
32121 /* Return priority zero for default function. */
32122 if (strcmp (attrs_str, "default") == 0)
32123 return 0;
32125 /* Handle arch= if specified. For priority, set it to be 1 more than
32126 the best instruction set the processor can handle. For instance, if
32127 there is a version for atom and a version for ssse3 (the highest ISA
32128 priority for atom), the atom version must be checked for dispatch
32129 before the ssse3 version. */
32130 if (strstr (attrs_str, "arch=") != NULL)
32132 cl_target_option_save (&cur_target, &global_options);
32133 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32134 &global_options_set);
32136 gcc_assert (target_node);
32137 new_target = TREE_TARGET_OPTION (target_node);
32138 gcc_assert (new_target);
32140 if (new_target->arch_specified && new_target->arch > 0)
32142 switch (new_target->arch)
32144 case PROCESSOR_CORE2:
32145 arg_str = "core2";
32146 priority = P_PROC_SSSE3;
32147 break;
32148 case PROCESSOR_NEHALEM:
32149 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32150 arg_str = "westmere";
32151 else
32152 /* We translate "arch=corei7" and "arch=nehalem" to
32153 "corei7" so that it will be mapped to M_INTEL_COREI7
32154 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32155 arg_str = "corei7";
32156 priority = P_PROC_SSE4_2;
32157 break;
32158 case PROCESSOR_SANDYBRIDGE:
32159 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32160 arg_str = "ivybridge";
32161 else
32162 arg_str = "sandybridge";
32163 priority = P_PROC_AVX;
32164 break;
32165 case PROCESSOR_HASWELL:
32166 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32167 arg_str = "skylake-avx512";
32168 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32169 arg_str = "skylake";
32170 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32171 arg_str = "broadwell";
32172 else
32173 arg_str = "haswell";
32174 priority = P_PROC_AVX2;
32175 break;
32176 case PROCESSOR_BONNELL:
32177 arg_str = "bonnell";
32178 priority = P_PROC_SSSE3;
32179 break;
32180 case PROCESSOR_KNL:
32181 arg_str = "knl";
32182 priority = P_PROC_AVX512F;
32183 break;
32184 case PROCESSOR_SILVERMONT:
32185 arg_str = "silvermont";
32186 priority = P_PROC_SSE4_2;
32187 break;
32188 case PROCESSOR_AMDFAM10:
32189 arg_str = "amdfam10h";
32190 priority = P_PROC_SSE4_A;
32191 break;
32192 case PROCESSOR_BTVER1:
32193 arg_str = "btver1";
32194 priority = P_PROC_SSE4_A;
32195 break;
32196 case PROCESSOR_BTVER2:
32197 arg_str = "btver2";
32198 priority = P_PROC_BMI;
32199 break;
32200 case PROCESSOR_BDVER1:
32201 arg_str = "bdver1";
32202 priority = P_PROC_XOP;
32203 break;
32204 case PROCESSOR_BDVER2:
32205 arg_str = "bdver2";
32206 priority = P_PROC_FMA;
32207 break;
32208 case PROCESSOR_BDVER3:
32209 arg_str = "bdver3";
32210 priority = P_PROC_FMA;
32211 break;
32212 case PROCESSOR_BDVER4:
32213 arg_str = "bdver4";
32214 priority = P_PROC_AVX2;
32215 break;
32216 case PROCESSOR_ZNVER1:
32217 arg_str = "znver1";
32218 priority = P_PROC_AVX2;
32219 break;
32223 cl_target_option_restore (&global_options, &cur_target);
32225 if (predicate_list && arg_str == NULL)
32227 error_at (DECL_SOURCE_LOCATION (decl),
32228 "No dispatcher found for the versioning attributes");
32229 return 0;
32232 if (predicate_list)
32234 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32235 /* For a C string literal the length includes the trailing NULL. */
32236 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32237 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32238 predicate_chain);
32242 /* Process feature name. */
32243 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32244 strcpy (tok_str, attrs_str);
32245 token = strtok (tok_str, ",");
32246 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32248 while (token != NULL)
32250 /* Do not process "arch=" */
32251 if (strncmp (token, "arch=", 5) == 0)
32253 token = strtok (NULL, ",");
32254 continue;
32256 for (i = 0; i < NUM_FEATURES; ++i)
32258 if (strcmp (token, feature_list[i].name) == 0)
32260 if (predicate_list)
32262 predicate_arg = build_string_literal (
32263 strlen (feature_list[i].name) + 1,
32264 feature_list[i].name);
32265 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32266 predicate_chain);
32268 /* Find the maximum priority feature. */
32269 if (feature_list[i].priority > priority)
32270 priority = feature_list[i].priority;
32272 break;
32275 if (predicate_list && i == NUM_FEATURES)
32277 error_at (DECL_SOURCE_LOCATION (decl),
32278 "No dispatcher found for %s", token);
32279 return 0;
32281 token = strtok (NULL, ",");
32283 free (tok_str);
32285 if (predicate_list && predicate_chain == NULL_TREE)
32287 error_at (DECL_SOURCE_LOCATION (decl),
32288 "No dispatcher found for the versioning attributes : %s",
32289 attrs_str);
32290 return 0;
32292 else if (predicate_list)
32294 predicate_chain = nreverse (predicate_chain);
32295 *predicate_list = predicate_chain;
32298 return priority;
32301 /* This compares the priority of target features in function DECL1
32302 and DECL2. It returns positive value if DECL1 is higher priority,
32303 negative value if DECL2 is higher priority and 0 if they are the
32304 same. */
32306 static int
32307 ix86_compare_version_priority (tree decl1, tree decl2)
32309 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32310 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32312 return (int)priority1 - (int)priority2;
32315 /* V1 and V2 point to function versions with different priorities
32316 based on the target ISA. This function compares their priorities. */
32318 static int
32319 feature_compare (const void *v1, const void *v2)
32321 typedef struct _function_version_info
32323 tree version_decl;
32324 tree predicate_chain;
32325 unsigned int dispatch_priority;
32326 } function_version_info;
32328 const function_version_info c1 = *(const function_version_info *)v1;
32329 const function_version_info c2 = *(const function_version_info *)v2;
32330 return (c2.dispatch_priority - c1.dispatch_priority);
32333 /* This function generates the dispatch function for
32334 multi-versioned functions. DISPATCH_DECL is the function which will
32335 contain the dispatch logic. FNDECLS are the function choices for
32336 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32337 in DISPATCH_DECL in which the dispatch code is generated. */
32339 static int
32340 dispatch_function_versions (tree dispatch_decl,
32341 void *fndecls_p,
32342 basic_block *empty_bb)
32344 tree default_decl;
32345 gimple *ifunc_cpu_init_stmt;
32346 gimple_seq gseq;
32347 int ix;
32348 tree ele;
32349 vec<tree> *fndecls;
32350 unsigned int num_versions = 0;
32351 unsigned int actual_versions = 0;
32352 unsigned int i;
32354 struct _function_version_info
32356 tree version_decl;
32357 tree predicate_chain;
32358 unsigned int dispatch_priority;
32359 }*function_version_info;
32361 gcc_assert (dispatch_decl != NULL
32362 && fndecls_p != NULL
32363 && empty_bb != NULL);
32365 /*fndecls_p is actually a vector. */
32366 fndecls = static_cast<vec<tree> *> (fndecls_p);
32368 /* At least one more version other than the default. */
32369 num_versions = fndecls->length ();
32370 gcc_assert (num_versions >= 2);
32372 function_version_info = (struct _function_version_info *)
32373 XNEWVEC (struct _function_version_info, (num_versions - 1));
32375 /* The first version in the vector is the default decl. */
32376 default_decl = (*fndecls)[0];
32378 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32380 gseq = bb_seq (*empty_bb);
32381 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32382 constructors, so explicity call __builtin_cpu_init here. */
32383 ifunc_cpu_init_stmt = gimple_build_call_vec (
32384 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32385 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32386 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32387 set_bb_seq (*empty_bb, gseq);
32389 pop_cfun ();
32392 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32394 tree version_decl = ele;
32395 tree predicate_chain = NULL_TREE;
32396 unsigned int priority;
32397 /* Get attribute string, parse it and find the right predicate decl.
32398 The predicate function could be a lengthy combination of many
32399 features, like arch-type and various isa-variants. */
32400 priority = get_builtin_code_for_version (version_decl,
32401 &predicate_chain);
32403 if (predicate_chain == NULL_TREE)
32404 continue;
32406 function_version_info [actual_versions].version_decl = version_decl;
32407 function_version_info [actual_versions].predicate_chain
32408 = predicate_chain;
32409 function_version_info [actual_versions].dispatch_priority = priority;
32410 actual_versions++;
32413 /* Sort the versions according to descending order of dispatch priority. The
32414 priority is based on the ISA. This is not a perfect solution. There
32415 could still be ambiguity. If more than one function version is suitable
32416 to execute, which one should be dispatched? In future, allow the user
32417 to specify a dispatch priority next to the version. */
32418 qsort (function_version_info, actual_versions,
32419 sizeof (struct _function_version_info), feature_compare);
32421 for (i = 0; i < actual_versions; ++i)
32422 *empty_bb = add_condition_to_bb (dispatch_decl,
32423 function_version_info[i].version_decl,
32424 function_version_info[i].predicate_chain,
32425 *empty_bb);
32427 /* dispatch default version at the end. */
32428 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32429 NULL, *empty_bb);
32431 free (function_version_info);
32432 return 0;
32435 /* Comparator function to be used in qsort routine to sort attribute
32436 specification strings to "target". */
32438 static int
32439 attr_strcmp (const void *v1, const void *v2)
32441 const char *c1 = *(char *const*)v1;
32442 const char *c2 = *(char *const*)v2;
32443 return strcmp (c1, c2);
32446 /* ARGLIST is the argument to target attribute. This function tokenizes
32447 the comma separated arguments, sorts them and returns a string which
32448 is a unique identifier for the comma separated arguments. It also
32449 replaces non-identifier characters "=,-" with "_". */
32451 static char *
32452 sorted_attr_string (tree arglist)
32454 tree arg;
32455 size_t str_len_sum = 0;
32456 char **args = NULL;
32457 char *attr_str, *ret_str;
32458 char *attr = NULL;
32459 unsigned int argnum = 1;
32460 unsigned int i;
32462 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32464 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32465 size_t len = strlen (str);
32466 str_len_sum += len + 1;
32467 if (arg != arglist)
32468 argnum++;
32469 for (i = 0; i < strlen (str); i++)
32470 if (str[i] == ',')
32471 argnum++;
32474 attr_str = XNEWVEC (char, str_len_sum);
32475 str_len_sum = 0;
32476 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32478 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32479 size_t len = strlen (str);
32480 memcpy (attr_str + str_len_sum, str, len);
32481 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32482 str_len_sum += len + 1;
32485 /* Replace "=,-" with "_". */
32486 for (i = 0; i < strlen (attr_str); i++)
32487 if (attr_str[i] == '=' || attr_str[i]== '-')
32488 attr_str[i] = '_';
32490 if (argnum == 1)
32491 return attr_str;
32493 args = XNEWVEC (char *, argnum);
32495 i = 0;
32496 attr = strtok (attr_str, ",");
32497 while (attr != NULL)
32499 args[i] = attr;
32500 i++;
32501 attr = strtok (NULL, ",");
32504 qsort (args, argnum, sizeof (char *), attr_strcmp);
32506 ret_str = XNEWVEC (char, str_len_sum);
32507 str_len_sum = 0;
32508 for (i = 0; i < argnum; i++)
32510 size_t len = strlen (args[i]);
32511 memcpy (ret_str + str_len_sum, args[i], len);
32512 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32513 str_len_sum += len + 1;
32516 XDELETEVEC (args);
32517 XDELETEVEC (attr_str);
32518 return ret_str;
32521 /* This function changes the assembler name for functions that are
32522 versions. If DECL is a function version and has a "target"
32523 attribute, it appends the attribute string to its assembler name. */
32525 static tree
32526 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32528 tree version_attr;
32529 const char *orig_name, *version_string;
32530 char *attr_str, *assembler_name;
32532 if (DECL_DECLARED_INLINE_P (decl)
32533 && lookup_attribute ("gnu_inline",
32534 DECL_ATTRIBUTES (decl)))
32535 error_at (DECL_SOURCE_LOCATION (decl),
32536 "Function versions cannot be marked as gnu_inline,"
32537 " bodies have to be generated");
32539 if (DECL_VIRTUAL_P (decl)
32540 || DECL_VINDEX (decl))
32541 sorry ("Virtual function multiversioning not supported");
32543 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32545 /* target attribute string cannot be NULL. */
32546 gcc_assert (version_attr != NULL_TREE);
32548 orig_name = IDENTIFIER_POINTER (id);
32549 version_string
32550 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32552 if (strcmp (version_string, "default") == 0)
32553 return id;
32555 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32556 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32558 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32560 /* Allow assembler name to be modified if already set. */
32561 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32562 SET_DECL_RTL (decl, NULL);
32564 tree ret = get_identifier (assembler_name);
32565 XDELETEVEC (attr_str);
32566 XDELETEVEC (assembler_name);
32567 return ret;
32570 /* This function returns true if FN1 and FN2 are versions of the same function,
32571 that is, the target strings of the function decls are different. This assumes
32572 that FN1 and FN2 have the same signature. */
32574 static bool
32575 ix86_function_versions (tree fn1, tree fn2)
32577 tree attr1, attr2;
32578 char *target1, *target2;
32579 bool result;
32581 if (TREE_CODE (fn1) != FUNCTION_DECL
32582 || TREE_CODE (fn2) != FUNCTION_DECL)
32583 return false;
32585 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32586 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32588 /* At least one function decl should have the target attribute specified. */
32589 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32590 return false;
32592 /* Diagnose missing target attribute if one of the decls is already
32593 multi-versioned. */
32594 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32596 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32598 if (attr2 != NULL_TREE)
32600 std::swap (fn1, fn2);
32601 attr1 = attr2;
32603 error_at (DECL_SOURCE_LOCATION (fn2),
32604 "missing %<target%> attribute for multi-versioned %D",
32605 fn2);
32606 inform (DECL_SOURCE_LOCATION (fn1),
32607 "previous declaration of %D", fn1);
32608 /* Prevent diagnosing of the same error multiple times. */
32609 DECL_ATTRIBUTES (fn2)
32610 = tree_cons (get_identifier ("target"),
32611 copy_node (TREE_VALUE (attr1)),
32612 DECL_ATTRIBUTES (fn2));
32614 return false;
32617 target1 = sorted_attr_string (TREE_VALUE (attr1));
32618 target2 = sorted_attr_string (TREE_VALUE (attr2));
32620 /* The sorted target strings must be different for fn1 and fn2
32621 to be versions. */
32622 if (strcmp (target1, target2) == 0)
32623 result = false;
32624 else
32625 result = true;
32627 XDELETEVEC (target1);
32628 XDELETEVEC (target2);
32630 return result;
32633 static tree
32634 ix86_mangle_decl_assembler_name (tree decl, tree id)
32636 /* For function version, add the target suffix to the assembler name. */
32637 if (TREE_CODE (decl) == FUNCTION_DECL
32638 && DECL_FUNCTION_VERSIONED (decl))
32639 id = ix86_mangle_function_version_assembler_name (decl, id);
32640 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32641 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32642 #endif
32644 return id;
32647 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32648 is true, append the full path name of the source file. */
32650 static char *
32651 make_name (tree decl, const char *suffix, bool make_unique)
32653 char *global_var_name;
32654 int name_len;
32655 const char *name;
32656 const char *unique_name = NULL;
32658 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32660 /* Get a unique name that can be used globally without any chances
32661 of collision at link time. */
32662 if (make_unique)
32663 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32665 name_len = strlen (name) + strlen (suffix) + 2;
32667 if (make_unique)
32668 name_len += strlen (unique_name) + 1;
32669 global_var_name = XNEWVEC (char, name_len);
32671 /* Use '.' to concatenate names as it is demangler friendly. */
32672 if (make_unique)
32673 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32674 suffix);
32675 else
32676 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32678 return global_var_name;
32681 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32683 /* Make a dispatcher declaration for the multi-versioned function DECL.
32684 Calls to DECL function will be replaced with calls to the dispatcher
32685 by the front-end. Return the decl created. */
32687 static tree
32688 make_dispatcher_decl (const tree decl)
32690 tree func_decl;
32691 char *func_name;
32692 tree fn_type, func_type;
32693 bool is_uniq = false;
32695 if (TREE_PUBLIC (decl) == 0)
32696 is_uniq = true;
32698 func_name = make_name (decl, "ifunc", is_uniq);
32700 fn_type = TREE_TYPE (decl);
32701 func_type = build_function_type (TREE_TYPE (fn_type),
32702 TYPE_ARG_TYPES (fn_type));
32704 func_decl = build_fn_decl (func_name, func_type);
32705 XDELETEVEC (func_name);
32706 TREE_USED (func_decl) = 1;
32707 DECL_CONTEXT (func_decl) = NULL_TREE;
32708 DECL_INITIAL (func_decl) = error_mark_node;
32709 DECL_ARTIFICIAL (func_decl) = 1;
32710 /* Mark this func as external, the resolver will flip it again if
32711 it gets generated. */
32712 DECL_EXTERNAL (func_decl) = 1;
32713 /* This will be of type IFUNCs have to be externally visible. */
32714 TREE_PUBLIC (func_decl) = 1;
32716 return func_decl;
32719 #endif
32721 /* Returns true if decl is multi-versioned and DECL is the default function,
32722 that is it is not tagged with target specific optimization. */
32724 static bool
32725 is_function_default_version (const tree decl)
32727 if (TREE_CODE (decl) != FUNCTION_DECL
32728 || !DECL_FUNCTION_VERSIONED (decl))
32729 return false;
32730 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32731 gcc_assert (attr);
32732 attr = TREE_VALUE (TREE_VALUE (attr));
32733 return (TREE_CODE (attr) == STRING_CST
32734 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
32737 /* Make a dispatcher declaration for the multi-versioned function DECL.
32738 Calls to DECL function will be replaced with calls to the dispatcher
32739 by the front-end. Returns the decl of the dispatcher function. */
32741 static tree
32742 ix86_get_function_versions_dispatcher (void *decl)
32744 tree fn = (tree) decl;
32745 struct cgraph_node *node = NULL;
32746 struct cgraph_node *default_node = NULL;
32747 struct cgraph_function_version_info *node_v = NULL;
32748 struct cgraph_function_version_info *first_v = NULL;
32750 tree dispatch_decl = NULL;
32752 struct cgraph_function_version_info *default_version_info = NULL;
32754 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32756 node = cgraph_node::get (fn);
32757 gcc_assert (node != NULL);
32759 node_v = node->function_version ();
32760 gcc_assert (node_v != NULL);
32762 if (node_v->dispatcher_resolver != NULL)
32763 return node_v->dispatcher_resolver;
32765 /* Find the default version and make it the first node. */
32766 first_v = node_v;
32767 /* Go to the beginning of the chain. */
32768 while (first_v->prev != NULL)
32769 first_v = first_v->prev;
32770 default_version_info = first_v;
32771 while (default_version_info != NULL)
32773 if (is_function_default_version
32774 (default_version_info->this_node->decl))
32775 break;
32776 default_version_info = default_version_info->next;
32779 /* If there is no default node, just return NULL. */
32780 if (default_version_info == NULL)
32781 return NULL;
32783 /* Make default info the first node. */
32784 if (first_v != default_version_info)
32786 default_version_info->prev->next = default_version_info->next;
32787 if (default_version_info->next)
32788 default_version_info->next->prev = default_version_info->prev;
32789 first_v->prev = default_version_info;
32790 default_version_info->next = first_v;
32791 default_version_info->prev = NULL;
32794 default_node = default_version_info->this_node;
32796 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32797 if (targetm.has_ifunc_p ())
32799 struct cgraph_function_version_info *it_v = NULL;
32800 struct cgraph_node *dispatcher_node = NULL;
32801 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32803 /* Right now, the dispatching is done via ifunc. */
32804 dispatch_decl = make_dispatcher_decl (default_node->decl);
32806 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32807 gcc_assert (dispatcher_node != NULL);
32808 dispatcher_node->dispatcher_function = 1;
32809 dispatcher_version_info
32810 = dispatcher_node->insert_new_function_version ();
32811 dispatcher_version_info->next = default_version_info;
32812 dispatcher_node->definition = 1;
32814 /* Set the dispatcher for all the versions. */
32815 it_v = default_version_info;
32816 while (it_v != NULL)
32818 it_v->dispatcher_resolver = dispatch_decl;
32819 it_v = it_v->next;
32822 else
32823 #endif
32825 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32826 "multiversioning needs ifunc which is not supported "
32827 "on this target");
32830 return dispatch_decl;
32833 /* Make the resolver function decl to dispatch the versions of
32834 a multi-versioned function, DEFAULT_DECL. Create an
32835 empty basic block in the resolver and store the pointer in
32836 EMPTY_BB. Return the decl of the resolver function. */
32838 static tree
32839 make_resolver_func (const tree default_decl,
32840 const tree dispatch_decl,
32841 basic_block *empty_bb)
32843 char *resolver_name;
32844 tree decl, type, decl_name, t;
32845 bool is_uniq = false;
32847 /* IFUNC's have to be globally visible. So, if the default_decl is
32848 not, then the name of the IFUNC should be made unique. */
32849 if (TREE_PUBLIC (default_decl) == 0)
32850 is_uniq = true;
32852 /* Append the filename to the resolver function if the versions are
32853 not externally visible. This is because the resolver function has
32854 to be externally visible for the loader to find it. So, appending
32855 the filename will prevent conflicts with a resolver function from
32856 another module which is based on the same version name. */
32857 resolver_name = make_name (default_decl, "resolver", is_uniq);
32859 /* The resolver function should return a (void *). */
32860 type = build_function_type_list (ptr_type_node, NULL_TREE);
32862 decl = build_fn_decl (resolver_name, type);
32863 decl_name = get_identifier (resolver_name);
32864 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32866 DECL_NAME (decl) = decl_name;
32867 TREE_USED (decl) = 1;
32868 DECL_ARTIFICIAL (decl) = 1;
32869 DECL_IGNORED_P (decl) = 0;
32870 /* IFUNC resolvers have to be externally visible. */
32871 TREE_PUBLIC (decl) = 1;
32872 DECL_UNINLINABLE (decl) = 1;
32874 /* Resolver is not external, body is generated. */
32875 DECL_EXTERNAL (decl) = 0;
32876 DECL_EXTERNAL (dispatch_decl) = 0;
32878 DECL_CONTEXT (decl) = NULL_TREE;
32879 DECL_INITIAL (decl) = make_node (BLOCK);
32880 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32882 if (DECL_COMDAT_GROUP (default_decl)
32883 || TREE_PUBLIC (default_decl))
32885 /* In this case, each translation unit with a call to this
32886 versioned function will put out a resolver. Ensure it
32887 is comdat to keep just one copy. */
32888 DECL_COMDAT (decl) = 1;
32889 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32891 /* Build result decl and add to function_decl. */
32892 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32893 DECL_ARTIFICIAL (t) = 1;
32894 DECL_IGNORED_P (t) = 1;
32895 DECL_RESULT (decl) = t;
32897 gimplify_function_tree (decl);
32898 push_cfun (DECL_STRUCT_FUNCTION (decl));
32899 *empty_bb = init_lowered_empty_function (decl, false, 0);
32901 cgraph_node::add_new_function (decl, true);
32902 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32904 pop_cfun ();
32906 gcc_assert (dispatch_decl != NULL);
32907 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
32908 DECL_ATTRIBUTES (dispatch_decl)
32909 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
32911 /* Create the alias for dispatch to resolver here. */
32912 /*cgraph_create_function_alias (dispatch_decl, decl);*/
32913 cgraph_node::create_same_body_alias (dispatch_decl, decl);
32914 XDELETEVEC (resolver_name);
32915 return decl;
32918 /* Generate the dispatching code body to dispatch multi-versioned function
32919 DECL. The target hook is called to process the "target" attributes and
32920 provide the code to dispatch the right function at run-time. NODE points
32921 to the dispatcher decl whose body will be created. */
32923 static tree
32924 ix86_generate_version_dispatcher_body (void *node_p)
32926 tree resolver_decl;
32927 basic_block empty_bb;
32928 tree default_ver_decl;
32929 struct cgraph_node *versn;
32930 struct cgraph_node *node;
32932 struct cgraph_function_version_info *node_version_info = NULL;
32933 struct cgraph_function_version_info *versn_info = NULL;
32935 node = (cgraph_node *)node_p;
32937 node_version_info = node->function_version ();
32938 gcc_assert (node->dispatcher_function
32939 && node_version_info != NULL);
32941 if (node_version_info->dispatcher_resolver)
32942 return node_version_info->dispatcher_resolver;
32944 /* The first version in the chain corresponds to the default version. */
32945 default_ver_decl = node_version_info->next->this_node->decl;
32947 /* node is going to be an alias, so remove the finalized bit. */
32948 node->definition = false;
32950 resolver_decl = make_resolver_func (default_ver_decl,
32951 node->decl, &empty_bb);
32953 node_version_info->dispatcher_resolver = resolver_decl;
32955 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32957 auto_vec<tree, 2> fn_ver_vec;
32959 for (versn_info = node_version_info->next; versn_info;
32960 versn_info = versn_info->next)
32962 versn = versn_info->this_node;
32963 /* Check for virtual functions here again, as by this time it should
32964 have been determined if this function needs a vtable index or
32965 not. This happens for methods in derived classes that override
32966 virtual methods in base classes but are not explicitly marked as
32967 virtual. */
32968 if (DECL_VINDEX (versn->decl))
32969 sorry ("Virtual function multiversioning not supported");
32971 fn_ver_vec.safe_push (versn->decl);
32974 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32975 cgraph_edge::rebuild_edges ();
32976 pop_cfun ();
32977 return resolver_decl;
32979 /* This builds the processor_model struct type defined in
32980 libgcc/config/i386/cpuinfo.c */
32982 static tree
32983 build_processor_model_struct (void)
32985 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32986 "__cpu_features"};
32987 tree field = NULL_TREE, field_chain = NULL_TREE;
32988 int i;
32989 tree type = make_node (RECORD_TYPE);
32991 /* The first 3 fields are unsigned int. */
32992 for (i = 0; i < 3; ++i)
32994 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32995 get_identifier (field_name[i]), unsigned_type_node);
32996 if (field_chain != NULL_TREE)
32997 DECL_CHAIN (field) = field_chain;
32998 field_chain = field;
33001 /* The last field is an array of unsigned integers of size one. */
33002 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33003 get_identifier (field_name[3]),
33004 build_array_type (unsigned_type_node,
33005 build_index_type (size_one_node)));
33006 if (field_chain != NULL_TREE)
33007 DECL_CHAIN (field) = field_chain;
33008 field_chain = field;
33010 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33011 return type;
33014 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33016 static tree
33017 make_var_decl (tree type, const char *name)
33019 tree new_decl;
33021 new_decl = build_decl (UNKNOWN_LOCATION,
33022 VAR_DECL,
33023 get_identifier(name),
33024 type);
33026 DECL_EXTERNAL (new_decl) = 1;
33027 TREE_STATIC (new_decl) = 1;
33028 TREE_PUBLIC (new_decl) = 1;
33029 DECL_INITIAL (new_decl) = 0;
33030 DECL_ARTIFICIAL (new_decl) = 0;
33031 DECL_PRESERVE_P (new_decl) = 1;
33033 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33034 assemble_variable (new_decl, 0, 0, 0);
33036 return new_decl;
33039 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33040 into an integer defined in libgcc/config/i386/cpuinfo.c */
33042 static tree
33043 fold_builtin_cpu (tree fndecl, tree *args)
33045 unsigned int i;
33046 enum ix86_builtins fn_code = (enum ix86_builtins)
33047 DECL_FUNCTION_CODE (fndecl);
33048 tree param_string_cst = NULL;
33050 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33051 enum processor_features
33053 F_CMOV = 0,
33054 F_MMX,
33055 F_POPCNT,
33056 F_SSE,
33057 F_SSE2,
33058 F_SSE3,
33059 F_SSSE3,
33060 F_SSE4_1,
33061 F_SSE4_2,
33062 F_AVX,
33063 F_AVX2,
33064 F_SSE4_A,
33065 F_FMA4,
33066 F_XOP,
33067 F_FMA,
33068 F_AVX512F,
33069 F_BMI,
33070 F_BMI2,
33071 F_AES,
33072 F_PCLMUL,
33073 F_AVX512VL,
33074 F_AVX512BW,
33075 F_AVX512DQ,
33076 F_AVX512CD,
33077 F_AVX512ER,
33078 F_AVX512PF,
33079 F_AVX512VBMI,
33080 F_AVX512IFMA,
33081 F_MAX
33084 /* These are the values for vendor types and cpu types and subtypes
33085 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33086 the corresponding start value. */
33087 enum processor_model
33089 M_INTEL = 1,
33090 M_AMD,
33091 M_CPU_TYPE_START,
33092 M_INTEL_BONNELL,
33093 M_INTEL_CORE2,
33094 M_INTEL_COREI7,
33095 M_AMDFAM10H,
33096 M_AMDFAM15H,
33097 M_INTEL_SILVERMONT,
33098 M_INTEL_KNL,
33099 M_AMD_BTVER1,
33100 M_AMD_BTVER2,
33101 M_CPU_SUBTYPE_START,
33102 M_INTEL_COREI7_NEHALEM,
33103 M_INTEL_COREI7_WESTMERE,
33104 M_INTEL_COREI7_SANDYBRIDGE,
33105 M_AMDFAM10H_BARCELONA,
33106 M_AMDFAM10H_SHANGHAI,
33107 M_AMDFAM10H_ISTANBUL,
33108 M_AMDFAM15H_BDVER1,
33109 M_AMDFAM15H_BDVER2,
33110 M_AMDFAM15H_BDVER3,
33111 M_AMDFAM15H_BDVER4,
33112 M_AMDFAM17H_ZNVER1,
33113 M_INTEL_COREI7_IVYBRIDGE,
33114 M_INTEL_COREI7_HASWELL,
33115 M_INTEL_COREI7_BROADWELL,
33116 M_INTEL_COREI7_SKYLAKE,
33117 M_INTEL_COREI7_SKYLAKE_AVX512
33120 static struct _arch_names_table
33122 const char *const name;
33123 const enum processor_model model;
33125 const arch_names_table[] =
33127 {"amd", M_AMD},
33128 {"intel", M_INTEL},
33129 {"atom", M_INTEL_BONNELL},
33130 {"slm", M_INTEL_SILVERMONT},
33131 {"core2", M_INTEL_CORE2},
33132 {"corei7", M_INTEL_COREI7},
33133 {"nehalem", M_INTEL_COREI7_NEHALEM},
33134 {"westmere", M_INTEL_COREI7_WESTMERE},
33135 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33136 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33137 {"haswell", M_INTEL_COREI7_HASWELL},
33138 {"broadwell", M_INTEL_COREI7_BROADWELL},
33139 {"skylake", M_INTEL_COREI7_SKYLAKE},
33140 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33141 {"bonnell", M_INTEL_BONNELL},
33142 {"silvermont", M_INTEL_SILVERMONT},
33143 {"knl", M_INTEL_KNL},
33144 {"amdfam10h", M_AMDFAM10H},
33145 {"barcelona", M_AMDFAM10H_BARCELONA},
33146 {"shanghai", M_AMDFAM10H_SHANGHAI},
33147 {"istanbul", M_AMDFAM10H_ISTANBUL},
33148 {"btver1", M_AMD_BTVER1},
33149 {"amdfam15h", M_AMDFAM15H},
33150 {"bdver1", M_AMDFAM15H_BDVER1},
33151 {"bdver2", M_AMDFAM15H_BDVER2},
33152 {"bdver3", M_AMDFAM15H_BDVER3},
33153 {"bdver4", M_AMDFAM15H_BDVER4},
33154 {"btver2", M_AMD_BTVER2},
33155 {"znver1", M_AMDFAM17H_ZNVER1},
33158 static struct _isa_names_table
33160 const char *const name;
33161 const enum processor_features feature;
33163 const isa_names_table[] =
33165 {"cmov", F_CMOV},
33166 {"mmx", F_MMX},
33167 {"popcnt", F_POPCNT},
33168 {"sse", F_SSE},
33169 {"sse2", F_SSE2},
33170 {"sse3", F_SSE3},
33171 {"ssse3", F_SSSE3},
33172 {"sse4a", F_SSE4_A},
33173 {"sse4.1", F_SSE4_1},
33174 {"sse4.2", F_SSE4_2},
33175 {"avx", F_AVX},
33176 {"fma4", F_FMA4},
33177 {"xop", F_XOP},
33178 {"fma", F_FMA},
33179 {"avx2", F_AVX2},
33180 {"avx512f", F_AVX512F},
33181 {"bmi", F_BMI},
33182 {"bmi2", F_BMI2},
33183 {"aes", F_AES},
33184 {"pclmul", F_PCLMUL},
33185 {"avx512vl",F_AVX512VL},
33186 {"avx512bw",F_AVX512BW},
33187 {"avx512dq",F_AVX512DQ},
33188 {"avx512cd",F_AVX512CD},
33189 {"avx512er",F_AVX512ER},
33190 {"avx512pf",F_AVX512PF},
33191 {"avx512vbmi",F_AVX512VBMI},
33192 {"avx512ifma",F_AVX512IFMA},
33195 tree __processor_model_type = build_processor_model_struct ();
33196 tree __cpu_model_var = make_var_decl (__processor_model_type,
33197 "__cpu_model");
33200 varpool_node::add (__cpu_model_var);
33202 gcc_assert ((args != NULL) && (*args != NULL));
33204 param_string_cst = *args;
33205 while (param_string_cst
33206 && TREE_CODE (param_string_cst) != STRING_CST)
33208 /* *args must be a expr that can contain other EXPRS leading to a
33209 STRING_CST. */
33210 if (!EXPR_P (param_string_cst))
33212 error ("Parameter to builtin must be a string constant or literal");
33213 return integer_zero_node;
33215 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33218 gcc_assert (param_string_cst);
33220 if (fn_code == IX86_BUILTIN_CPU_IS)
33222 tree ref;
33223 tree field;
33224 tree final;
33226 unsigned int field_val = 0;
33227 unsigned int NUM_ARCH_NAMES
33228 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33230 for (i = 0; i < NUM_ARCH_NAMES; i++)
33231 if (strcmp (arch_names_table[i].name,
33232 TREE_STRING_POINTER (param_string_cst)) == 0)
33233 break;
33235 if (i == NUM_ARCH_NAMES)
33237 error ("Parameter to builtin not valid: %s",
33238 TREE_STRING_POINTER (param_string_cst));
33239 return integer_zero_node;
33242 field = TYPE_FIELDS (__processor_model_type);
33243 field_val = arch_names_table[i].model;
33245 /* CPU types are stored in the next field. */
33246 if (field_val > M_CPU_TYPE_START
33247 && field_val < M_CPU_SUBTYPE_START)
33249 field = DECL_CHAIN (field);
33250 field_val -= M_CPU_TYPE_START;
33253 /* CPU subtypes are stored in the next field. */
33254 if (field_val > M_CPU_SUBTYPE_START)
33256 field = DECL_CHAIN ( DECL_CHAIN (field));
33257 field_val -= M_CPU_SUBTYPE_START;
33260 /* Get the appropriate field in __cpu_model. */
33261 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33262 field, NULL_TREE);
33264 /* Check the value. */
33265 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33266 build_int_cstu (unsigned_type_node, field_val));
33267 return build1 (CONVERT_EXPR, integer_type_node, final);
33269 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33271 tree ref;
33272 tree array_elt;
33273 tree field;
33274 tree final;
33276 unsigned int field_val = 0;
33277 unsigned int NUM_ISA_NAMES
33278 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33280 for (i = 0; i < NUM_ISA_NAMES; i++)
33281 if (strcmp (isa_names_table[i].name,
33282 TREE_STRING_POINTER (param_string_cst)) == 0)
33283 break;
33285 if (i == NUM_ISA_NAMES)
33287 error ("Parameter to builtin not valid: %s",
33288 TREE_STRING_POINTER (param_string_cst));
33289 return integer_zero_node;
33292 field = TYPE_FIELDS (__processor_model_type);
33293 /* Get the last field, which is __cpu_features. */
33294 while (DECL_CHAIN (field))
33295 field = DECL_CHAIN (field);
33297 /* Get the appropriate field: __cpu_model.__cpu_features */
33298 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33299 field, NULL_TREE);
33301 /* Access the 0th element of __cpu_features array. */
33302 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33303 integer_zero_node, NULL_TREE, NULL_TREE);
33305 field_val = (1 << isa_names_table[i].feature);
33306 /* Return __cpu_model.__cpu_features[0] & field_val */
33307 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33308 build_int_cstu (unsigned_type_node, field_val));
33309 return build1 (CONVERT_EXPR, integer_type_node, final);
33311 gcc_unreachable ();
33314 static tree
33315 ix86_fold_builtin (tree fndecl, int n_args,
33316 tree *args, bool ignore ATTRIBUTE_UNUSED)
33318 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33320 enum ix86_builtins fn_code = (enum ix86_builtins)
33321 DECL_FUNCTION_CODE (fndecl);
33322 switch (fn_code)
33324 case IX86_BUILTIN_CPU_IS:
33325 case IX86_BUILTIN_CPU_SUPPORTS:
33326 gcc_assert (n_args == 1);
33327 return fold_builtin_cpu (fndecl, args);
33329 case IX86_BUILTIN_NANQ:
33330 case IX86_BUILTIN_NANSQ:
33332 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33333 const char *str = c_getstr (*args);
33334 int quiet = fn_code == IX86_BUILTIN_NANQ;
33335 REAL_VALUE_TYPE real;
33337 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33338 return build_real (type, real);
33339 return NULL_TREE;
33342 default:
33343 break;
33347 #ifdef SUBTARGET_FOLD_BUILTIN
33348 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33349 #endif
33351 return NULL_TREE;
33354 /* Make builtins to detect cpu type and features supported. NAME is
33355 the builtin name, CODE is the builtin code, and FTYPE is the function
33356 type of the builtin. */
33358 static void
33359 make_cpu_type_builtin (const char* name, int code,
33360 enum ix86_builtin_func_type ftype, bool is_const)
33362 tree decl;
33363 tree type;
33365 type = ix86_get_builtin_func_type (ftype);
33366 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33367 NULL, NULL_TREE);
33368 gcc_assert (decl != NULL_TREE);
33369 ix86_builtins[(int) code] = decl;
33370 TREE_READONLY (decl) = is_const;
33373 /* Make builtins to get CPU type and features supported. The created
33374 builtins are :
33376 __builtin_cpu_init (), to detect cpu type and features,
33377 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33378 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33381 static void
33382 ix86_init_platform_type_builtins (void)
33384 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33385 INT_FTYPE_VOID, false);
33386 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33387 INT_FTYPE_PCCHAR, true);
33388 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33389 INT_FTYPE_PCCHAR, true);
33392 /* Internal method for ix86_init_builtins. */
33394 static void
33395 ix86_init_builtins_va_builtins_abi (void)
33397 tree ms_va_ref, sysv_va_ref;
33398 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33399 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33400 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33401 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33403 if (!TARGET_64BIT)
33404 return;
33405 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33406 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33407 ms_va_ref = build_reference_type (ms_va_list_type_node);
33408 sysv_va_ref =
33409 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33411 fnvoid_va_end_ms =
33412 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33413 fnvoid_va_start_ms =
33414 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33415 fnvoid_va_end_sysv =
33416 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33417 fnvoid_va_start_sysv =
33418 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33419 NULL_TREE);
33420 fnvoid_va_copy_ms =
33421 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33422 NULL_TREE);
33423 fnvoid_va_copy_sysv =
33424 build_function_type_list (void_type_node, sysv_va_ref,
33425 sysv_va_ref, NULL_TREE);
33427 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33428 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33429 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33430 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33431 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33432 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33433 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33434 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33435 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33436 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33437 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33438 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33441 static void
33442 ix86_init_builtin_types (void)
33444 tree float80_type_node, const_string_type_node;
33446 /* The __float80 type. */
33447 float80_type_node = long_double_type_node;
33448 if (TYPE_MODE (float80_type_node) != XFmode)
33450 if (float64x_type_node != NULL_TREE
33451 && TYPE_MODE (float64x_type_node) == XFmode)
33452 float80_type_node = float64x_type_node;
33453 else
33455 /* The __float80 type. */
33456 float80_type_node = make_node (REAL_TYPE);
33458 TYPE_PRECISION (float80_type_node) = 80;
33459 layout_type (float80_type_node);
33462 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33464 /* The __float128 type. The node has already been created as
33465 _Float128, so we only need to register the __float128 name for
33466 it. */
33467 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33469 const_string_type_node
33470 = build_pointer_type (build_qualified_type
33471 (char_type_node, TYPE_QUAL_CONST));
33473 /* This macro is built by i386-builtin-types.awk. */
33474 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33477 static void
33478 ix86_init_builtins (void)
33480 tree ftype, decl;
33482 ix86_init_builtin_types ();
33484 /* Builtins to get CPU type and features. */
33485 ix86_init_platform_type_builtins ();
33487 /* TFmode support builtins. */
33488 def_builtin_const (0, "__builtin_infq",
33489 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33490 def_builtin_const (0, "__builtin_huge_valq",
33491 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33493 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33494 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33495 BUILT_IN_MD, "nanq", NULL_TREE);
33496 TREE_READONLY (decl) = 1;
33497 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33499 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33500 BUILT_IN_MD, "nansq", NULL_TREE);
33501 TREE_READONLY (decl) = 1;
33502 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33504 /* We will expand them to normal call if SSE isn't available since
33505 they are used by libgcc. */
33506 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33507 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33508 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33509 TREE_READONLY (decl) = 1;
33510 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33512 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33513 decl = add_builtin_function ("__builtin_copysignq", ftype,
33514 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33515 "__copysigntf3", NULL_TREE);
33516 TREE_READONLY (decl) = 1;
33517 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33519 ix86_init_tm_builtins ();
33520 ix86_init_mmx_sse_builtins ();
33521 ix86_init_mpx_builtins ();
33523 if (TARGET_LP64)
33524 ix86_init_builtins_va_builtins_abi ();
33526 #ifdef SUBTARGET_INIT_BUILTINS
33527 SUBTARGET_INIT_BUILTINS;
33528 #endif
33531 /* Return the ix86 builtin for CODE. */
33533 static tree
33534 ix86_builtin_decl (unsigned code, bool)
33536 if (code >= IX86_BUILTIN_MAX)
33537 return error_mark_node;
33539 return ix86_builtins[code];
33542 /* Errors in the source file can cause expand_expr to return const0_rtx
33543 where we expect a vector. To avoid crashing, use one of the vector
33544 clear instructions. */
33545 static rtx
33546 safe_vector_operand (rtx x, machine_mode mode)
33548 if (x == const0_rtx)
33549 x = CONST0_RTX (mode);
33550 return x;
33553 /* Fixup modeless constants to fit required mode. */
33554 static rtx
33555 fixup_modeless_constant (rtx x, machine_mode mode)
33557 if (GET_MODE (x) == VOIDmode)
33558 x = convert_to_mode (mode, x, 1);
33559 return x;
33562 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33564 static rtx
33565 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33567 rtx pat;
33568 tree arg0 = CALL_EXPR_ARG (exp, 0);
33569 tree arg1 = CALL_EXPR_ARG (exp, 1);
33570 rtx op0 = expand_normal (arg0);
33571 rtx op1 = expand_normal (arg1);
33572 machine_mode tmode = insn_data[icode].operand[0].mode;
33573 machine_mode mode0 = insn_data[icode].operand[1].mode;
33574 machine_mode mode1 = insn_data[icode].operand[2].mode;
33576 if (VECTOR_MODE_P (mode0))
33577 op0 = safe_vector_operand (op0, mode0);
33578 if (VECTOR_MODE_P (mode1))
33579 op1 = safe_vector_operand (op1, mode1);
33581 if (optimize || !target
33582 || GET_MODE (target) != tmode
33583 || !insn_data[icode].operand[0].predicate (target, tmode))
33584 target = gen_reg_rtx (tmode);
33586 if (GET_MODE (op1) == SImode && mode1 == TImode)
33588 rtx x = gen_reg_rtx (V4SImode);
33589 emit_insn (gen_sse2_loadd (x, op1));
33590 op1 = gen_lowpart (TImode, x);
33593 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33594 op0 = copy_to_mode_reg (mode0, op0);
33595 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33596 op1 = copy_to_mode_reg (mode1, op1);
33598 pat = GEN_FCN (icode) (target, op0, op1);
33599 if (! pat)
33600 return 0;
33602 emit_insn (pat);
33604 return target;
33607 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33609 static rtx
33610 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33611 enum ix86_builtin_func_type m_type,
33612 enum rtx_code sub_code)
33614 rtx pat;
33615 int i;
33616 int nargs;
33617 bool comparison_p = false;
33618 bool tf_p = false;
33619 bool last_arg_constant = false;
33620 int num_memory = 0;
33621 struct {
33622 rtx op;
33623 machine_mode mode;
33624 } args[4];
33626 machine_mode tmode = insn_data[icode].operand[0].mode;
33628 switch (m_type)
33630 case MULTI_ARG_4_DF2_DI_I:
33631 case MULTI_ARG_4_DF2_DI_I1:
33632 case MULTI_ARG_4_SF2_SI_I:
33633 case MULTI_ARG_4_SF2_SI_I1:
33634 nargs = 4;
33635 last_arg_constant = true;
33636 break;
33638 case MULTI_ARG_3_SF:
33639 case MULTI_ARG_3_DF:
33640 case MULTI_ARG_3_SF2:
33641 case MULTI_ARG_3_DF2:
33642 case MULTI_ARG_3_DI:
33643 case MULTI_ARG_3_SI:
33644 case MULTI_ARG_3_SI_DI:
33645 case MULTI_ARG_3_HI:
33646 case MULTI_ARG_3_HI_SI:
33647 case MULTI_ARG_3_QI:
33648 case MULTI_ARG_3_DI2:
33649 case MULTI_ARG_3_SI2:
33650 case MULTI_ARG_3_HI2:
33651 case MULTI_ARG_3_QI2:
33652 nargs = 3;
33653 break;
33655 case MULTI_ARG_2_SF:
33656 case MULTI_ARG_2_DF:
33657 case MULTI_ARG_2_DI:
33658 case MULTI_ARG_2_SI:
33659 case MULTI_ARG_2_HI:
33660 case MULTI_ARG_2_QI:
33661 nargs = 2;
33662 break;
33664 case MULTI_ARG_2_DI_IMM:
33665 case MULTI_ARG_2_SI_IMM:
33666 case MULTI_ARG_2_HI_IMM:
33667 case MULTI_ARG_2_QI_IMM:
33668 nargs = 2;
33669 last_arg_constant = true;
33670 break;
33672 case MULTI_ARG_1_SF:
33673 case MULTI_ARG_1_DF:
33674 case MULTI_ARG_1_SF2:
33675 case MULTI_ARG_1_DF2:
33676 case MULTI_ARG_1_DI:
33677 case MULTI_ARG_1_SI:
33678 case MULTI_ARG_1_HI:
33679 case MULTI_ARG_1_QI:
33680 case MULTI_ARG_1_SI_DI:
33681 case MULTI_ARG_1_HI_DI:
33682 case MULTI_ARG_1_HI_SI:
33683 case MULTI_ARG_1_QI_DI:
33684 case MULTI_ARG_1_QI_SI:
33685 case MULTI_ARG_1_QI_HI:
33686 nargs = 1;
33687 break;
33689 case MULTI_ARG_2_DI_CMP:
33690 case MULTI_ARG_2_SI_CMP:
33691 case MULTI_ARG_2_HI_CMP:
33692 case MULTI_ARG_2_QI_CMP:
33693 nargs = 2;
33694 comparison_p = true;
33695 break;
33697 case MULTI_ARG_2_SF_TF:
33698 case MULTI_ARG_2_DF_TF:
33699 case MULTI_ARG_2_DI_TF:
33700 case MULTI_ARG_2_SI_TF:
33701 case MULTI_ARG_2_HI_TF:
33702 case MULTI_ARG_2_QI_TF:
33703 nargs = 2;
33704 tf_p = true;
33705 break;
33707 default:
33708 gcc_unreachable ();
33711 if (optimize || !target
33712 || GET_MODE (target) != tmode
33713 || !insn_data[icode].operand[0].predicate (target, tmode))
33714 target = gen_reg_rtx (tmode);
33716 gcc_assert (nargs <= 4);
33718 for (i = 0; i < nargs; i++)
33720 tree arg = CALL_EXPR_ARG (exp, i);
33721 rtx op = expand_normal (arg);
33722 int adjust = (comparison_p) ? 1 : 0;
33723 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33725 if (last_arg_constant && i == nargs - 1)
33727 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33729 enum insn_code new_icode = icode;
33730 switch (icode)
33732 case CODE_FOR_xop_vpermil2v2df3:
33733 case CODE_FOR_xop_vpermil2v4sf3:
33734 case CODE_FOR_xop_vpermil2v4df3:
33735 case CODE_FOR_xop_vpermil2v8sf3:
33736 error ("the last argument must be a 2-bit immediate");
33737 return gen_reg_rtx (tmode);
33738 case CODE_FOR_xop_rotlv2di3:
33739 new_icode = CODE_FOR_rotlv2di3;
33740 goto xop_rotl;
33741 case CODE_FOR_xop_rotlv4si3:
33742 new_icode = CODE_FOR_rotlv4si3;
33743 goto xop_rotl;
33744 case CODE_FOR_xop_rotlv8hi3:
33745 new_icode = CODE_FOR_rotlv8hi3;
33746 goto xop_rotl;
33747 case CODE_FOR_xop_rotlv16qi3:
33748 new_icode = CODE_FOR_rotlv16qi3;
33749 xop_rotl:
33750 if (CONST_INT_P (op))
33752 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33753 op = GEN_INT (INTVAL (op) & mask);
33754 gcc_checking_assert
33755 (insn_data[icode].operand[i + 1].predicate (op, mode));
33757 else
33759 gcc_checking_assert
33760 (nargs == 2
33761 && insn_data[new_icode].operand[0].mode == tmode
33762 && insn_data[new_icode].operand[1].mode == tmode
33763 && insn_data[new_icode].operand[2].mode == mode
33764 && insn_data[new_icode].operand[0].predicate
33765 == insn_data[icode].operand[0].predicate
33766 && insn_data[new_icode].operand[1].predicate
33767 == insn_data[icode].operand[1].predicate);
33768 icode = new_icode;
33769 goto non_constant;
33771 break;
33772 default:
33773 gcc_unreachable ();
33777 else
33779 non_constant:
33780 if (VECTOR_MODE_P (mode))
33781 op = safe_vector_operand (op, mode);
33783 /* If we aren't optimizing, only allow one memory operand to be
33784 generated. */
33785 if (memory_operand (op, mode))
33786 num_memory++;
33788 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
33790 if (optimize
33791 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
33792 || num_memory > 1)
33793 op = force_reg (mode, op);
33796 args[i].op = op;
33797 args[i].mode = mode;
33800 switch (nargs)
33802 case 1:
33803 pat = GEN_FCN (icode) (target, args[0].op);
33804 break;
33806 case 2:
33807 if (tf_p)
33808 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
33809 GEN_INT ((int)sub_code));
33810 else if (! comparison_p)
33811 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
33812 else
33814 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
33815 args[0].op,
33816 args[1].op);
33818 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
33820 break;
33822 case 3:
33823 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
33824 break;
33826 case 4:
33827 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
33828 break;
33830 default:
33831 gcc_unreachable ();
33834 if (! pat)
33835 return 0;
33837 emit_insn (pat);
33838 return target;
33841 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
33842 insns with vec_merge. */
33844 static rtx
33845 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
33846 rtx target)
33848 rtx pat;
33849 tree arg0 = CALL_EXPR_ARG (exp, 0);
33850 rtx op1, op0 = expand_normal (arg0);
33851 machine_mode tmode = insn_data[icode].operand[0].mode;
33852 machine_mode mode0 = insn_data[icode].operand[1].mode;
33854 if (optimize || !target
33855 || GET_MODE (target) != tmode
33856 || !insn_data[icode].operand[0].predicate (target, tmode))
33857 target = gen_reg_rtx (tmode);
33859 if (VECTOR_MODE_P (mode0))
33860 op0 = safe_vector_operand (op0, mode0);
33862 if ((optimize && !register_operand (op0, mode0))
33863 || !insn_data[icode].operand[1].predicate (op0, mode0))
33864 op0 = copy_to_mode_reg (mode0, op0);
33866 op1 = op0;
33867 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33868 op1 = copy_to_mode_reg (mode0, op1);
33870 pat = GEN_FCN (icode) (target, op0, op1);
33871 if (! pat)
33872 return 0;
33873 emit_insn (pat);
33874 return target;
33877 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33879 static rtx
33880 ix86_expand_sse_compare (const struct builtin_description *d,
33881 tree exp, rtx target, bool swap)
33883 rtx pat;
33884 tree arg0 = CALL_EXPR_ARG (exp, 0);
33885 tree arg1 = CALL_EXPR_ARG (exp, 1);
33886 rtx op0 = expand_normal (arg0);
33887 rtx op1 = expand_normal (arg1);
33888 rtx op2;
33889 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33890 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33891 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33892 enum rtx_code comparison = d->comparison;
33894 if (VECTOR_MODE_P (mode0))
33895 op0 = safe_vector_operand (op0, mode0);
33896 if (VECTOR_MODE_P (mode1))
33897 op1 = safe_vector_operand (op1, mode1);
33899 /* Swap operands if we have a comparison that isn't available in
33900 hardware. */
33901 if (swap)
33902 std::swap (op0, op1);
33904 if (optimize || !target
33905 || GET_MODE (target) != tmode
33906 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33907 target = gen_reg_rtx (tmode);
33909 if ((optimize && !register_operand (op0, mode0))
33910 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33911 op0 = copy_to_mode_reg (mode0, op0);
33912 if ((optimize && !register_operand (op1, mode1))
33913 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33914 op1 = copy_to_mode_reg (mode1, op1);
33916 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33917 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33918 if (! pat)
33919 return 0;
33920 emit_insn (pat);
33921 return target;
33924 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33926 static rtx
33927 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33928 rtx target)
33930 rtx pat;
33931 tree arg0 = CALL_EXPR_ARG (exp, 0);
33932 tree arg1 = CALL_EXPR_ARG (exp, 1);
33933 rtx op0 = expand_normal (arg0);
33934 rtx op1 = expand_normal (arg1);
33935 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33936 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33937 enum rtx_code comparison = d->comparison;
33939 if (VECTOR_MODE_P (mode0))
33940 op0 = safe_vector_operand (op0, mode0);
33941 if (VECTOR_MODE_P (mode1))
33942 op1 = safe_vector_operand (op1, mode1);
33944 /* Swap operands if we have a comparison that isn't available in
33945 hardware. */
33946 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33947 std::swap (op0, op1);
33949 target = gen_reg_rtx (SImode);
33950 emit_move_insn (target, const0_rtx);
33951 target = gen_rtx_SUBREG (QImode, target, 0);
33953 if ((optimize && !register_operand (op0, mode0))
33954 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33955 op0 = copy_to_mode_reg (mode0, op0);
33956 if ((optimize && !register_operand (op1, mode1))
33957 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33958 op1 = copy_to_mode_reg (mode1, op1);
33960 pat = GEN_FCN (d->icode) (op0, op1);
33961 if (! pat)
33962 return 0;
33963 emit_insn (pat);
33964 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33965 gen_rtx_fmt_ee (comparison, QImode,
33966 SET_DEST (pat),
33967 const0_rtx)));
33969 return SUBREG_REG (target);
33972 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33974 static rtx
33975 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33976 rtx target)
33978 rtx pat;
33979 tree arg0 = CALL_EXPR_ARG (exp, 0);
33980 rtx op1, op0 = expand_normal (arg0);
33981 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33982 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33984 if (optimize || target == 0
33985 || GET_MODE (target) != tmode
33986 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33987 target = gen_reg_rtx (tmode);
33989 if (VECTOR_MODE_P (mode0))
33990 op0 = safe_vector_operand (op0, mode0);
33992 if ((optimize && !register_operand (op0, mode0))
33993 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33994 op0 = copy_to_mode_reg (mode0, op0);
33996 op1 = GEN_INT (d->comparison);
33998 pat = GEN_FCN (d->icode) (target, op0, op1);
33999 if (! pat)
34000 return 0;
34001 emit_insn (pat);
34002 return target;
34005 static rtx
34006 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34007 tree exp, rtx target)
34009 rtx pat;
34010 tree arg0 = CALL_EXPR_ARG (exp, 0);
34011 tree arg1 = CALL_EXPR_ARG (exp, 1);
34012 rtx op0 = expand_normal (arg0);
34013 rtx op1 = expand_normal (arg1);
34014 rtx op2;
34015 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34016 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34017 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34019 if (optimize || target == 0
34020 || GET_MODE (target) != tmode
34021 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34022 target = gen_reg_rtx (tmode);
34024 op0 = safe_vector_operand (op0, mode0);
34025 op1 = safe_vector_operand (op1, mode1);
34027 if ((optimize && !register_operand (op0, mode0))
34028 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34029 op0 = copy_to_mode_reg (mode0, op0);
34030 if ((optimize && !register_operand (op1, mode1))
34031 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34032 op1 = copy_to_mode_reg (mode1, op1);
34034 op2 = GEN_INT (d->comparison);
34036 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34037 if (! pat)
34038 return 0;
34039 emit_insn (pat);
34040 return target;
34043 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34045 static rtx
34046 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34047 rtx target)
34049 rtx pat;
34050 tree arg0 = CALL_EXPR_ARG (exp, 0);
34051 tree arg1 = CALL_EXPR_ARG (exp, 1);
34052 rtx op0 = expand_normal (arg0);
34053 rtx op1 = expand_normal (arg1);
34054 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34055 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34056 enum rtx_code comparison = d->comparison;
34058 if (VECTOR_MODE_P (mode0))
34059 op0 = safe_vector_operand (op0, mode0);
34060 if (VECTOR_MODE_P (mode1))
34061 op1 = safe_vector_operand (op1, mode1);
34063 target = gen_reg_rtx (SImode);
34064 emit_move_insn (target, const0_rtx);
34065 target = gen_rtx_SUBREG (QImode, target, 0);
34067 if ((optimize && !register_operand (op0, mode0))
34068 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34069 op0 = copy_to_mode_reg (mode0, op0);
34070 if ((optimize && !register_operand (op1, mode1))
34071 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34072 op1 = copy_to_mode_reg (mode1, op1);
34074 pat = GEN_FCN (d->icode) (op0, op1);
34075 if (! pat)
34076 return 0;
34077 emit_insn (pat);
34078 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34079 gen_rtx_fmt_ee (comparison, QImode,
34080 SET_DEST (pat),
34081 const0_rtx)));
34083 return SUBREG_REG (target);
34086 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34088 static rtx
34089 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34090 tree exp, rtx target)
34092 rtx pat;
34093 tree arg0 = CALL_EXPR_ARG (exp, 0);
34094 tree arg1 = CALL_EXPR_ARG (exp, 1);
34095 tree arg2 = CALL_EXPR_ARG (exp, 2);
34096 tree arg3 = CALL_EXPR_ARG (exp, 3);
34097 tree arg4 = CALL_EXPR_ARG (exp, 4);
34098 rtx scratch0, scratch1;
34099 rtx op0 = expand_normal (arg0);
34100 rtx op1 = expand_normal (arg1);
34101 rtx op2 = expand_normal (arg2);
34102 rtx op3 = expand_normal (arg3);
34103 rtx op4 = expand_normal (arg4);
34104 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34106 tmode0 = insn_data[d->icode].operand[0].mode;
34107 tmode1 = insn_data[d->icode].operand[1].mode;
34108 modev2 = insn_data[d->icode].operand[2].mode;
34109 modei3 = insn_data[d->icode].operand[3].mode;
34110 modev4 = insn_data[d->icode].operand[4].mode;
34111 modei5 = insn_data[d->icode].operand[5].mode;
34112 modeimm = insn_data[d->icode].operand[6].mode;
34114 if (VECTOR_MODE_P (modev2))
34115 op0 = safe_vector_operand (op0, modev2);
34116 if (VECTOR_MODE_P (modev4))
34117 op2 = safe_vector_operand (op2, modev4);
34119 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34120 op0 = copy_to_mode_reg (modev2, op0);
34121 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34122 op1 = copy_to_mode_reg (modei3, op1);
34123 if ((optimize && !register_operand (op2, modev4))
34124 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34125 op2 = copy_to_mode_reg (modev4, op2);
34126 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34127 op3 = copy_to_mode_reg (modei5, op3);
34129 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34131 error ("the fifth argument must be an 8-bit immediate");
34132 return const0_rtx;
34135 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34137 if (optimize || !target
34138 || GET_MODE (target) != tmode0
34139 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34140 target = gen_reg_rtx (tmode0);
34142 scratch1 = gen_reg_rtx (tmode1);
34144 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34146 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34148 if (optimize || !target
34149 || GET_MODE (target) != tmode1
34150 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34151 target = gen_reg_rtx (tmode1);
34153 scratch0 = gen_reg_rtx (tmode0);
34155 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34157 else
34159 gcc_assert (d->flag);
34161 scratch0 = gen_reg_rtx (tmode0);
34162 scratch1 = gen_reg_rtx (tmode1);
34164 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34167 if (! pat)
34168 return 0;
34170 emit_insn (pat);
34172 if (d->flag)
34174 target = gen_reg_rtx (SImode);
34175 emit_move_insn (target, const0_rtx);
34176 target = gen_rtx_SUBREG (QImode, target, 0);
34178 emit_insn
34179 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34180 gen_rtx_fmt_ee (EQ, QImode,
34181 gen_rtx_REG ((machine_mode) d->flag,
34182 FLAGS_REG),
34183 const0_rtx)));
34184 return SUBREG_REG (target);
34186 else
34187 return target;
34191 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34193 static rtx
34194 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34195 tree exp, rtx target)
34197 rtx pat;
34198 tree arg0 = CALL_EXPR_ARG (exp, 0);
34199 tree arg1 = CALL_EXPR_ARG (exp, 1);
34200 tree arg2 = CALL_EXPR_ARG (exp, 2);
34201 rtx scratch0, scratch1;
34202 rtx op0 = expand_normal (arg0);
34203 rtx op1 = expand_normal (arg1);
34204 rtx op2 = expand_normal (arg2);
34205 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34207 tmode0 = insn_data[d->icode].operand[0].mode;
34208 tmode1 = insn_data[d->icode].operand[1].mode;
34209 modev2 = insn_data[d->icode].operand[2].mode;
34210 modev3 = insn_data[d->icode].operand[3].mode;
34211 modeimm = insn_data[d->icode].operand[4].mode;
34213 if (VECTOR_MODE_P (modev2))
34214 op0 = safe_vector_operand (op0, modev2);
34215 if (VECTOR_MODE_P (modev3))
34216 op1 = safe_vector_operand (op1, modev3);
34218 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34219 op0 = copy_to_mode_reg (modev2, op0);
34220 if ((optimize && !register_operand (op1, modev3))
34221 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34222 op1 = copy_to_mode_reg (modev3, op1);
34224 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34226 error ("the third argument must be an 8-bit immediate");
34227 return const0_rtx;
34230 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34232 if (optimize || !target
34233 || GET_MODE (target) != tmode0
34234 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34235 target = gen_reg_rtx (tmode0);
34237 scratch1 = gen_reg_rtx (tmode1);
34239 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34241 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34243 if (optimize || !target
34244 || GET_MODE (target) != tmode1
34245 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34246 target = gen_reg_rtx (tmode1);
34248 scratch0 = gen_reg_rtx (tmode0);
34250 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34252 else
34254 gcc_assert (d->flag);
34256 scratch0 = gen_reg_rtx (tmode0);
34257 scratch1 = gen_reg_rtx (tmode1);
34259 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34262 if (! pat)
34263 return 0;
34265 emit_insn (pat);
34267 if (d->flag)
34269 target = gen_reg_rtx (SImode);
34270 emit_move_insn (target, const0_rtx);
34271 target = gen_rtx_SUBREG (QImode, target, 0);
34273 emit_insn
34274 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34275 gen_rtx_fmt_ee (EQ, QImode,
34276 gen_rtx_REG ((machine_mode) d->flag,
34277 FLAGS_REG),
34278 const0_rtx)));
34279 return SUBREG_REG (target);
34281 else
34282 return target;
34285 /* Subroutine of ix86_expand_builtin to take care of insns with
34286 variable number of operands. */
34288 static rtx
34289 ix86_expand_args_builtin (const struct builtin_description *d,
34290 tree exp, rtx target)
34292 rtx pat, real_target;
34293 unsigned int i, nargs;
34294 unsigned int nargs_constant = 0;
34295 unsigned int mask_pos = 0;
34296 int num_memory = 0;
34297 struct
34299 rtx op;
34300 machine_mode mode;
34301 } args[6];
34302 bool last_arg_count = false;
34303 enum insn_code icode = d->icode;
34304 const struct insn_data_d *insn_p = &insn_data[icode];
34305 machine_mode tmode = insn_p->operand[0].mode;
34306 machine_mode rmode = VOIDmode;
34307 bool swap = false;
34308 enum rtx_code comparison = d->comparison;
34310 switch ((enum ix86_builtin_func_type) d->flag)
34312 case V2DF_FTYPE_V2DF_ROUND:
34313 case V4DF_FTYPE_V4DF_ROUND:
34314 case V8DF_FTYPE_V8DF_ROUND:
34315 case V4SF_FTYPE_V4SF_ROUND:
34316 case V8SF_FTYPE_V8SF_ROUND:
34317 case V16SF_FTYPE_V16SF_ROUND:
34318 case V4SI_FTYPE_V4SF_ROUND:
34319 case V8SI_FTYPE_V8SF_ROUND:
34320 case V16SI_FTYPE_V16SF_ROUND:
34321 return ix86_expand_sse_round (d, exp, target);
34322 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34323 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34324 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34325 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34326 case INT_FTYPE_V8SF_V8SF_PTEST:
34327 case INT_FTYPE_V4DI_V4DI_PTEST:
34328 case INT_FTYPE_V4DF_V4DF_PTEST:
34329 case INT_FTYPE_V4SF_V4SF_PTEST:
34330 case INT_FTYPE_V2DI_V2DI_PTEST:
34331 case INT_FTYPE_V2DF_V2DF_PTEST:
34332 return ix86_expand_sse_ptest (d, exp, target);
34333 case FLOAT128_FTYPE_FLOAT128:
34334 case FLOAT_FTYPE_FLOAT:
34335 case INT_FTYPE_INT:
34336 case UINT64_FTYPE_INT:
34337 case UINT16_FTYPE_UINT16:
34338 case INT64_FTYPE_INT64:
34339 case INT64_FTYPE_V4SF:
34340 case INT64_FTYPE_V2DF:
34341 case INT_FTYPE_V16QI:
34342 case INT_FTYPE_V8QI:
34343 case INT_FTYPE_V8SF:
34344 case INT_FTYPE_V4DF:
34345 case INT_FTYPE_V4SF:
34346 case INT_FTYPE_V2DF:
34347 case INT_FTYPE_V32QI:
34348 case V16QI_FTYPE_V16QI:
34349 case V8SI_FTYPE_V8SF:
34350 case V8SI_FTYPE_V4SI:
34351 case V8HI_FTYPE_V8HI:
34352 case V8HI_FTYPE_V16QI:
34353 case V8QI_FTYPE_V8QI:
34354 case V8SF_FTYPE_V8SF:
34355 case V8SF_FTYPE_V8SI:
34356 case V8SF_FTYPE_V4SF:
34357 case V8SF_FTYPE_V8HI:
34358 case V4SI_FTYPE_V4SI:
34359 case V4SI_FTYPE_V16QI:
34360 case V4SI_FTYPE_V4SF:
34361 case V4SI_FTYPE_V8SI:
34362 case V4SI_FTYPE_V8HI:
34363 case V4SI_FTYPE_V4DF:
34364 case V4SI_FTYPE_V2DF:
34365 case V4HI_FTYPE_V4HI:
34366 case V4DF_FTYPE_V4DF:
34367 case V4DF_FTYPE_V4SI:
34368 case V4DF_FTYPE_V4SF:
34369 case V4DF_FTYPE_V2DF:
34370 case V4SF_FTYPE_V4SF:
34371 case V4SF_FTYPE_V4SI:
34372 case V4SF_FTYPE_V8SF:
34373 case V4SF_FTYPE_V4DF:
34374 case V4SF_FTYPE_V8HI:
34375 case V4SF_FTYPE_V2DF:
34376 case V2DI_FTYPE_V2DI:
34377 case V2DI_FTYPE_V16QI:
34378 case V2DI_FTYPE_V8HI:
34379 case V2DI_FTYPE_V4SI:
34380 case V2DF_FTYPE_V2DF:
34381 case V2DF_FTYPE_V4SI:
34382 case V2DF_FTYPE_V4DF:
34383 case V2DF_FTYPE_V4SF:
34384 case V2DF_FTYPE_V2SI:
34385 case V2SI_FTYPE_V2SI:
34386 case V2SI_FTYPE_V4SF:
34387 case V2SI_FTYPE_V2SF:
34388 case V2SI_FTYPE_V2DF:
34389 case V2SF_FTYPE_V2SF:
34390 case V2SF_FTYPE_V2SI:
34391 case V32QI_FTYPE_V32QI:
34392 case V32QI_FTYPE_V16QI:
34393 case V16HI_FTYPE_V16HI:
34394 case V16HI_FTYPE_V8HI:
34395 case V8SI_FTYPE_V8SI:
34396 case V16HI_FTYPE_V16QI:
34397 case V8SI_FTYPE_V16QI:
34398 case V4DI_FTYPE_V16QI:
34399 case V8SI_FTYPE_V8HI:
34400 case V4DI_FTYPE_V8HI:
34401 case V4DI_FTYPE_V4SI:
34402 case V4DI_FTYPE_V2DI:
34403 case UHI_FTYPE_UHI:
34404 case UHI_FTYPE_V16QI:
34405 case USI_FTYPE_V32QI:
34406 case UDI_FTYPE_V64QI:
34407 case V16QI_FTYPE_UHI:
34408 case V32QI_FTYPE_USI:
34409 case V64QI_FTYPE_UDI:
34410 case V8HI_FTYPE_UQI:
34411 case V16HI_FTYPE_UHI:
34412 case V32HI_FTYPE_USI:
34413 case V4SI_FTYPE_UQI:
34414 case V8SI_FTYPE_UQI:
34415 case V4SI_FTYPE_UHI:
34416 case V8SI_FTYPE_UHI:
34417 case UQI_FTYPE_V8HI:
34418 case UHI_FTYPE_V16HI:
34419 case USI_FTYPE_V32HI:
34420 case UQI_FTYPE_V4SI:
34421 case UQI_FTYPE_V8SI:
34422 case UHI_FTYPE_V16SI:
34423 case UQI_FTYPE_V2DI:
34424 case UQI_FTYPE_V4DI:
34425 case UQI_FTYPE_V8DI:
34426 case V16SI_FTYPE_UHI:
34427 case V2DI_FTYPE_UQI:
34428 case V4DI_FTYPE_UQI:
34429 case V16SI_FTYPE_INT:
34430 case V16SF_FTYPE_V8SF:
34431 case V16SI_FTYPE_V8SI:
34432 case V16SF_FTYPE_V4SF:
34433 case V16SI_FTYPE_V4SI:
34434 case V16SI_FTYPE_V16SF:
34435 case V16SF_FTYPE_V16SF:
34436 case V8DI_FTYPE_UQI:
34437 case V8DF_FTYPE_V4DF:
34438 case V8DF_FTYPE_V2DF:
34439 case V8DF_FTYPE_V8DF:
34440 nargs = 1;
34441 break;
34442 case V4SF_FTYPE_V4SF_VEC_MERGE:
34443 case V2DF_FTYPE_V2DF_VEC_MERGE:
34444 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34445 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34446 case V16QI_FTYPE_V16QI_V16QI:
34447 case V16QI_FTYPE_V8HI_V8HI:
34448 case V16SF_FTYPE_V16SF_V16SF:
34449 case V8QI_FTYPE_V8QI_V8QI:
34450 case V8QI_FTYPE_V4HI_V4HI:
34451 case V8HI_FTYPE_V8HI_V8HI:
34452 case V8HI_FTYPE_V16QI_V16QI:
34453 case V8HI_FTYPE_V4SI_V4SI:
34454 case V8SF_FTYPE_V8SF_V8SF:
34455 case V8SF_FTYPE_V8SF_V8SI:
34456 case V8DF_FTYPE_V8DF_V8DF:
34457 case V4SI_FTYPE_V4SI_V4SI:
34458 case V4SI_FTYPE_V8HI_V8HI:
34459 case V4SI_FTYPE_V2DF_V2DF:
34460 case V4HI_FTYPE_V4HI_V4HI:
34461 case V4HI_FTYPE_V8QI_V8QI:
34462 case V4HI_FTYPE_V2SI_V2SI:
34463 case V4DF_FTYPE_V4DF_V4DF:
34464 case V4DF_FTYPE_V4DF_V4DI:
34465 case V4SF_FTYPE_V4SF_V4SF:
34466 case V4SF_FTYPE_V4SF_V4SI:
34467 case V4SF_FTYPE_V4SF_V2SI:
34468 case V4SF_FTYPE_V4SF_V2DF:
34469 case V4SF_FTYPE_V4SF_UINT:
34470 case V4SF_FTYPE_V4SF_DI:
34471 case V4SF_FTYPE_V4SF_SI:
34472 case V2DI_FTYPE_V2DI_V2DI:
34473 case V2DI_FTYPE_V16QI_V16QI:
34474 case V2DI_FTYPE_V4SI_V4SI:
34475 case V2DI_FTYPE_V2DI_V16QI:
34476 case V2SI_FTYPE_V2SI_V2SI:
34477 case V2SI_FTYPE_V4HI_V4HI:
34478 case V2SI_FTYPE_V2SF_V2SF:
34479 case V2DF_FTYPE_V2DF_V2DF:
34480 case V2DF_FTYPE_V2DF_V4SF:
34481 case V2DF_FTYPE_V2DF_V2DI:
34482 case V2DF_FTYPE_V2DF_DI:
34483 case V2DF_FTYPE_V2DF_SI:
34484 case V2DF_FTYPE_V2DF_UINT:
34485 case V2SF_FTYPE_V2SF_V2SF:
34486 case V1DI_FTYPE_V1DI_V1DI:
34487 case V1DI_FTYPE_V8QI_V8QI:
34488 case V1DI_FTYPE_V2SI_V2SI:
34489 case V32QI_FTYPE_V16HI_V16HI:
34490 case V16HI_FTYPE_V8SI_V8SI:
34491 case V32QI_FTYPE_V32QI_V32QI:
34492 case V16HI_FTYPE_V32QI_V32QI:
34493 case V16HI_FTYPE_V16HI_V16HI:
34494 case V8SI_FTYPE_V4DF_V4DF:
34495 case V8SI_FTYPE_V8SI_V8SI:
34496 case V8SI_FTYPE_V16HI_V16HI:
34497 case V4DI_FTYPE_V4DI_V4DI:
34498 case V4DI_FTYPE_V8SI_V8SI:
34499 case V8DI_FTYPE_V64QI_V64QI:
34500 if (comparison == UNKNOWN)
34501 return ix86_expand_binop_builtin (icode, exp, target);
34502 nargs = 2;
34503 break;
34504 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34505 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34506 gcc_assert (comparison != UNKNOWN);
34507 nargs = 2;
34508 swap = true;
34509 break;
34510 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34511 case V16HI_FTYPE_V16HI_SI_COUNT:
34512 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34513 case V8SI_FTYPE_V8SI_SI_COUNT:
34514 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34515 case V4DI_FTYPE_V4DI_INT_COUNT:
34516 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34517 case V8HI_FTYPE_V8HI_SI_COUNT:
34518 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34519 case V4SI_FTYPE_V4SI_SI_COUNT:
34520 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34521 case V4HI_FTYPE_V4HI_SI_COUNT:
34522 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34523 case V2DI_FTYPE_V2DI_SI_COUNT:
34524 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34525 case V2SI_FTYPE_V2SI_SI_COUNT:
34526 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34527 case V1DI_FTYPE_V1DI_SI_COUNT:
34528 nargs = 2;
34529 last_arg_count = true;
34530 break;
34531 case UINT64_FTYPE_UINT64_UINT64:
34532 case UINT_FTYPE_UINT_UINT:
34533 case UINT_FTYPE_UINT_USHORT:
34534 case UINT_FTYPE_UINT_UCHAR:
34535 case UINT16_FTYPE_UINT16_INT:
34536 case UINT8_FTYPE_UINT8_INT:
34537 case UHI_FTYPE_UHI_UHI:
34538 case USI_FTYPE_USI_USI:
34539 case UDI_FTYPE_UDI_UDI:
34540 case V16SI_FTYPE_V8DF_V8DF:
34541 nargs = 2;
34542 break;
34543 case V2DI_FTYPE_V2DI_INT_CONVERT:
34544 nargs = 2;
34545 rmode = V1TImode;
34546 nargs_constant = 1;
34547 break;
34548 case V4DI_FTYPE_V4DI_INT_CONVERT:
34549 nargs = 2;
34550 rmode = V2TImode;
34551 nargs_constant = 1;
34552 break;
34553 case V8DI_FTYPE_V8DI_INT_CONVERT:
34554 nargs = 2;
34555 rmode = V4TImode;
34556 nargs_constant = 1;
34557 break;
34558 case V8HI_FTYPE_V8HI_INT:
34559 case V8HI_FTYPE_V8SF_INT:
34560 case V16HI_FTYPE_V16SF_INT:
34561 case V8HI_FTYPE_V4SF_INT:
34562 case V8SF_FTYPE_V8SF_INT:
34563 case V4SF_FTYPE_V16SF_INT:
34564 case V16SF_FTYPE_V16SF_INT:
34565 case V4SI_FTYPE_V4SI_INT:
34566 case V4SI_FTYPE_V8SI_INT:
34567 case V4HI_FTYPE_V4HI_INT:
34568 case V4DF_FTYPE_V4DF_INT:
34569 case V4DF_FTYPE_V8DF_INT:
34570 case V4SF_FTYPE_V4SF_INT:
34571 case V4SF_FTYPE_V8SF_INT:
34572 case V2DI_FTYPE_V2DI_INT:
34573 case V2DF_FTYPE_V2DF_INT:
34574 case V2DF_FTYPE_V4DF_INT:
34575 case V16HI_FTYPE_V16HI_INT:
34576 case V8SI_FTYPE_V8SI_INT:
34577 case V16SI_FTYPE_V16SI_INT:
34578 case V4SI_FTYPE_V16SI_INT:
34579 case V4DI_FTYPE_V4DI_INT:
34580 case V2DI_FTYPE_V4DI_INT:
34581 case V4DI_FTYPE_V8DI_INT:
34582 case QI_FTYPE_V4SF_INT:
34583 case QI_FTYPE_V2DF_INT:
34584 nargs = 2;
34585 nargs_constant = 1;
34586 break;
34587 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34588 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34589 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34590 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34591 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34592 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34593 case UHI_FTYPE_V16SI_V16SI_UHI:
34594 case UQI_FTYPE_V8DI_V8DI_UQI:
34595 case V16HI_FTYPE_V16SI_V16HI_UHI:
34596 case V16QI_FTYPE_V16SI_V16QI_UHI:
34597 case V16QI_FTYPE_V8DI_V16QI_UQI:
34598 case V16SF_FTYPE_V16SF_V16SF_UHI:
34599 case V16SF_FTYPE_V4SF_V16SF_UHI:
34600 case V16SI_FTYPE_SI_V16SI_UHI:
34601 case V16SI_FTYPE_V16HI_V16SI_UHI:
34602 case V16SI_FTYPE_V16QI_V16SI_UHI:
34603 case V8SF_FTYPE_V4SF_V8SF_UQI:
34604 case V4DF_FTYPE_V2DF_V4DF_UQI:
34605 case V8SI_FTYPE_V4SI_V8SI_UQI:
34606 case V8SI_FTYPE_SI_V8SI_UQI:
34607 case V4SI_FTYPE_V4SI_V4SI_UQI:
34608 case V4SI_FTYPE_SI_V4SI_UQI:
34609 case V4DI_FTYPE_V2DI_V4DI_UQI:
34610 case V4DI_FTYPE_DI_V4DI_UQI:
34611 case V2DI_FTYPE_V2DI_V2DI_UQI:
34612 case V2DI_FTYPE_DI_V2DI_UQI:
34613 case V64QI_FTYPE_V64QI_V64QI_UDI:
34614 case V64QI_FTYPE_V16QI_V64QI_UDI:
34615 case V64QI_FTYPE_QI_V64QI_UDI:
34616 case V32QI_FTYPE_V32QI_V32QI_USI:
34617 case V32QI_FTYPE_V16QI_V32QI_USI:
34618 case V32QI_FTYPE_QI_V32QI_USI:
34619 case V16QI_FTYPE_V16QI_V16QI_UHI:
34620 case V16QI_FTYPE_QI_V16QI_UHI:
34621 case V32HI_FTYPE_V8HI_V32HI_USI:
34622 case V32HI_FTYPE_HI_V32HI_USI:
34623 case V16HI_FTYPE_V8HI_V16HI_UHI:
34624 case V16HI_FTYPE_HI_V16HI_UHI:
34625 case V8HI_FTYPE_V8HI_V8HI_UQI:
34626 case V8HI_FTYPE_HI_V8HI_UQI:
34627 case V8SF_FTYPE_V8HI_V8SF_UQI:
34628 case V4SF_FTYPE_V8HI_V4SF_UQI:
34629 case V8SI_FTYPE_V8SF_V8SI_UQI:
34630 case V4SI_FTYPE_V4SF_V4SI_UQI:
34631 case V4DI_FTYPE_V4SF_V4DI_UQI:
34632 case V2DI_FTYPE_V4SF_V2DI_UQI:
34633 case V4SF_FTYPE_V4DI_V4SF_UQI:
34634 case V4SF_FTYPE_V2DI_V4SF_UQI:
34635 case V4DF_FTYPE_V4DI_V4DF_UQI:
34636 case V2DF_FTYPE_V2DI_V2DF_UQI:
34637 case V16QI_FTYPE_V8HI_V16QI_UQI:
34638 case V16QI_FTYPE_V16HI_V16QI_UHI:
34639 case V16QI_FTYPE_V4SI_V16QI_UQI:
34640 case V16QI_FTYPE_V8SI_V16QI_UQI:
34641 case V8HI_FTYPE_V4SI_V8HI_UQI:
34642 case V8HI_FTYPE_V8SI_V8HI_UQI:
34643 case V16QI_FTYPE_V2DI_V16QI_UQI:
34644 case V16QI_FTYPE_V4DI_V16QI_UQI:
34645 case V8HI_FTYPE_V2DI_V8HI_UQI:
34646 case V8HI_FTYPE_V4DI_V8HI_UQI:
34647 case V4SI_FTYPE_V2DI_V4SI_UQI:
34648 case V4SI_FTYPE_V4DI_V4SI_UQI:
34649 case V32QI_FTYPE_V32HI_V32QI_USI:
34650 case UHI_FTYPE_V16QI_V16QI_UHI:
34651 case USI_FTYPE_V32QI_V32QI_USI:
34652 case UDI_FTYPE_V64QI_V64QI_UDI:
34653 case UQI_FTYPE_V8HI_V8HI_UQI:
34654 case UHI_FTYPE_V16HI_V16HI_UHI:
34655 case USI_FTYPE_V32HI_V32HI_USI:
34656 case UQI_FTYPE_V4SI_V4SI_UQI:
34657 case UQI_FTYPE_V8SI_V8SI_UQI:
34658 case UQI_FTYPE_V2DI_V2DI_UQI:
34659 case UQI_FTYPE_V4DI_V4DI_UQI:
34660 case V4SF_FTYPE_V2DF_V4SF_UQI:
34661 case V4SF_FTYPE_V4DF_V4SF_UQI:
34662 case V16SI_FTYPE_V16SI_V16SI_UHI:
34663 case V16SI_FTYPE_V4SI_V16SI_UHI:
34664 case V2DI_FTYPE_V4SI_V2DI_UQI:
34665 case V2DI_FTYPE_V8HI_V2DI_UQI:
34666 case V2DI_FTYPE_V16QI_V2DI_UQI:
34667 case V4DI_FTYPE_V4DI_V4DI_UQI:
34668 case V4DI_FTYPE_V4SI_V4DI_UQI:
34669 case V4DI_FTYPE_V8HI_V4DI_UQI:
34670 case V4DI_FTYPE_V16QI_V4DI_UQI:
34671 case V4DI_FTYPE_V4DF_V4DI_UQI:
34672 case V2DI_FTYPE_V2DF_V2DI_UQI:
34673 case V4SI_FTYPE_V4DF_V4SI_UQI:
34674 case V4SI_FTYPE_V2DF_V4SI_UQI:
34675 case V4SI_FTYPE_V8HI_V4SI_UQI:
34676 case V4SI_FTYPE_V16QI_V4SI_UQI:
34677 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34678 case V8DF_FTYPE_V2DF_V8DF_UQI:
34679 case V8DF_FTYPE_V4DF_V8DF_UQI:
34680 case V8DF_FTYPE_V8DF_V8DF_UQI:
34681 case V8SF_FTYPE_V8SF_V8SF_UQI:
34682 case V8SF_FTYPE_V8SI_V8SF_UQI:
34683 case V4DF_FTYPE_V4DF_V4DF_UQI:
34684 case V4SF_FTYPE_V4SF_V4SF_UQI:
34685 case V2DF_FTYPE_V2DF_V2DF_UQI:
34686 case V2DF_FTYPE_V4SF_V2DF_UQI:
34687 case V2DF_FTYPE_V4SI_V2DF_UQI:
34688 case V4SF_FTYPE_V4SI_V4SF_UQI:
34689 case V4DF_FTYPE_V4SF_V4DF_UQI:
34690 case V4DF_FTYPE_V4SI_V4DF_UQI:
34691 case V8SI_FTYPE_V8SI_V8SI_UQI:
34692 case V8SI_FTYPE_V8HI_V8SI_UQI:
34693 case V8SI_FTYPE_V16QI_V8SI_UQI:
34694 case V8DF_FTYPE_V8SI_V8DF_UQI:
34695 case V8DI_FTYPE_DI_V8DI_UQI:
34696 case V16SF_FTYPE_V8SF_V16SF_UHI:
34697 case V16SI_FTYPE_V8SI_V16SI_UHI:
34698 case V16HI_FTYPE_V16HI_V16HI_UHI:
34699 case V8HI_FTYPE_V16QI_V8HI_UQI:
34700 case V16HI_FTYPE_V16QI_V16HI_UHI:
34701 case V32HI_FTYPE_V32HI_V32HI_USI:
34702 case V32HI_FTYPE_V32QI_V32HI_USI:
34703 case V8DI_FTYPE_V16QI_V8DI_UQI:
34704 case V8DI_FTYPE_V2DI_V8DI_UQI:
34705 case V8DI_FTYPE_V4DI_V8DI_UQI:
34706 case V8DI_FTYPE_V8DI_V8DI_UQI:
34707 case V8DI_FTYPE_V8HI_V8DI_UQI:
34708 case V8DI_FTYPE_V8SI_V8DI_UQI:
34709 case V8HI_FTYPE_V8DI_V8HI_UQI:
34710 case V8SI_FTYPE_V8DI_V8SI_UQI:
34711 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34712 nargs = 3;
34713 break;
34714 case V32QI_FTYPE_V32QI_V32QI_INT:
34715 case V16HI_FTYPE_V16HI_V16HI_INT:
34716 case V16QI_FTYPE_V16QI_V16QI_INT:
34717 case V4DI_FTYPE_V4DI_V4DI_INT:
34718 case V8HI_FTYPE_V8HI_V8HI_INT:
34719 case V8SI_FTYPE_V8SI_V8SI_INT:
34720 case V8SI_FTYPE_V8SI_V4SI_INT:
34721 case V8SF_FTYPE_V8SF_V8SF_INT:
34722 case V8SF_FTYPE_V8SF_V4SF_INT:
34723 case V4SI_FTYPE_V4SI_V4SI_INT:
34724 case V4DF_FTYPE_V4DF_V4DF_INT:
34725 case V16SF_FTYPE_V16SF_V16SF_INT:
34726 case V16SF_FTYPE_V16SF_V4SF_INT:
34727 case V16SI_FTYPE_V16SI_V4SI_INT:
34728 case V4DF_FTYPE_V4DF_V2DF_INT:
34729 case V4SF_FTYPE_V4SF_V4SF_INT:
34730 case V2DI_FTYPE_V2DI_V2DI_INT:
34731 case V4DI_FTYPE_V4DI_V2DI_INT:
34732 case V2DF_FTYPE_V2DF_V2DF_INT:
34733 case UQI_FTYPE_V8DI_V8UDI_INT:
34734 case UQI_FTYPE_V8DF_V8DF_INT:
34735 case UQI_FTYPE_V2DF_V2DF_INT:
34736 case UQI_FTYPE_V4SF_V4SF_INT:
34737 case UHI_FTYPE_V16SI_V16SI_INT:
34738 case UHI_FTYPE_V16SF_V16SF_INT:
34739 nargs = 3;
34740 nargs_constant = 1;
34741 break;
34742 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34743 nargs = 3;
34744 rmode = V4DImode;
34745 nargs_constant = 1;
34746 break;
34747 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34748 nargs = 3;
34749 rmode = V2DImode;
34750 nargs_constant = 1;
34751 break;
34752 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34753 nargs = 3;
34754 rmode = DImode;
34755 nargs_constant = 1;
34756 break;
34757 case V2DI_FTYPE_V2DI_UINT_UINT:
34758 nargs = 3;
34759 nargs_constant = 2;
34760 break;
34761 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
34762 nargs = 3;
34763 rmode = V8DImode;
34764 nargs_constant = 1;
34765 break;
34766 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
34767 nargs = 5;
34768 rmode = V8DImode;
34769 mask_pos = 2;
34770 nargs_constant = 1;
34771 break;
34772 case QI_FTYPE_V8DF_INT_UQI:
34773 case QI_FTYPE_V4DF_INT_UQI:
34774 case QI_FTYPE_V2DF_INT_UQI:
34775 case HI_FTYPE_V16SF_INT_UHI:
34776 case QI_FTYPE_V8SF_INT_UQI:
34777 case QI_FTYPE_V4SF_INT_UQI:
34778 nargs = 3;
34779 mask_pos = 1;
34780 nargs_constant = 1;
34781 break;
34782 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
34783 nargs = 5;
34784 rmode = V4DImode;
34785 mask_pos = 2;
34786 nargs_constant = 1;
34787 break;
34788 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
34789 nargs = 5;
34790 rmode = V2DImode;
34791 mask_pos = 2;
34792 nargs_constant = 1;
34793 break;
34794 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34795 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34796 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34797 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34798 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34799 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34800 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34801 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34802 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34803 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34804 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34805 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34806 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34807 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34808 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34809 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34810 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34811 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34812 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34813 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34814 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34815 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34816 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34817 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34818 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34819 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34820 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34821 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34822 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34823 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34824 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34825 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34826 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34827 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34828 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34829 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34830 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34831 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34832 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34833 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34834 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34835 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34836 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34837 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34838 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34839 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34840 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34841 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34842 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34843 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34844 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34845 nargs = 4;
34846 break;
34847 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34848 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34849 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34850 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34851 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34852 nargs = 4;
34853 nargs_constant = 1;
34854 break;
34855 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34856 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34857 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34858 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34859 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34860 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34861 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34862 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34863 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34864 case USI_FTYPE_V32QI_V32QI_INT_USI:
34865 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34866 case USI_FTYPE_V32HI_V32HI_INT_USI:
34867 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34868 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34869 nargs = 4;
34870 mask_pos = 1;
34871 nargs_constant = 1;
34872 break;
34873 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34874 nargs = 4;
34875 nargs_constant = 2;
34876 break;
34877 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34878 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34879 nargs = 4;
34880 break;
34881 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34882 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34883 mask_pos = 1;
34884 nargs = 4;
34885 nargs_constant = 1;
34886 break;
34887 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34888 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34889 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34890 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34891 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34892 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34893 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34894 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34895 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34896 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34897 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34898 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34899 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34900 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34901 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34902 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34903 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34904 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34905 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34906 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34907 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34908 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34909 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34910 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34911 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34912 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34913 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34914 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34915 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34916 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34917 nargs = 4;
34918 mask_pos = 2;
34919 nargs_constant = 1;
34920 break;
34921 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34922 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34923 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34924 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34925 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34926 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34927 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34928 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34929 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34930 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34931 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34932 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34933 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34934 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34935 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34936 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34937 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34938 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34939 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34940 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34941 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34942 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34943 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34944 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34945 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34946 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34947 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34948 nargs = 5;
34949 mask_pos = 2;
34950 nargs_constant = 1;
34951 break;
34952 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34953 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34954 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34955 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34956 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34957 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34958 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34959 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34960 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34961 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34962 nargs = 5;
34963 mask_pos = 1;
34964 nargs_constant = 1;
34965 break;
34967 default:
34968 gcc_unreachable ();
34971 gcc_assert (nargs <= ARRAY_SIZE (args));
34973 if (comparison != UNKNOWN)
34975 gcc_assert (nargs == 2);
34976 return ix86_expand_sse_compare (d, exp, target, swap);
34979 if (rmode == VOIDmode || rmode == tmode)
34981 if (optimize
34982 || target == 0
34983 || GET_MODE (target) != tmode
34984 || !insn_p->operand[0].predicate (target, tmode))
34985 target = gen_reg_rtx (tmode);
34986 real_target = target;
34988 else
34990 real_target = gen_reg_rtx (tmode);
34991 target = lowpart_subreg (rmode, real_target, tmode);
34994 for (i = 0; i < nargs; i++)
34996 tree arg = CALL_EXPR_ARG (exp, i);
34997 rtx op = expand_normal (arg);
34998 machine_mode mode = insn_p->operand[i + 1].mode;
34999 bool match = insn_p->operand[i + 1].predicate (op, mode);
35001 if (last_arg_count && (i + 1) == nargs)
35003 /* SIMD shift insns take either an 8-bit immediate or
35004 register as count. But builtin functions take int as
35005 count. If count doesn't match, we put it in register. */
35006 if (!match)
35008 op = lowpart_subreg (SImode, op, GET_MODE (op));
35009 if (!insn_p->operand[i + 1].predicate (op, mode))
35010 op = copy_to_reg (op);
35013 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35014 (!mask_pos && (nargs - i) <= nargs_constant))
35016 if (!match)
35017 switch (icode)
35019 case CODE_FOR_avx_vinsertf128v4di:
35020 case CODE_FOR_avx_vextractf128v4di:
35021 error ("the last argument must be an 1-bit immediate");
35022 return const0_rtx;
35024 case CODE_FOR_avx512f_cmpv8di3_mask:
35025 case CODE_FOR_avx512f_cmpv16si3_mask:
35026 case CODE_FOR_avx512f_ucmpv8di3_mask:
35027 case CODE_FOR_avx512f_ucmpv16si3_mask:
35028 case CODE_FOR_avx512vl_cmpv4di3_mask:
35029 case CODE_FOR_avx512vl_cmpv8si3_mask:
35030 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35031 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35032 case CODE_FOR_avx512vl_cmpv2di3_mask:
35033 case CODE_FOR_avx512vl_cmpv4si3_mask:
35034 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35035 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35036 error ("the last argument must be a 3-bit immediate");
35037 return const0_rtx;
35039 case CODE_FOR_sse4_1_roundsd:
35040 case CODE_FOR_sse4_1_roundss:
35042 case CODE_FOR_sse4_1_roundpd:
35043 case CODE_FOR_sse4_1_roundps:
35044 case CODE_FOR_avx_roundpd256:
35045 case CODE_FOR_avx_roundps256:
35047 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35048 case CODE_FOR_sse4_1_roundps_sfix:
35049 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35050 case CODE_FOR_avx_roundps_sfix256:
35052 case CODE_FOR_sse4_1_blendps:
35053 case CODE_FOR_avx_blendpd256:
35054 case CODE_FOR_avx_vpermilv4df:
35055 case CODE_FOR_avx_vpermilv4df_mask:
35056 case CODE_FOR_avx512f_getmantv8df_mask:
35057 case CODE_FOR_avx512f_getmantv16sf_mask:
35058 case CODE_FOR_avx512vl_getmantv8sf_mask:
35059 case CODE_FOR_avx512vl_getmantv4df_mask:
35060 case CODE_FOR_avx512vl_getmantv4sf_mask:
35061 case CODE_FOR_avx512vl_getmantv2df_mask:
35062 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35063 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35064 case CODE_FOR_avx512dq_rangepv4df_mask:
35065 case CODE_FOR_avx512dq_rangepv8sf_mask:
35066 case CODE_FOR_avx512dq_rangepv2df_mask:
35067 case CODE_FOR_avx512dq_rangepv4sf_mask:
35068 case CODE_FOR_avx_shufpd256_mask:
35069 error ("the last argument must be a 4-bit immediate");
35070 return const0_rtx;
35072 case CODE_FOR_sha1rnds4:
35073 case CODE_FOR_sse4_1_blendpd:
35074 case CODE_FOR_avx_vpermilv2df:
35075 case CODE_FOR_avx_vpermilv2df_mask:
35076 case CODE_FOR_xop_vpermil2v2df3:
35077 case CODE_FOR_xop_vpermil2v4sf3:
35078 case CODE_FOR_xop_vpermil2v4df3:
35079 case CODE_FOR_xop_vpermil2v8sf3:
35080 case CODE_FOR_avx512f_vinsertf32x4_mask:
35081 case CODE_FOR_avx512f_vinserti32x4_mask:
35082 case CODE_FOR_avx512f_vextractf32x4_mask:
35083 case CODE_FOR_avx512f_vextracti32x4_mask:
35084 case CODE_FOR_sse2_shufpd:
35085 case CODE_FOR_sse2_shufpd_mask:
35086 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35087 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35088 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35089 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35090 error ("the last argument must be a 2-bit immediate");
35091 return const0_rtx;
35093 case CODE_FOR_avx_vextractf128v4df:
35094 case CODE_FOR_avx_vextractf128v8sf:
35095 case CODE_FOR_avx_vextractf128v8si:
35096 case CODE_FOR_avx_vinsertf128v4df:
35097 case CODE_FOR_avx_vinsertf128v8sf:
35098 case CODE_FOR_avx_vinsertf128v8si:
35099 case CODE_FOR_avx512f_vinsertf64x4_mask:
35100 case CODE_FOR_avx512f_vinserti64x4_mask:
35101 case CODE_FOR_avx512f_vextractf64x4_mask:
35102 case CODE_FOR_avx512f_vextracti64x4_mask:
35103 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35104 case CODE_FOR_avx512dq_vinserti32x8_mask:
35105 case CODE_FOR_avx512vl_vinsertv4df:
35106 case CODE_FOR_avx512vl_vinsertv4di:
35107 case CODE_FOR_avx512vl_vinsertv8sf:
35108 case CODE_FOR_avx512vl_vinsertv8si:
35109 error ("the last argument must be a 1-bit immediate");
35110 return const0_rtx;
35112 case CODE_FOR_avx_vmcmpv2df3:
35113 case CODE_FOR_avx_vmcmpv4sf3:
35114 case CODE_FOR_avx_cmpv2df3:
35115 case CODE_FOR_avx_cmpv4sf3:
35116 case CODE_FOR_avx_cmpv4df3:
35117 case CODE_FOR_avx_cmpv8sf3:
35118 case CODE_FOR_avx512f_cmpv8df3_mask:
35119 case CODE_FOR_avx512f_cmpv16sf3_mask:
35120 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35121 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35122 error ("the last argument must be a 5-bit immediate");
35123 return const0_rtx;
35125 default:
35126 switch (nargs_constant)
35128 case 2:
35129 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35130 (!mask_pos && (nargs - i) == nargs_constant))
35132 error ("the next to last argument must be an 8-bit immediate");
35133 break;
35135 /* FALLTHRU */
35136 case 1:
35137 error ("the last argument must be an 8-bit immediate");
35138 break;
35139 default:
35140 gcc_unreachable ();
35142 return const0_rtx;
35145 else
35147 if (VECTOR_MODE_P (mode))
35148 op = safe_vector_operand (op, mode);
35150 /* If we aren't optimizing, only allow one memory operand to
35151 be generated. */
35152 if (memory_operand (op, mode))
35153 num_memory++;
35155 op = fixup_modeless_constant (op, mode);
35157 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35159 if (optimize || !match || num_memory > 1)
35160 op = copy_to_mode_reg (mode, op);
35162 else
35164 op = copy_to_reg (op);
35165 op = lowpart_subreg (mode, op, GET_MODE (op));
35169 args[i].op = op;
35170 args[i].mode = mode;
35173 switch (nargs)
35175 case 1:
35176 pat = GEN_FCN (icode) (real_target, args[0].op);
35177 break;
35178 case 2:
35179 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35180 break;
35181 case 3:
35182 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35183 args[2].op);
35184 break;
35185 case 4:
35186 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35187 args[2].op, args[3].op);
35188 break;
35189 case 5:
35190 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35191 args[2].op, args[3].op, args[4].op);
35192 break;
35193 case 6:
35194 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35195 args[2].op, args[3].op, args[4].op,
35196 args[5].op);
35197 break;
35198 default:
35199 gcc_unreachable ();
35202 if (! pat)
35203 return 0;
35205 emit_insn (pat);
35206 return target;
35209 /* Transform pattern of following layout:
35210 (parallel [
35211 set (A B)
35212 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35214 into:
35215 (set (A B))
35218 (parallel [ A B
35220 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35223 into:
35224 (parallel [ A B ... ]) */
35226 static rtx
35227 ix86_erase_embedded_rounding (rtx pat)
35229 if (GET_CODE (pat) == INSN)
35230 pat = PATTERN (pat);
35232 gcc_assert (GET_CODE (pat) == PARALLEL);
35234 if (XVECLEN (pat, 0) == 2)
35236 rtx p0 = XVECEXP (pat, 0, 0);
35237 rtx p1 = XVECEXP (pat, 0, 1);
35239 gcc_assert (GET_CODE (p0) == SET
35240 && GET_CODE (p1) == UNSPEC
35241 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35243 return p0;
35245 else
35247 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35248 int i = 0;
35249 int j = 0;
35251 for (; i < XVECLEN (pat, 0); ++i)
35253 rtx elem = XVECEXP (pat, 0, i);
35254 if (GET_CODE (elem) != UNSPEC
35255 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35256 res [j++] = elem;
35259 /* No more than 1 occurence was removed. */
35260 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35262 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35266 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35267 with rounding. */
35268 static rtx
35269 ix86_expand_sse_comi_round (const struct builtin_description *d,
35270 tree exp, rtx target)
35272 rtx pat, set_dst;
35273 tree arg0 = CALL_EXPR_ARG (exp, 0);
35274 tree arg1 = CALL_EXPR_ARG (exp, 1);
35275 tree arg2 = CALL_EXPR_ARG (exp, 2);
35276 tree arg3 = CALL_EXPR_ARG (exp, 3);
35277 rtx op0 = expand_normal (arg0);
35278 rtx op1 = expand_normal (arg1);
35279 rtx op2 = expand_normal (arg2);
35280 rtx op3 = expand_normal (arg3);
35281 enum insn_code icode = d->icode;
35282 const struct insn_data_d *insn_p = &insn_data[icode];
35283 machine_mode mode0 = insn_p->operand[0].mode;
35284 machine_mode mode1 = insn_p->operand[1].mode;
35285 enum rtx_code comparison = UNEQ;
35286 bool need_ucomi = false;
35288 /* See avxintrin.h for values. */
35289 enum rtx_code comi_comparisons[32] =
35291 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35292 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35293 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35295 bool need_ucomi_values[32] =
35297 true, false, false, true, true, false, false, true,
35298 true, false, false, true, true, false, false, true,
35299 false, true, true, false, false, true, true, false,
35300 false, true, true, false, false, true, true, false
35303 if (!CONST_INT_P (op2))
35305 error ("the third argument must be comparison constant");
35306 return const0_rtx;
35308 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35310 error ("incorrect comparison mode");
35311 return const0_rtx;
35314 if (!insn_p->operand[2].predicate (op3, SImode))
35316 error ("incorrect rounding operand");
35317 return const0_rtx;
35320 comparison = comi_comparisons[INTVAL (op2)];
35321 need_ucomi = need_ucomi_values[INTVAL (op2)];
35323 if (VECTOR_MODE_P (mode0))
35324 op0 = safe_vector_operand (op0, mode0);
35325 if (VECTOR_MODE_P (mode1))
35326 op1 = safe_vector_operand (op1, mode1);
35328 target = gen_reg_rtx (SImode);
35329 emit_move_insn (target, const0_rtx);
35330 target = gen_rtx_SUBREG (QImode, target, 0);
35332 if ((optimize && !register_operand (op0, mode0))
35333 || !insn_p->operand[0].predicate (op0, mode0))
35334 op0 = copy_to_mode_reg (mode0, op0);
35335 if ((optimize && !register_operand (op1, mode1))
35336 || !insn_p->operand[1].predicate (op1, mode1))
35337 op1 = copy_to_mode_reg (mode1, op1);
35339 if (need_ucomi)
35340 icode = icode == CODE_FOR_sse_comi_round
35341 ? CODE_FOR_sse_ucomi_round
35342 : CODE_FOR_sse2_ucomi_round;
35344 pat = GEN_FCN (icode) (op0, op1, op3);
35345 if (! pat)
35346 return 0;
35348 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35349 if (INTVAL (op3) == NO_ROUND)
35351 pat = ix86_erase_embedded_rounding (pat);
35352 if (! pat)
35353 return 0;
35355 set_dst = SET_DEST (pat);
35357 else
35359 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35360 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35363 emit_insn (pat);
35364 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35365 gen_rtx_fmt_ee (comparison, QImode,
35366 set_dst,
35367 const0_rtx)));
35369 return SUBREG_REG (target);
35372 static rtx
35373 ix86_expand_round_builtin (const struct builtin_description *d,
35374 tree exp, rtx target)
35376 rtx pat;
35377 unsigned int i, nargs;
35378 struct
35380 rtx op;
35381 machine_mode mode;
35382 } args[6];
35383 enum insn_code icode = d->icode;
35384 const struct insn_data_d *insn_p = &insn_data[icode];
35385 machine_mode tmode = insn_p->operand[0].mode;
35386 unsigned int nargs_constant = 0;
35387 unsigned int redundant_embed_rnd = 0;
35389 switch ((enum ix86_builtin_func_type) d->flag)
35391 case UINT64_FTYPE_V2DF_INT:
35392 case UINT64_FTYPE_V4SF_INT:
35393 case UINT_FTYPE_V2DF_INT:
35394 case UINT_FTYPE_V4SF_INT:
35395 case INT64_FTYPE_V2DF_INT:
35396 case INT64_FTYPE_V4SF_INT:
35397 case INT_FTYPE_V2DF_INT:
35398 case INT_FTYPE_V4SF_INT:
35399 nargs = 2;
35400 break;
35401 case V4SF_FTYPE_V4SF_UINT_INT:
35402 case V4SF_FTYPE_V4SF_UINT64_INT:
35403 case V2DF_FTYPE_V2DF_UINT64_INT:
35404 case V4SF_FTYPE_V4SF_INT_INT:
35405 case V4SF_FTYPE_V4SF_INT64_INT:
35406 case V2DF_FTYPE_V2DF_INT64_INT:
35407 case V4SF_FTYPE_V4SF_V4SF_INT:
35408 case V2DF_FTYPE_V2DF_V2DF_INT:
35409 case V4SF_FTYPE_V4SF_V2DF_INT:
35410 case V2DF_FTYPE_V2DF_V4SF_INT:
35411 nargs = 3;
35412 break;
35413 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35414 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35415 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35416 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35417 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35418 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35419 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35420 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35421 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35422 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35423 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35424 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35425 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35426 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35427 nargs = 4;
35428 break;
35429 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35430 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35431 nargs_constant = 2;
35432 nargs = 4;
35433 break;
35434 case INT_FTYPE_V4SF_V4SF_INT_INT:
35435 case INT_FTYPE_V2DF_V2DF_INT_INT:
35436 return ix86_expand_sse_comi_round (d, exp, target);
35437 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35438 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35439 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35440 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35441 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35442 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35443 nargs = 5;
35444 break;
35445 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35446 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35447 nargs_constant = 4;
35448 nargs = 5;
35449 break;
35450 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35451 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35452 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35453 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35454 nargs_constant = 3;
35455 nargs = 5;
35456 break;
35457 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35458 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35459 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35460 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35461 nargs = 6;
35462 nargs_constant = 4;
35463 break;
35464 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35465 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35466 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35467 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35468 nargs = 6;
35469 nargs_constant = 3;
35470 break;
35471 default:
35472 gcc_unreachable ();
35474 gcc_assert (nargs <= ARRAY_SIZE (args));
35476 if (optimize
35477 || target == 0
35478 || GET_MODE (target) != tmode
35479 || !insn_p->operand[0].predicate (target, tmode))
35480 target = gen_reg_rtx (tmode);
35482 for (i = 0; i < nargs; i++)
35484 tree arg = CALL_EXPR_ARG (exp, i);
35485 rtx op = expand_normal (arg);
35486 machine_mode mode = insn_p->operand[i + 1].mode;
35487 bool match = insn_p->operand[i + 1].predicate (op, mode);
35489 if (i == nargs - nargs_constant)
35491 if (!match)
35493 switch (icode)
35495 case CODE_FOR_avx512f_getmantv8df_mask_round:
35496 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35497 case CODE_FOR_avx512f_vgetmantv2df_round:
35498 case CODE_FOR_avx512f_vgetmantv4sf_round:
35499 error ("the immediate argument must be a 4-bit immediate");
35500 return const0_rtx;
35501 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35502 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35503 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35504 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35505 error ("the immediate argument must be a 5-bit immediate");
35506 return const0_rtx;
35507 default:
35508 error ("the immediate argument must be an 8-bit immediate");
35509 return const0_rtx;
35513 else if (i == nargs-1)
35515 if (!insn_p->operand[nargs].predicate (op, SImode))
35517 error ("incorrect rounding operand");
35518 return const0_rtx;
35521 /* If there is no rounding use normal version of the pattern. */
35522 if (INTVAL (op) == NO_ROUND)
35523 redundant_embed_rnd = 1;
35525 else
35527 if (VECTOR_MODE_P (mode))
35528 op = safe_vector_operand (op, mode);
35530 op = fixup_modeless_constant (op, mode);
35532 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35534 if (optimize || !match)
35535 op = copy_to_mode_reg (mode, op);
35537 else
35539 op = copy_to_reg (op);
35540 op = lowpart_subreg (mode, op, GET_MODE (op));
35544 args[i].op = op;
35545 args[i].mode = mode;
35548 switch (nargs)
35550 case 1:
35551 pat = GEN_FCN (icode) (target, args[0].op);
35552 break;
35553 case 2:
35554 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35555 break;
35556 case 3:
35557 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35558 args[2].op);
35559 break;
35560 case 4:
35561 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35562 args[2].op, args[3].op);
35563 break;
35564 case 5:
35565 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35566 args[2].op, args[3].op, args[4].op);
35567 break;
35568 case 6:
35569 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35570 args[2].op, args[3].op, args[4].op,
35571 args[5].op);
35572 break;
35573 default:
35574 gcc_unreachable ();
35577 if (!pat)
35578 return 0;
35580 if (redundant_embed_rnd)
35581 pat = ix86_erase_embedded_rounding (pat);
35583 emit_insn (pat);
35584 return target;
35587 /* Subroutine of ix86_expand_builtin to take care of special insns
35588 with variable number of operands. */
35590 static rtx
35591 ix86_expand_special_args_builtin (const struct builtin_description *d,
35592 tree exp, rtx target)
35594 tree arg;
35595 rtx pat, op;
35596 unsigned int i, nargs, arg_adjust, memory;
35597 bool aligned_mem = false;
35598 struct
35600 rtx op;
35601 machine_mode mode;
35602 } args[3];
35603 enum insn_code icode = d->icode;
35604 bool last_arg_constant = false;
35605 const struct insn_data_d *insn_p = &insn_data[icode];
35606 machine_mode tmode = insn_p->operand[0].mode;
35607 enum { load, store } klass;
35609 switch ((enum ix86_builtin_func_type) d->flag)
35611 case VOID_FTYPE_VOID:
35612 emit_insn (GEN_FCN (icode) (target));
35613 return 0;
35614 case VOID_FTYPE_UINT64:
35615 case VOID_FTYPE_UNSIGNED:
35616 nargs = 0;
35617 klass = store;
35618 memory = 0;
35619 break;
35621 case INT_FTYPE_VOID:
35622 case USHORT_FTYPE_VOID:
35623 case UINT64_FTYPE_VOID:
35624 case UNSIGNED_FTYPE_VOID:
35625 nargs = 0;
35626 klass = load;
35627 memory = 0;
35628 break;
35629 case UINT64_FTYPE_PUNSIGNED:
35630 case V2DI_FTYPE_PV2DI:
35631 case V4DI_FTYPE_PV4DI:
35632 case V32QI_FTYPE_PCCHAR:
35633 case V16QI_FTYPE_PCCHAR:
35634 case V8SF_FTYPE_PCV4SF:
35635 case V8SF_FTYPE_PCFLOAT:
35636 case V4SF_FTYPE_PCFLOAT:
35637 case V4DF_FTYPE_PCV2DF:
35638 case V4DF_FTYPE_PCDOUBLE:
35639 case V2DF_FTYPE_PCDOUBLE:
35640 case VOID_FTYPE_PVOID:
35641 case V8DI_FTYPE_PV8DI:
35642 nargs = 1;
35643 klass = load;
35644 memory = 0;
35645 switch (icode)
35647 case CODE_FOR_sse4_1_movntdqa:
35648 case CODE_FOR_avx2_movntdqa:
35649 case CODE_FOR_avx512f_movntdqa:
35650 aligned_mem = true;
35651 break;
35652 default:
35653 break;
35655 break;
35656 case VOID_FTYPE_PV2SF_V4SF:
35657 case VOID_FTYPE_PV8DI_V8DI:
35658 case VOID_FTYPE_PV4DI_V4DI:
35659 case VOID_FTYPE_PV2DI_V2DI:
35660 case VOID_FTYPE_PCHAR_V32QI:
35661 case VOID_FTYPE_PCHAR_V16QI:
35662 case VOID_FTYPE_PFLOAT_V16SF:
35663 case VOID_FTYPE_PFLOAT_V8SF:
35664 case VOID_FTYPE_PFLOAT_V4SF:
35665 case VOID_FTYPE_PDOUBLE_V8DF:
35666 case VOID_FTYPE_PDOUBLE_V4DF:
35667 case VOID_FTYPE_PDOUBLE_V2DF:
35668 case VOID_FTYPE_PLONGLONG_LONGLONG:
35669 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35670 case VOID_FTYPE_PINT_INT:
35671 nargs = 1;
35672 klass = store;
35673 /* Reserve memory operand for target. */
35674 memory = ARRAY_SIZE (args);
35675 switch (icode)
35677 /* These builtins and instructions require the memory
35678 to be properly aligned. */
35679 case CODE_FOR_avx_movntv4di:
35680 case CODE_FOR_sse2_movntv2di:
35681 case CODE_FOR_avx_movntv8sf:
35682 case CODE_FOR_sse_movntv4sf:
35683 case CODE_FOR_sse4a_vmmovntv4sf:
35684 case CODE_FOR_avx_movntv4df:
35685 case CODE_FOR_sse2_movntv2df:
35686 case CODE_FOR_sse4a_vmmovntv2df:
35687 case CODE_FOR_sse2_movntidi:
35688 case CODE_FOR_sse_movntq:
35689 case CODE_FOR_sse2_movntisi:
35690 case CODE_FOR_avx512f_movntv16sf:
35691 case CODE_FOR_avx512f_movntv8df:
35692 case CODE_FOR_avx512f_movntv8di:
35693 aligned_mem = true;
35694 break;
35695 default:
35696 break;
35698 break;
35699 case V4SF_FTYPE_V4SF_PCV2SF:
35700 case V2DF_FTYPE_V2DF_PCDOUBLE:
35701 nargs = 2;
35702 klass = load;
35703 memory = 1;
35704 break;
35705 case V8SF_FTYPE_PCV8SF_V8SI:
35706 case V4DF_FTYPE_PCV4DF_V4DI:
35707 case V4SF_FTYPE_PCV4SF_V4SI:
35708 case V2DF_FTYPE_PCV2DF_V2DI:
35709 case V8SI_FTYPE_PCV8SI_V8SI:
35710 case V4DI_FTYPE_PCV4DI_V4DI:
35711 case V4SI_FTYPE_PCV4SI_V4SI:
35712 case V2DI_FTYPE_PCV2DI_V2DI:
35713 nargs = 2;
35714 klass = load;
35715 memory = 0;
35716 break;
35717 case VOID_FTYPE_PV8DF_V8DF_UQI:
35718 case VOID_FTYPE_PV4DF_V4DF_UQI:
35719 case VOID_FTYPE_PV2DF_V2DF_UQI:
35720 case VOID_FTYPE_PV16SF_V16SF_UHI:
35721 case VOID_FTYPE_PV8SF_V8SF_UQI:
35722 case VOID_FTYPE_PV4SF_V4SF_UQI:
35723 case VOID_FTYPE_PV8DI_V8DI_UQI:
35724 case VOID_FTYPE_PV4DI_V4DI_UQI:
35725 case VOID_FTYPE_PV2DI_V2DI_UQI:
35726 case VOID_FTYPE_PV16SI_V16SI_UHI:
35727 case VOID_FTYPE_PV8SI_V8SI_UQI:
35728 case VOID_FTYPE_PV4SI_V4SI_UQI:
35729 switch (icode)
35731 /* These builtins and instructions require the memory
35732 to be properly aligned. */
35733 case CODE_FOR_avx512f_storev16sf_mask:
35734 case CODE_FOR_avx512f_storev16si_mask:
35735 case CODE_FOR_avx512f_storev8df_mask:
35736 case CODE_FOR_avx512f_storev8di_mask:
35737 case CODE_FOR_avx512vl_storev8sf_mask:
35738 case CODE_FOR_avx512vl_storev8si_mask:
35739 case CODE_FOR_avx512vl_storev4df_mask:
35740 case CODE_FOR_avx512vl_storev4di_mask:
35741 case CODE_FOR_avx512vl_storev4sf_mask:
35742 case CODE_FOR_avx512vl_storev4si_mask:
35743 case CODE_FOR_avx512vl_storev2df_mask:
35744 case CODE_FOR_avx512vl_storev2di_mask:
35745 aligned_mem = true;
35746 break;
35747 default:
35748 break;
35750 /* FALLTHRU */
35751 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35752 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35753 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35754 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35755 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35756 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35757 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35758 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35759 case VOID_FTYPE_PV8SI_V8DI_UQI:
35760 case VOID_FTYPE_PV8HI_V8DI_UQI:
35761 case VOID_FTYPE_PV16HI_V16SI_UHI:
35762 case VOID_FTYPE_PV16QI_V8DI_UQI:
35763 case VOID_FTYPE_PV16QI_V16SI_UHI:
35764 case VOID_FTYPE_PV4SI_V4DI_UQI:
35765 case VOID_FTYPE_PV4SI_V2DI_UQI:
35766 case VOID_FTYPE_PV8HI_V4DI_UQI:
35767 case VOID_FTYPE_PV8HI_V2DI_UQI:
35768 case VOID_FTYPE_PV8HI_V8SI_UQI:
35769 case VOID_FTYPE_PV8HI_V4SI_UQI:
35770 case VOID_FTYPE_PV16QI_V4DI_UQI:
35771 case VOID_FTYPE_PV16QI_V2DI_UQI:
35772 case VOID_FTYPE_PV16QI_V8SI_UQI:
35773 case VOID_FTYPE_PV16QI_V4SI_UQI:
35774 case VOID_FTYPE_PCHAR_V64QI_UDI:
35775 case VOID_FTYPE_PCHAR_V32QI_USI:
35776 case VOID_FTYPE_PCHAR_V16QI_UHI:
35777 case VOID_FTYPE_PSHORT_V32HI_USI:
35778 case VOID_FTYPE_PSHORT_V16HI_UHI:
35779 case VOID_FTYPE_PSHORT_V8HI_UQI:
35780 case VOID_FTYPE_PINT_V16SI_UHI:
35781 case VOID_FTYPE_PINT_V8SI_UQI:
35782 case VOID_FTYPE_PINT_V4SI_UQI:
35783 case VOID_FTYPE_PINT64_V8DI_UQI:
35784 case VOID_FTYPE_PINT64_V4DI_UQI:
35785 case VOID_FTYPE_PINT64_V2DI_UQI:
35786 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35787 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35788 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35789 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35790 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35791 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35792 nargs = 2;
35793 klass = store;
35794 /* Reserve memory operand for target. */
35795 memory = ARRAY_SIZE (args);
35796 break;
35797 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35798 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35799 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35800 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35801 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35802 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35803 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35804 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35805 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35806 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35807 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35808 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35809 switch (icode)
35811 /* These builtins and instructions require the memory
35812 to be properly aligned. */
35813 case CODE_FOR_avx512f_loadv16sf_mask:
35814 case CODE_FOR_avx512f_loadv16si_mask:
35815 case CODE_FOR_avx512f_loadv8df_mask:
35816 case CODE_FOR_avx512f_loadv8di_mask:
35817 case CODE_FOR_avx512vl_loadv8sf_mask:
35818 case CODE_FOR_avx512vl_loadv8si_mask:
35819 case CODE_FOR_avx512vl_loadv4df_mask:
35820 case CODE_FOR_avx512vl_loadv4di_mask:
35821 case CODE_FOR_avx512vl_loadv4sf_mask:
35822 case CODE_FOR_avx512vl_loadv4si_mask:
35823 case CODE_FOR_avx512vl_loadv2df_mask:
35824 case CODE_FOR_avx512vl_loadv2di_mask:
35825 case CODE_FOR_avx512bw_loadv64qi_mask:
35826 case CODE_FOR_avx512vl_loadv32qi_mask:
35827 case CODE_FOR_avx512vl_loadv16qi_mask:
35828 case CODE_FOR_avx512bw_loadv32hi_mask:
35829 case CODE_FOR_avx512vl_loadv16hi_mask:
35830 case CODE_FOR_avx512vl_loadv8hi_mask:
35831 aligned_mem = true;
35832 break;
35833 default:
35834 break;
35836 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35837 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35838 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35839 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35840 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35841 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35842 case V16SI_FTYPE_PCINT_V16SI_UHI:
35843 case V8SI_FTYPE_PCINT_V8SI_UQI:
35844 case V4SI_FTYPE_PCINT_V4SI_UQI:
35845 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35846 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35847 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35848 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35849 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35850 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35851 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35852 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35853 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35854 nargs = 3;
35855 klass = load;
35856 memory = 0;
35857 break;
35858 case VOID_FTYPE_UINT_UINT_UINT:
35859 case VOID_FTYPE_UINT64_UINT_UINT:
35860 case UCHAR_FTYPE_UINT_UINT_UINT:
35861 case UCHAR_FTYPE_UINT64_UINT_UINT:
35862 nargs = 3;
35863 klass = load;
35864 memory = ARRAY_SIZE (args);
35865 last_arg_constant = true;
35866 break;
35867 default:
35868 gcc_unreachable ();
35871 gcc_assert (nargs <= ARRAY_SIZE (args));
35873 if (klass == store)
35875 arg = CALL_EXPR_ARG (exp, 0);
35876 op = expand_normal (arg);
35877 gcc_assert (target == 0);
35878 if (memory)
35880 op = ix86_zero_extend_to_Pmode (op);
35881 target = gen_rtx_MEM (tmode, op);
35882 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35883 on it. Try to improve it using get_pointer_alignment,
35884 and if the special builtin is one that requires strict
35885 mode alignment, also from it's GET_MODE_ALIGNMENT.
35886 Failure to do so could lead to ix86_legitimate_combined_insn
35887 rejecting all changes to such insns. */
35888 unsigned int align = get_pointer_alignment (arg);
35889 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35890 align = GET_MODE_ALIGNMENT (tmode);
35891 if (MEM_ALIGN (target) < align)
35892 set_mem_align (target, align);
35894 else
35895 target = force_reg (tmode, op);
35896 arg_adjust = 1;
35898 else
35900 arg_adjust = 0;
35901 if (optimize
35902 || target == 0
35903 || !register_operand (target, tmode)
35904 || GET_MODE (target) != tmode)
35905 target = gen_reg_rtx (tmode);
35908 for (i = 0; i < nargs; i++)
35910 machine_mode mode = insn_p->operand[i + 1].mode;
35911 bool match;
35913 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35914 op = expand_normal (arg);
35915 match = insn_p->operand[i + 1].predicate (op, mode);
35917 if (last_arg_constant && (i + 1) == nargs)
35919 if (!match)
35921 if (icode == CODE_FOR_lwp_lwpvalsi3
35922 || icode == CODE_FOR_lwp_lwpinssi3
35923 || icode == CODE_FOR_lwp_lwpvaldi3
35924 || icode == CODE_FOR_lwp_lwpinsdi3)
35925 error ("the last argument must be a 32-bit immediate");
35926 else
35927 error ("the last argument must be an 8-bit immediate");
35928 return const0_rtx;
35931 else
35933 if (i == memory)
35935 /* This must be the memory operand. */
35936 op = ix86_zero_extend_to_Pmode (op);
35937 op = gen_rtx_MEM (mode, op);
35938 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35939 on it. Try to improve it using get_pointer_alignment,
35940 and if the special builtin is one that requires strict
35941 mode alignment, also from it's GET_MODE_ALIGNMENT.
35942 Failure to do so could lead to ix86_legitimate_combined_insn
35943 rejecting all changes to such insns. */
35944 unsigned int align = get_pointer_alignment (arg);
35945 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35946 align = GET_MODE_ALIGNMENT (mode);
35947 if (MEM_ALIGN (op) < align)
35948 set_mem_align (op, align);
35950 else
35952 /* This must be register. */
35953 if (VECTOR_MODE_P (mode))
35954 op = safe_vector_operand (op, mode);
35956 op = fixup_modeless_constant (op, mode);
35958 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35959 op = copy_to_mode_reg (mode, op);
35960 else
35962 op = copy_to_reg (op);
35963 op = lowpart_subreg (mode, op, GET_MODE (op));
35968 args[i].op = op;
35969 args[i].mode = mode;
35972 switch (nargs)
35974 case 0:
35975 pat = GEN_FCN (icode) (target);
35976 break;
35977 case 1:
35978 pat = GEN_FCN (icode) (target, args[0].op);
35979 break;
35980 case 2:
35981 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35982 break;
35983 case 3:
35984 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35985 break;
35986 default:
35987 gcc_unreachable ();
35990 if (! pat)
35991 return 0;
35992 emit_insn (pat);
35993 return klass == store ? 0 : target;
35996 /* Return the integer constant in ARG. Constrain it to be in the range
35997 of the subparts of VEC_TYPE; issue an error if not. */
35999 static int
36000 get_element_number (tree vec_type, tree arg)
36002 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36004 if (!tree_fits_uhwi_p (arg)
36005 || (elt = tree_to_uhwi (arg), elt > max))
36007 error ("selector must be an integer constant in the range 0..%wi", max);
36008 return 0;
36011 return elt;
36014 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36015 ix86_expand_vector_init. We DO have language-level syntax for this, in
36016 the form of (type){ init-list }. Except that since we can't place emms
36017 instructions from inside the compiler, we can't allow the use of MMX
36018 registers unless the user explicitly asks for it. So we do *not* define
36019 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36020 we have builtins invoked by mmintrin.h that gives us license to emit
36021 these sorts of instructions. */
36023 static rtx
36024 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36026 machine_mode tmode = TYPE_MODE (type);
36027 machine_mode inner_mode = GET_MODE_INNER (tmode);
36028 int i, n_elt = GET_MODE_NUNITS (tmode);
36029 rtvec v = rtvec_alloc (n_elt);
36031 gcc_assert (VECTOR_MODE_P (tmode));
36032 gcc_assert (call_expr_nargs (exp) == n_elt);
36034 for (i = 0; i < n_elt; ++i)
36036 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36037 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36040 if (!target || !register_operand (target, tmode))
36041 target = gen_reg_rtx (tmode);
36043 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36044 return target;
36047 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36048 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36049 had a language-level syntax for referencing vector elements. */
36051 static rtx
36052 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36054 machine_mode tmode, mode0;
36055 tree arg0, arg1;
36056 int elt;
36057 rtx op0;
36059 arg0 = CALL_EXPR_ARG (exp, 0);
36060 arg1 = CALL_EXPR_ARG (exp, 1);
36062 op0 = expand_normal (arg0);
36063 elt = get_element_number (TREE_TYPE (arg0), arg1);
36065 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36066 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36067 gcc_assert (VECTOR_MODE_P (mode0));
36069 op0 = force_reg (mode0, op0);
36071 if (optimize || !target || !register_operand (target, tmode))
36072 target = gen_reg_rtx (tmode);
36074 ix86_expand_vector_extract (true, target, op0, elt);
36076 return target;
36079 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36080 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36081 a language-level syntax for referencing vector elements. */
36083 static rtx
36084 ix86_expand_vec_set_builtin (tree exp)
36086 machine_mode tmode, mode1;
36087 tree arg0, arg1, arg2;
36088 int elt;
36089 rtx op0, op1, target;
36091 arg0 = CALL_EXPR_ARG (exp, 0);
36092 arg1 = CALL_EXPR_ARG (exp, 1);
36093 arg2 = CALL_EXPR_ARG (exp, 2);
36095 tmode = TYPE_MODE (TREE_TYPE (arg0));
36096 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36097 gcc_assert (VECTOR_MODE_P (tmode));
36099 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36100 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36101 elt = get_element_number (TREE_TYPE (arg0), arg2);
36103 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36104 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36106 op0 = force_reg (tmode, op0);
36107 op1 = force_reg (mode1, op1);
36109 /* OP0 is the source of these builtin functions and shouldn't be
36110 modified. Create a copy, use it and return it as target. */
36111 target = gen_reg_rtx (tmode);
36112 emit_move_insn (target, op0);
36113 ix86_expand_vector_set (true, target, op1, elt);
36115 return target;
36118 /* Emit conditional move of SRC to DST with condition
36119 OP1 CODE OP2. */
36120 static void
36121 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36123 rtx t;
36125 if (TARGET_CMOVE)
36127 t = ix86_expand_compare (code, op1, op2);
36128 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36129 src, dst)));
36131 else
36133 rtx_code_label *nomove = gen_label_rtx ();
36134 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36135 const0_rtx, GET_MODE (op1), 1, nomove);
36136 emit_move_insn (dst, src);
36137 emit_label (nomove);
36141 /* Choose max of DST and SRC and put it to DST. */
36142 static void
36143 ix86_emit_move_max (rtx dst, rtx src)
36145 ix86_emit_cmove (dst, src, LTU, dst, src);
36148 /* Expand an expression EXP that calls a built-in function,
36149 with result going to TARGET if that's convenient
36150 (and in mode MODE if that's convenient).
36151 SUBTARGET may be used as the target for computing one of EXP's operands.
36152 IGNORE is nonzero if the value is to be ignored. */
36154 static rtx
36155 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36156 machine_mode mode, int ignore)
36158 size_t i;
36159 enum insn_code icode;
36160 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36161 tree arg0, arg1, arg2, arg3, arg4;
36162 rtx op0, op1, op2, op3, op4, pat, insn;
36163 machine_mode mode0, mode1, mode2, mode3, mode4;
36164 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36166 /* For CPU builtins that can be folded, fold first and expand the fold. */
36167 switch (fcode)
36169 case IX86_BUILTIN_CPU_INIT:
36171 /* Make it call __cpu_indicator_init in libgcc. */
36172 tree call_expr, fndecl, type;
36173 type = build_function_type_list (integer_type_node, NULL_TREE);
36174 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36175 call_expr = build_call_expr (fndecl, 0);
36176 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36178 case IX86_BUILTIN_CPU_IS:
36179 case IX86_BUILTIN_CPU_SUPPORTS:
36181 tree arg0 = CALL_EXPR_ARG (exp, 0);
36182 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36183 gcc_assert (fold_expr != NULL_TREE);
36184 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36188 /* Determine whether the builtin function is available under the current ISA.
36189 Originally the builtin was not created if it wasn't applicable to the
36190 current ISA based on the command line switches. With function specific
36191 options, we need to check in the context of the function making the call
36192 whether it is supported. */
36193 if (ix86_builtins_isa[fcode].isa
36194 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
36196 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, 0,
36197 NULL, NULL, (enum fpmath_unit) 0,
36198 false);
36199 if (!opts)
36200 error ("%qE needs unknown isa option", fndecl);
36201 else
36203 gcc_assert (opts != NULL);
36204 error ("%qE needs isa option %s", fndecl, opts);
36205 free (opts);
36207 return expand_call (exp, target, ignore);
36210 switch (fcode)
36212 case IX86_BUILTIN_BNDMK:
36213 if (!target
36214 || GET_MODE (target) != BNDmode
36215 || !register_operand (target, BNDmode))
36216 target = gen_reg_rtx (BNDmode);
36218 arg0 = CALL_EXPR_ARG (exp, 0);
36219 arg1 = CALL_EXPR_ARG (exp, 1);
36221 op0 = expand_normal (arg0);
36222 op1 = expand_normal (arg1);
36224 if (!register_operand (op0, Pmode))
36225 op0 = ix86_zero_extend_to_Pmode (op0);
36226 if (!register_operand (op1, Pmode))
36227 op1 = ix86_zero_extend_to_Pmode (op1);
36229 /* Builtin arg1 is size of block but instruction op1 should
36230 be (size - 1). */
36231 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36232 NULL_RTX, 1, OPTAB_DIRECT);
36234 emit_insn (BNDmode == BND64mode
36235 ? gen_bnd64_mk (target, op0, op1)
36236 : gen_bnd32_mk (target, op0, op1));
36237 return target;
36239 case IX86_BUILTIN_BNDSTX:
36240 arg0 = CALL_EXPR_ARG (exp, 0);
36241 arg1 = CALL_EXPR_ARG (exp, 1);
36242 arg2 = CALL_EXPR_ARG (exp, 2);
36244 op0 = expand_normal (arg0);
36245 op1 = expand_normal (arg1);
36246 op2 = expand_normal (arg2);
36248 if (!register_operand (op0, Pmode))
36249 op0 = ix86_zero_extend_to_Pmode (op0);
36250 if (!register_operand (op1, BNDmode))
36251 op1 = copy_to_mode_reg (BNDmode, op1);
36252 if (!register_operand (op2, Pmode))
36253 op2 = ix86_zero_extend_to_Pmode (op2);
36255 emit_insn (BNDmode == BND64mode
36256 ? gen_bnd64_stx (op2, op0, op1)
36257 : gen_bnd32_stx (op2, op0, op1));
36258 return 0;
36260 case IX86_BUILTIN_BNDLDX:
36261 if (!target
36262 || GET_MODE (target) != BNDmode
36263 || !register_operand (target, BNDmode))
36264 target = gen_reg_rtx (BNDmode);
36266 arg0 = CALL_EXPR_ARG (exp, 0);
36267 arg1 = CALL_EXPR_ARG (exp, 1);
36269 op0 = expand_normal (arg0);
36270 op1 = expand_normal (arg1);
36272 if (!register_operand (op0, Pmode))
36273 op0 = ix86_zero_extend_to_Pmode (op0);
36274 if (!register_operand (op1, Pmode))
36275 op1 = ix86_zero_extend_to_Pmode (op1);
36277 emit_insn (BNDmode == BND64mode
36278 ? gen_bnd64_ldx (target, op0, op1)
36279 : gen_bnd32_ldx (target, op0, op1));
36280 return target;
36282 case IX86_BUILTIN_BNDCL:
36283 arg0 = CALL_EXPR_ARG (exp, 0);
36284 arg1 = CALL_EXPR_ARG (exp, 1);
36286 op0 = expand_normal (arg0);
36287 op1 = expand_normal (arg1);
36289 if (!register_operand (op0, Pmode))
36290 op0 = ix86_zero_extend_to_Pmode (op0);
36291 if (!register_operand (op1, BNDmode))
36292 op1 = copy_to_mode_reg (BNDmode, op1);
36294 emit_insn (BNDmode == BND64mode
36295 ? gen_bnd64_cl (op1, op0)
36296 : gen_bnd32_cl (op1, op0));
36297 return 0;
36299 case IX86_BUILTIN_BNDCU:
36300 arg0 = CALL_EXPR_ARG (exp, 0);
36301 arg1 = CALL_EXPR_ARG (exp, 1);
36303 op0 = expand_normal (arg0);
36304 op1 = expand_normal (arg1);
36306 if (!register_operand (op0, Pmode))
36307 op0 = ix86_zero_extend_to_Pmode (op0);
36308 if (!register_operand (op1, BNDmode))
36309 op1 = copy_to_mode_reg (BNDmode, op1);
36311 emit_insn (BNDmode == BND64mode
36312 ? gen_bnd64_cu (op1, op0)
36313 : gen_bnd32_cu (op1, op0));
36314 return 0;
36316 case IX86_BUILTIN_BNDRET:
36317 arg0 = CALL_EXPR_ARG (exp, 0);
36318 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36319 target = chkp_get_rtl_bounds (arg0);
36321 /* If no bounds were specified for returned value,
36322 then use INIT bounds. It usually happens when
36323 some built-in function is expanded. */
36324 if (!target)
36326 rtx t1 = gen_reg_rtx (Pmode);
36327 rtx t2 = gen_reg_rtx (Pmode);
36328 target = gen_reg_rtx (BNDmode);
36329 emit_move_insn (t1, const0_rtx);
36330 emit_move_insn (t2, constm1_rtx);
36331 emit_insn (BNDmode == BND64mode
36332 ? gen_bnd64_mk (target, t1, t2)
36333 : gen_bnd32_mk (target, t1, t2));
36336 gcc_assert (target && REG_P (target));
36337 return target;
36339 case IX86_BUILTIN_BNDNARROW:
36341 rtx m1, m1h1, m1h2, lb, ub, t1;
36343 /* Return value and lb. */
36344 arg0 = CALL_EXPR_ARG (exp, 0);
36345 /* Bounds. */
36346 arg1 = CALL_EXPR_ARG (exp, 1);
36347 /* Size. */
36348 arg2 = CALL_EXPR_ARG (exp, 2);
36350 lb = expand_normal (arg0);
36351 op1 = expand_normal (arg1);
36352 op2 = expand_normal (arg2);
36354 /* Size was passed but we need to use (size - 1) as for bndmk. */
36355 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36356 NULL_RTX, 1, OPTAB_DIRECT);
36358 /* Add LB to size and inverse to get UB. */
36359 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36360 op2, 1, OPTAB_DIRECT);
36361 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36363 if (!register_operand (lb, Pmode))
36364 lb = ix86_zero_extend_to_Pmode (lb);
36365 if (!register_operand (ub, Pmode))
36366 ub = ix86_zero_extend_to_Pmode (ub);
36368 /* We need to move bounds to memory before any computations. */
36369 if (MEM_P (op1))
36370 m1 = op1;
36371 else
36373 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36374 emit_move_insn (m1, op1);
36377 /* Generate mem expression to be used for access to LB and UB. */
36378 m1h1 = adjust_address (m1, Pmode, 0);
36379 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36381 t1 = gen_reg_rtx (Pmode);
36383 /* Compute LB. */
36384 emit_move_insn (t1, m1h1);
36385 ix86_emit_move_max (t1, lb);
36386 emit_move_insn (m1h1, t1);
36388 /* Compute UB. UB is stored in 1's complement form. Therefore
36389 we also use max here. */
36390 emit_move_insn (t1, m1h2);
36391 ix86_emit_move_max (t1, ub);
36392 emit_move_insn (m1h2, t1);
36394 op2 = gen_reg_rtx (BNDmode);
36395 emit_move_insn (op2, m1);
36397 return chkp_join_splitted_slot (lb, op2);
36400 case IX86_BUILTIN_BNDINT:
36402 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36404 if (!target
36405 || GET_MODE (target) != BNDmode
36406 || !register_operand (target, BNDmode))
36407 target = gen_reg_rtx (BNDmode);
36409 arg0 = CALL_EXPR_ARG (exp, 0);
36410 arg1 = CALL_EXPR_ARG (exp, 1);
36412 op0 = expand_normal (arg0);
36413 op1 = expand_normal (arg1);
36415 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36416 rh1 = adjust_address (res, Pmode, 0);
36417 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36419 /* Put first bounds to temporaries. */
36420 lb1 = gen_reg_rtx (Pmode);
36421 ub1 = gen_reg_rtx (Pmode);
36422 if (MEM_P (op0))
36424 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36425 emit_move_insn (ub1, adjust_address (op0, Pmode,
36426 GET_MODE_SIZE (Pmode)));
36428 else
36430 emit_move_insn (res, op0);
36431 emit_move_insn (lb1, rh1);
36432 emit_move_insn (ub1, rh2);
36435 /* Put second bounds to temporaries. */
36436 lb2 = gen_reg_rtx (Pmode);
36437 ub2 = gen_reg_rtx (Pmode);
36438 if (MEM_P (op1))
36440 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36441 emit_move_insn (ub2, adjust_address (op1, Pmode,
36442 GET_MODE_SIZE (Pmode)));
36444 else
36446 emit_move_insn (res, op1);
36447 emit_move_insn (lb2, rh1);
36448 emit_move_insn (ub2, rh2);
36451 /* Compute LB. */
36452 ix86_emit_move_max (lb1, lb2);
36453 emit_move_insn (rh1, lb1);
36455 /* Compute UB. UB is stored in 1's complement form. Therefore
36456 we also use max here. */
36457 ix86_emit_move_max (ub1, ub2);
36458 emit_move_insn (rh2, ub1);
36460 emit_move_insn (target, res);
36462 return target;
36465 case IX86_BUILTIN_SIZEOF:
36467 tree name;
36468 rtx symbol;
36470 if (!target
36471 || GET_MODE (target) != Pmode
36472 || !register_operand (target, Pmode))
36473 target = gen_reg_rtx (Pmode);
36475 arg0 = CALL_EXPR_ARG (exp, 0);
36476 gcc_assert (TREE_CODE (arg0) == VAR_DECL);
36478 name = DECL_ASSEMBLER_NAME (arg0);
36479 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36481 emit_insn (Pmode == SImode
36482 ? gen_move_size_reloc_si (target, symbol)
36483 : gen_move_size_reloc_di (target, symbol));
36485 return target;
36488 case IX86_BUILTIN_BNDLOWER:
36490 rtx mem, hmem;
36492 if (!target
36493 || GET_MODE (target) != Pmode
36494 || !register_operand (target, Pmode))
36495 target = gen_reg_rtx (Pmode);
36497 arg0 = CALL_EXPR_ARG (exp, 0);
36498 op0 = expand_normal (arg0);
36500 /* We need to move bounds to memory first. */
36501 if (MEM_P (op0))
36502 mem = op0;
36503 else
36505 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36506 emit_move_insn (mem, op0);
36509 /* Generate mem expression to access LB and load it. */
36510 hmem = adjust_address (mem, Pmode, 0);
36511 emit_move_insn (target, hmem);
36513 return target;
36516 case IX86_BUILTIN_BNDUPPER:
36518 rtx mem, hmem, res;
36520 if (!target
36521 || GET_MODE (target) != Pmode
36522 || !register_operand (target, Pmode))
36523 target = gen_reg_rtx (Pmode);
36525 arg0 = CALL_EXPR_ARG (exp, 0);
36526 op0 = expand_normal (arg0);
36528 /* We need to move bounds to memory first. */
36529 if (MEM_P (op0))
36530 mem = op0;
36531 else
36533 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36534 emit_move_insn (mem, op0);
36537 /* Generate mem expression to access UB. */
36538 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36540 /* We need to inverse all bits of UB. */
36541 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36543 if (res != target)
36544 emit_move_insn (target, res);
36546 return target;
36549 case IX86_BUILTIN_MASKMOVQ:
36550 case IX86_BUILTIN_MASKMOVDQU:
36551 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36552 ? CODE_FOR_mmx_maskmovq
36553 : CODE_FOR_sse2_maskmovdqu);
36554 /* Note the arg order is different from the operand order. */
36555 arg1 = CALL_EXPR_ARG (exp, 0);
36556 arg2 = CALL_EXPR_ARG (exp, 1);
36557 arg0 = CALL_EXPR_ARG (exp, 2);
36558 op0 = expand_normal (arg0);
36559 op1 = expand_normal (arg1);
36560 op2 = expand_normal (arg2);
36561 mode0 = insn_data[icode].operand[0].mode;
36562 mode1 = insn_data[icode].operand[1].mode;
36563 mode2 = insn_data[icode].operand[2].mode;
36565 op0 = ix86_zero_extend_to_Pmode (op0);
36566 op0 = gen_rtx_MEM (mode1, op0);
36568 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36569 op0 = copy_to_mode_reg (mode0, op0);
36570 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36571 op1 = copy_to_mode_reg (mode1, op1);
36572 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36573 op2 = copy_to_mode_reg (mode2, op2);
36574 pat = GEN_FCN (icode) (op0, op1, op2);
36575 if (! pat)
36576 return 0;
36577 emit_insn (pat);
36578 return 0;
36580 case IX86_BUILTIN_LDMXCSR:
36581 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36582 target = assign_386_stack_local (SImode, SLOT_TEMP);
36583 emit_move_insn (target, op0);
36584 emit_insn (gen_sse_ldmxcsr (target));
36585 return 0;
36587 case IX86_BUILTIN_STMXCSR:
36588 target = assign_386_stack_local (SImode, SLOT_TEMP);
36589 emit_insn (gen_sse_stmxcsr (target));
36590 return copy_to_mode_reg (SImode, target);
36592 case IX86_BUILTIN_CLFLUSH:
36593 arg0 = CALL_EXPR_ARG (exp, 0);
36594 op0 = expand_normal (arg0);
36595 icode = CODE_FOR_sse2_clflush;
36596 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36597 op0 = ix86_zero_extend_to_Pmode (op0);
36599 emit_insn (gen_sse2_clflush (op0));
36600 return 0;
36602 case IX86_BUILTIN_CLWB:
36603 arg0 = CALL_EXPR_ARG (exp, 0);
36604 op0 = expand_normal (arg0);
36605 icode = CODE_FOR_clwb;
36606 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36607 op0 = ix86_zero_extend_to_Pmode (op0);
36609 emit_insn (gen_clwb (op0));
36610 return 0;
36612 case IX86_BUILTIN_CLFLUSHOPT:
36613 arg0 = CALL_EXPR_ARG (exp, 0);
36614 op0 = expand_normal (arg0);
36615 icode = CODE_FOR_clflushopt;
36616 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36617 op0 = ix86_zero_extend_to_Pmode (op0);
36619 emit_insn (gen_clflushopt (op0));
36620 return 0;
36622 case IX86_BUILTIN_MONITOR:
36623 case IX86_BUILTIN_MONITORX:
36624 arg0 = CALL_EXPR_ARG (exp, 0);
36625 arg1 = CALL_EXPR_ARG (exp, 1);
36626 arg2 = CALL_EXPR_ARG (exp, 2);
36627 op0 = expand_normal (arg0);
36628 op1 = expand_normal (arg1);
36629 op2 = expand_normal (arg2);
36630 if (!REG_P (op0))
36631 op0 = ix86_zero_extend_to_Pmode (op0);
36632 if (!REG_P (op1))
36633 op1 = copy_to_mode_reg (SImode, op1);
36634 if (!REG_P (op2))
36635 op2 = copy_to_mode_reg (SImode, op2);
36637 emit_insn (fcode == IX86_BUILTIN_MONITOR
36638 ? ix86_gen_monitor (op0, op1, op2)
36639 : ix86_gen_monitorx (op0, op1, op2));
36640 return 0;
36642 case IX86_BUILTIN_MWAIT:
36643 arg0 = CALL_EXPR_ARG (exp, 0);
36644 arg1 = CALL_EXPR_ARG (exp, 1);
36645 op0 = expand_normal (arg0);
36646 op1 = expand_normal (arg1);
36647 if (!REG_P (op0))
36648 op0 = copy_to_mode_reg (SImode, op0);
36649 if (!REG_P (op1))
36650 op1 = copy_to_mode_reg (SImode, op1);
36651 emit_insn (gen_sse3_mwait (op0, op1));
36652 return 0;
36654 case IX86_BUILTIN_MWAITX:
36655 arg0 = CALL_EXPR_ARG (exp, 0);
36656 arg1 = CALL_EXPR_ARG (exp, 1);
36657 arg2 = CALL_EXPR_ARG (exp, 2);
36658 op0 = expand_normal (arg0);
36659 op1 = expand_normal (arg1);
36660 op2 = expand_normal (arg2);
36661 if (!REG_P (op0))
36662 op0 = copy_to_mode_reg (SImode, op0);
36663 if (!REG_P (op1))
36664 op1 = copy_to_mode_reg (SImode, op1);
36665 if (!REG_P (op2))
36666 op2 = copy_to_mode_reg (SImode, op2);
36667 emit_insn (gen_mwaitx (op0, op1, op2));
36668 return 0;
36670 case IX86_BUILTIN_CLZERO:
36671 arg0 = CALL_EXPR_ARG (exp, 0);
36672 op0 = expand_normal (arg0);
36673 if (!REG_P (op0))
36674 op0 = ix86_zero_extend_to_Pmode (op0);
36675 emit_insn (ix86_gen_clzero (op0));
36676 return 0;
36678 case IX86_BUILTIN_VEC_INIT_V2SI:
36679 case IX86_BUILTIN_VEC_INIT_V4HI:
36680 case IX86_BUILTIN_VEC_INIT_V8QI:
36681 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36683 case IX86_BUILTIN_VEC_EXT_V2DF:
36684 case IX86_BUILTIN_VEC_EXT_V2DI:
36685 case IX86_BUILTIN_VEC_EXT_V4SF:
36686 case IX86_BUILTIN_VEC_EXT_V4SI:
36687 case IX86_BUILTIN_VEC_EXT_V8HI:
36688 case IX86_BUILTIN_VEC_EXT_V2SI:
36689 case IX86_BUILTIN_VEC_EXT_V4HI:
36690 case IX86_BUILTIN_VEC_EXT_V16QI:
36691 return ix86_expand_vec_ext_builtin (exp, target);
36693 case IX86_BUILTIN_VEC_SET_V2DI:
36694 case IX86_BUILTIN_VEC_SET_V4SF:
36695 case IX86_BUILTIN_VEC_SET_V4SI:
36696 case IX86_BUILTIN_VEC_SET_V8HI:
36697 case IX86_BUILTIN_VEC_SET_V4HI:
36698 case IX86_BUILTIN_VEC_SET_V16QI:
36699 return ix86_expand_vec_set_builtin (exp);
36701 case IX86_BUILTIN_INFQ:
36702 case IX86_BUILTIN_HUGE_VALQ:
36704 REAL_VALUE_TYPE inf;
36705 rtx tmp;
36707 real_inf (&inf);
36708 tmp = const_double_from_real_value (inf, mode);
36710 tmp = validize_mem (force_const_mem (mode, tmp));
36712 if (target == 0)
36713 target = gen_reg_rtx (mode);
36715 emit_move_insn (target, tmp);
36716 return target;
36719 case IX86_BUILTIN_NANQ:
36720 case IX86_BUILTIN_NANSQ:
36721 return expand_call (exp, target, ignore);
36723 case IX86_BUILTIN_RDPMC:
36724 case IX86_BUILTIN_RDTSC:
36725 case IX86_BUILTIN_RDTSCP:
36727 op0 = gen_reg_rtx (DImode);
36728 op1 = gen_reg_rtx (DImode);
36730 if (fcode == IX86_BUILTIN_RDPMC)
36732 arg0 = CALL_EXPR_ARG (exp, 0);
36733 op2 = expand_normal (arg0);
36734 if (!register_operand (op2, SImode))
36735 op2 = copy_to_mode_reg (SImode, op2);
36737 insn = (TARGET_64BIT
36738 ? gen_rdpmc_rex64 (op0, op1, op2)
36739 : gen_rdpmc (op0, op2));
36740 emit_insn (insn);
36742 else if (fcode == IX86_BUILTIN_RDTSC)
36744 insn = (TARGET_64BIT
36745 ? gen_rdtsc_rex64 (op0, op1)
36746 : gen_rdtsc (op0));
36747 emit_insn (insn);
36749 else
36751 op2 = gen_reg_rtx (SImode);
36753 insn = (TARGET_64BIT
36754 ? gen_rdtscp_rex64 (op0, op1, op2)
36755 : gen_rdtscp (op0, op2));
36756 emit_insn (insn);
36758 arg0 = CALL_EXPR_ARG (exp, 0);
36759 op4 = expand_normal (arg0);
36760 if (!address_operand (op4, VOIDmode))
36762 op4 = convert_memory_address (Pmode, op4);
36763 op4 = copy_addr_to_reg (op4);
36765 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36768 if (target == 0)
36770 /* mode is VOIDmode if __builtin_rd* has been called
36771 without lhs. */
36772 if (mode == VOIDmode)
36773 return target;
36774 target = gen_reg_rtx (mode);
36777 if (TARGET_64BIT)
36779 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36780 op1, 1, OPTAB_DIRECT);
36781 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36782 op0, 1, OPTAB_DIRECT);
36785 emit_move_insn (target, op0);
36786 return target;
36788 case IX86_BUILTIN_FXSAVE:
36789 case IX86_BUILTIN_FXRSTOR:
36790 case IX86_BUILTIN_FXSAVE64:
36791 case IX86_BUILTIN_FXRSTOR64:
36792 case IX86_BUILTIN_FNSTENV:
36793 case IX86_BUILTIN_FLDENV:
36794 mode0 = BLKmode;
36795 switch (fcode)
36797 case IX86_BUILTIN_FXSAVE:
36798 icode = CODE_FOR_fxsave;
36799 break;
36800 case IX86_BUILTIN_FXRSTOR:
36801 icode = CODE_FOR_fxrstor;
36802 break;
36803 case IX86_BUILTIN_FXSAVE64:
36804 icode = CODE_FOR_fxsave64;
36805 break;
36806 case IX86_BUILTIN_FXRSTOR64:
36807 icode = CODE_FOR_fxrstor64;
36808 break;
36809 case IX86_BUILTIN_FNSTENV:
36810 icode = CODE_FOR_fnstenv;
36811 break;
36812 case IX86_BUILTIN_FLDENV:
36813 icode = CODE_FOR_fldenv;
36814 break;
36815 default:
36816 gcc_unreachable ();
36819 arg0 = CALL_EXPR_ARG (exp, 0);
36820 op0 = expand_normal (arg0);
36822 if (!address_operand (op0, VOIDmode))
36824 op0 = convert_memory_address (Pmode, op0);
36825 op0 = copy_addr_to_reg (op0);
36827 op0 = gen_rtx_MEM (mode0, op0);
36829 pat = GEN_FCN (icode) (op0);
36830 if (pat)
36831 emit_insn (pat);
36832 return 0;
36834 case IX86_BUILTIN_XSAVE:
36835 case IX86_BUILTIN_XRSTOR:
36836 case IX86_BUILTIN_XSAVE64:
36837 case IX86_BUILTIN_XRSTOR64:
36838 case IX86_BUILTIN_XSAVEOPT:
36839 case IX86_BUILTIN_XSAVEOPT64:
36840 case IX86_BUILTIN_XSAVES:
36841 case IX86_BUILTIN_XRSTORS:
36842 case IX86_BUILTIN_XSAVES64:
36843 case IX86_BUILTIN_XRSTORS64:
36844 case IX86_BUILTIN_XSAVEC:
36845 case IX86_BUILTIN_XSAVEC64:
36846 arg0 = CALL_EXPR_ARG (exp, 0);
36847 arg1 = CALL_EXPR_ARG (exp, 1);
36848 op0 = expand_normal (arg0);
36849 op1 = expand_normal (arg1);
36851 if (!address_operand (op0, VOIDmode))
36853 op0 = convert_memory_address (Pmode, op0);
36854 op0 = copy_addr_to_reg (op0);
36856 op0 = gen_rtx_MEM (BLKmode, op0);
36858 op1 = force_reg (DImode, op1);
36860 if (TARGET_64BIT)
36862 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36863 NULL, 1, OPTAB_DIRECT);
36864 switch (fcode)
36866 case IX86_BUILTIN_XSAVE:
36867 icode = CODE_FOR_xsave_rex64;
36868 break;
36869 case IX86_BUILTIN_XRSTOR:
36870 icode = CODE_FOR_xrstor_rex64;
36871 break;
36872 case IX86_BUILTIN_XSAVE64:
36873 icode = CODE_FOR_xsave64;
36874 break;
36875 case IX86_BUILTIN_XRSTOR64:
36876 icode = CODE_FOR_xrstor64;
36877 break;
36878 case IX86_BUILTIN_XSAVEOPT:
36879 icode = CODE_FOR_xsaveopt_rex64;
36880 break;
36881 case IX86_BUILTIN_XSAVEOPT64:
36882 icode = CODE_FOR_xsaveopt64;
36883 break;
36884 case IX86_BUILTIN_XSAVES:
36885 icode = CODE_FOR_xsaves_rex64;
36886 break;
36887 case IX86_BUILTIN_XRSTORS:
36888 icode = CODE_FOR_xrstors_rex64;
36889 break;
36890 case IX86_BUILTIN_XSAVES64:
36891 icode = CODE_FOR_xsaves64;
36892 break;
36893 case IX86_BUILTIN_XRSTORS64:
36894 icode = CODE_FOR_xrstors64;
36895 break;
36896 case IX86_BUILTIN_XSAVEC:
36897 icode = CODE_FOR_xsavec_rex64;
36898 break;
36899 case IX86_BUILTIN_XSAVEC64:
36900 icode = CODE_FOR_xsavec64;
36901 break;
36902 default:
36903 gcc_unreachable ();
36906 op2 = gen_lowpart (SImode, op2);
36907 op1 = gen_lowpart (SImode, op1);
36908 pat = GEN_FCN (icode) (op0, op1, op2);
36910 else
36912 switch (fcode)
36914 case IX86_BUILTIN_XSAVE:
36915 icode = CODE_FOR_xsave;
36916 break;
36917 case IX86_BUILTIN_XRSTOR:
36918 icode = CODE_FOR_xrstor;
36919 break;
36920 case IX86_BUILTIN_XSAVEOPT:
36921 icode = CODE_FOR_xsaveopt;
36922 break;
36923 case IX86_BUILTIN_XSAVES:
36924 icode = CODE_FOR_xsaves;
36925 break;
36926 case IX86_BUILTIN_XRSTORS:
36927 icode = CODE_FOR_xrstors;
36928 break;
36929 case IX86_BUILTIN_XSAVEC:
36930 icode = CODE_FOR_xsavec;
36931 break;
36932 default:
36933 gcc_unreachable ();
36935 pat = GEN_FCN (icode) (op0, op1);
36938 if (pat)
36939 emit_insn (pat);
36940 return 0;
36942 case IX86_BUILTIN_LLWPCB:
36943 arg0 = CALL_EXPR_ARG (exp, 0);
36944 op0 = expand_normal (arg0);
36945 icode = CODE_FOR_lwp_llwpcb;
36946 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36947 op0 = ix86_zero_extend_to_Pmode (op0);
36948 emit_insn (gen_lwp_llwpcb (op0));
36949 return 0;
36951 case IX86_BUILTIN_SLWPCB:
36952 icode = CODE_FOR_lwp_slwpcb;
36953 if (!target
36954 || !insn_data[icode].operand[0].predicate (target, Pmode))
36955 target = gen_reg_rtx (Pmode);
36956 emit_insn (gen_lwp_slwpcb (target));
36957 return target;
36959 case IX86_BUILTIN_BEXTRI32:
36960 case IX86_BUILTIN_BEXTRI64:
36961 arg0 = CALL_EXPR_ARG (exp, 0);
36962 arg1 = CALL_EXPR_ARG (exp, 1);
36963 op0 = expand_normal (arg0);
36964 op1 = expand_normal (arg1);
36965 icode = (fcode == IX86_BUILTIN_BEXTRI32
36966 ? CODE_FOR_tbm_bextri_si
36967 : CODE_FOR_tbm_bextri_di);
36968 if (!CONST_INT_P (op1))
36970 error ("last argument must be an immediate");
36971 return const0_rtx;
36973 else
36975 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36976 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36977 op1 = GEN_INT (length);
36978 op2 = GEN_INT (lsb_index);
36979 pat = GEN_FCN (icode) (target, op0, op1, op2);
36980 if (pat)
36981 emit_insn (pat);
36982 return target;
36985 case IX86_BUILTIN_RDRAND16_STEP:
36986 icode = CODE_FOR_rdrandhi_1;
36987 mode0 = HImode;
36988 goto rdrand_step;
36990 case IX86_BUILTIN_RDRAND32_STEP:
36991 icode = CODE_FOR_rdrandsi_1;
36992 mode0 = SImode;
36993 goto rdrand_step;
36995 case IX86_BUILTIN_RDRAND64_STEP:
36996 icode = CODE_FOR_rdranddi_1;
36997 mode0 = DImode;
36999 rdrand_step:
37000 op0 = gen_reg_rtx (mode0);
37001 emit_insn (GEN_FCN (icode) (op0));
37003 arg0 = CALL_EXPR_ARG (exp, 0);
37004 op1 = expand_normal (arg0);
37005 if (!address_operand (op1, VOIDmode))
37007 op1 = convert_memory_address (Pmode, op1);
37008 op1 = copy_addr_to_reg (op1);
37010 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37012 op1 = gen_reg_rtx (SImode);
37013 emit_move_insn (op1, CONST1_RTX (SImode));
37015 /* Emit SImode conditional move. */
37016 if (mode0 == HImode)
37018 op2 = gen_reg_rtx (SImode);
37019 emit_insn (gen_zero_extendhisi2 (op2, op0));
37021 else if (mode0 == SImode)
37022 op2 = op0;
37023 else
37024 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37026 if (target == 0
37027 || !register_operand (target, SImode))
37028 target = gen_reg_rtx (SImode);
37030 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37031 const0_rtx);
37032 emit_insn (gen_rtx_SET (target,
37033 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37034 return target;
37036 case IX86_BUILTIN_RDSEED16_STEP:
37037 icode = CODE_FOR_rdseedhi_1;
37038 mode0 = HImode;
37039 goto rdseed_step;
37041 case IX86_BUILTIN_RDSEED32_STEP:
37042 icode = CODE_FOR_rdseedsi_1;
37043 mode0 = SImode;
37044 goto rdseed_step;
37046 case IX86_BUILTIN_RDSEED64_STEP:
37047 icode = CODE_FOR_rdseeddi_1;
37048 mode0 = DImode;
37050 rdseed_step:
37051 op0 = gen_reg_rtx (mode0);
37052 emit_insn (GEN_FCN (icode) (op0));
37054 arg0 = CALL_EXPR_ARG (exp, 0);
37055 op1 = expand_normal (arg0);
37056 if (!address_operand (op1, VOIDmode))
37058 op1 = convert_memory_address (Pmode, op1);
37059 op1 = copy_addr_to_reg (op1);
37061 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37063 op2 = gen_reg_rtx (QImode);
37065 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37066 const0_rtx);
37067 emit_insn (gen_rtx_SET (op2, pat));
37069 if (target == 0
37070 || !register_operand (target, SImode))
37071 target = gen_reg_rtx (SImode);
37073 emit_insn (gen_zero_extendqisi2 (target, op2));
37074 return target;
37076 case IX86_BUILTIN_SBB32:
37077 icode = CODE_FOR_subborrowsi;
37078 mode0 = SImode;
37079 goto handlecarry;
37081 case IX86_BUILTIN_SBB64:
37082 icode = CODE_FOR_subborrowdi;
37083 mode0 = DImode;
37084 goto handlecarry;
37086 case IX86_BUILTIN_ADDCARRYX32:
37087 icode = CODE_FOR_addcarrysi;
37088 mode0 = SImode;
37089 goto handlecarry;
37091 case IX86_BUILTIN_ADDCARRYX64:
37092 icode = CODE_FOR_addcarrydi;
37093 mode0 = DImode;
37095 handlecarry:
37096 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37097 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37098 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37099 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37101 op1 = expand_normal (arg0);
37102 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37104 op2 = expand_normal (arg1);
37105 if (!register_operand (op2, mode0))
37106 op2 = copy_to_mode_reg (mode0, op2);
37108 op3 = expand_normal (arg2);
37109 if (!register_operand (op3, mode0))
37110 op3 = copy_to_mode_reg (mode0, op3);
37112 op4 = expand_normal (arg3);
37113 if (!address_operand (op4, VOIDmode))
37115 op4 = convert_memory_address (Pmode, op4);
37116 op4 = copy_addr_to_reg (op4);
37119 /* Generate CF from input operand. */
37120 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37122 /* Generate instruction that consumes CF. */
37123 op0 = gen_reg_rtx (mode0);
37125 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37126 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37127 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37129 /* Return current CF value. */
37130 if (target == 0)
37131 target = gen_reg_rtx (QImode);
37133 PUT_MODE (pat, QImode);
37134 emit_insn (gen_rtx_SET (target, pat));
37136 /* Store the result. */
37137 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37139 return target;
37141 case IX86_BUILTIN_READ_FLAGS:
37142 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37144 if (optimize
37145 || target == NULL_RTX
37146 || !nonimmediate_operand (target, word_mode)
37147 || GET_MODE (target) != word_mode)
37148 target = gen_reg_rtx (word_mode);
37150 emit_insn (gen_pop (target));
37151 return target;
37153 case IX86_BUILTIN_WRITE_FLAGS:
37155 arg0 = CALL_EXPR_ARG (exp, 0);
37156 op0 = expand_normal (arg0);
37157 if (!general_no_elim_operand (op0, word_mode))
37158 op0 = copy_to_mode_reg (word_mode, op0);
37160 emit_insn (gen_push (op0));
37161 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37162 return 0;
37164 case IX86_BUILTIN_KORTESTC16:
37165 icode = CODE_FOR_kortestchi;
37166 mode0 = HImode;
37167 mode1 = CCCmode;
37168 goto kortest;
37170 case IX86_BUILTIN_KORTESTZ16:
37171 icode = CODE_FOR_kortestzhi;
37172 mode0 = HImode;
37173 mode1 = CCZmode;
37175 kortest:
37176 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37177 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37178 op0 = expand_normal (arg0);
37179 op1 = expand_normal (arg1);
37181 op0 = copy_to_reg (op0);
37182 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37183 op1 = copy_to_reg (op1);
37184 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37186 target = gen_reg_rtx (QImode);
37187 emit_insn (gen_rtx_SET (target, const0_rtx));
37189 /* Emit kortest. */
37190 emit_insn (GEN_FCN (icode) (op0, op1));
37191 /* And use setcc to return result from flags. */
37192 ix86_expand_setcc (target, EQ,
37193 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37194 return target;
37196 case IX86_BUILTIN_GATHERSIV2DF:
37197 icode = CODE_FOR_avx2_gathersiv2df;
37198 goto gather_gen;
37199 case IX86_BUILTIN_GATHERSIV4DF:
37200 icode = CODE_FOR_avx2_gathersiv4df;
37201 goto gather_gen;
37202 case IX86_BUILTIN_GATHERDIV2DF:
37203 icode = CODE_FOR_avx2_gatherdiv2df;
37204 goto gather_gen;
37205 case IX86_BUILTIN_GATHERDIV4DF:
37206 icode = CODE_FOR_avx2_gatherdiv4df;
37207 goto gather_gen;
37208 case IX86_BUILTIN_GATHERSIV4SF:
37209 icode = CODE_FOR_avx2_gathersiv4sf;
37210 goto gather_gen;
37211 case IX86_BUILTIN_GATHERSIV8SF:
37212 icode = CODE_FOR_avx2_gathersiv8sf;
37213 goto gather_gen;
37214 case IX86_BUILTIN_GATHERDIV4SF:
37215 icode = CODE_FOR_avx2_gatherdiv4sf;
37216 goto gather_gen;
37217 case IX86_BUILTIN_GATHERDIV8SF:
37218 icode = CODE_FOR_avx2_gatherdiv8sf;
37219 goto gather_gen;
37220 case IX86_BUILTIN_GATHERSIV2DI:
37221 icode = CODE_FOR_avx2_gathersiv2di;
37222 goto gather_gen;
37223 case IX86_BUILTIN_GATHERSIV4DI:
37224 icode = CODE_FOR_avx2_gathersiv4di;
37225 goto gather_gen;
37226 case IX86_BUILTIN_GATHERDIV2DI:
37227 icode = CODE_FOR_avx2_gatherdiv2di;
37228 goto gather_gen;
37229 case IX86_BUILTIN_GATHERDIV4DI:
37230 icode = CODE_FOR_avx2_gatherdiv4di;
37231 goto gather_gen;
37232 case IX86_BUILTIN_GATHERSIV4SI:
37233 icode = CODE_FOR_avx2_gathersiv4si;
37234 goto gather_gen;
37235 case IX86_BUILTIN_GATHERSIV8SI:
37236 icode = CODE_FOR_avx2_gathersiv8si;
37237 goto gather_gen;
37238 case IX86_BUILTIN_GATHERDIV4SI:
37239 icode = CODE_FOR_avx2_gatherdiv4si;
37240 goto gather_gen;
37241 case IX86_BUILTIN_GATHERDIV8SI:
37242 icode = CODE_FOR_avx2_gatherdiv8si;
37243 goto gather_gen;
37244 case IX86_BUILTIN_GATHERALTSIV4DF:
37245 icode = CODE_FOR_avx2_gathersiv4df;
37246 goto gather_gen;
37247 case IX86_BUILTIN_GATHERALTDIV8SF:
37248 icode = CODE_FOR_avx2_gatherdiv8sf;
37249 goto gather_gen;
37250 case IX86_BUILTIN_GATHERALTSIV4DI:
37251 icode = CODE_FOR_avx2_gathersiv4di;
37252 goto gather_gen;
37253 case IX86_BUILTIN_GATHERALTDIV8SI:
37254 icode = CODE_FOR_avx2_gatherdiv8si;
37255 goto gather_gen;
37256 case IX86_BUILTIN_GATHER3SIV16SF:
37257 icode = CODE_FOR_avx512f_gathersiv16sf;
37258 goto gather_gen;
37259 case IX86_BUILTIN_GATHER3SIV8DF:
37260 icode = CODE_FOR_avx512f_gathersiv8df;
37261 goto gather_gen;
37262 case IX86_BUILTIN_GATHER3DIV16SF:
37263 icode = CODE_FOR_avx512f_gatherdiv16sf;
37264 goto gather_gen;
37265 case IX86_BUILTIN_GATHER3DIV8DF:
37266 icode = CODE_FOR_avx512f_gatherdiv8df;
37267 goto gather_gen;
37268 case IX86_BUILTIN_GATHER3SIV16SI:
37269 icode = CODE_FOR_avx512f_gathersiv16si;
37270 goto gather_gen;
37271 case IX86_BUILTIN_GATHER3SIV8DI:
37272 icode = CODE_FOR_avx512f_gathersiv8di;
37273 goto gather_gen;
37274 case IX86_BUILTIN_GATHER3DIV16SI:
37275 icode = CODE_FOR_avx512f_gatherdiv16si;
37276 goto gather_gen;
37277 case IX86_BUILTIN_GATHER3DIV8DI:
37278 icode = CODE_FOR_avx512f_gatherdiv8di;
37279 goto gather_gen;
37280 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37281 icode = CODE_FOR_avx512f_gathersiv8df;
37282 goto gather_gen;
37283 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37284 icode = CODE_FOR_avx512f_gatherdiv16sf;
37285 goto gather_gen;
37286 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37287 icode = CODE_FOR_avx512f_gathersiv8di;
37288 goto gather_gen;
37289 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37290 icode = CODE_FOR_avx512f_gatherdiv16si;
37291 goto gather_gen;
37292 case IX86_BUILTIN_GATHER3SIV2DF:
37293 icode = CODE_FOR_avx512vl_gathersiv2df;
37294 goto gather_gen;
37295 case IX86_BUILTIN_GATHER3SIV4DF:
37296 icode = CODE_FOR_avx512vl_gathersiv4df;
37297 goto gather_gen;
37298 case IX86_BUILTIN_GATHER3DIV2DF:
37299 icode = CODE_FOR_avx512vl_gatherdiv2df;
37300 goto gather_gen;
37301 case IX86_BUILTIN_GATHER3DIV4DF:
37302 icode = CODE_FOR_avx512vl_gatherdiv4df;
37303 goto gather_gen;
37304 case IX86_BUILTIN_GATHER3SIV4SF:
37305 icode = CODE_FOR_avx512vl_gathersiv4sf;
37306 goto gather_gen;
37307 case IX86_BUILTIN_GATHER3SIV8SF:
37308 icode = CODE_FOR_avx512vl_gathersiv8sf;
37309 goto gather_gen;
37310 case IX86_BUILTIN_GATHER3DIV4SF:
37311 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37312 goto gather_gen;
37313 case IX86_BUILTIN_GATHER3DIV8SF:
37314 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37315 goto gather_gen;
37316 case IX86_BUILTIN_GATHER3SIV2DI:
37317 icode = CODE_FOR_avx512vl_gathersiv2di;
37318 goto gather_gen;
37319 case IX86_BUILTIN_GATHER3SIV4DI:
37320 icode = CODE_FOR_avx512vl_gathersiv4di;
37321 goto gather_gen;
37322 case IX86_BUILTIN_GATHER3DIV2DI:
37323 icode = CODE_FOR_avx512vl_gatherdiv2di;
37324 goto gather_gen;
37325 case IX86_BUILTIN_GATHER3DIV4DI:
37326 icode = CODE_FOR_avx512vl_gatherdiv4di;
37327 goto gather_gen;
37328 case IX86_BUILTIN_GATHER3SIV4SI:
37329 icode = CODE_FOR_avx512vl_gathersiv4si;
37330 goto gather_gen;
37331 case IX86_BUILTIN_GATHER3SIV8SI:
37332 icode = CODE_FOR_avx512vl_gathersiv8si;
37333 goto gather_gen;
37334 case IX86_BUILTIN_GATHER3DIV4SI:
37335 icode = CODE_FOR_avx512vl_gatherdiv4si;
37336 goto gather_gen;
37337 case IX86_BUILTIN_GATHER3DIV8SI:
37338 icode = CODE_FOR_avx512vl_gatherdiv8si;
37339 goto gather_gen;
37340 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37341 icode = CODE_FOR_avx512vl_gathersiv4df;
37342 goto gather_gen;
37343 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37344 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37345 goto gather_gen;
37346 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37347 icode = CODE_FOR_avx512vl_gathersiv4di;
37348 goto gather_gen;
37349 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37350 icode = CODE_FOR_avx512vl_gatherdiv8si;
37351 goto gather_gen;
37352 case IX86_BUILTIN_SCATTERSIV16SF:
37353 icode = CODE_FOR_avx512f_scattersiv16sf;
37354 goto scatter_gen;
37355 case IX86_BUILTIN_SCATTERSIV8DF:
37356 icode = CODE_FOR_avx512f_scattersiv8df;
37357 goto scatter_gen;
37358 case IX86_BUILTIN_SCATTERDIV16SF:
37359 icode = CODE_FOR_avx512f_scatterdiv16sf;
37360 goto scatter_gen;
37361 case IX86_BUILTIN_SCATTERDIV8DF:
37362 icode = CODE_FOR_avx512f_scatterdiv8df;
37363 goto scatter_gen;
37364 case IX86_BUILTIN_SCATTERSIV16SI:
37365 icode = CODE_FOR_avx512f_scattersiv16si;
37366 goto scatter_gen;
37367 case IX86_BUILTIN_SCATTERSIV8DI:
37368 icode = CODE_FOR_avx512f_scattersiv8di;
37369 goto scatter_gen;
37370 case IX86_BUILTIN_SCATTERDIV16SI:
37371 icode = CODE_FOR_avx512f_scatterdiv16si;
37372 goto scatter_gen;
37373 case IX86_BUILTIN_SCATTERDIV8DI:
37374 icode = CODE_FOR_avx512f_scatterdiv8di;
37375 goto scatter_gen;
37376 case IX86_BUILTIN_SCATTERSIV8SF:
37377 icode = CODE_FOR_avx512vl_scattersiv8sf;
37378 goto scatter_gen;
37379 case IX86_BUILTIN_SCATTERSIV4SF:
37380 icode = CODE_FOR_avx512vl_scattersiv4sf;
37381 goto scatter_gen;
37382 case IX86_BUILTIN_SCATTERSIV4DF:
37383 icode = CODE_FOR_avx512vl_scattersiv4df;
37384 goto scatter_gen;
37385 case IX86_BUILTIN_SCATTERSIV2DF:
37386 icode = CODE_FOR_avx512vl_scattersiv2df;
37387 goto scatter_gen;
37388 case IX86_BUILTIN_SCATTERDIV8SF:
37389 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37390 goto scatter_gen;
37391 case IX86_BUILTIN_SCATTERDIV4SF:
37392 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37393 goto scatter_gen;
37394 case IX86_BUILTIN_SCATTERDIV4DF:
37395 icode = CODE_FOR_avx512vl_scatterdiv4df;
37396 goto scatter_gen;
37397 case IX86_BUILTIN_SCATTERDIV2DF:
37398 icode = CODE_FOR_avx512vl_scatterdiv2df;
37399 goto scatter_gen;
37400 case IX86_BUILTIN_SCATTERSIV8SI:
37401 icode = CODE_FOR_avx512vl_scattersiv8si;
37402 goto scatter_gen;
37403 case IX86_BUILTIN_SCATTERSIV4SI:
37404 icode = CODE_FOR_avx512vl_scattersiv4si;
37405 goto scatter_gen;
37406 case IX86_BUILTIN_SCATTERSIV4DI:
37407 icode = CODE_FOR_avx512vl_scattersiv4di;
37408 goto scatter_gen;
37409 case IX86_BUILTIN_SCATTERSIV2DI:
37410 icode = CODE_FOR_avx512vl_scattersiv2di;
37411 goto scatter_gen;
37412 case IX86_BUILTIN_SCATTERDIV8SI:
37413 icode = CODE_FOR_avx512vl_scatterdiv8si;
37414 goto scatter_gen;
37415 case IX86_BUILTIN_SCATTERDIV4SI:
37416 icode = CODE_FOR_avx512vl_scatterdiv4si;
37417 goto scatter_gen;
37418 case IX86_BUILTIN_SCATTERDIV4DI:
37419 icode = CODE_FOR_avx512vl_scatterdiv4di;
37420 goto scatter_gen;
37421 case IX86_BUILTIN_SCATTERDIV2DI:
37422 icode = CODE_FOR_avx512vl_scatterdiv2di;
37423 goto scatter_gen;
37424 case IX86_BUILTIN_GATHERPFDPD:
37425 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37426 goto vec_prefetch_gen;
37427 case IX86_BUILTIN_SCATTERALTSIV8DF:
37428 icode = CODE_FOR_avx512f_scattersiv8df;
37429 goto scatter_gen;
37430 case IX86_BUILTIN_SCATTERALTDIV16SF:
37431 icode = CODE_FOR_avx512f_scatterdiv16sf;
37432 goto scatter_gen;
37433 case IX86_BUILTIN_SCATTERALTSIV8DI:
37434 icode = CODE_FOR_avx512f_scattersiv8di;
37435 goto scatter_gen;
37436 case IX86_BUILTIN_SCATTERALTDIV16SI:
37437 icode = CODE_FOR_avx512f_scatterdiv16si;
37438 goto scatter_gen;
37439 case IX86_BUILTIN_GATHERPFDPS:
37440 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37441 goto vec_prefetch_gen;
37442 case IX86_BUILTIN_GATHERPFQPD:
37443 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37444 goto vec_prefetch_gen;
37445 case IX86_BUILTIN_GATHERPFQPS:
37446 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37447 goto vec_prefetch_gen;
37448 case IX86_BUILTIN_SCATTERPFDPD:
37449 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37450 goto vec_prefetch_gen;
37451 case IX86_BUILTIN_SCATTERPFDPS:
37452 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37453 goto vec_prefetch_gen;
37454 case IX86_BUILTIN_SCATTERPFQPD:
37455 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37456 goto vec_prefetch_gen;
37457 case IX86_BUILTIN_SCATTERPFQPS:
37458 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37459 goto vec_prefetch_gen;
37461 gather_gen:
37462 rtx half;
37463 rtx (*gen) (rtx, rtx);
37465 arg0 = CALL_EXPR_ARG (exp, 0);
37466 arg1 = CALL_EXPR_ARG (exp, 1);
37467 arg2 = CALL_EXPR_ARG (exp, 2);
37468 arg3 = CALL_EXPR_ARG (exp, 3);
37469 arg4 = CALL_EXPR_ARG (exp, 4);
37470 op0 = expand_normal (arg0);
37471 op1 = expand_normal (arg1);
37472 op2 = expand_normal (arg2);
37473 op3 = expand_normal (arg3);
37474 op4 = expand_normal (arg4);
37475 /* Note the arg order is different from the operand order. */
37476 mode0 = insn_data[icode].operand[1].mode;
37477 mode2 = insn_data[icode].operand[3].mode;
37478 mode3 = insn_data[icode].operand[4].mode;
37479 mode4 = insn_data[icode].operand[5].mode;
37481 if (target == NULL_RTX
37482 || GET_MODE (target) != insn_data[icode].operand[0].mode
37483 || !insn_data[icode].operand[0].predicate (target,
37484 GET_MODE (target)))
37485 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37486 else
37487 subtarget = target;
37489 switch (fcode)
37491 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37492 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37493 half = gen_reg_rtx (V8SImode);
37494 if (!nonimmediate_operand (op2, V16SImode))
37495 op2 = copy_to_mode_reg (V16SImode, op2);
37496 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37497 op2 = half;
37498 break;
37499 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37500 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37501 case IX86_BUILTIN_GATHERALTSIV4DF:
37502 case IX86_BUILTIN_GATHERALTSIV4DI:
37503 half = gen_reg_rtx (V4SImode);
37504 if (!nonimmediate_operand (op2, V8SImode))
37505 op2 = copy_to_mode_reg (V8SImode, op2);
37506 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37507 op2 = half;
37508 break;
37509 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37510 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37511 half = gen_reg_rtx (mode0);
37512 if (mode0 == V8SFmode)
37513 gen = gen_vec_extract_lo_v16sf;
37514 else
37515 gen = gen_vec_extract_lo_v16si;
37516 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37517 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37518 emit_insn (gen (half, op0));
37519 op0 = half;
37520 if (GET_MODE (op3) != VOIDmode)
37522 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37523 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37524 emit_insn (gen (half, op3));
37525 op3 = half;
37527 break;
37528 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37529 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37530 case IX86_BUILTIN_GATHERALTDIV8SF:
37531 case IX86_BUILTIN_GATHERALTDIV8SI:
37532 half = gen_reg_rtx (mode0);
37533 if (mode0 == V4SFmode)
37534 gen = gen_vec_extract_lo_v8sf;
37535 else
37536 gen = gen_vec_extract_lo_v8si;
37537 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37538 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37539 emit_insn (gen (half, op0));
37540 op0 = half;
37541 if (GET_MODE (op3) != VOIDmode)
37543 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37544 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37545 emit_insn (gen (half, op3));
37546 op3 = half;
37548 break;
37549 default:
37550 break;
37553 /* Force memory operand only with base register here. But we
37554 don't want to do it on memory operand for other builtin
37555 functions. */
37556 op1 = ix86_zero_extend_to_Pmode (op1);
37558 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37559 op0 = copy_to_mode_reg (mode0, op0);
37560 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37561 op1 = copy_to_mode_reg (Pmode, op1);
37562 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37563 op2 = copy_to_mode_reg (mode2, op2);
37565 op3 = fixup_modeless_constant (op3, mode3);
37567 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37569 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37570 op3 = copy_to_mode_reg (mode3, op3);
37572 else
37574 op3 = copy_to_reg (op3);
37575 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37577 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37579 error ("the last argument must be scale 1, 2, 4, 8");
37580 return const0_rtx;
37583 /* Optimize. If mask is known to have all high bits set,
37584 replace op0 with pc_rtx to signal that the instruction
37585 overwrites the whole destination and doesn't use its
37586 previous contents. */
37587 if (optimize)
37589 if (TREE_CODE (arg3) == INTEGER_CST)
37591 if (integer_all_onesp (arg3))
37592 op0 = pc_rtx;
37594 else if (TREE_CODE (arg3) == VECTOR_CST)
37596 unsigned int negative = 0;
37597 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37599 tree cst = VECTOR_CST_ELT (arg3, i);
37600 if (TREE_CODE (cst) == INTEGER_CST
37601 && tree_int_cst_sign_bit (cst))
37602 negative++;
37603 else if (TREE_CODE (cst) == REAL_CST
37604 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37605 negative++;
37607 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37608 op0 = pc_rtx;
37610 else if (TREE_CODE (arg3) == SSA_NAME
37611 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37613 /* Recognize also when mask is like:
37614 __v2df src = _mm_setzero_pd ();
37615 __v2df mask = _mm_cmpeq_pd (src, src);
37617 __v8sf src = _mm256_setzero_ps ();
37618 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37619 as that is a cheaper way to load all ones into
37620 a register than having to load a constant from
37621 memory. */
37622 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37623 if (is_gimple_call (def_stmt))
37625 tree fndecl = gimple_call_fndecl (def_stmt);
37626 if (fndecl
37627 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37628 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37630 case IX86_BUILTIN_CMPPD:
37631 case IX86_BUILTIN_CMPPS:
37632 case IX86_BUILTIN_CMPPD256:
37633 case IX86_BUILTIN_CMPPS256:
37634 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37635 break;
37636 /* FALLTHRU */
37637 case IX86_BUILTIN_CMPEQPD:
37638 case IX86_BUILTIN_CMPEQPS:
37639 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37640 && initializer_zerop (gimple_call_arg (def_stmt,
37641 1)))
37642 op0 = pc_rtx;
37643 break;
37644 default:
37645 break;
37651 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37652 if (! pat)
37653 return const0_rtx;
37654 emit_insn (pat);
37656 switch (fcode)
37658 case IX86_BUILTIN_GATHER3DIV16SF:
37659 if (target == NULL_RTX)
37660 target = gen_reg_rtx (V8SFmode);
37661 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37662 break;
37663 case IX86_BUILTIN_GATHER3DIV16SI:
37664 if (target == NULL_RTX)
37665 target = gen_reg_rtx (V8SImode);
37666 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37667 break;
37668 case IX86_BUILTIN_GATHER3DIV8SF:
37669 case IX86_BUILTIN_GATHERDIV8SF:
37670 if (target == NULL_RTX)
37671 target = gen_reg_rtx (V4SFmode);
37672 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37673 break;
37674 case IX86_BUILTIN_GATHER3DIV8SI:
37675 case IX86_BUILTIN_GATHERDIV8SI:
37676 if (target == NULL_RTX)
37677 target = gen_reg_rtx (V4SImode);
37678 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37679 break;
37680 default:
37681 target = subtarget;
37682 break;
37684 return target;
37686 scatter_gen:
37687 arg0 = CALL_EXPR_ARG (exp, 0);
37688 arg1 = CALL_EXPR_ARG (exp, 1);
37689 arg2 = CALL_EXPR_ARG (exp, 2);
37690 arg3 = CALL_EXPR_ARG (exp, 3);
37691 arg4 = CALL_EXPR_ARG (exp, 4);
37692 op0 = expand_normal (arg0);
37693 op1 = expand_normal (arg1);
37694 op2 = expand_normal (arg2);
37695 op3 = expand_normal (arg3);
37696 op4 = expand_normal (arg4);
37697 mode1 = insn_data[icode].operand[1].mode;
37698 mode2 = insn_data[icode].operand[2].mode;
37699 mode3 = insn_data[icode].operand[3].mode;
37700 mode4 = insn_data[icode].operand[4].mode;
37702 /* Scatter instruction stores operand op3 to memory with
37703 indices from op2 and scale from op4 under writemask op1.
37704 If index operand op2 has more elements then source operand
37705 op3 one need to use only its low half. And vice versa. */
37706 switch (fcode)
37708 case IX86_BUILTIN_SCATTERALTSIV8DF:
37709 case IX86_BUILTIN_SCATTERALTSIV8DI:
37710 half = gen_reg_rtx (V8SImode);
37711 if (!nonimmediate_operand (op2, V16SImode))
37712 op2 = copy_to_mode_reg (V16SImode, op2);
37713 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37714 op2 = half;
37715 break;
37716 case IX86_BUILTIN_SCATTERALTDIV16SF:
37717 case IX86_BUILTIN_SCATTERALTDIV16SI:
37718 half = gen_reg_rtx (mode3);
37719 if (mode3 == V8SFmode)
37720 gen = gen_vec_extract_lo_v16sf;
37721 else
37722 gen = gen_vec_extract_lo_v16si;
37723 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37724 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37725 emit_insn (gen (half, op3));
37726 op3 = half;
37727 break;
37728 default:
37729 break;
37732 /* Force memory operand only with base register here. But we
37733 don't want to do it on memory operand for other builtin
37734 functions. */
37735 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37737 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37738 op0 = copy_to_mode_reg (Pmode, op0);
37740 op1 = fixup_modeless_constant (op1, mode1);
37742 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37744 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37745 op1 = copy_to_mode_reg (mode1, op1);
37747 else
37749 op1 = copy_to_reg (op1);
37750 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37753 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37754 op2 = copy_to_mode_reg (mode2, op2);
37756 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37757 op3 = copy_to_mode_reg (mode3, op3);
37759 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37761 error ("the last argument must be scale 1, 2, 4, 8");
37762 return const0_rtx;
37765 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37766 if (! pat)
37767 return const0_rtx;
37769 emit_insn (pat);
37770 return 0;
37772 vec_prefetch_gen:
37773 arg0 = CALL_EXPR_ARG (exp, 0);
37774 arg1 = CALL_EXPR_ARG (exp, 1);
37775 arg2 = CALL_EXPR_ARG (exp, 2);
37776 arg3 = CALL_EXPR_ARG (exp, 3);
37777 arg4 = CALL_EXPR_ARG (exp, 4);
37778 op0 = expand_normal (arg0);
37779 op1 = expand_normal (arg1);
37780 op2 = expand_normal (arg2);
37781 op3 = expand_normal (arg3);
37782 op4 = expand_normal (arg4);
37783 mode0 = insn_data[icode].operand[0].mode;
37784 mode1 = insn_data[icode].operand[1].mode;
37785 mode3 = insn_data[icode].operand[3].mode;
37786 mode4 = insn_data[icode].operand[4].mode;
37788 op0 = fixup_modeless_constant (op0, mode0);
37790 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37792 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37793 op0 = copy_to_mode_reg (mode0, op0);
37795 else
37797 op0 = copy_to_reg (op0);
37798 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37801 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37802 op1 = copy_to_mode_reg (mode1, op1);
37804 /* Force memory operand only with base register here. But we
37805 don't want to do it on memory operand for other builtin
37806 functions. */
37807 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37809 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37810 op2 = copy_to_mode_reg (Pmode, op2);
37812 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37814 error ("the forth argument must be scale 1, 2, 4, 8");
37815 return const0_rtx;
37818 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37820 error ("incorrect hint operand");
37821 return const0_rtx;
37824 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37825 if (! pat)
37826 return const0_rtx;
37828 emit_insn (pat);
37830 return 0;
37832 case IX86_BUILTIN_XABORT:
37833 icode = CODE_FOR_xabort;
37834 arg0 = CALL_EXPR_ARG (exp, 0);
37835 op0 = expand_normal (arg0);
37836 mode0 = insn_data[icode].operand[0].mode;
37837 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37839 error ("the xabort's argument must be an 8-bit immediate");
37840 return const0_rtx;
37842 emit_insn (gen_xabort (op0));
37843 return 0;
37845 default:
37846 break;
37849 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37850 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37852 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37853 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37854 target);
37857 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37858 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37860 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37861 switch (fcode)
37863 case IX86_BUILTIN_FABSQ:
37864 case IX86_BUILTIN_COPYSIGNQ:
37865 if (!TARGET_SSE)
37866 /* Emit a normal call if SSE isn't available. */
37867 return expand_call (exp, target, ignore);
37868 /* FALLTHRU */
37869 default:
37870 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37874 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37875 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37877 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37878 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37881 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37882 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37884 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37885 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37888 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37889 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37891 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37892 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37895 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37896 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37898 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37899 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37902 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37903 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37905 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37906 const struct builtin_description *d = bdesc_multi_arg + i;
37907 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37908 (enum ix86_builtin_func_type)
37909 d->flag, d->comparison);
37912 gcc_unreachable ();
37915 /* This returns the target-specific builtin with code CODE if
37916 current_function_decl has visibility on this builtin, which is checked
37917 using isa flags. Returns NULL_TREE otherwise. */
37919 static tree ix86_get_builtin (enum ix86_builtins code)
37921 struct cl_target_option *opts;
37922 tree target_tree = NULL_TREE;
37924 /* Determine the isa flags of current_function_decl. */
37926 if (current_function_decl)
37927 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37929 if (target_tree == NULL)
37930 target_tree = target_option_default_node;
37932 opts = TREE_TARGET_OPTION (target_tree);
37934 if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37935 return ix86_builtin_decl (code, true);
37936 else
37937 return NULL_TREE;
37940 /* Return function decl for target specific builtin
37941 for given MPX builtin passed i FCODE. */
37942 static tree
37943 ix86_builtin_mpx_function (unsigned fcode)
37945 switch (fcode)
37947 case BUILT_IN_CHKP_BNDMK:
37948 return ix86_builtins[IX86_BUILTIN_BNDMK];
37950 case BUILT_IN_CHKP_BNDSTX:
37951 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37953 case BUILT_IN_CHKP_BNDLDX:
37954 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37956 case BUILT_IN_CHKP_BNDCL:
37957 return ix86_builtins[IX86_BUILTIN_BNDCL];
37959 case BUILT_IN_CHKP_BNDCU:
37960 return ix86_builtins[IX86_BUILTIN_BNDCU];
37962 case BUILT_IN_CHKP_BNDRET:
37963 return ix86_builtins[IX86_BUILTIN_BNDRET];
37965 case BUILT_IN_CHKP_INTERSECT:
37966 return ix86_builtins[IX86_BUILTIN_BNDINT];
37968 case BUILT_IN_CHKP_NARROW:
37969 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37971 case BUILT_IN_CHKP_SIZEOF:
37972 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37974 case BUILT_IN_CHKP_EXTRACT_LOWER:
37975 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37977 case BUILT_IN_CHKP_EXTRACT_UPPER:
37978 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37980 default:
37981 return NULL_TREE;
37984 gcc_unreachable ();
37987 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37989 Return an address to be used to load/store bounds for pointer
37990 passed in SLOT.
37992 SLOT_NO is an integer constant holding number of a target
37993 dependent special slot to be used in case SLOT is not a memory.
37995 SPECIAL_BASE is a pointer to be used as a base of fake address
37996 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37997 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37999 static rtx
38000 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38002 rtx addr = NULL;
38004 /* NULL slot means we pass bounds for pointer not passed to the
38005 function at all. Register slot means we pass pointer in a
38006 register. In both these cases bounds are passed via Bounds
38007 Table. Since we do not have actual pointer stored in memory,
38008 we have to use fake addresses to access Bounds Table. We
38009 start with (special_base - sizeof (void*)) and decrease this
38010 address by pointer size to get addresses for other slots. */
38011 if (!slot || REG_P (slot))
38013 gcc_assert (CONST_INT_P (slot_no));
38014 addr = plus_constant (Pmode, special_base,
38015 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38017 /* If pointer is passed in a memory then its address is used to
38018 access Bounds Table. */
38019 else if (MEM_P (slot))
38021 addr = XEXP (slot, 0);
38022 if (!register_operand (addr, Pmode))
38023 addr = copy_addr_to_reg (addr);
38025 else
38026 gcc_unreachable ();
38028 return addr;
38031 /* Expand pass uses this hook to load bounds for function parameter
38032 PTR passed in SLOT in case its bounds are not passed in a register.
38034 If SLOT is a memory, then bounds are loaded as for regular pointer
38035 loaded from memory. PTR may be NULL in case SLOT is a memory.
38036 In such case value of PTR (if required) may be loaded from SLOT.
38038 If SLOT is NULL or a register then SLOT_NO is an integer constant
38039 holding number of the target dependent special slot which should be
38040 used to obtain bounds.
38042 Return loaded bounds. */
38044 static rtx
38045 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38047 rtx reg = gen_reg_rtx (BNDmode);
38048 rtx addr;
38050 /* Get address to be used to access Bounds Table. Special slots start
38051 at the location of return address of the current function. */
38052 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38054 /* Load pointer value from a memory if we don't have it. */
38055 if (!ptr)
38057 gcc_assert (MEM_P (slot));
38058 ptr = copy_addr_to_reg (slot);
38061 if (!register_operand (ptr, Pmode))
38062 ptr = ix86_zero_extend_to_Pmode (ptr);
38064 emit_insn (BNDmode == BND64mode
38065 ? gen_bnd64_ldx (reg, addr, ptr)
38066 : gen_bnd32_ldx (reg, addr, ptr));
38068 return reg;
38071 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38072 passed in SLOT in case BOUNDS are not passed in a register.
38074 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38075 stored in memory. PTR may be NULL in case SLOT is a memory.
38076 In such case value of PTR (if required) may be loaded from SLOT.
38078 If SLOT is NULL or a register then SLOT_NO is an integer constant
38079 holding number of the target dependent special slot which should be
38080 used to store BOUNDS. */
38082 static void
38083 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38085 rtx addr;
38087 /* Get address to be used to access Bounds Table. Special slots start
38088 at the location of return address of a called function. */
38089 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38091 /* Load pointer value from a memory if we don't have it. */
38092 if (!ptr)
38094 gcc_assert (MEM_P (slot));
38095 ptr = copy_addr_to_reg (slot);
38098 if (!register_operand (ptr, Pmode))
38099 ptr = ix86_zero_extend_to_Pmode (ptr);
38101 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38102 if (!register_operand (bounds, BNDmode))
38103 bounds = copy_to_mode_reg (BNDmode, bounds);
38105 emit_insn (BNDmode == BND64mode
38106 ? gen_bnd64_stx (addr, ptr, bounds)
38107 : gen_bnd32_stx (addr, ptr, bounds));
38110 /* Load and return bounds returned by function in SLOT. */
38112 static rtx
38113 ix86_load_returned_bounds (rtx slot)
38115 rtx res;
38117 gcc_assert (REG_P (slot));
38118 res = gen_reg_rtx (BNDmode);
38119 emit_move_insn (res, slot);
38121 return res;
38124 /* Store BOUNDS returned by function into SLOT. */
38126 static void
38127 ix86_store_returned_bounds (rtx slot, rtx bounds)
38129 gcc_assert (REG_P (slot));
38130 emit_move_insn (slot, bounds);
38133 /* Returns a function decl for a vectorized version of the combined function
38134 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38135 if it is not available. */
38137 static tree
38138 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38139 tree type_in)
38141 machine_mode in_mode, out_mode;
38142 int in_n, out_n;
38144 if (TREE_CODE (type_out) != VECTOR_TYPE
38145 || TREE_CODE (type_in) != VECTOR_TYPE)
38146 return NULL_TREE;
38148 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38149 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38150 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38151 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38153 switch (fn)
38155 CASE_CFN_EXP2:
38156 if (out_mode == SFmode && in_mode == SFmode)
38158 if (out_n == 16 && in_n == 16)
38159 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38161 break;
38163 CASE_CFN_IFLOOR:
38164 CASE_CFN_LFLOOR:
38165 CASE_CFN_LLFLOOR:
38166 /* The round insn does not trap on denormals. */
38167 if (flag_trapping_math || !TARGET_ROUND)
38168 break;
38170 if (out_mode == SImode && in_mode == DFmode)
38172 if (out_n == 4 && in_n == 2)
38173 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38174 else if (out_n == 8 && in_n == 4)
38175 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38176 else if (out_n == 16 && in_n == 8)
38177 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38179 if (out_mode == SImode && in_mode == SFmode)
38181 if (out_n == 4 && in_n == 4)
38182 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38183 else if (out_n == 8 && in_n == 8)
38184 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38185 else if (out_n == 16 && in_n == 16)
38186 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38188 break;
38190 CASE_CFN_ICEIL:
38191 CASE_CFN_LCEIL:
38192 CASE_CFN_LLCEIL:
38193 /* The round insn does not trap on denormals. */
38194 if (flag_trapping_math || !TARGET_ROUND)
38195 break;
38197 if (out_mode == SImode && in_mode == DFmode)
38199 if (out_n == 4 && in_n == 2)
38200 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38201 else if (out_n == 8 && in_n == 4)
38202 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38203 else if (out_n == 16 && in_n == 8)
38204 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38206 if (out_mode == SImode && in_mode == SFmode)
38208 if (out_n == 4 && in_n == 4)
38209 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38210 else if (out_n == 8 && in_n == 8)
38211 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38212 else if (out_n == 16 && in_n == 16)
38213 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38215 break;
38217 CASE_CFN_IRINT:
38218 CASE_CFN_LRINT:
38219 CASE_CFN_LLRINT:
38220 if (out_mode == SImode && in_mode == DFmode)
38222 if (out_n == 4 && in_n == 2)
38223 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38224 else if (out_n == 8 && in_n == 4)
38225 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38226 else if (out_n == 16 && in_n == 8)
38227 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38229 if (out_mode == SImode && in_mode == SFmode)
38231 if (out_n == 4 && in_n == 4)
38232 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38233 else if (out_n == 8 && in_n == 8)
38234 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38235 else if (out_n == 16 && in_n == 16)
38236 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38238 break;
38240 CASE_CFN_IROUND:
38241 CASE_CFN_LROUND:
38242 CASE_CFN_LLROUND:
38243 /* The round insn does not trap on denormals. */
38244 if (flag_trapping_math || !TARGET_ROUND)
38245 break;
38247 if (out_mode == SImode && in_mode == DFmode)
38249 if (out_n == 4 && in_n == 2)
38250 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38251 else if (out_n == 8 && in_n == 4)
38252 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38253 else if (out_n == 16 && in_n == 8)
38254 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38256 if (out_mode == SImode && in_mode == SFmode)
38258 if (out_n == 4 && in_n == 4)
38259 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38260 else if (out_n == 8 && in_n == 8)
38261 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38262 else if (out_n == 16 && in_n == 16)
38263 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38265 break;
38267 CASE_CFN_FLOOR:
38268 /* The round insn does not trap on denormals. */
38269 if (flag_trapping_math || !TARGET_ROUND)
38270 break;
38272 if (out_mode == DFmode && in_mode == DFmode)
38274 if (out_n == 2 && in_n == 2)
38275 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38276 else if (out_n == 4 && in_n == 4)
38277 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38278 else if (out_n == 8 && in_n == 8)
38279 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38281 if (out_mode == SFmode && in_mode == SFmode)
38283 if (out_n == 4 && in_n == 4)
38284 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38285 else if (out_n == 8 && in_n == 8)
38286 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38287 else if (out_n == 16 && in_n == 16)
38288 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38290 break;
38292 CASE_CFN_CEIL:
38293 /* The round insn does not trap on denormals. */
38294 if (flag_trapping_math || !TARGET_ROUND)
38295 break;
38297 if (out_mode == DFmode && in_mode == DFmode)
38299 if (out_n == 2 && in_n == 2)
38300 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38301 else if (out_n == 4 && in_n == 4)
38302 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38303 else if (out_n == 8 && in_n == 8)
38304 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38306 if (out_mode == SFmode && in_mode == SFmode)
38308 if (out_n == 4 && in_n == 4)
38309 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38310 else if (out_n == 8 && in_n == 8)
38311 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38312 else if (out_n == 16 && in_n == 16)
38313 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38315 break;
38317 CASE_CFN_TRUNC:
38318 /* The round insn does not trap on denormals. */
38319 if (flag_trapping_math || !TARGET_ROUND)
38320 break;
38322 if (out_mode == DFmode && in_mode == DFmode)
38324 if (out_n == 2 && in_n == 2)
38325 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38326 else if (out_n == 4 && in_n == 4)
38327 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38328 else if (out_n == 8 && in_n == 8)
38329 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38331 if (out_mode == SFmode && in_mode == SFmode)
38333 if (out_n == 4 && in_n == 4)
38334 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38335 else if (out_n == 8 && in_n == 8)
38336 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38337 else if (out_n == 16 && in_n == 16)
38338 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38340 break;
38342 CASE_CFN_RINT:
38343 /* The round insn does not trap on denormals. */
38344 if (flag_trapping_math || !TARGET_ROUND)
38345 break;
38347 if (out_mode == DFmode && in_mode == DFmode)
38349 if (out_n == 2 && in_n == 2)
38350 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38351 else if (out_n == 4 && in_n == 4)
38352 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38354 if (out_mode == SFmode && in_mode == SFmode)
38356 if (out_n == 4 && in_n == 4)
38357 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38358 else if (out_n == 8 && in_n == 8)
38359 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38361 break;
38363 CASE_CFN_FMA:
38364 if (out_mode == DFmode && in_mode == DFmode)
38366 if (out_n == 2 && in_n == 2)
38367 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38368 if (out_n == 4 && in_n == 4)
38369 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38371 if (out_mode == SFmode && in_mode == SFmode)
38373 if (out_n == 4 && in_n == 4)
38374 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38375 if (out_n == 8 && in_n == 8)
38376 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38378 break;
38380 default:
38381 break;
38384 /* Dispatch to a handler for a vectorization library. */
38385 if (ix86_veclib_handler)
38386 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38388 return NULL_TREE;
38391 /* Handler for an SVML-style interface to
38392 a library with vectorized intrinsics. */
38394 static tree
38395 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38397 char name[20];
38398 tree fntype, new_fndecl, args;
38399 unsigned arity;
38400 const char *bname;
38401 machine_mode el_mode, in_mode;
38402 int n, in_n;
38404 /* The SVML is suitable for unsafe math only. */
38405 if (!flag_unsafe_math_optimizations)
38406 return NULL_TREE;
38408 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38409 n = TYPE_VECTOR_SUBPARTS (type_out);
38410 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38411 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38412 if (el_mode != in_mode
38413 || n != in_n)
38414 return NULL_TREE;
38416 switch (fn)
38418 CASE_CFN_EXP:
38419 CASE_CFN_LOG:
38420 CASE_CFN_LOG10:
38421 CASE_CFN_POW:
38422 CASE_CFN_TANH:
38423 CASE_CFN_TAN:
38424 CASE_CFN_ATAN:
38425 CASE_CFN_ATAN2:
38426 CASE_CFN_ATANH:
38427 CASE_CFN_CBRT:
38428 CASE_CFN_SINH:
38429 CASE_CFN_SIN:
38430 CASE_CFN_ASINH:
38431 CASE_CFN_ASIN:
38432 CASE_CFN_COSH:
38433 CASE_CFN_COS:
38434 CASE_CFN_ACOSH:
38435 CASE_CFN_ACOS:
38436 if ((el_mode != DFmode || n != 2)
38437 && (el_mode != SFmode || n != 4))
38438 return NULL_TREE;
38439 break;
38441 default:
38442 return NULL_TREE;
38445 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38446 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38448 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38449 strcpy (name, "vmlsLn4");
38450 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38451 strcpy (name, "vmldLn2");
38452 else if (n == 4)
38454 sprintf (name, "vmls%s", bname+10);
38455 name[strlen (name)-1] = '4';
38457 else
38458 sprintf (name, "vmld%s2", bname+10);
38460 /* Convert to uppercase. */
38461 name[4] &= ~0x20;
38463 arity = 0;
38464 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38465 arity++;
38467 if (arity == 1)
38468 fntype = build_function_type_list (type_out, type_in, NULL);
38469 else
38470 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38472 /* Build a function declaration for the vectorized function. */
38473 new_fndecl = build_decl (BUILTINS_LOCATION,
38474 FUNCTION_DECL, get_identifier (name), fntype);
38475 TREE_PUBLIC (new_fndecl) = 1;
38476 DECL_EXTERNAL (new_fndecl) = 1;
38477 DECL_IS_NOVOPS (new_fndecl) = 1;
38478 TREE_READONLY (new_fndecl) = 1;
38480 return new_fndecl;
38483 /* Handler for an ACML-style interface to
38484 a library with vectorized intrinsics. */
38486 static tree
38487 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38489 char name[20] = "__vr.._";
38490 tree fntype, new_fndecl, args;
38491 unsigned arity;
38492 const char *bname;
38493 machine_mode el_mode, in_mode;
38494 int n, in_n;
38496 /* The ACML is 64bits only and suitable for unsafe math only as
38497 it does not correctly support parts of IEEE with the required
38498 precision such as denormals. */
38499 if (!TARGET_64BIT
38500 || !flag_unsafe_math_optimizations)
38501 return NULL_TREE;
38503 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38504 n = TYPE_VECTOR_SUBPARTS (type_out);
38505 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38506 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38507 if (el_mode != in_mode
38508 || n != in_n)
38509 return NULL_TREE;
38511 switch (fn)
38513 CASE_CFN_SIN:
38514 CASE_CFN_COS:
38515 CASE_CFN_EXP:
38516 CASE_CFN_LOG:
38517 CASE_CFN_LOG2:
38518 CASE_CFN_LOG10:
38519 if (el_mode == DFmode && n == 2)
38521 name[4] = 'd';
38522 name[5] = '2';
38524 else if (el_mode == SFmode && n == 4)
38526 name[4] = 's';
38527 name[5] = '4';
38529 else
38530 return NULL_TREE;
38531 break;
38533 default:
38534 return NULL_TREE;
38537 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38538 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38539 sprintf (name + 7, "%s", bname+10);
38541 arity = 0;
38542 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38543 arity++;
38545 if (arity == 1)
38546 fntype = build_function_type_list (type_out, type_in, NULL);
38547 else
38548 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38550 /* Build a function declaration for the vectorized function. */
38551 new_fndecl = build_decl (BUILTINS_LOCATION,
38552 FUNCTION_DECL, get_identifier (name), fntype);
38553 TREE_PUBLIC (new_fndecl) = 1;
38554 DECL_EXTERNAL (new_fndecl) = 1;
38555 DECL_IS_NOVOPS (new_fndecl) = 1;
38556 TREE_READONLY (new_fndecl) = 1;
38558 return new_fndecl;
38561 /* Returns a decl of a function that implements gather load with
38562 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38563 Return NULL_TREE if it is not available. */
38565 static tree
38566 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38567 const_tree index_type, int scale)
38569 bool si;
38570 enum ix86_builtins code;
38572 if (! TARGET_AVX2)
38573 return NULL_TREE;
38575 if ((TREE_CODE (index_type) != INTEGER_TYPE
38576 && !POINTER_TYPE_P (index_type))
38577 || (TYPE_MODE (index_type) != SImode
38578 && TYPE_MODE (index_type) != DImode))
38579 return NULL_TREE;
38581 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38582 return NULL_TREE;
38584 /* v*gather* insn sign extends index to pointer mode. */
38585 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38586 && TYPE_UNSIGNED (index_type))
38587 return NULL_TREE;
38589 if (scale <= 0
38590 || scale > 8
38591 || (scale & (scale - 1)) != 0)
38592 return NULL_TREE;
38594 si = TYPE_MODE (index_type) == SImode;
38595 switch (TYPE_MODE (mem_vectype))
38597 case V2DFmode:
38598 if (TARGET_AVX512VL)
38599 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38600 else
38601 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38602 break;
38603 case V4DFmode:
38604 if (TARGET_AVX512VL)
38605 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38606 else
38607 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38608 break;
38609 case V2DImode:
38610 if (TARGET_AVX512VL)
38611 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38612 else
38613 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38614 break;
38615 case V4DImode:
38616 if (TARGET_AVX512VL)
38617 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38618 else
38619 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38620 break;
38621 case V4SFmode:
38622 if (TARGET_AVX512VL)
38623 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38624 else
38625 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38626 break;
38627 case V8SFmode:
38628 if (TARGET_AVX512VL)
38629 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38630 else
38631 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38632 break;
38633 case V4SImode:
38634 if (TARGET_AVX512VL)
38635 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38636 else
38637 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38638 break;
38639 case V8SImode:
38640 if (TARGET_AVX512VL)
38641 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38642 else
38643 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38644 break;
38645 case V8DFmode:
38646 if (TARGET_AVX512F)
38647 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38648 else
38649 return NULL_TREE;
38650 break;
38651 case V8DImode:
38652 if (TARGET_AVX512F)
38653 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38654 else
38655 return NULL_TREE;
38656 break;
38657 case V16SFmode:
38658 if (TARGET_AVX512F)
38659 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38660 else
38661 return NULL_TREE;
38662 break;
38663 case V16SImode:
38664 if (TARGET_AVX512F)
38665 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38666 else
38667 return NULL_TREE;
38668 break;
38669 default:
38670 return NULL_TREE;
38673 return ix86_get_builtin (code);
38676 /* Returns a decl of a function that implements scatter store with
38677 register type VECTYPE and index type INDEX_TYPE and SCALE.
38678 Return NULL_TREE if it is not available. */
38680 static tree
38681 ix86_vectorize_builtin_scatter (const_tree vectype,
38682 const_tree index_type, int scale)
38684 bool si;
38685 enum ix86_builtins code;
38687 if (!TARGET_AVX512F)
38688 return NULL_TREE;
38690 if ((TREE_CODE (index_type) != INTEGER_TYPE
38691 && !POINTER_TYPE_P (index_type))
38692 || (TYPE_MODE (index_type) != SImode
38693 && TYPE_MODE (index_type) != DImode))
38694 return NULL_TREE;
38696 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38697 return NULL_TREE;
38699 /* v*scatter* insn sign extends index to pointer mode. */
38700 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38701 && TYPE_UNSIGNED (index_type))
38702 return NULL_TREE;
38704 /* Scale can be 1, 2, 4 or 8. */
38705 if (scale <= 0
38706 || scale > 8
38707 || (scale & (scale - 1)) != 0)
38708 return NULL_TREE;
38710 si = TYPE_MODE (index_type) == SImode;
38711 switch (TYPE_MODE (vectype))
38713 case V8DFmode:
38714 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38715 break;
38716 case V8DImode:
38717 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38718 break;
38719 case V16SFmode:
38720 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38721 break;
38722 case V16SImode:
38723 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38724 break;
38725 default:
38726 return NULL_TREE;
38729 return ix86_builtins[code];
38732 /* Return true if it is safe to use the rsqrt optabs to optimize
38733 1.0/sqrt. */
38735 static bool
38736 use_rsqrt_p ()
38738 return (TARGET_SSE_MATH
38739 && flag_finite_math_only
38740 && !flag_trapping_math
38741 && flag_unsafe_math_optimizations);
38744 /* Returns a code for a target-specific builtin that implements
38745 reciprocal of the function, or NULL_TREE if not available. */
38747 static tree
38748 ix86_builtin_reciprocal (tree fndecl)
38750 switch (DECL_FUNCTION_CODE (fndecl))
38752 /* Vectorized version of sqrt to rsqrt conversion. */
38753 case IX86_BUILTIN_SQRTPS_NR:
38754 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38756 case IX86_BUILTIN_SQRTPS_NR256:
38757 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38759 default:
38760 return NULL_TREE;
38764 /* Helper for avx_vpermilps256_operand et al. This is also used by
38765 the expansion functions to turn the parallel back into a mask.
38766 The return value is 0 for no match and the imm8+1 for a match. */
38769 avx_vpermilp_parallel (rtx par, machine_mode mode)
38771 unsigned i, nelt = GET_MODE_NUNITS (mode);
38772 unsigned mask = 0;
38773 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38775 if (XVECLEN (par, 0) != (int) nelt)
38776 return 0;
38778 /* Validate that all of the elements are constants, and not totally
38779 out of range. Copy the data into an integral array to make the
38780 subsequent checks easier. */
38781 for (i = 0; i < nelt; ++i)
38783 rtx er = XVECEXP (par, 0, i);
38784 unsigned HOST_WIDE_INT ei;
38786 if (!CONST_INT_P (er))
38787 return 0;
38788 ei = INTVAL (er);
38789 if (ei >= nelt)
38790 return 0;
38791 ipar[i] = ei;
38794 switch (mode)
38796 case V8DFmode:
38797 /* In the 512-bit DFmode case, we can only move elements within
38798 a 128-bit lane. First fill the second part of the mask,
38799 then fallthru. */
38800 for (i = 4; i < 6; ++i)
38802 if (ipar[i] < 4 || ipar[i] >= 6)
38803 return 0;
38804 mask |= (ipar[i] - 4) << i;
38806 for (i = 6; i < 8; ++i)
38808 if (ipar[i] < 6)
38809 return 0;
38810 mask |= (ipar[i] - 6) << i;
38812 /* FALLTHRU */
38814 case V4DFmode:
38815 /* In the 256-bit DFmode case, we can only move elements within
38816 a 128-bit lane. */
38817 for (i = 0; i < 2; ++i)
38819 if (ipar[i] >= 2)
38820 return 0;
38821 mask |= ipar[i] << i;
38823 for (i = 2; i < 4; ++i)
38825 if (ipar[i] < 2)
38826 return 0;
38827 mask |= (ipar[i] - 2) << i;
38829 break;
38831 case V16SFmode:
38832 /* In 512 bit SFmode case, permutation in the upper 256 bits
38833 must mirror the permutation in the lower 256-bits. */
38834 for (i = 0; i < 8; ++i)
38835 if (ipar[i] + 8 != ipar[i + 8])
38836 return 0;
38837 /* FALLTHRU */
38839 case V8SFmode:
38840 /* In 256 bit SFmode case, we have full freedom of
38841 movement within the low 128-bit lane, but the high 128-bit
38842 lane must mirror the exact same pattern. */
38843 for (i = 0; i < 4; ++i)
38844 if (ipar[i] + 4 != ipar[i + 4])
38845 return 0;
38846 nelt = 4;
38847 /* FALLTHRU */
38849 case V2DFmode:
38850 case V4SFmode:
38851 /* In the 128-bit case, we've full freedom in the placement of
38852 the elements from the source operand. */
38853 for (i = 0; i < nelt; ++i)
38854 mask |= ipar[i] << (i * (nelt / 2));
38855 break;
38857 default:
38858 gcc_unreachable ();
38861 /* Make sure success has a non-zero value by adding one. */
38862 return mask + 1;
38865 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38866 the expansion functions to turn the parallel back into a mask.
38867 The return value is 0 for no match and the imm8+1 for a match. */
38870 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38872 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38873 unsigned mask = 0;
38874 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38876 if (XVECLEN (par, 0) != (int) nelt)
38877 return 0;
38879 /* Validate that all of the elements are constants, and not totally
38880 out of range. Copy the data into an integral array to make the
38881 subsequent checks easier. */
38882 for (i = 0; i < nelt; ++i)
38884 rtx er = XVECEXP (par, 0, i);
38885 unsigned HOST_WIDE_INT ei;
38887 if (!CONST_INT_P (er))
38888 return 0;
38889 ei = INTVAL (er);
38890 if (ei >= 2 * nelt)
38891 return 0;
38892 ipar[i] = ei;
38895 /* Validate that the halves of the permute are halves. */
38896 for (i = 0; i < nelt2 - 1; ++i)
38897 if (ipar[i] + 1 != ipar[i + 1])
38898 return 0;
38899 for (i = nelt2; i < nelt - 1; ++i)
38900 if (ipar[i] + 1 != ipar[i + 1])
38901 return 0;
38903 /* Reconstruct the mask. */
38904 for (i = 0; i < 2; ++i)
38906 unsigned e = ipar[i * nelt2];
38907 if (e % nelt2)
38908 return 0;
38909 e /= nelt2;
38910 mask |= e << (i * 4);
38913 /* Make sure success has a non-zero value by adding one. */
38914 return mask + 1;
38917 /* Return a register priority for hard reg REGNO. */
38918 static int
38919 ix86_register_priority (int hard_regno)
38921 /* ebp and r13 as the base always wants a displacement, r12 as the
38922 base always wants an index. So discourage their usage in an
38923 address. */
38924 if (hard_regno == R12_REG || hard_regno == R13_REG)
38925 return 0;
38926 if (hard_regno == BP_REG)
38927 return 1;
38928 /* New x86-64 int registers result in bigger code size. Discourage
38929 them. */
38930 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38931 return 2;
38932 /* New x86-64 SSE registers result in bigger code size. Discourage
38933 them. */
38934 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38935 return 2;
38936 /* Usage of AX register results in smaller code. Prefer it. */
38937 if (hard_regno == AX_REG)
38938 return 4;
38939 return 3;
38942 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38944 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38945 QImode must go into class Q_REGS.
38946 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38947 movdf to do mem-to-mem moves through integer regs. */
38949 static reg_class_t
38950 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38952 machine_mode mode = GET_MODE (x);
38954 /* We're only allowed to return a subclass of CLASS. Many of the
38955 following checks fail for NO_REGS, so eliminate that early. */
38956 if (regclass == NO_REGS)
38957 return NO_REGS;
38959 /* All classes can load zeros. */
38960 if (x == CONST0_RTX (mode))
38961 return regclass;
38963 /* Force constants into memory if we are loading a (nonzero) constant into
38964 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38965 instructions to load from a constant. */
38966 if (CONSTANT_P (x)
38967 && (MAYBE_MMX_CLASS_P (regclass)
38968 || MAYBE_SSE_CLASS_P (regclass)
38969 || MAYBE_MASK_CLASS_P (regclass)))
38970 return NO_REGS;
38972 /* Floating-point constants need more complex checks. */
38973 if (CONST_DOUBLE_P (x))
38975 /* General regs can load everything. */
38976 if (INTEGER_CLASS_P (regclass))
38977 return regclass;
38979 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38980 zero above. We only want to wind up preferring 80387 registers if
38981 we plan on doing computation with them. */
38982 if (IS_STACK_MODE (mode)
38983 && standard_80387_constant_p (x) > 0)
38985 /* Limit class to FP regs. */
38986 if (FLOAT_CLASS_P (regclass))
38987 return FLOAT_REGS;
38988 else if (regclass == FP_TOP_SSE_REGS)
38989 return FP_TOP_REG;
38990 else if (regclass == FP_SECOND_SSE_REGS)
38991 return FP_SECOND_REG;
38994 return NO_REGS;
38997 /* Prefer SSE regs only, if we can use them for math. */
38998 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38999 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39001 /* Generally when we see PLUS here, it's the function invariant
39002 (plus soft-fp const_int). Which can only be computed into general
39003 regs. */
39004 if (GET_CODE (x) == PLUS)
39005 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39007 /* QImode constants are easy to load, but non-constant QImode data
39008 must go into Q_REGS. */
39009 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39011 if (Q_CLASS_P (regclass))
39012 return regclass;
39013 else if (reg_class_subset_p (Q_REGS, regclass))
39014 return Q_REGS;
39015 else
39016 return NO_REGS;
39019 return regclass;
39022 /* Discourage putting floating-point values in SSE registers unless
39023 SSE math is being used, and likewise for the 387 registers. */
39024 static reg_class_t
39025 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39027 machine_mode mode = GET_MODE (x);
39029 /* Restrict the output reload class to the register bank that we are doing
39030 math on. If we would like not to return a subset of CLASS, reject this
39031 alternative: if reload cannot do this, it will still use its choice. */
39032 mode = GET_MODE (x);
39033 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39034 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39036 if (IS_STACK_MODE (mode))
39038 if (regclass == FP_TOP_SSE_REGS)
39039 return FP_TOP_REG;
39040 else if (regclass == FP_SECOND_SSE_REGS)
39041 return FP_SECOND_REG;
39042 else
39043 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39046 return regclass;
39049 static reg_class_t
39050 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39051 machine_mode mode, secondary_reload_info *sri)
39053 /* Double-word spills from general registers to non-offsettable memory
39054 references (zero-extended addresses) require special handling. */
39055 if (TARGET_64BIT
39056 && MEM_P (x)
39057 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39058 && INTEGER_CLASS_P (rclass)
39059 && !offsettable_memref_p (x))
39061 sri->icode = (in_p
39062 ? CODE_FOR_reload_noff_load
39063 : CODE_FOR_reload_noff_store);
39064 /* Add the cost of moving address to a temporary. */
39065 sri->extra_cost = 1;
39067 return NO_REGS;
39070 /* QImode spills from non-QI registers require
39071 intermediate register on 32bit targets. */
39072 if (mode == QImode
39073 && (MAYBE_MASK_CLASS_P (rclass)
39074 || (!TARGET_64BIT && !in_p
39075 && INTEGER_CLASS_P (rclass)
39076 && MAYBE_NON_Q_CLASS_P (rclass))))
39078 int regno;
39080 if (REG_P (x))
39081 regno = REGNO (x);
39082 else
39083 regno = -1;
39085 if (regno >= FIRST_PSEUDO_REGISTER || SUBREG_P (x))
39086 regno = true_regnum (x);
39088 /* Return Q_REGS if the operand is in memory. */
39089 if (regno == -1)
39090 return Q_REGS;
39093 /* This condition handles corner case where an expression involving
39094 pointers gets vectorized. We're trying to use the address of a
39095 stack slot as a vector initializer.
39097 (set (reg:V2DI 74 [ vect_cst_.2 ])
39098 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39100 Eventually frame gets turned into sp+offset like this:
39102 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39103 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39104 (const_int 392 [0x188]))))
39106 That later gets turned into:
39108 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39109 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39110 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39112 We'll have the following reload recorded:
39114 Reload 0: reload_in (DI) =
39115 (plus:DI (reg/f:DI 7 sp)
39116 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39117 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39118 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39119 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39120 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39121 reload_reg_rtx: (reg:V2DI 22 xmm1)
39123 Which isn't going to work since SSE instructions can't handle scalar
39124 additions. Returning GENERAL_REGS forces the addition into integer
39125 register and reload can handle subsequent reloads without problems. */
39127 if (in_p && GET_CODE (x) == PLUS
39128 && SSE_CLASS_P (rclass)
39129 && SCALAR_INT_MODE_P (mode))
39130 return GENERAL_REGS;
39132 return NO_REGS;
39135 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39137 static bool
39138 ix86_class_likely_spilled_p (reg_class_t rclass)
39140 switch (rclass)
39142 case AREG:
39143 case DREG:
39144 case CREG:
39145 case BREG:
39146 case AD_REGS:
39147 case SIREG:
39148 case DIREG:
39149 case SSE_FIRST_REG:
39150 case FP_TOP_REG:
39151 case FP_SECOND_REG:
39152 case BND_REGS:
39153 return true;
39155 default:
39156 break;
39159 return false;
39162 /* If we are copying between general and FP registers, we need a memory
39163 location. The same is true for SSE and MMX registers.
39165 To optimize register_move_cost performance, allow inline variant.
39167 The macro can't work reliably when one of the CLASSES is class containing
39168 registers from multiple units (SSE, MMX, integer). We avoid this by never
39169 combining those units in single alternative in the machine description.
39170 Ensure that this constraint holds to avoid unexpected surprises.
39172 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
39173 enforce these sanity checks. */
39175 static inline bool
39176 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39177 machine_mode mode, int strict)
39179 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39180 return false;
39181 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39182 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39183 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39184 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39185 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39186 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
39188 gcc_assert (!strict || lra_in_progress);
39189 return true;
39192 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39193 return true;
39195 /* Between mask and general, we have moves no larger than word size. */
39196 if ((MAYBE_MASK_CLASS_P (class1) != MAYBE_MASK_CLASS_P (class2))
39197 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39198 return true;
39200 /* ??? This is a lie. We do have moves between mmx/general, and for
39201 mmx/sse2. But by saying we need secondary memory we discourage the
39202 register allocator from using the mmx registers unless needed. */
39203 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39204 return true;
39206 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39208 /* SSE1 doesn't have any direct moves from other classes. */
39209 if (!TARGET_SSE2)
39210 return true;
39212 /* If the target says that inter-unit moves are more expensive
39213 than moving through memory, then don't generate them. */
39214 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39215 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39216 return true;
39218 /* Between SSE and general, we have moves no larger than word size. */
39219 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39220 return true;
39223 return false;
39226 bool
39227 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39228 machine_mode mode, int strict)
39230 return inline_secondary_memory_needed (class1, class2, mode, strict);
39233 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39235 On the 80386, this is the size of MODE in words,
39236 except in the FP regs, where a single reg is always enough. */
39238 static unsigned char
39239 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39241 if (MAYBE_INTEGER_CLASS_P (rclass))
39243 if (mode == XFmode)
39244 return (TARGET_64BIT ? 2 : 3);
39245 else if (mode == XCmode)
39246 return (TARGET_64BIT ? 4 : 6);
39247 else
39248 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39250 else
39252 if (COMPLEX_MODE_P (mode))
39253 return 2;
39254 else
39255 return 1;
39259 /* Return true if the registers in CLASS cannot represent the change from
39260 modes FROM to TO. */
39262 bool
39263 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
39264 enum reg_class regclass)
39266 if (from == to)
39267 return false;
39269 /* x87 registers can't do subreg at all, as all values are reformatted
39270 to extended precision. */
39271 if (MAYBE_FLOAT_CLASS_P (regclass))
39272 return true;
39274 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39276 /* Vector registers do not support QI or HImode loads. If we don't
39277 disallow a change to these modes, reload will assume it's ok to
39278 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39279 the vec_dupv4hi pattern. */
39280 if (GET_MODE_SIZE (from) < 4)
39281 return true;
39284 return false;
39287 /* Return the cost of moving data of mode M between a
39288 register and memory. A value of 2 is the default; this cost is
39289 relative to those in `REGISTER_MOVE_COST'.
39291 This function is used extensively by register_move_cost that is used to
39292 build tables at startup. Make it inline in this case.
39293 When IN is 2, return maximum of in and out move cost.
39295 If moving between registers and memory is more expensive than
39296 between two registers, you should define this macro to express the
39297 relative cost.
39299 Model also increased moving costs of QImode registers in non
39300 Q_REGS classes.
39302 static inline int
39303 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39304 int in)
39306 int cost;
39307 if (FLOAT_CLASS_P (regclass))
39309 int index;
39310 switch (mode)
39312 case SFmode:
39313 index = 0;
39314 break;
39315 case DFmode:
39316 index = 1;
39317 break;
39318 case XFmode:
39319 index = 2;
39320 break;
39321 default:
39322 return 100;
39324 if (in == 2)
39325 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39326 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39328 if (SSE_CLASS_P (regclass))
39330 int index;
39331 switch (GET_MODE_SIZE (mode))
39333 case 4:
39334 index = 0;
39335 break;
39336 case 8:
39337 index = 1;
39338 break;
39339 case 16:
39340 index = 2;
39341 break;
39342 default:
39343 return 100;
39345 if (in == 2)
39346 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39347 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39349 if (MMX_CLASS_P (regclass))
39351 int index;
39352 switch (GET_MODE_SIZE (mode))
39354 case 4:
39355 index = 0;
39356 break;
39357 case 8:
39358 index = 1;
39359 break;
39360 default:
39361 return 100;
39363 if (in)
39364 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39365 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39367 switch (GET_MODE_SIZE (mode))
39369 case 1:
39370 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39372 if (!in)
39373 return ix86_cost->int_store[0];
39374 if (TARGET_PARTIAL_REG_DEPENDENCY
39375 && optimize_function_for_speed_p (cfun))
39376 cost = ix86_cost->movzbl_load;
39377 else
39378 cost = ix86_cost->int_load[0];
39379 if (in == 2)
39380 return MAX (cost, ix86_cost->int_store[0]);
39381 return cost;
39383 else
39385 if (in == 2)
39386 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39387 if (in)
39388 return ix86_cost->movzbl_load;
39389 else
39390 return ix86_cost->int_store[0] + 4;
39392 break;
39393 case 2:
39394 if (in == 2)
39395 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39396 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39397 default:
39398 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39399 if (mode == TFmode)
39400 mode = XFmode;
39401 if (in == 2)
39402 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39403 else if (in)
39404 cost = ix86_cost->int_load[2];
39405 else
39406 cost = ix86_cost->int_store[2];
39407 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39411 static int
39412 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39413 bool in)
39415 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39419 /* Return the cost of moving data from a register in class CLASS1 to
39420 one in class CLASS2.
39422 It is not required that the cost always equal 2 when FROM is the same as TO;
39423 on some machines it is expensive to move between registers if they are not
39424 general registers. */
39426 static int
39427 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39428 reg_class_t class2_i)
39430 enum reg_class class1 = (enum reg_class) class1_i;
39431 enum reg_class class2 = (enum reg_class) class2_i;
39433 /* In case we require secondary memory, compute cost of the store followed
39434 by load. In order to avoid bad register allocation choices, we need
39435 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39437 if (inline_secondary_memory_needed (class1, class2, mode, 0))
39439 int cost = 1;
39441 cost += inline_memory_move_cost (mode, class1, 2);
39442 cost += inline_memory_move_cost (mode, class2, 2);
39444 /* In case of copying from general_purpose_register we may emit multiple
39445 stores followed by single load causing memory size mismatch stall.
39446 Count this as arbitrarily high cost of 20. */
39447 if (targetm.class_max_nregs (class1, mode)
39448 > targetm.class_max_nregs (class2, mode))
39449 cost += 20;
39451 /* In the case of FP/MMX moves, the registers actually overlap, and we
39452 have to switch modes in order to treat them differently. */
39453 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39454 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39455 cost += 20;
39457 return cost;
39460 /* Moves between SSE/MMX and integer unit are expensive. */
39461 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39462 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39464 /* ??? By keeping returned value relatively high, we limit the number
39465 of moves between integer and MMX/SSE registers for all targets.
39466 Additionally, high value prevents problem with x86_modes_tieable_p(),
39467 where integer modes in MMX/SSE registers are not tieable
39468 because of missing QImode and HImode moves to, from or between
39469 MMX/SSE registers. */
39470 return MAX (8, ix86_cost->mmxsse_to_integer);
39472 if (MAYBE_FLOAT_CLASS_P (class1))
39473 return ix86_cost->fp_move;
39474 if (MAYBE_SSE_CLASS_P (class1))
39475 return ix86_cost->sse_move;
39476 if (MAYBE_MMX_CLASS_P (class1))
39477 return ix86_cost->mmx_move;
39478 return 2;
39481 /* Return TRUE if hard register REGNO can hold a value of machine-mode
39482 MODE. */
39484 bool
39485 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
39487 /* Flags and only flags can only hold CCmode values. */
39488 if (CC_REGNO_P (regno))
39489 return GET_MODE_CLASS (mode) == MODE_CC;
39490 if (GET_MODE_CLASS (mode) == MODE_CC
39491 || GET_MODE_CLASS (mode) == MODE_RANDOM
39492 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39493 return false;
39494 if (STACK_REGNO_P (regno))
39495 return VALID_FP_MODE_P (mode);
39496 if (MASK_REGNO_P (regno))
39497 return (VALID_MASK_REG_MODE (mode)
39498 || (TARGET_AVX512BW
39499 && VALID_MASK_AVX512BW_MODE (mode)));
39500 if (BND_REGNO_P (regno))
39501 return VALID_BND_REG_MODE (mode);
39502 if (SSE_REGNO_P (regno))
39504 /* We implement the move patterns for all vector modes into and
39505 out of SSE registers, even when no operation instructions
39506 are available. */
39508 /* For AVX-512 we allow, regardless of regno:
39509 - XI mode
39510 - any of 512-bit wide vector mode
39511 - any scalar mode. */
39512 if (TARGET_AVX512F
39513 && (mode == XImode
39514 || VALID_AVX512F_REG_MODE (mode)
39515 || VALID_AVX512F_SCALAR_MODE (mode)))
39516 return true;
39518 /* TODO check for QI/HI scalars. */
39519 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39520 if (TARGET_AVX512VL
39521 && (mode == OImode
39522 || mode == TImode
39523 || VALID_AVX256_REG_MODE (mode)
39524 || VALID_AVX512VL_128_REG_MODE (mode)))
39525 return true;
39527 /* xmm16-xmm31 are only available for AVX-512. */
39528 if (EXT_REX_SSE_REGNO_P (regno))
39529 return false;
39531 /* OImode and AVX modes are available only when AVX is enabled. */
39532 return ((TARGET_AVX
39533 && VALID_AVX256_REG_OR_OI_MODE (mode))
39534 || VALID_SSE_REG_MODE (mode)
39535 || VALID_SSE2_REG_MODE (mode)
39536 || VALID_MMX_REG_MODE (mode)
39537 || VALID_MMX_REG_MODE_3DNOW (mode));
39539 if (MMX_REGNO_P (regno))
39541 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39542 so if the register is available at all, then we can move data of
39543 the given mode into or out of it. */
39544 return (VALID_MMX_REG_MODE (mode)
39545 || VALID_MMX_REG_MODE_3DNOW (mode));
39548 if (mode == QImode)
39550 /* Take care for QImode values - they can be in non-QI regs,
39551 but then they do cause partial register stalls. */
39552 if (ANY_QI_REGNO_P (regno))
39553 return true;
39554 if (!TARGET_PARTIAL_REG_STALL)
39555 return true;
39556 /* LRA checks if the hard register is OK for the given mode.
39557 QImode values can live in non-QI regs, so we allow all
39558 registers here. */
39559 if (lra_in_progress)
39560 return true;
39561 return !can_create_pseudo_p ();
39563 /* We handle both integer and floats in the general purpose registers. */
39564 else if (VALID_INT_MODE_P (mode))
39565 return true;
39566 else if (VALID_FP_MODE_P (mode))
39567 return true;
39568 else if (VALID_DFP_MODE_P (mode))
39569 return true;
39570 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39571 on to use that value in smaller contexts, this can easily force a
39572 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39573 supporting DImode, allow it. */
39574 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39575 return true;
39577 return false;
39580 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39581 tieable integer mode. */
39583 static bool
39584 ix86_tieable_integer_mode_p (machine_mode mode)
39586 switch (mode)
39588 case HImode:
39589 case SImode:
39590 return true;
39592 case QImode:
39593 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39595 case DImode:
39596 return TARGET_64BIT;
39598 default:
39599 return false;
39603 /* Return true if MODE1 is accessible in a register that can hold MODE2
39604 without copying. That is, all register classes that can hold MODE2
39605 can also hold MODE1. */
39607 bool
39608 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39610 if (mode1 == mode2)
39611 return true;
39613 if (ix86_tieable_integer_mode_p (mode1)
39614 && ix86_tieable_integer_mode_p (mode2))
39615 return true;
39617 /* MODE2 being XFmode implies fp stack or general regs, which means we
39618 can tie any smaller floating point modes to it. Note that we do not
39619 tie this with TFmode. */
39620 if (mode2 == XFmode)
39621 return mode1 == SFmode || mode1 == DFmode;
39623 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39624 that we can tie it with SFmode. */
39625 if (mode2 == DFmode)
39626 return mode1 == SFmode;
39628 /* If MODE2 is only appropriate for an SSE register, then tie with
39629 any other mode acceptable to SSE registers. */
39630 if (GET_MODE_SIZE (mode2) == 32
39631 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39632 return (GET_MODE_SIZE (mode1) == 32
39633 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39634 if (GET_MODE_SIZE (mode2) == 16
39635 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39636 return (GET_MODE_SIZE (mode1) == 16
39637 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39639 /* If MODE2 is appropriate for an MMX register, then tie
39640 with any other mode acceptable to MMX registers. */
39641 if (GET_MODE_SIZE (mode2) == 8
39642 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39643 return (GET_MODE_SIZE (mode1) == 8
39644 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39646 return false;
39649 /* Return the cost of moving between two registers of mode MODE. */
39651 static int
39652 ix86_set_reg_reg_cost (machine_mode mode)
39654 unsigned int units = UNITS_PER_WORD;
39656 switch (GET_MODE_CLASS (mode))
39658 default:
39659 break;
39661 case MODE_CC:
39662 units = GET_MODE_SIZE (CCmode);
39663 break;
39665 case MODE_FLOAT:
39666 if ((TARGET_SSE && mode == TFmode)
39667 || (TARGET_80387 && mode == XFmode)
39668 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39669 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39670 units = GET_MODE_SIZE (mode);
39671 break;
39673 case MODE_COMPLEX_FLOAT:
39674 if ((TARGET_SSE && mode == TCmode)
39675 || (TARGET_80387 && mode == XCmode)
39676 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39677 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39678 units = GET_MODE_SIZE (mode);
39679 break;
39681 case MODE_VECTOR_INT:
39682 case MODE_VECTOR_FLOAT:
39683 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39684 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39685 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39686 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39687 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39688 units = GET_MODE_SIZE (mode);
39691 /* Return the cost of moving between two registers of mode MODE,
39692 assuming that the move will be in pieces of at most UNITS bytes. */
39693 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39696 /* Compute a (partial) cost for rtx X. Return true if the complete
39697 cost has been computed, and false if subexpressions should be
39698 scanned. In either case, *TOTAL contains the cost result. */
39700 static bool
39701 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39702 int *total, bool speed)
39704 rtx mask;
39705 enum rtx_code code = GET_CODE (x);
39706 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39707 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39709 switch (code)
39711 case SET:
39712 if (register_operand (SET_DEST (x), VOIDmode)
39713 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39715 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39716 return true;
39718 return false;
39720 case CONST_INT:
39721 case CONST:
39722 case LABEL_REF:
39723 case SYMBOL_REF:
39724 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39725 *total = 3;
39726 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39727 *total = 2;
39728 else if (flag_pic && SYMBOLIC_CONST (x)
39729 && !(TARGET_64BIT
39730 && (GET_CODE (x) == LABEL_REF
39731 || (GET_CODE (x) == SYMBOL_REF
39732 && SYMBOL_REF_LOCAL_P (x))))
39733 /* Use 0 cost for CONST to improve its propagation. */
39734 && (TARGET_64BIT || GET_CODE (x) != CONST))
39735 *total = 1;
39736 else
39737 *total = 0;
39738 return true;
39740 case CONST_DOUBLE:
39741 if (IS_STACK_MODE (mode))
39742 switch (standard_80387_constant_p (x))
39744 case -1:
39745 case 0:
39746 break;
39747 case 1: /* 0.0 */
39748 *total = 1;
39749 return true;
39750 default: /* Other constants */
39751 *total = 2;
39752 return true;
39754 /* FALLTHRU */
39756 case CONST_VECTOR:
39757 switch (standard_sse_constant_p (x, mode))
39759 case 0:
39760 break;
39761 case 1: /* 0: xor eliminates false dependency */
39762 *total = 0;
39763 return true;
39764 default: /* -1: cmp contains false dependency */
39765 *total = 1;
39766 return true;
39768 /* FALLTHRU */
39770 case CONST_WIDE_INT:
39771 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39772 it'll probably end up. Add a penalty for size. */
39773 *total = (COSTS_N_INSNS (1)
39774 + (!TARGET_64BIT && flag_pic)
39775 + (GET_MODE_SIZE (mode) <= 4
39776 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39777 return true;
39779 case ZERO_EXTEND:
39780 /* The zero extensions is often completely free on x86_64, so make
39781 it as cheap as possible. */
39782 if (TARGET_64BIT && mode == DImode
39783 && GET_MODE (XEXP (x, 0)) == SImode)
39784 *total = 1;
39785 else if (TARGET_ZERO_EXTEND_WITH_AND)
39786 *total = cost->add;
39787 else
39788 *total = cost->movzx;
39789 return false;
39791 case SIGN_EXTEND:
39792 *total = cost->movsx;
39793 return false;
39795 case ASHIFT:
39796 if (SCALAR_INT_MODE_P (mode)
39797 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39798 && CONST_INT_P (XEXP (x, 1)))
39800 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39801 if (value == 1)
39803 *total = cost->add;
39804 return false;
39806 if ((value == 2 || value == 3)
39807 && cost->lea <= cost->shift_const)
39809 *total = cost->lea;
39810 return false;
39813 /* FALLTHRU */
39815 case ROTATE:
39816 case ASHIFTRT:
39817 case LSHIFTRT:
39818 case ROTATERT:
39819 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39821 /* ??? Should be SSE vector operation cost. */
39822 /* At least for published AMD latencies, this really is the same
39823 as the latency for a simple fpu operation like fabs. */
39824 /* V*QImode is emulated with 1-11 insns. */
39825 if (mode == V16QImode || mode == V32QImode)
39827 int count = 11;
39828 if (TARGET_XOP && mode == V16QImode)
39830 /* For XOP we use vpshab, which requires a broadcast of the
39831 value to the variable shift insn. For constants this
39832 means a V16Q const in mem; even when we can perform the
39833 shift with one insn set the cost to prefer paddb. */
39834 if (CONSTANT_P (XEXP (x, 1)))
39836 *total = (cost->fabs
39837 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
39838 + (speed ? 2 : COSTS_N_BYTES (16)));
39839 return true;
39841 count = 3;
39843 else if (TARGET_SSSE3)
39844 count = 7;
39845 *total = cost->fabs * count;
39847 else
39848 *total = cost->fabs;
39850 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39852 if (CONST_INT_P (XEXP (x, 1)))
39854 if (INTVAL (XEXP (x, 1)) > 32)
39855 *total = cost->shift_const + COSTS_N_INSNS (2);
39856 else
39857 *total = cost->shift_const * 2;
39859 else
39861 if (GET_CODE (XEXP (x, 1)) == AND)
39862 *total = cost->shift_var * 2;
39863 else
39864 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
39867 else
39869 if (CONST_INT_P (XEXP (x, 1)))
39870 *total = cost->shift_const;
39871 else if (SUBREG_P (XEXP (x, 1))
39872 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
39874 /* Return the cost after shift-and truncation. */
39875 *total = cost->shift_var;
39876 return true;
39878 else
39879 *total = cost->shift_var;
39881 return false;
39883 case FMA:
39885 rtx sub;
39887 gcc_assert (FLOAT_MODE_P (mode));
39888 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39890 /* ??? SSE scalar/vector cost should be used here. */
39891 /* ??? Bald assumption that fma has the same cost as fmul. */
39892 *total = cost->fmul;
39893 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39895 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39896 sub = XEXP (x, 0);
39897 if (GET_CODE (sub) == NEG)
39898 sub = XEXP (sub, 0);
39899 *total += rtx_cost (sub, mode, FMA, 0, speed);
39901 sub = XEXP (x, 2);
39902 if (GET_CODE (sub) == NEG)
39903 sub = XEXP (sub, 0);
39904 *total += rtx_cost (sub, mode, FMA, 2, speed);
39905 return true;
39908 case MULT:
39909 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39911 /* ??? SSE scalar cost should be used here. */
39912 *total = cost->fmul;
39913 return false;
39915 else if (X87_FLOAT_MODE_P (mode))
39917 *total = cost->fmul;
39918 return false;
39920 else if (FLOAT_MODE_P (mode))
39922 /* ??? SSE vector cost should be used here. */
39923 *total = cost->fmul;
39924 return false;
39926 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39928 /* V*QImode is emulated with 7-13 insns. */
39929 if (mode == V16QImode || mode == V32QImode)
39931 int extra = 11;
39932 if (TARGET_XOP && mode == V16QImode)
39933 extra = 5;
39934 else if (TARGET_SSSE3)
39935 extra = 6;
39936 *total = cost->fmul * 2 + cost->fabs * extra;
39938 /* V*DImode is emulated with 5-8 insns. */
39939 else if (mode == V2DImode || mode == V4DImode)
39941 if (TARGET_XOP && mode == V2DImode)
39942 *total = cost->fmul * 2 + cost->fabs * 3;
39943 else
39944 *total = cost->fmul * 3 + cost->fabs * 5;
39946 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39947 insns, including two PMULUDQ. */
39948 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39949 *total = cost->fmul * 2 + cost->fabs * 5;
39950 else
39951 *total = cost->fmul;
39952 return false;
39954 else
39956 rtx op0 = XEXP (x, 0);
39957 rtx op1 = XEXP (x, 1);
39958 int nbits;
39959 if (CONST_INT_P (XEXP (x, 1)))
39961 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39962 for (nbits = 0; value != 0; value &= value - 1)
39963 nbits++;
39965 else
39966 /* This is arbitrary. */
39967 nbits = 7;
39969 /* Compute costs correctly for widening multiplication. */
39970 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39971 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39972 == GET_MODE_SIZE (mode))
39974 int is_mulwiden = 0;
39975 machine_mode inner_mode = GET_MODE (op0);
39977 if (GET_CODE (op0) == GET_CODE (op1))
39978 is_mulwiden = 1, op1 = XEXP (op1, 0);
39979 else if (CONST_INT_P (op1))
39981 if (GET_CODE (op0) == SIGN_EXTEND)
39982 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39983 == INTVAL (op1);
39984 else
39985 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39988 if (is_mulwiden)
39989 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39992 *total = (cost->mult_init[MODE_INDEX (mode)]
39993 + nbits * cost->mult_bit
39994 + rtx_cost (op0, mode, outer_code, opno, speed)
39995 + rtx_cost (op1, mode, outer_code, opno, speed));
39997 return true;
40000 case DIV:
40001 case UDIV:
40002 case MOD:
40003 case UMOD:
40004 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40005 /* ??? SSE cost should be used here. */
40006 *total = cost->fdiv;
40007 else if (X87_FLOAT_MODE_P (mode))
40008 *total = cost->fdiv;
40009 else if (FLOAT_MODE_P (mode))
40010 /* ??? SSE vector cost should be used here. */
40011 *total = cost->fdiv;
40012 else
40013 *total = cost->divide[MODE_INDEX (mode)];
40014 return false;
40016 case PLUS:
40017 if (GET_MODE_CLASS (mode) == MODE_INT
40018 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40020 if (GET_CODE (XEXP (x, 0)) == PLUS
40021 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40022 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40023 && CONSTANT_P (XEXP (x, 1)))
40025 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40026 if (val == 2 || val == 4 || val == 8)
40028 *total = cost->lea;
40029 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40030 outer_code, opno, speed);
40031 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40032 outer_code, opno, speed);
40033 *total += rtx_cost (XEXP (x, 1), mode,
40034 outer_code, opno, speed);
40035 return true;
40038 else if (GET_CODE (XEXP (x, 0)) == MULT
40039 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40041 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40042 if (val == 2 || val == 4 || val == 8)
40044 *total = cost->lea;
40045 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40046 outer_code, opno, speed);
40047 *total += rtx_cost (XEXP (x, 1), mode,
40048 outer_code, opno, speed);
40049 return true;
40052 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40054 *total = cost->lea;
40055 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40056 outer_code, opno, speed);
40057 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40058 outer_code, opno, speed);
40059 *total += rtx_cost (XEXP (x, 1), mode,
40060 outer_code, opno, speed);
40061 return true;
40064 /* FALLTHRU */
40066 case MINUS:
40067 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40069 /* ??? SSE cost should be used here. */
40070 *total = cost->fadd;
40071 return false;
40073 else if (X87_FLOAT_MODE_P (mode))
40075 *total = cost->fadd;
40076 return false;
40078 else if (FLOAT_MODE_P (mode))
40080 /* ??? SSE vector cost should be used here. */
40081 *total = cost->fadd;
40082 return false;
40084 /* FALLTHRU */
40086 case AND:
40087 case IOR:
40088 case XOR:
40089 if (GET_MODE_CLASS (mode) == MODE_INT
40090 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40092 *total = (cost->add * 2
40093 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40094 << (GET_MODE (XEXP (x, 0)) != DImode))
40095 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40096 << (GET_MODE (XEXP (x, 1)) != DImode)));
40097 return true;
40099 /* FALLTHRU */
40101 case NEG:
40102 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40104 /* ??? SSE cost should be used here. */
40105 *total = cost->fchs;
40106 return false;
40108 else if (X87_FLOAT_MODE_P (mode))
40110 *total = cost->fchs;
40111 return false;
40113 else if (FLOAT_MODE_P (mode))
40115 /* ??? SSE vector cost should be used here. */
40116 *total = cost->fchs;
40117 return false;
40119 /* FALLTHRU */
40121 case NOT:
40122 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40124 /* ??? Should be SSE vector operation cost. */
40125 /* At least for published AMD latencies, this really is the same
40126 as the latency for a simple fpu operation like fabs. */
40127 *total = cost->fabs;
40129 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40130 *total = cost->add * 2;
40131 else
40132 *total = cost->add;
40133 return false;
40135 case COMPARE:
40136 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40137 && XEXP (XEXP (x, 0), 1) == const1_rtx
40138 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40139 && XEXP (x, 1) == const0_rtx)
40141 /* This kind of construct is implemented using test[bwl].
40142 Treat it as if we had an AND. */
40143 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40144 *total = (cost->add
40145 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40146 opno, speed)
40147 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40148 return true;
40151 /* The embedded comparison operand is completely free. */
40152 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40153 && XEXP (x, 1) == const0_rtx)
40154 *total = 0;
40156 return false;
40158 case FLOAT_EXTEND:
40159 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40160 *total = 0;
40161 return false;
40163 case ABS:
40164 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40165 /* ??? SSE cost should be used here. */
40166 *total = cost->fabs;
40167 else if (X87_FLOAT_MODE_P (mode))
40168 *total = cost->fabs;
40169 else if (FLOAT_MODE_P (mode))
40170 /* ??? SSE vector cost should be used here. */
40171 *total = cost->fabs;
40172 return false;
40174 case SQRT:
40175 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40176 /* ??? SSE cost should be used here. */
40177 *total = cost->fsqrt;
40178 else if (X87_FLOAT_MODE_P (mode))
40179 *total = cost->fsqrt;
40180 else if (FLOAT_MODE_P (mode))
40181 /* ??? SSE vector cost should be used here. */
40182 *total = cost->fsqrt;
40183 return false;
40185 case UNSPEC:
40186 if (XINT (x, 1) == UNSPEC_TP)
40187 *total = 0;
40188 return false;
40190 case VEC_SELECT:
40191 case VEC_CONCAT:
40192 case VEC_DUPLICATE:
40193 /* ??? Assume all of these vector manipulation patterns are
40194 recognizable. In which case they all pretty much have the
40195 same cost. */
40196 *total = cost->fabs;
40197 return true;
40198 case VEC_MERGE:
40199 mask = XEXP (x, 2);
40200 /* This is masked instruction, assume the same cost,
40201 as nonmasked variant. */
40202 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40203 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40204 else
40205 *total = cost->fabs;
40206 return true;
40208 default:
40209 return false;
40213 #if TARGET_MACHO
40215 static int current_machopic_label_num;
40217 /* Given a symbol name and its associated stub, write out the
40218 definition of the stub. */
40220 void
40221 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40223 unsigned int length;
40224 char *binder_name, *symbol_name, lazy_ptr_name[32];
40225 int label = ++current_machopic_label_num;
40227 /* For 64-bit we shouldn't get here. */
40228 gcc_assert (!TARGET_64BIT);
40230 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40231 symb = targetm.strip_name_encoding (symb);
40233 length = strlen (stub);
40234 binder_name = XALLOCAVEC (char, length + 32);
40235 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40237 length = strlen (symb);
40238 symbol_name = XALLOCAVEC (char, length + 32);
40239 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40241 sprintf (lazy_ptr_name, "L%d$lz", label);
40243 if (MACHOPIC_ATT_STUB)
40244 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40245 else if (MACHOPIC_PURE)
40246 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40247 else
40248 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40250 fprintf (file, "%s:\n", stub);
40251 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40253 if (MACHOPIC_ATT_STUB)
40255 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40257 else if (MACHOPIC_PURE)
40259 /* PIC stub. */
40260 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40261 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40262 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40263 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40264 label, lazy_ptr_name, label);
40265 fprintf (file, "\tjmp\t*%%ecx\n");
40267 else
40268 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40270 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40271 it needs no stub-binding-helper. */
40272 if (MACHOPIC_ATT_STUB)
40273 return;
40275 fprintf (file, "%s:\n", binder_name);
40277 if (MACHOPIC_PURE)
40279 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40280 fprintf (file, "\tpushl\t%%ecx\n");
40282 else
40283 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40285 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40287 /* N.B. Keep the correspondence of these
40288 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40289 old-pic/new-pic/non-pic stubs; altering this will break
40290 compatibility with existing dylibs. */
40291 if (MACHOPIC_PURE)
40293 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40294 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40296 else
40297 /* 16-byte -mdynamic-no-pic stub. */
40298 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40300 fprintf (file, "%s:\n", lazy_ptr_name);
40301 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40302 fprintf (file, ASM_LONG "%s\n", binder_name);
40304 #endif /* TARGET_MACHO */
40306 /* Order the registers for register allocator. */
40308 void
40309 x86_order_regs_for_local_alloc (void)
40311 int pos = 0;
40312 int i;
40314 /* First allocate the local general purpose registers. */
40315 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40316 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40317 reg_alloc_order [pos++] = i;
40319 /* Global general purpose registers. */
40320 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40321 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40322 reg_alloc_order [pos++] = i;
40324 /* x87 registers come first in case we are doing FP math
40325 using them. */
40326 if (!TARGET_SSE_MATH)
40327 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40328 reg_alloc_order [pos++] = i;
40330 /* SSE registers. */
40331 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40332 reg_alloc_order [pos++] = i;
40333 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40334 reg_alloc_order [pos++] = i;
40336 /* Extended REX SSE registers. */
40337 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40338 reg_alloc_order [pos++] = i;
40340 /* Mask register. */
40341 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40342 reg_alloc_order [pos++] = i;
40344 /* MPX bound registers. */
40345 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40346 reg_alloc_order [pos++] = i;
40348 /* x87 registers. */
40349 if (TARGET_SSE_MATH)
40350 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40351 reg_alloc_order [pos++] = i;
40353 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40354 reg_alloc_order [pos++] = i;
40356 /* Initialize the rest of array as we do not allocate some registers
40357 at all. */
40358 while (pos < FIRST_PSEUDO_REGISTER)
40359 reg_alloc_order [pos++] = 0;
40362 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40363 in struct attribute_spec handler. */
40364 static tree
40365 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40366 tree args,
40367 int,
40368 bool *no_add_attrs)
40370 if (TREE_CODE (*node) != FUNCTION_TYPE
40371 && TREE_CODE (*node) != METHOD_TYPE
40372 && TREE_CODE (*node) != FIELD_DECL
40373 && TREE_CODE (*node) != TYPE_DECL)
40375 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40376 name);
40377 *no_add_attrs = true;
40378 return NULL_TREE;
40380 if (TARGET_64BIT)
40382 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40383 name);
40384 *no_add_attrs = true;
40385 return NULL_TREE;
40387 if (is_attribute_p ("callee_pop_aggregate_return", name))
40389 tree cst;
40391 cst = TREE_VALUE (args);
40392 if (TREE_CODE (cst) != INTEGER_CST)
40394 warning (OPT_Wattributes,
40395 "%qE attribute requires an integer constant argument",
40396 name);
40397 *no_add_attrs = true;
40399 else if (compare_tree_int (cst, 0) != 0
40400 && compare_tree_int (cst, 1) != 0)
40402 warning (OPT_Wattributes,
40403 "argument to %qE attribute is neither zero, nor one",
40404 name);
40405 *no_add_attrs = true;
40408 return NULL_TREE;
40411 return NULL_TREE;
40414 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40415 struct attribute_spec.handler. */
40416 static tree
40417 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40418 bool *no_add_attrs)
40420 if (TREE_CODE (*node) != FUNCTION_TYPE
40421 && TREE_CODE (*node) != METHOD_TYPE
40422 && TREE_CODE (*node) != FIELD_DECL
40423 && TREE_CODE (*node) != TYPE_DECL)
40425 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40426 name);
40427 *no_add_attrs = true;
40428 return NULL_TREE;
40431 /* Can combine regparm with all attributes but fastcall. */
40432 if (is_attribute_p ("ms_abi", name))
40434 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40436 error ("ms_abi and sysv_abi attributes are not compatible");
40439 return NULL_TREE;
40441 else if (is_attribute_p ("sysv_abi", name))
40443 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40445 error ("ms_abi and sysv_abi attributes are not compatible");
40448 return NULL_TREE;
40451 return NULL_TREE;
40454 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40455 struct attribute_spec.handler. */
40456 static tree
40457 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40458 bool *no_add_attrs)
40460 tree *type = NULL;
40461 if (DECL_P (*node))
40463 if (TREE_CODE (*node) == TYPE_DECL)
40464 type = &TREE_TYPE (*node);
40466 else
40467 type = node;
40469 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40471 warning (OPT_Wattributes, "%qE attribute ignored",
40472 name);
40473 *no_add_attrs = true;
40476 else if ((is_attribute_p ("ms_struct", name)
40477 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40478 || ((is_attribute_p ("gcc_struct", name)
40479 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40481 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40482 name);
40483 *no_add_attrs = true;
40486 return NULL_TREE;
40489 static tree
40490 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40491 bool *no_add_attrs)
40493 if (TREE_CODE (*node) != FUNCTION_DECL)
40495 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40496 name);
40497 *no_add_attrs = true;
40499 return NULL_TREE;
40502 static tree
40503 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40504 int, bool *)
40506 return NULL_TREE;
40509 static tree
40510 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40512 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40513 but the function type contains args and return type data. */
40514 tree func_type = *node;
40515 tree return_type = TREE_TYPE (func_type);
40517 int nargs = 0;
40518 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40519 while (current_arg_type
40520 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40522 if (nargs == 0)
40524 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40525 error ("interrupt service routine should have a pointer "
40526 "as the first argument");
40528 else if (nargs == 1)
40530 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40531 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40532 error ("interrupt service routine should have unsigned %s"
40533 "int as the second argument",
40534 TARGET_64BIT
40535 ? (TARGET_X32 ? "long long " : "long ")
40536 : "");
40538 nargs++;
40539 current_arg_type = TREE_CHAIN (current_arg_type);
40541 if (!nargs || nargs > 2)
40542 error ("interrupt service routine can only have a pointer argument "
40543 "and an optional integer argument");
40544 if (! VOID_TYPE_P (return_type))
40545 error ("interrupt service routine can't have non-void return value");
40547 return NULL_TREE;
40550 static bool
40551 ix86_ms_bitfield_layout_p (const_tree record_type)
40553 return ((TARGET_MS_BITFIELD_LAYOUT
40554 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40555 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40558 /* Returns an expression indicating where the this parameter is
40559 located on entry to the FUNCTION. */
40561 static rtx
40562 x86_this_parameter (tree function)
40564 tree type = TREE_TYPE (function);
40565 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40566 int nregs;
40568 if (TARGET_64BIT)
40570 const int *parm_regs;
40572 if (ix86_function_type_abi (type) == MS_ABI)
40573 parm_regs = x86_64_ms_abi_int_parameter_registers;
40574 else
40575 parm_regs = x86_64_int_parameter_registers;
40576 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40579 nregs = ix86_function_regparm (type, function);
40581 if (nregs > 0 && !stdarg_p (type))
40583 int regno;
40584 unsigned int ccvt = ix86_get_callcvt (type);
40586 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40587 regno = aggr ? DX_REG : CX_REG;
40588 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40590 regno = CX_REG;
40591 if (aggr)
40592 return gen_rtx_MEM (SImode,
40593 plus_constant (Pmode, stack_pointer_rtx, 4));
40595 else
40597 regno = AX_REG;
40598 if (aggr)
40600 regno = DX_REG;
40601 if (nregs == 1)
40602 return gen_rtx_MEM (SImode,
40603 plus_constant (Pmode,
40604 stack_pointer_rtx, 4));
40607 return gen_rtx_REG (SImode, regno);
40610 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40611 aggr ? 8 : 4));
40614 /* Determine whether x86_output_mi_thunk can succeed. */
40616 static bool
40617 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40618 const_tree function)
40620 /* 64-bit can handle anything. */
40621 if (TARGET_64BIT)
40622 return true;
40624 /* For 32-bit, everything's fine if we have one free register. */
40625 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40626 return true;
40628 /* Need a free register for vcall_offset. */
40629 if (vcall_offset)
40630 return false;
40632 /* Need a free register for GOT references. */
40633 if (flag_pic && !targetm.binds_local_p (function))
40634 return false;
40636 /* Otherwise ok. */
40637 return true;
40640 /* Output the assembler code for a thunk function. THUNK_DECL is the
40641 declaration for the thunk function itself, FUNCTION is the decl for
40642 the target function. DELTA is an immediate constant offset to be
40643 added to THIS. If VCALL_OFFSET is nonzero, the word at
40644 *(*this + vcall_offset) should be added to THIS. */
40646 static void
40647 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40648 HOST_WIDE_INT vcall_offset, tree function)
40650 rtx this_param = x86_this_parameter (function);
40651 rtx this_reg, tmp, fnaddr;
40652 unsigned int tmp_regno;
40653 rtx_insn *insn;
40655 if (TARGET_64BIT)
40656 tmp_regno = R10_REG;
40657 else
40659 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40660 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40661 tmp_regno = AX_REG;
40662 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40663 tmp_regno = DX_REG;
40664 else
40665 tmp_regno = CX_REG;
40668 emit_note (NOTE_INSN_PROLOGUE_END);
40670 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40671 pull it in now and let DELTA benefit. */
40672 if (REG_P (this_param))
40673 this_reg = this_param;
40674 else if (vcall_offset)
40676 /* Put the this parameter into %eax. */
40677 this_reg = gen_rtx_REG (Pmode, AX_REG);
40678 emit_move_insn (this_reg, this_param);
40680 else
40681 this_reg = NULL_RTX;
40683 /* Adjust the this parameter by a fixed constant. */
40684 if (delta)
40686 rtx delta_rtx = GEN_INT (delta);
40687 rtx delta_dst = this_reg ? this_reg : this_param;
40689 if (TARGET_64BIT)
40691 if (!x86_64_general_operand (delta_rtx, Pmode))
40693 tmp = gen_rtx_REG (Pmode, tmp_regno);
40694 emit_move_insn (tmp, delta_rtx);
40695 delta_rtx = tmp;
40699 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40702 /* Adjust the this parameter by a value stored in the vtable. */
40703 if (vcall_offset)
40705 rtx vcall_addr, vcall_mem, this_mem;
40707 tmp = gen_rtx_REG (Pmode, tmp_regno);
40709 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40710 if (Pmode != ptr_mode)
40711 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40712 emit_move_insn (tmp, this_mem);
40714 /* Adjust the this parameter. */
40715 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40716 if (TARGET_64BIT
40717 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40719 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40720 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40721 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40724 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40725 if (Pmode != ptr_mode)
40726 emit_insn (gen_addsi_1_zext (this_reg,
40727 gen_rtx_REG (ptr_mode,
40728 REGNO (this_reg)),
40729 vcall_mem));
40730 else
40731 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40734 /* If necessary, drop THIS back to its stack slot. */
40735 if (this_reg && this_reg != this_param)
40736 emit_move_insn (this_param, this_reg);
40738 fnaddr = XEXP (DECL_RTL (function), 0);
40739 if (TARGET_64BIT)
40741 if (!flag_pic || targetm.binds_local_p (function)
40742 || TARGET_PECOFF)
40744 else
40746 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40747 tmp = gen_rtx_CONST (Pmode, tmp);
40748 fnaddr = gen_const_mem (Pmode, tmp);
40751 else
40753 if (!flag_pic || targetm.binds_local_p (function))
40755 #if TARGET_MACHO
40756 else if (TARGET_MACHO)
40758 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40759 fnaddr = XEXP (fnaddr, 0);
40761 #endif /* TARGET_MACHO */
40762 else
40764 tmp = gen_rtx_REG (Pmode, CX_REG);
40765 output_set_got (tmp, NULL_RTX);
40767 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40768 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40769 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40770 fnaddr = gen_const_mem (Pmode, fnaddr);
40774 /* Our sibling call patterns do not allow memories, because we have no
40775 predicate that can distinguish between frame and non-frame memory.
40776 For our purposes here, we can get away with (ab)using a jump pattern,
40777 because we're going to do no optimization. */
40778 if (MEM_P (fnaddr))
40780 if (sibcall_insn_operand (fnaddr, word_mode))
40782 fnaddr = XEXP (DECL_RTL (function), 0);
40783 tmp = gen_rtx_MEM (QImode, fnaddr);
40784 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40785 tmp = emit_call_insn (tmp);
40786 SIBLING_CALL_P (tmp) = 1;
40788 else
40789 emit_jump_insn (gen_indirect_jump (fnaddr));
40791 else
40793 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40795 // CM_LARGE_PIC always uses pseudo PIC register which is
40796 // uninitialized. Since FUNCTION is local and calling it
40797 // doesn't go through PLT, we use scratch register %r11 as
40798 // PIC register and initialize it here.
40799 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40800 ix86_init_large_pic_reg (tmp_regno);
40801 fnaddr = legitimize_pic_address (fnaddr,
40802 gen_rtx_REG (Pmode, tmp_regno));
40805 if (!sibcall_insn_operand (fnaddr, word_mode))
40807 tmp = gen_rtx_REG (word_mode, tmp_regno);
40808 if (GET_MODE (fnaddr) != word_mode)
40809 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40810 emit_move_insn (tmp, fnaddr);
40811 fnaddr = tmp;
40814 tmp = gen_rtx_MEM (QImode, fnaddr);
40815 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40816 tmp = emit_call_insn (tmp);
40817 SIBLING_CALL_P (tmp) = 1;
40819 emit_barrier ();
40821 /* Emit just enough of rest_of_compilation to get the insns emitted.
40822 Note that use_thunk calls assemble_start_function et al. */
40823 insn = get_insns ();
40824 shorten_branches (insn);
40825 final_start_function (insn, file, 1);
40826 final (insn, file, 1);
40827 final_end_function ();
40830 static void
40831 x86_file_start (void)
40833 default_file_start ();
40834 if (TARGET_16BIT)
40835 fputs ("\t.code16gcc\n", asm_out_file);
40836 #if TARGET_MACHO
40837 darwin_file_start ();
40838 #endif
40839 if (X86_FILE_START_VERSION_DIRECTIVE)
40840 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40841 if (X86_FILE_START_FLTUSED)
40842 fputs ("\t.global\t__fltused\n", asm_out_file);
40843 if (ix86_asm_dialect == ASM_INTEL)
40844 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40848 x86_field_alignment (tree field, int computed)
40850 machine_mode mode;
40851 tree type = TREE_TYPE (field);
40853 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40854 return computed;
40855 if (TARGET_IAMCU)
40856 return iamcu_alignment (type, computed);
40857 mode = TYPE_MODE (strip_array_types (type));
40858 if (mode == DFmode || mode == DCmode
40859 || GET_MODE_CLASS (mode) == MODE_INT
40860 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40861 return MIN (32, computed);
40862 return computed;
40865 /* Print call to TARGET to FILE. */
40867 static void
40868 x86_print_call_or_nop (FILE *file, const char *target)
40870 if (flag_nop_mcount)
40871 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
40872 else
40873 fprintf (file, "1:\tcall\t%s\n", target);
40876 /* Output assembler code to FILE to increment profiler label # LABELNO
40877 for profiling a function entry. */
40878 void
40879 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40881 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40882 : MCOUNT_NAME);
40883 if (TARGET_64BIT)
40885 #ifndef NO_PROFILE_COUNTERS
40886 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40887 #endif
40889 if (!TARGET_PECOFF && flag_pic)
40890 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40891 else
40892 x86_print_call_or_nop (file, mcount_name);
40894 else if (flag_pic)
40896 #ifndef NO_PROFILE_COUNTERS
40897 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40898 LPREFIX, labelno);
40899 #endif
40900 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40902 else
40904 #ifndef NO_PROFILE_COUNTERS
40905 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40906 LPREFIX, labelno);
40907 #endif
40908 x86_print_call_or_nop (file, mcount_name);
40911 if (flag_record_mcount)
40913 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40914 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40915 fprintf (file, "\t.previous\n");
40919 /* We don't have exact information about the insn sizes, but we may assume
40920 quite safely that we are informed about all 1 byte insns and memory
40921 address sizes. This is enough to eliminate unnecessary padding in
40922 99% of cases. */
40924 static int
40925 min_insn_size (rtx_insn *insn)
40927 int l = 0, len;
40929 if (!INSN_P (insn) || !active_insn_p (insn))
40930 return 0;
40932 /* Discard alignments we've emit and jump instructions. */
40933 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40934 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40935 return 0;
40937 /* Important case - calls are always 5 bytes.
40938 It is common to have many calls in the row. */
40939 if (CALL_P (insn)
40940 && symbolic_reference_mentioned_p (PATTERN (insn))
40941 && !SIBLING_CALL_P (insn))
40942 return 5;
40943 len = get_attr_length (insn);
40944 if (len <= 1)
40945 return 1;
40947 /* For normal instructions we rely on get_attr_length being exact,
40948 with a few exceptions. */
40949 if (!JUMP_P (insn))
40951 enum attr_type type = get_attr_type (insn);
40953 switch (type)
40955 case TYPE_MULTI:
40956 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40957 || asm_noperands (PATTERN (insn)) >= 0)
40958 return 0;
40959 break;
40960 case TYPE_OTHER:
40961 case TYPE_FCMP:
40962 break;
40963 default:
40964 /* Otherwise trust get_attr_length. */
40965 return len;
40968 l = get_attr_length_address (insn);
40969 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40970 l = 4;
40972 if (l)
40973 return 1+l;
40974 else
40975 return 2;
40978 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40980 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40981 window. */
40983 static void
40984 ix86_avoid_jump_mispredicts (void)
40986 rtx_insn *insn, *start = get_insns ();
40987 int nbytes = 0, njumps = 0;
40988 bool isjump = false;
40990 /* Look for all minimal intervals of instructions containing 4 jumps.
40991 The intervals are bounded by START and INSN. NBYTES is the total
40992 size of instructions in the interval including INSN and not including
40993 START. When the NBYTES is smaller than 16 bytes, it is possible
40994 that the end of START and INSN ends up in the same 16byte page.
40996 The smallest offset in the page INSN can start is the case where START
40997 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40998 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41000 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41001 have to, control transfer to label(s) can be performed through other
41002 means, and also we estimate minimum length of all asm stmts as 0. */
41003 for (insn = start; insn; insn = NEXT_INSN (insn))
41005 int min_size;
41007 if (LABEL_P (insn))
41009 int align = label_to_alignment (insn);
41010 int max_skip = label_to_max_skip (insn);
41012 if (max_skip > 15)
41013 max_skip = 15;
41014 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41015 already in the current 16 byte page, because otherwise
41016 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41017 bytes to reach 16 byte boundary. */
41018 if (align <= 0
41019 || (align <= 3 && max_skip != (1 << align) - 1))
41020 max_skip = 0;
41021 if (dump_file)
41022 fprintf (dump_file, "Label %i with max_skip %i\n",
41023 INSN_UID (insn), max_skip);
41024 if (max_skip)
41026 while (nbytes + max_skip >= 16)
41028 start = NEXT_INSN (start);
41029 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41030 || CALL_P (start))
41031 njumps--, isjump = true;
41032 else
41033 isjump = false;
41034 nbytes -= min_insn_size (start);
41037 continue;
41040 min_size = min_insn_size (insn);
41041 nbytes += min_size;
41042 if (dump_file)
41043 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41044 INSN_UID (insn), min_size);
41045 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41046 || CALL_P (insn))
41047 njumps++;
41048 else
41049 continue;
41051 while (njumps > 3)
41053 start = NEXT_INSN (start);
41054 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41055 || CALL_P (start))
41056 njumps--, isjump = true;
41057 else
41058 isjump = false;
41059 nbytes -= min_insn_size (start);
41061 gcc_assert (njumps >= 0);
41062 if (dump_file)
41063 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41064 INSN_UID (start), INSN_UID (insn), nbytes);
41066 if (njumps == 3 && isjump && nbytes < 16)
41068 int padsize = 15 - nbytes + min_insn_size (insn);
41070 if (dump_file)
41071 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41072 INSN_UID (insn), padsize);
41073 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41077 #endif
41079 /* AMD Athlon works faster
41080 when RET is not destination of conditional jump or directly preceded
41081 by other jump instruction. We avoid the penalty by inserting NOP just
41082 before the RET instructions in such cases. */
41083 static void
41084 ix86_pad_returns (void)
41086 edge e;
41087 edge_iterator ei;
41089 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41091 basic_block bb = e->src;
41092 rtx_insn *ret = BB_END (bb);
41093 rtx_insn *prev;
41094 bool replace = false;
41096 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41097 || optimize_bb_for_size_p (bb))
41098 continue;
41099 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41100 if (active_insn_p (prev) || LABEL_P (prev))
41101 break;
41102 if (prev && LABEL_P (prev))
41104 edge e;
41105 edge_iterator ei;
41107 FOR_EACH_EDGE (e, ei, bb->preds)
41108 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41109 && !(e->flags & EDGE_FALLTHRU))
41111 replace = true;
41112 break;
41115 if (!replace)
41117 prev = prev_active_insn (ret);
41118 if (prev
41119 && ((JUMP_P (prev) && any_condjump_p (prev))
41120 || CALL_P (prev)))
41121 replace = true;
41122 /* Empty functions get branch mispredict even when
41123 the jump destination is not visible to us. */
41124 if (!prev && !optimize_function_for_size_p (cfun))
41125 replace = true;
41127 if (replace)
41129 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41130 delete_insn (ret);
41135 /* Count the minimum number of instructions in BB. Return 4 if the
41136 number of instructions >= 4. */
41138 static int
41139 ix86_count_insn_bb (basic_block bb)
41141 rtx_insn *insn;
41142 int insn_count = 0;
41144 /* Count number of instructions in this block. Return 4 if the number
41145 of instructions >= 4. */
41146 FOR_BB_INSNS (bb, insn)
41148 /* Only happen in exit blocks. */
41149 if (JUMP_P (insn)
41150 && ANY_RETURN_P (PATTERN (insn)))
41151 break;
41153 if (NONDEBUG_INSN_P (insn)
41154 && GET_CODE (PATTERN (insn)) != USE
41155 && GET_CODE (PATTERN (insn)) != CLOBBER)
41157 insn_count++;
41158 if (insn_count >= 4)
41159 return insn_count;
41163 return insn_count;
41167 /* Count the minimum number of instructions in code path in BB.
41168 Return 4 if the number of instructions >= 4. */
41170 static int
41171 ix86_count_insn (basic_block bb)
41173 edge e;
41174 edge_iterator ei;
41175 int min_prev_count;
41177 /* Only bother counting instructions along paths with no
41178 more than 2 basic blocks between entry and exit. Given
41179 that BB has an edge to exit, determine if a predecessor
41180 of BB has an edge from entry. If so, compute the number
41181 of instructions in the predecessor block. If there
41182 happen to be multiple such blocks, compute the minimum. */
41183 min_prev_count = 4;
41184 FOR_EACH_EDGE (e, ei, bb->preds)
41186 edge prev_e;
41187 edge_iterator prev_ei;
41189 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41191 min_prev_count = 0;
41192 break;
41194 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41196 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41198 int count = ix86_count_insn_bb (e->src);
41199 if (count < min_prev_count)
41200 min_prev_count = count;
41201 break;
41206 if (min_prev_count < 4)
41207 min_prev_count += ix86_count_insn_bb (bb);
41209 return min_prev_count;
41212 /* Pad short function to 4 instructions. */
41214 static void
41215 ix86_pad_short_function (void)
41217 edge e;
41218 edge_iterator ei;
41220 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41222 rtx_insn *ret = BB_END (e->src);
41223 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41225 int insn_count = ix86_count_insn (e->src);
41227 /* Pad short function. */
41228 if (insn_count < 4)
41230 rtx_insn *insn = ret;
41232 /* Find epilogue. */
41233 while (insn
41234 && (!NOTE_P (insn)
41235 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41236 insn = PREV_INSN (insn);
41238 if (!insn)
41239 insn = ret;
41241 /* Two NOPs count as one instruction. */
41242 insn_count = 2 * (4 - insn_count);
41243 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41249 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41250 the epilogue, the Windows system unwinder will apply epilogue logic and
41251 produce incorrect offsets. This can be avoided by adding a nop between
41252 the last insn that can throw and the first insn of the epilogue. */
41254 static void
41255 ix86_seh_fixup_eh_fallthru (void)
41257 edge e;
41258 edge_iterator ei;
41260 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41262 rtx_insn *insn, *next;
41264 /* Find the beginning of the epilogue. */
41265 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41266 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41267 break;
41268 if (insn == NULL)
41269 continue;
41271 /* We only care about preceding insns that can throw. */
41272 insn = prev_active_insn (insn);
41273 if (insn == NULL || !can_throw_internal (insn))
41274 continue;
41276 /* Do not separate calls from their debug information. */
41277 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41278 if (NOTE_P (next)
41279 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41280 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41281 insn = next;
41282 else
41283 break;
41285 emit_insn_after (gen_nops (const1_rtx), insn);
41289 /* Given a register number BASE, the lowest of a group of registers, update
41290 regsets IN and OUT with the registers that should be avoided in input
41291 and output operands respectively when trying to avoid generating a modr/m
41292 byte for -fmitigate-rop. */
41294 static void
41295 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41297 SET_HARD_REG_BIT (out, base);
41298 SET_HARD_REG_BIT (out, base + 1);
41299 SET_HARD_REG_BIT (in, base + 2);
41300 SET_HARD_REG_BIT (in, base + 3);
41303 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41304 that certain encodings of modr/m bytes do not occur. */
41305 static void
41306 ix86_mitigate_rop (void)
41308 HARD_REG_SET input_risky;
41309 HARD_REG_SET output_risky;
41310 HARD_REG_SET inout_risky;
41312 CLEAR_HARD_REG_SET (output_risky);
41313 CLEAR_HARD_REG_SET (input_risky);
41314 SET_HARD_REG_BIT (output_risky, AX_REG);
41315 SET_HARD_REG_BIT (output_risky, CX_REG);
41316 SET_HARD_REG_BIT (input_risky, BX_REG);
41317 SET_HARD_REG_BIT (input_risky, DX_REG);
41318 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41319 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41320 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41321 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41322 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41323 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41324 COPY_HARD_REG_SET (inout_risky, input_risky);
41325 IOR_HARD_REG_SET (inout_risky, output_risky);
41327 df_note_add_problem ();
41328 /* Fix up what stack-regs did. */
41329 df_insn_rescan_all ();
41330 df_analyze ();
41332 regrename_init (true);
41333 regrename_analyze (NULL);
41335 auto_vec<du_head_p> cands;
41337 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41339 if (!NONDEBUG_INSN_P (insn))
41340 continue;
41342 if (GET_CODE (PATTERN (insn)) == USE
41343 || GET_CODE (PATTERN (insn)) == CLOBBER)
41344 continue;
41346 extract_insn (insn);
41348 int opno0, opno1;
41349 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41350 recog_data.n_operands, &opno0,
41351 &opno1);
41353 if (!ix86_rop_should_change_byte_p (modrm))
41354 continue;
41356 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41358 /* This happens when regrename has to fail a block. */
41359 if (!info->op_info)
41360 continue;
41362 if (info->op_info[opno0].n_chains != 0)
41364 gcc_assert (info->op_info[opno0].n_chains == 1);
41365 du_head_p op0c;
41366 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41367 if (op0c->target_data_1 + op0c->target_data_2 == 0
41368 && !op0c->cannot_rename)
41369 cands.safe_push (op0c);
41371 op0c->target_data_1++;
41373 if (info->op_info[opno1].n_chains != 0)
41375 gcc_assert (info->op_info[opno1].n_chains == 1);
41376 du_head_p op1c;
41377 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41378 if (op1c->target_data_1 + op1c->target_data_2 == 0
41379 && !op1c->cannot_rename)
41380 cands.safe_push (op1c);
41382 op1c->target_data_2++;
41386 int i;
41387 du_head_p head;
41388 FOR_EACH_VEC_ELT (cands, i, head)
41390 int old_reg, best_reg;
41391 HARD_REG_SET unavailable;
41393 CLEAR_HARD_REG_SET (unavailable);
41394 if (head->target_data_1)
41395 IOR_HARD_REG_SET (unavailable, output_risky);
41396 if (head->target_data_2)
41397 IOR_HARD_REG_SET (unavailable, input_risky);
41399 int n_uses;
41400 reg_class superclass = regrename_find_superclass (head, &n_uses,
41401 &unavailable);
41402 old_reg = head->regno;
41403 best_reg = find_rename_reg (head, superclass, &unavailable,
41404 old_reg, false);
41405 bool ok = regrename_do_replace (head, best_reg);
41406 gcc_assert (ok);
41407 if (dump_file)
41408 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41409 reg_names[best_reg], reg_class_names[superclass]);
41413 regrename_finish ();
41415 df_analyze ();
41417 basic_block bb;
41418 regset_head live;
41420 INIT_REG_SET (&live);
41422 FOR_EACH_BB_FN (bb, cfun)
41424 rtx_insn *insn;
41426 COPY_REG_SET (&live, DF_LR_OUT (bb));
41427 df_simulate_initialize_backwards (bb, &live);
41429 FOR_BB_INSNS_REVERSE (bb, insn)
41431 if (!NONDEBUG_INSN_P (insn))
41432 continue;
41434 df_simulate_one_insn_backwards (bb, insn, &live);
41436 if (GET_CODE (PATTERN (insn)) == USE
41437 || GET_CODE (PATTERN (insn)) == CLOBBER)
41438 continue;
41440 extract_insn (insn);
41441 constrain_operands_cached (insn, reload_completed);
41442 int opno0, opno1;
41443 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41444 recog_data.n_operands, &opno0,
41445 &opno1);
41446 if (modrm < 0
41447 || !ix86_rop_should_change_byte_p (modrm)
41448 || opno0 == opno1)
41449 continue;
41451 rtx oldreg = recog_data.operand[opno1];
41452 preprocess_constraints (insn);
41453 const operand_alternative *alt = which_op_alt ();
41455 int i;
41456 for (i = 0; i < recog_data.n_operands; i++)
41457 if (i != opno1
41458 && alt[i].earlyclobber
41459 && reg_overlap_mentioned_p (recog_data.operand[i],
41460 oldreg))
41461 break;
41463 if (i < recog_data.n_operands)
41464 continue;
41466 if (dump_file)
41467 fprintf (dump_file,
41468 "attempting to fix modrm byte in insn %d:"
41469 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41470 reg_class_names[alt[opno1].cl]);
41472 HARD_REG_SET unavailable;
41473 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41474 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41475 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41476 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41477 IOR_HARD_REG_SET (unavailable, output_risky);
41478 IOR_COMPL_HARD_REG_SET (unavailable,
41479 reg_class_contents[alt[opno1].cl]);
41481 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41482 if (!TEST_HARD_REG_BIT (unavailable, i))
41483 break;
41484 if (i == FIRST_PSEUDO_REGISTER)
41486 if (dump_file)
41487 fprintf (dump_file, ", none available\n");
41488 continue;
41490 if (dump_file)
41491 fprintf (dump_file, " -> %d\n", i);
41492 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41493 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41494 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41499 /* Implement machine specific optimizations. We implement padding of returns
41500 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41501 static void
41502 ix86_reorg (void)
41504 /* We are freeing block_for_insn in the toplev to keep compatibility
41505 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41506 compute_bb_for_insn ();
41508 if (flag_mitigate_rop)
41509 ix86_mitigate_rop ();
41511 if (TARGET_SEH && current_function_has_exception_handlers ())
41512 ix86_seh_fixup_eh_fallthru ();
41514 if (optimize && optimize_function_for_speed_p (cfun))
41516 if (TARGET_PAD_SHORT_FUNCTION)
41517 ix86_pad_short_function ();
41518 else if (TARGET_PAD_RETURNS)
41519 ix86_pad_returns ();
41520 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41521 if (TARGET_FOUR_JUMP_LIMIT)
41522 ix86_avoid_jump_mispredicts ();
41523 #endif
41527 /* Return nonzero when QImode register that must be represented via REX prefix
41528 is used. */
41529 bool
41530 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41532 int i;
41533 extract_insn_cached (insn);
41534 for (i = 0; i < recog_data.n_operands; i++)
41535 if (GENERAL_REG_P (recog_data.operand[i])
41536 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41537 return true;
41538 return false;
41541 /* Return true when INSN mentions register that must be encoded using REX
41542 prefix. */
41543 bool
41544 x86_extended_reg_mentioned_p (rtx insn)
41546 subrtx_iterator::array_type array;
41547 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41549 const_rtx x = *iter;
41550 if (REG_P (x)
41551 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41552 return true;
41554 return false;
41557 /* If profitable, negate (without causing overflow) integer constant
41558 of mode MODE at location LOC. Return true in this case. */
41559 bool
41560 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41562 HOST_WIDE_INT val;
41564 if (!CONST_INT_P (*loc))
41565 return false;
41567 switch (mode)
41569 case DImode:
41570 /* DImode x86_64 constants must fit in 32 bits. */
41571 gcc_assert (x86_64_immediate_operand (*loc, mode));
41573 mode = SImode;
41574 break;
41576 case SImode:
41577 case HImode:
41578 case QImode:
41579 break;
41581 default:
41582 gcc_unreachable ();
41585 /* Avoid overflows. */
41586 if (mode_signbit_p (mode, *loc))
41587 return false;
41589 val = INTVAL (*loc);
41591 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41592 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41593 if ((val < 0 && val != -128)
41594 || val == 128)
41596 *loc = GEN_INT (-val);
41597 return true;
41600 return false;
41603 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41604 optabs would emit if we didn't have TFmode patterns. */
41606 void
41607 x86_emit_floatuns (rtx operands[2])
41609 rtx_code_label *neglab, *donelab;
41610 rtx i0, i1, f0, in, out;
41611 machine_mode mode, inmode;
41613 inmode = GET_MODE (operands[1]);
41614 gcc_assert (inmode == SImode || inmode == DImode);
41616 out = operands[0];
41617 in = force_reg (inmode, operands[1]);
41618 mode = GET_MODE (out);
41619 neglab = gen_label_rtx ();
41620 donelab = gen_label_rtx ();
41621 f0 = gen_reg_rtx (mode);
41623 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41625 expand_float (out, in, 0);
41627 emit_jump_insn (gen_jump (donelab));
41628 emit_barrier ();
41630 emit_label (neglab);
41632 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41633 1, OPTAB_DIRECT);
41634 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41635 1, OPTAB_DIRECT);
41636 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41638 expand_float (f0, i0, 0);
41640 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41642 emit_label (donelab);
41645 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41646 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41647 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41648 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41650 /* Get a vector mode of the same size as the original but with elements
41651 twice as wide. This is only guaranteed to apply to integral vectors. */
41653 static inline machine_mode
41654 get_mode_wider_vector (machine_mode o)
41656 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41657 machine_mode n = GET_MODE_WIDER_MODE (o);
41658 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41659 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41660 return n;
41663 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41664 fill target with val via vec_duplicate. */
41666 static bool
41667 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41669 bool ok;
41670 rtx_insn *insn;
41671 rtx dup;
41673 /* First attempt to recognize VAL as-is. */
41674 dup = gen_rtx_VEC_DUPLICATE (mode, val);
41675 insn = emit_insn (gen_rtx_SET (target, dup));
41676 if (recog_memoized (insn) < 0)
41678 rtx_insn *seq;
41679 /* If that fails, force VAL into a register. */
41681 start_sequence ();
41682 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
41683 seq = get_insns ();
41684 end_sequence ();
41685 if (seq)
41686 emit_insn_before (seq, insn);
41688 ok = recog_memoized (insn) >= 0;
41689 gcc_assert (ok);
41691 return true;
41694 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41695 with all elements equal to VAR. Return true if successful. */
41697 static bool
41698 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41699 rtx target, rtx val)
41701 bool ok;
41703 switch (mode)
41705 case V2SImode:
41706 case V2SFmode:
41707 if (!mmx_ok)
41708 return false;
41709 /* FALLTHRU */
41711 case V4DFmode:
41712 case V4DImode:
41713 case V8SFmode:
41714 case V8SImode:
41715 case V2DFmode:
41716 case V2DImode:
41717 case V4SFmode:
41718 case V4SImode:
41719 case V16SImode:
41720 case V8DImode:
41721 case V16SFmode:
41722 case V8DFmode:
41723 return ix86_vector_duplicate_value (mode, target, val);
41725 case V4HImode:
41726 if (!mmx_ok)
41727 return false;
41728 if (TARGET_SSE || TARGET_3DNOW_A)
41730 rtx x;
41732 val = gen_lowpart (SImode, val);
41733 x = gen_rtx_TRUNCATE (HImode, val);
41734 x = gen_rtx_VEC_DUPLICATE (mode, x);
41735 emit_insn (gen_rtx_SET (target, x));
41736 return true;
41738 goto widen;
41740 case V8QImode:
41741 if (!mmx_ok)
41742 return false;
41743 goto widen;
41745 case V8HImode:
41746 if (TARGET_AVX2)
41747 return ix86_vector_duplicate_value (mode, target, val);
41749 if (TARGET_SSE2)
41751 struct expand_vec_perm_d dperm;
41752 rtx tmp1, tmp2;
41754 permute:
41755 memset (&dperm, 0, sizeof (dperm));
41756 dperm.target = target;
41757 dperm.vmode = mode;
41758 dperm.nelt = GET_MODE_NUNITS (mode);
41759 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41760 dperm.one_operand_p = true;
41762 /* Extend to SImode using a paradoxical SUBREG. */
41763 tmp1 = gen_reg_rtx (SImode);
41764 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41766 /* Insert the SImode value as low element of a V4SImode vector. */
41767 tmp2 = gen_reg_rtx (V4SImode);
41768 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41769 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41771 ok = (expand_vec_perm_1 (&dperm)
41772 || expand_vec_perm_broadcast_1 (&dperm));
41773 gcc_assert (ok);
41774 return ok;
41776 goto widen;
41778 case V16QImode:
41779 if (TARGET_AVX2)
41780 return ix86_vector_duplicate_value (mode, target, val);
41782 if (TARGET_SSE2)
41783 goto permute;
41784 goto widen;
41786 widen:
41787 /* Replicate the value once into the next wider mode and recurse. */
41789 machine_mode smode, wsmode, wvmode;
41790 rtx x;
41792 smode = GET_MODE_INNER (mode);
41793 wvmode = get_mode_wider_vector (mode);
41794 wsmode = GET_MODE_INNER (wvmode);
41796 val = convert_modes (wsmode, smode, val, true);
41797 x = expand_simple_binop (wsmode, ASHIFT, val,
41798 GEN_INT (GET_MODE_BITSIZE (smode)),
41799 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41800 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41802 x = gen_reg_rtx (wvmode);
41803 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41804 gcc_assert (ok);
41805 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41806 return ok;
41809 case V16HImode:
41810 case V32QImode:
41811 if (TARGET_AVX2)
41812 return ix86_vector_duplicate_value (mode, target, val);
41813 else
41815 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41816 rtx x = gen_reg_rtx (hvmode);
41818 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41819 gcc_assert (ok);
41821 x = gen_rtx_VEC_CONCAT (mode, x, x);
41822 emit_insn (gen_rtx_SET (target, x));
41824 return true;
41826 case V64QImode:
41827 case V32HImode:
41828 if (TARGET_AVX512BW)
41829 return ix86_vector_duplicate_value (mode, target, val);
41830 else
41832 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41833 rtx x = gen_reg_rtx (hvmode);
41835 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41836 gcc_assert (ok);
41838 x = gen_rtx_VEC_CONCAT (mode, x, x);
41839 emit_insn (gen_rtx_SET (target, x));
41841 return true;
41843 default:
41844 return false;
41848 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41849 whose ONE_VAR element is VAR, and other elements are zero. Return true
41850 if successful. */
41852 static bool
41853 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41854 rtx target, rtx var, int one_var)
41856 machine_mode vsimode;
41857 rtx new_target;
41858 rtx x, tmp;
41859 bool use_vector_set = false;
41861 switch (mode)
41863 case V2DImode:
41864 /* For SSE4.1, we normally use vector set. But if the second
41865 element is zero and inter-unit moves are OK, we use movq
41866 instead. */
41867 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41868 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41869 && one_var == 0));
41870 break;
41871 case V16QImode:
41872 case V4SImode:
41873 case V4SFmode:
41874 use_vector_set = TARGET_SSE4_1;
41875 break;
41876 case V8HImode:
41877 use_vector_set = TARGET_SSE2;
41878 break;
41879 case V4HImode:
41880 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41881 break;
41882 case V32QImode:
41883 case V16HImode:
41884 case V8SImode:
41885 case V8SFmode:
41886 case V4DFmode:
41887 use_vector_set = TARGET_AVX;
41888 break;
41889 case V4DImode:
41890 /* Use ix86_expand_vector_set in 64bit mode only. */
41891 use_vector_set = TARGET_AVX && TARGET_64BIT;
41892 break;
41893 default:
41894 break;
41897 if (use_vector_set)
41899 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41900 var = force_reg (GET_MODE_INNER (mode), var);
41901 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41902 return true;
41905 switch (mode)
41907 case V2SFmode:
41908 case V2SImode:
41909 if (!mmx_ok)
41910 return false;
41911 /* FALLTHRU */
41913 case V2DFmode:
41914 case V2DImode:
41915 if (one_var != 0)
41916 return false;
41917 var = force_reg (GET_MODE_INNER (mode), var);
41918 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41919 emit_insn (gen_rtx_SET (target, x));
41920 return true;
41922 case V4SFmode:
41923 case V4SImode:
41924 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41925 new_target = gen_reg_rtx (mode);
41926 else
41927 new_target = target;
41928 var = force_reg (GET_MODE_INNER (mode), var);
41929 x = gen_rtx_VEC_DUPLICATE (mode, var);
41930 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41931 emit_insn (gen_rtx_SET (new_target, x));
41932 if (one_var != 0)
41934 /* We need to shuffle the value to the correct position, so
41935 create a new pseudo to store the intermediate result. */
41937 /* With SSE2, we can use the integer shuffle insns. */
41938 if (mode != V4SFmode && TARGET_SSE2)
41940 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41941 const1_rtx,
41942 GEN_INT (one_var == 1 ? 0 : 1),
41943 GEN_INT (one_var == 2 ? 0 : 1),
41944 GEN_INT (one_var == 3 ? 0 : 1)));
41945 if (target != new_target)
41946 emit_move_insn (target, new_target);
41947 return true;
41950 /* Otherwise convert the intermediate result to V4SFmode and
41951 use the SSE1 shuffle instructions. */
41952 if (mode != V4SFmode)
41954 tmp = gen_reg_rtx (V4SFmode);
41955 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41957 else
41958 tmp = new_target;
41960 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41961 const1_rtx,
41962 GEN_INT (one_var == 1 ? 0 : 1),
41963 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41964 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41966 if (mode != V4SFmode)
41967 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41968 else if (tmp != target)
41969 emit_move_insn (target, tmp);
41971 else if (target != new_target)
41972 emit_move_insn (target, new_target);
41973 return true;
41975 case V8HImode:
41976 case V16QImode:
41977 vsimode = V4SImode;
41978 goto widen;
41979 case V4HImode:
41980 case V8QImode:
41981 if (!mmx_ok)
41982 return false;
41983 vsimode = V2SImode;
41984 goto widen;
41985 widen:
41986 if (one_var != 0)
41987 return false;
41989 /* Zero extend the variable element to SImode and recurse. */
41990 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41992 x = gen_reg_rtx (vsimode);
41993 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41994 var, one_var))
41995 gcc_unreachable ();
41997 emit_move_insn (target, gen_lowpart (mode, x));
41998 return true;
42000 default:
42001 return false;
42005 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42006 consisting of the values in VALS. It is known that all elements
42007 except ONE_VAR are constants. Return true if successful. */
42009 static bool
42010 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42011 rtx target, rtx vals, int one_var)
42013 rtx var = XVECEXP (vals, 0, one_var);
42014 machine_mode wmode;
42015 rtx const_vec, x;
42017 const_vec = copy_rtx (vals);
42018 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42019 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42021 switch (mode)
42023 case V2DFmode:
42024 case V2DImode:
42025 case V2SFmode:
42026 case V2SImode:
42027 /* For the two element vectors, it's just as easy to use
42028 the general case. */
42029 return false;
42031 case V4DImode:
42032 /* Use ix86_expand_vector_set in 64bit mode only. */
42033 if (!TARGET_64BIT)
42034 return false;
42035 /* FALLTHRU */
42036 case V4DFmode:
42037 case V8SFmode:
42038 case V8SImode:
42039 case V16HImode:
42040 case V32QImode:
42041 case V4SFmode:
42042 case V4SImode:
42043 case V8HImode:
42044 case V4HImode:
42045 break;
42047 case V16QImode:
42048 if (TARGET_SSE4_1)
42049 break;
42050 wmode = V8HImode;
42051 goto widen;
42052 case V8QImode:
42053 wmode = V4HImode;
42054 goto widen;
42055 widen:
42056 /* There's no way to set one QImode entry easily. Combine
42057 the variable value with its adjacent constant value, and
42058 promote to an HImode set. */
42059 x = XVECEXP (vals, 0, one_var ^ 1);
42060 if (one_var & 1)
42062 var = convert_modes (HImode, QImode, var, true);
42063 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42064 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42065 x = GEN_INT (INTVAL (x) & 0xff);
42067 else
42069 var = convert_modes (HImode, QImode, var, true);
42070 x = gen_int_mode (INTVAL (x) << 8, HImode);
42072 if (x != const0_rtx)
42073 var = expand_simple_binop (HImode, IOR, var, x, var,
42074 1, OPTAB_LIB_WIDEN);
42076 x = gen_reg_rtx (wmode);
42077 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42078 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42080 emit_move_insn (target, gen_lowpart (mode, x));
42081 return true;
42083 default:
42084 return false;
42087 emit_move_insn (target, const_vec);
42088 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42089 return true;
42092 /* A subroutine of ix86_expand_vector_init_general. Use vector
42093 concatenate to handle the most general case: all values variable,
42094 and none identical. */
42096 static void
42097 ix86_expand_vector_init_concat (machine_mode mode,
42098 rtx target, rtx *ops, int n)
42100 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42101 rtx first[16], second[8], third[4];
42102 rtvec v;
42103 int i, j;
42105 switch (n)
42107 case 2:
42108 switch (mode)
42110 case V16SImode:
42111 cmode = V8SImode;
42112 break;
42113 case V16SFmode:
42114 cmode = V8SFmode;
42115 break;
42116 case V8DImode:
42117 cmode = V4DImode;
42118 break;
42119 case V8DFmode:
42120 cmode = V4DFmode;
42121 break;
42122 case V8SImode:
42123 cmode = V4SImode;
42124 break;
42125 case V8SFmode:
42126 cmode = V4SFmode;
42127 break;
42128 case V4DImode:
42129 cmode = V2DImode;
42130 break;
42131 case V4DFmode:
42132 cmode = V2DFmode;
42133 break;
42134 case V4SImode:
42135 cmode = V2SImode;
42136 break;
42137 case V4SFmode:
42138 cmode = V2SFmode;
42139 break;
42140 case V2DImode:
42141 cmode = DImode;
42142 break;
42143 case V2SImode:
42144 cmode = SImode;
42145 break;
42146 case V2DFmode:
42147 cmode = DFmode;
42148 break;
42149 case V2SFmode:
42150 cmode = SFmode;
42151 break;
42152 default:
42153 gcc_unreachable ();
42156 if (!register_operand (ops[1], cmode))
42157 ops[1] = force_reg (cmode, ops[1]);
42158 if (!register_operand (ops[0], cmode))
42159 ops[0] = force_reg (cmode, ops[0]);
42160 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42161 ops[1])));
42162 break;
42164 case 4:
42165 switch (mode)
42167 case V4DImode:
42168 cmode = V2DImode;
42169 break;
42170 case V4DFmode:
42171 cmode = V2DFmode;
42172 break;
42173 case V4SImode:
42174 cmode = V2SImode;
42175 break;
42176 case V4SFmode:
42177 cmode = V2SFmode;
42178 break;
42179 default:
42180 gcc_unreachable ();
42182 goto half;
42184 case 8:
42185 switch (mode)
42187 case V8DImode:
42188 cmode = V2DImode;
42189 hmode = V4DImode;
42190 break;
42191 case V8DFmode:
42192 cmode = V2DFmode;
42193 hmode = V4DFmode;
42194 break;
42195 case V8SImode:
42196 cmode = V2SImode;
42197 hmode = V4SImode;
42198 break;
42199 case V8SFmode:
42200 cmode = V2SFmode;
42201 hmode = V4SFmode;
42202 break;
42203 default:
42204 gcc_unreachable ();
42206 goto half;
42208 case 16:
42209 switch (mode)
42211 case V16SImode:
42212 cmode = V2SImode;
42213 hmode = V4SImode;
42214 gmode = V8SImode;
42215 break;
42216 case V16SFmode:
42217 cmode = V2SFmode;
42218 hmode = V4SFmode;
42219 gmode = V8SFmode;
42220 break;
42221 default:
42222 gcc_unreachable ();
42224 goto half;
42226 half:
42227 /* FIXME: We process inputs backward to help RA. PR 36222. */
42228 i = n - 1;
42229 j = (n >> 1) - 1;
42230 for (; i > 0; i -= 2, j--)
42232 first[j] = gen_reg_rtx (cmode);
42233 v = gen_rtvec (2, ops[i - 1], ops[i]);
42234 ix86_expand_vector_init (false, first[j],
42235 gen_rtx_PARALLEL (cmode, v));
42238 n >>= 1;
42239 if (n > 4)
42241 gcc_assert (hmode != VOIDmode);
42242 gcc_assert (gmode != VOIDmode);
42243 for (i = j = 0; i < n; i += 2, j++)
42245 second[j] = gen_reg_rtx (hmode);
42246 ix86_expand_vector_init_concat (hmode, second [j],
42247 &first [i], 2);
42249 n >>= 1;
42250 for (i = j = 0; i < n; i += 2, j++)
42252 third[j] = gen_reg_rtx (gmode);
42253 ix86_expand_vector_init_concat (gmode, third[j],
42254 &second[i], 2);
42256 n >>= 1;
42257 ix86_expand_vector_init_concat (mode, target, third, n);
42259 else if (n > 2)
42261 gcc_assert (hmode != VOIDmode);
42262 for (i = j = 0; i < n; i += 2, j++)
42264 second[j] = gen_reg_rtx (hmode);
42265 ix86_expand_vector_init_concat (hmode, second [j],
42266 &first [i], 2);
42268 n >>= 1;
42269 ix86_expand_vector_init_concat (mode, target, second, n);
42271 else
42272 ix86_expand_vector_init_concat (mode, target, first, n);
42273 break;
42275 default:
42276 gcc_unreachable ();
42280 /* A subroutine of ix86_expand_vector_init_general. Use vector
42281 interleave to handle the most general case: all values variable,
42282 and none identical. */
42284 static void
42285 ix86_expand_vector_init_interleave (machine_mode mode,
42286 rtx target, rtx *ops, int n)
42288 machine_mode first_imode, second_imode, third_imode, inner_mode;
42289 int i, j;
42290 rtx op0, op1;
42291 rtx (*gen_load_even) (rtx, rtx, rtx);
42292 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42293 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42295 switch (mode)
42297 case V8HImode:
42298 gen_load_even = gen_vec_setv8hi;
42299 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42300 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42301 inner_mode = HImode;
42302 first_imode = V4SImode;
42303 second_imode = V2DImode;
42304 third_imode = VOIDmode;
42305 break;
42306 case V16QImode:
42307 gen_load_even = gen_vec_setv16qi;
42308 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42309 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42310 inner_mode = QImode;
42311 first_imode = V8HImode;
42312 second_imode = V4SImode;
42313 third_imode = V2DImode;
42314 break;
42315 default:
42316 gcc_unreachable ();
42319 for (i = 0; i < n; i++)
42321 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42322 op0 = gen_reg_rtx (SImode);
42323 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42325 /* Insert the SImode value as low element of V4SImode vector. */
42326 op1 = gen_reg_rtx (V4SImode);
42327 op0 = gen_rtx_VEC_MERGE (V4SImode,
42328 gen_rtx_VEC_DUPLICATE (V4SImode,
42329 op0),
42330 CONST0_RTX (V4SImode),
42331 const1_rtx);
42332 emit_insn (gen_rtx_SET (op1, op0));
42334 /* Cast the V4SImode vector back to a vector in orignal mode. */
42335 op0 = gen_reg_rtx (mode);
42336 emit_move_insn (op0, gen_lowpart (mode, op1));
42338 /* Load even elements into the second position. */
42339 emit_insn (gen_load_even (op0,
42340 force_reg (inner_mode,
42341 ops [i + i + 1]),
42342 const1_rtx));
42344 /* Cast vector to FIRST_IMODE vector. */
42345 ops[i] = gen_reg_rtx (first_imode);
42346 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42349 /* Interleave low FIRST_IMODE vectors. */
42350 for (i = j = 0; i < n; i += 2, j++)
42352 op0 = gen_reg_rtx (first_imode);
42353 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42355 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42356 ops[j] = gen_reg_rtx (second_imode);
42357 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42360 /* Interleave low SECOND_IMODE vectors. */
42361 switch (second_imode)
42363 case V4SImode:
42364 for (i = j = 0; i < n / 2; i += 2, j++)
42366 op0 = gen_reg_rtx (second_imode);
42367 emit_insn (gen_interleave_second_low (op0, ops[i],
42368 ops[i + 1]));
42370 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42371 vector. */
42372 ops[j] = gen_reg_rtx (third_imode);
42373 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42375 second_imode = V2DImode;
42376 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42377 /* FALLTHRU */
42379 case V2DImode:
42380 op0 = gen_reg_rtx (second_imode);
42381 emit_insn (gen_interleave_second_low (op0, ops[0],
42382 ops[1]));
42384 /* Cast the SECOND_IMODE vector back to a vector on original
42385 mode. */
42386 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42387 break;
42389 default:
42390 gcc_unreachable ();
42394 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42395 all values variable, and none identical. */
42397 static void
42398 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42399 rtx target, rtx vals)
42401 rtx ops[64], op0, op1, op2, op3, op4, op5;
42402 machine_mode half_mode = VOIDmode;
42403 machine_mode quarter_mode = VOIDmode;
42404 int n, i;
42406 switch (mode)
42408 case V2SFmode:
42409 case V2SImode:
42410 if (!mmx_ok && !TARGET_SSE)
42411 break;
42412 /* FALLTHRU */
42414 case V16SImode:
42415 case V16SFmode:
42416 case V8DFmode:
42417 case V8DImode:
42418 case V8SFmode:
42419 case V8SImode:
42420 case V4DFmode:
42421 case V4DImode:
42422 case V4SFmode:
42423 case V4SImode:
42424 case V2DFmode:
42425 case V2DImode:
42426 n = GET_MODE_NUNITS (mode);
42427 for (i = 0; i < n; i++)
42428 ops[i] = XVECEXP (vals, 0, i);
42429 ix86_expand_vector_init_concat (mode, target, ops, n);
42430 return;
42432 case V32QImode:
42433 half_mode = V16QImode;
42434 goto half;
42436 case V16HImode:
42437 half_mode = V8HImode;
42438 goto half;
42440 half:
42441 n = GET_MODE_NUNITS (mode);
42442 for (i = 0; i < n; i++)
42443 ops[i] = XVECEXP (vals, 0, i);
42444 op0 = gen_reg_rtx (half_mode);
42445 op1 = gen_reg_rtx (half_mode);
42446 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42447 n >> 2);
42448 ix86_expand_vector_init_interleave (half_mode, op1,
42449 &ops [n >> 1], n >> 2);
42450 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42451 return;
42453 case V64QImode:
42454 quarter_mode = V16QImode;
42455 half_mode = V32QImode;
42456 goto quarter;
42458 case V32HImode:
42459 quarter_mode = V8HImode;
42460 half_mode = V16HImode;
42461 goto quarter;
42463 quarter:
42464 n = GET_MODE_NUNITS (mode);
42465 for (i = 0; i < n; i++)
42466 ops[i] = XVECEXP (vals, 0, i);
42467 op0 = gen_reg_rtx (quarter_mode);
42468 op1 = gen_reg_rtx (quarter_mode);
42469 op2 = gen_reg_rtx (quarter_mode);
42470 op3 = gen_reg_rtx (quarter_mode);
42471 op4 = gen_reg_rtx (half_mode);
42472 op5 = gen_reg_rtx (half_mode);
42473 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42474 n >> 3);
42475 ix86_expand_vector_init_interleave (quarter_mode, op1,
42476 &ops [n >> 2], n >> 3);
42477 ix86_expand_vector_init_interleave (quarter_mode, op2,
42478 &ops [n >> 1], n >> 3);
42479 ix86_expand_vector_init_interleave (quarter_mode, op3,
42480 &ops [(n >> 1) | (n >> 2)], n >> 3);
42481 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42482 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42483 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42484 return;
42486 case V16QImode:
42487 if (!TARGET_SSE4_1)
42488 break;
42489 /* FALLTHRU */
42491 case V8HImode:
42492 if (!TARGET_SSE2)
42493 break;
42495 /* Don't use ix86_expand_vector_init_interleave if we can't
42496 move from GPR to SSE register directly. */
42497 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42498 break;
42500 n = GET_MODE_NUNITS (mode);
42501 for (i = 0; i < n; i++)
42502 ops[i] = XVECEXP (vals, 0, i);
42503 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42504 return;
42506 case V4HImode:
42507 case V8QImode:
42508 break;
42510 default:
42511 gcc_unreachable ();
42515 int i, j, n_elts, n_words, n_elt_per_word;
42516 machine_mode inner_mode;
42517 rtx words[4], shift;
42519 inner_mode = GET_MODE_INNER (mode);
42520 n_elts = GET_MODE_NUNITS (mode);
42521 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42522 n_elt_per_word = n_elts / n_words;
42523 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42525 for (i = 0; i < n_words; ++i)
42527 rtx word = NULL_RTX;
42529 for (j = 0; j < n_elt_per_word; ++j)
42531 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42532 elt = convert_modes (word_mode, inner_mode, elt, true);
42534 if (j == 0)
42535 word = elt;
42536 else
42538 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42539 word, 1, OPTAB_LIB_WIDEN);
42540 word = expand_simple_binop (word_mode, IOR, word, elt,
42541 word, 1, OPTAB_LIB_WIDEN);
42545 words[i] = word;
42548 if (n_words == 1)
42549 emit_move_insn (target, gen_lowpart (mode, words[0]));
42550 else if (n_words == 2)
42552 rtx tmp = gen_reg_rtx (mode);
42553 emit_clobber (tmp);
42554 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42555 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42556 emit_move_insn (target, tmp);
42558 else if (n_words == 4)
42560 rtx tmp = gen_reg_rtx (V4SImode);
42561 gcc_assert (word_mode == SImode);
42562 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42563 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42564 emit_move_insn (target, gen_lowpart (mode, tmp));
42566 else
42567 gcc_unreachable ();
42571 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42572 instructions unless MMX_OK is true. */
42574 void
42575 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42577 machine_mode mode = GET_MODE (target);
42578 machine_mode inner_mode = GET_MODE_INNER (mode);
42579 int n_elts = GET_MODE_NUNITS (mode);
42580 int n_var = 0, one_var = -1;
42581 bool all_same = true, all_const_zero = true;
42582 int i;
42583 rtx x;
42585 for (i = 0; i < n_elts; ++i)
42587 x = XVECEXP (vals, 0, i);
42588 if (!(CONST_SCALAR_INT_P (x)
42589 || CONST_DOUBLE_P (x)
42590 || CONST_FIXED_P (x)))
42591 n_var++, one_var = i;
42592 else if (x != CONST0_RTX (inner_mode))
42593 all_const_zero = false;
42594 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42595 all_same = false;
42598 /* Constants are best loaded from the constant pool. */
42599 if (n_var == 0)
42601 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42602 return;
42605 /* If all values are identical, broadcast the value. */
42606 if (all_same
42607 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42608 XVECEXP (vals, 0, 0)))
42609 return;
42611 /* Values where only one field is non-constant are best loaded from
42612 the pool and overwritten via move later. */
42613 if (n_var == 1)
42615 if (all_const_zero
42616 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42617 XVECEXP (vals, 0, one_var),
42618 one_var))
42619 return;
42621 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42622 return;
42625 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42628 void
42629 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42631 machine_mode mode = GET_MODE (target);
42632 machine_mode inner_mode = GET_MODE_INNER (mode);
42633 machine_mode half_mode;
42634 bool use_vec_merge = false;
42635 rtx tmp;
42636 static rtx (*gen_extract[6][2]) (rtx, rtx)
42638 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42639 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42640 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42641 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42642 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42643 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42645 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42647 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42648 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42649 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42650 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42651 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42652 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42654 int i, j, n;
42655 machine_mode mmode = VOIDmode;
42656 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42658 switch (mode)
42660 case V2SFmode:
42661 case V2SImode:
42662 if (mmx_ok)
42664 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42665 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42666 if (elt == 0)
42667 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42668 else
42669 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42670 emit_insn (gen_rtx_SET (target, tmp));
42671 return;
42673 break;
42675 case V2DImode:
42676 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42677 if (use_vec_merge)
42678 break;
42680 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42681 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42682 if (elt == 0)
42683 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42684 else
42685 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42686 emit_insn (gen_rtx_SET (target, tmp));
42687 return;
42689 case V2DFmode:
42691 rtx op0, op1;
42693 /* For the two element vectors, we implement a VEC_CONCAT with
42694 the extraction of the other element. */
42696 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42697 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42699 if (elt == 0)
42700 op0 = val, op1 = tmp;
42701 else
42702 op0 = tmp, op1 = val;
42704 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42705 emit_insn (gen_rtx_SET (target, tmp));
42707 return;
42709 case V4SFmode:
42710 use_vec_merge = TARGET_SSE4_1;
42711 if (use_vec_merge)
42712 break;
42714 switch (elt)
42716 case 0:
42717 use_vec_merge = true;
42718 break;
42720 case 1:
42721 /* tmp = target = A B C D */
42722 tmp = copy_to_reg (target);
42723 /* target = A A B B */
42724 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42725 /* target = X A B B */
42726 ix86_expand_vector_set (false, target, val, 0);
42727 /* target = A X C D */
42728 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42729 const1_rtx, const0_rtx,
42730 GEN_INT (2+4), GEN_INT (3+4)));
42731 return;
42733 case 2:
42734 /* tmp = target = A B C D */
42735 tmp = copy_to_reg (target);
42736 /* tmp = X B C D */
42737 ix86_expand_vector_set (false, tmp, val, 0);
42738 /* target = A B X D */
42739 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42740 const0_rtx, const1_rtx,
42741 GEN_INT (0+4), GEN_INT (3+4)));
42742 return;
42744 case 3:
42745 /* tmp = target = A B C D */
42746 tmp = copy_to_reg (target);
42747 /* tmp = X B C D */
42748 ix86_expand_vector_set (false, tmp, val, 0);
42749 /* target = A B X D */
42750 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42751 const0_rtx, const1_rtx,
42752 GEN_INT (2+4), GEN_INT (0+4)));
42753 return;
42755 default:
42756 gcc_unreachable ();
42758 break;
42760 case V4SImode:
42761 use_vec_merge = TARGET_SSE4_1;
42762 if (use_vec_merge)
42763 break;
42765 /* Element 0 handled by vec_merge below. */
42766 if (elt == 0)
42768 use_vec_merge = true;
42769 break;
42772 if (TARGET_SSE2)
42774 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42775 store into element 0, then shuffle them back. */
42777 rtx order[4];
42779 order[0] = GEN_INT (elt);
42780 order[1] = const1_rtx;
42781 order[2] = const2_rtx;
42782 order[3] = GEN_INT (3);
42783 order[elt] = const0_rtx;
42785 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42786 order[1], order[2], order[3]));
42788 ix86_expand_vector_set (false, target, val, 0);
42790 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42791 order[1], order[2], order[3]));
42793 else
42795 /* For SSE1, we have to reuse the V4SF code. */
42796 rtx t = gen_reg_rtx (V4SFmode);
42797 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42798 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42799 emit_move_insn (target, gen_lowpart (mode, t));
42801 return;
42803 case V8HImode:
42804 use_vec_merge = TARGET_SSE2;
42805 break;
42806 case V4HImode:
42807 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42808 break;
42810 case V16QImode:
42811 use_vec_merge = TARGET_SSE4_1;
42812 break;
42814 case V8QImode:
42815 break;
42817 case V32QImode:
42818 half_mode = V16QImode;
42819 j = 0;
42820 n = 16;
42821 goto half;
42823 case V16HImode:
42824 half_mode = V8HImode;
42825 j = 1;
42826 n = 8;
42827 goto half;
42829 case V8SImode:
42830 half_mode = V4SImode;
42831 j = 2;
42832 n = 4;
42833 goto half;
42835 case V4DImode:
42836 half_mode = V2DImode;
42837 j = 3;
42838 n = 2;
42839 goto half;
42841 case V8SFmode:
42842 half_mode = V4SFmode;
42843 j = 4;
42844 n = 4;
42845 goto half;
42847 case V4DFmode:
42848 half_mode = V2DFmode;
42849 j = 5;
42850 n = 2;
42851 goto half;
42853 half:
42854 /* Compute offset. */
42855 i = elt / n;
42856 elt %= n;
42858 gcc_assert (i <= 1);
42860 /* Extract the half. */
42861 tmp = gen_reg_rtx (half_mode);
42862 emit_insn (gen_extract[j][i] (tmp, target));
42864 /* Put val in tmp at elt. */
42865 ix86_expand_vector_set (false, tmp, val, elt);
42867 /* Put it back. */
42868 emit_insn (gen_insert[j][i] (target, target, tmp));
42869 return;
42871 case V8DFmode:
42872 if (TARGET_AVX512F)
42874 mmode = QImode;
42875 gen_blendm = gen_avx512f_blendmv8df;
42877 break;
42879 case V8DImode:
42880 if (TARGET_AVX512F)
42882 mmode = QImode;
42883 gen_blendm = gen_avx512f_blendmv8di;
42885 break;
42887 case V16SFmode:
42888 if (TARGET_AVX512F)
42890 mmode = HImode;
42891 gen_blendm = gen_avx512f_blendmv16sf;
42893 break;
42895 case V16SImode:
42896 if (TARGET_AVX512F)
42898 mmode = HImode;
42899 gen_blendm = gen_avx512f_blendmv16si;
42901 break;
42903 case V32HImode:
42904 if (TARGET_AVX512F && TARGET_AVX512BW)
42906 mmode = SImode;
42907 gen_blendm = gen_avx512bw_blendmv32hi;
42909 break;
42911 case V64QImode:
42912 if (TARGET_AVX512F && TARGET_AVX512BW)
42914 mmode = DImode;
42915 gen_blendm = gen_avx512bw_blendmv64qi;
42917 break;
42919 default:
42920 break;
42923 if (mmode != VOIDmode)
42925 tmp = gen_reg_rtx (mode);
42926 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42927 /* The avx512*_blendm<mode> expanders have different operand order
42928 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42929 elements where the mask is set and second input operand otherwise,
42930 in {sse,avx}*_*blend* the first input operand is used for elements
42931 where the mask is clear and second input operand otherwise. */
42932 emit_insn (gen_blendm (target, target, tmp,
42933 force_reg (mmode,
42934 gen_int_mode (1 << elt, mmode))));
42936 else if (use_vec_merge)
42938 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42939 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42940 emit_insn (gen_rtx_SET (target, tmp));
42942 else
42944 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42946 emit_move_insn (mem, target);
42948 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42949 emit_move_insn (tmp, val);
42951 emit_move_insn (target, mem);
42955 void
42956 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42958 machine_mode mode = GET_MODE (vec);
42959 machine_mode inner_mode = GET_MODE_INNER (mode);
42960 bool use_vec_extr = false;
42961 rtx tmp;
42963 switch (mode)
42965 case V2SImode:
42966 case V2SFmode:
42967 if (!mmx_ok)
42968 break;
42969 /* FALLTHRU */
42971 case V2DFmode:
42972 case V2DImode:
42973 use_vec_extr = true;
42974 break;
42976 case V4SFmode:
42977 use_vec_extr = TARGET_SSE4_1;
42978 if (use_vec_extr)
42979 break;
42981 switch (elt)
42983 case 0:
42984 tmp = vec;
42985 break;
42987 case 1:
42988 case 3:
42989 tmp = gen_reg_rtx (mode);
42990 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42991 GEN_INT (elt), GEN_INT (elt),
42992 GEN_INT (elt+4), GEN_INT (elt+4)));
42993 break;
42995 case 2:
42996 tmp = gen_reg_rtx (mode);
42997 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42998 break;
43000 default:
43001 gcc_unreachable ();
43003 vec = tmp;
43004 use_vec_extr = true;
43005 elt = 0;
43006 break;
43008 case V4SImode:
43009 use_vec_extr = TARGET_SSE4_1;
43010 if (use_vec_extr)
43011 break;
43013 if (TARGET_SSE2)
43015 switch (elt)
43017 case 0:
43018 tmp = vec;
43019 break;
43021 case 1:
43022 case 3:
43023 tmp = gen_reg_rtx (mode);
43024 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43025 GEN_INT (elt), GEN_INT (elt),
43026 GEN_INT (elt), GEN_INT (elt)));
43027 break;
43029 case 2:
43030 tmp = gen_reg_rtx (mode);
43031 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43032 break;
43034 default:
43035 gcc_unreachable ();
43037 vec = tmp;
43038 use_vec_extr = true;
43039 elt = 0;
43041 else
43043 /* For SSE1, we have to reuse the V4SF code. */
43044 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43045 gen_lowpart (V4SFmode, vec), elt);
43046 return;
43048 break;
43050 case V8HImode:
43051 use_vec_extr = TARGET_SSE2;
43052 break;
43053 case V4HImode:
43054 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43055 break;
43057 case V16QImode:
43058 use_vec_extr = TARGET_SSE4_1;
43059 break;
43061 case V8SFmode:
43062 if (TARGET_AVX)
43064 tmp = gen_reg_rtx (V4SFmode);
43065 if (elt < 4)
43066 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43067 else
43068 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43069 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43070 return;
43072 break;
43074 case V4DFmode:
43075 if (TARGET_AVX)
43077 tmp = gen_reg_rtx (V2DFmode);
43078 if (elt < 2)
43079 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43080 else
43081 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43082 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43083 return;
43085 break;
43087 case V32QImode:
43088 if (TARGET_AVX)
43090 tmp = gen_reg_rtx (V16QImode);
43091 if (elt < 16)
43092 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43093 else
43094 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43095 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43096 return;
43098 break;
43100 case V16HImode:
43101 if (TARGET_AVX)
43103 tmp = gen_reg_rtx (V8HImode);
43104 if (elt < 8)
43105 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43106 else
43107 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43108 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43109 return;
43111 break;
43113 case V8SImode:
43114 if (TARGET_AVX)
43116 tmp = gen_reg_rtx (V4SImode);
43117 if (elt < 4)
43118 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43119 else
43120 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43121 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43122 return;
43124 break;
43126 case V4DImode:
43127 if (TARGET_AVX)
43129 tmp = gen_reg_rtx (V2DImode);
43130 if (elt < 2)
43131 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43132 else
43133 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43134 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43135 return;
43137 break;
43139 case V32HImode:
43140 if (TARGET_AVX512BW)
43142 tmp = gen_reg_rtx (V16HImode);
43143 if (elt < 16)
43144 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43145 else
43146 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43147 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43148 return;
43150 break;
43152 case V64QImode:
43153 if (TARGET_AVX512BW)
43155 tmp = gen_reg_rtx (V32QImode);
43156 if (elt < 32)
43157 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43158 else
43159 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43160 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43161 return;
43163 break;
43165 case V16SFmode:
43166 tmp = gen_reg_rtx (V8SFmode);
43167 if (elt < 8)
43168 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43169 else
43170 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43171 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43172 return;
43174 case V8DFmode:
43175 tmp = gen_reg_rtx (V4DFmode);
43176 if (elt < 4)
43177 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43178 else
43179 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43180 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43181 return;
43183 case V16SImode:
43184 tmp = gen_reg_rtx (V8SImode);
43185 if (elt < 8)
43186 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43187 else
43188 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43189 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43190 return;
43192 case V8DImode:
43193 tmp = gen_reg_rtx (V4DImode);
43194 if (elt < 4)
43195 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43196 else
43197 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43198 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43199 return;
43201 case V8QImode:
43202 /* ??? Could extract the appropriate HImode element and shift. */
43203 default:
43204 break;
43207 if (use_vec_extr)
43209 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43210 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43212 /* Let the rtl optimizers know about the zero extension performed. */
43213 if (inner_mode == QImode || inner_mode == HImode)
43215 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43216 target = gen_lowpart (SImode, target);
43219 emit_insn (gen_rtx_SET (target, tmp));
43221 else
43223 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43225 emit_move_insn (mem, vec);
43227 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43228 emit_move_insn (target, tmp);
43232 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43233 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43234 The upper bits of DEST are undefined, though they shouldn't cause
43235 exceptions (some bits from src or all zeros are ok). */
43237 static void
43238 emit_reduc_half (rtx dest, rtx src, int i)
43240 rtx tem, d = dest;
43241 switch (GET_MODE (src))
43243 case V4SFmode:
43244 if (i == 128)
43245 tem = gen_sse_movhlps (dest, src, src);
43246 else
43247 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43248 GEN_INT (1 + 4), GEN_INT (1 + 4));
43249 break;
43250 case V2DFmode:
43251 tem = gen_vec_interleave_highv2df (dest, src, src);
43252 break;
43253 case V16QImode:
43254 case V8HImode:
43255 case V4SImode:
43256 case V2DImode:
43257 d = gen_reg_rtx (V1TImode);
43258 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43259 GEN_INT (i / 2));
43260 break;
43261 case V8SFmode:
43262 if (i == 256)
43263 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43264 else
43265 tem = gen_avx_shufps256 (dest, src, src,
43266 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43267 break;
43268 case V4DFmode:
43269 if (i == 256)
43270 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43271 else
43272 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43273 break;
43274 case V32QImode:
43275 case V16HImode:
43276 case V8SImode:
43277 case V4DImode:
43278 if (i == 256)
43280 if (GET_MODE (dest) != V4DImode)
43281 d = gen_reg_rtx (V4DImode);
43282 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43283 gen_lowpart (V4DImode, src),
43284 const1_rtx);
43286 else
43288 d = gen_reg_rtx (V2TImode);
43289 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43290 GEN_INT (i / 2));
43292 break;
43293 case V64QImode:
43294 case V32HImode:
43295 case V16SImode:
43296 case V16SFmode:
43297 case V8DImode:
43298 case V8DFmode:
43299 if (i > 128)
43300 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43301 gen_lowpart (V16SImode, src),
43302 gen_lowpart (V16SImode, src),
43303 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43304 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43305 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43306 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43307 GEN_INT (0xC), GEN_INT (0xD),
43308 GEN_INT (0xE), GEN_INT (0xF),
43309 GEN_INT (0x10), GEN_INT (0x11),
43310 GEN_INT (0x12), GEN_INT (0x13),
43311 GEN_INT (0x14), GEN_INT (0x15),
43312 GEN_INT (0x16), GEN_INT (0x17));
43313 else
43314 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43315 gen_lowpart (V16SImode, src),
43316 GEN_INT (i == 128 ? 0x2 : 0x1),
43317 GEN_INT (0x3),
43318 GEN_INT (0x3),
43319 GEN_INT (0x3),
43320 GEN_INT (i == 128 ? 0x6 : 0x5),
43321 GEN_INT (0x7),
43322 GEN_INT (0x7),
43323 GEN_INT (0x7),
43324 GEN_INT (i == 128 ? 0xA : 0x9),
43325 GEN_INT (0xB),
43326 GEN_INT (0xB),
43327 GEN_INT (0xB),
43328 GEN_INT (i == 128 ? 0xE : 0xD),
43329 GEN_INT (0xF),
43330 GEN_INT (0xF),
43331 GEN_INT (0xF));
43332 break;
43333 default:
43334 gcc_unreachable ();
43336 emit_insn (tem);
43337 if (d != dest)
43338 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43341 /* Expand a vector reduction. FN is the binary pattern to reduce;
43342 DEST is the destination; IN is the input vector. */
43344 void
43345 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43347 rtx half, dst, vec = in;
43348 machine_mode mode = GET_MODE (in);
43349 int i;
43351 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43352 if (TARGET_SSE4_1
43353 && mode == V8HImode
43354 && fn == gen_uminv8hi3)
43356 emit_insn (gen_sse4_1_phminposuw (dest, in));
43357 return;
43360 for (i = GET_MODE_BITSIZE (mode);
43361 i > GET_MODE_UNIT_BITSIZE (mode);
43362 i >>= 1)
43364 half = gen_reg_rtx (mode);
43365 emit_reduc_half (half, vec, i);
43366 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43367 dst = dest;
43368 else
43369 dst = gen_reg_rtx (mode);
43370 emit_insn (fn (dst, half, vec));
43371 vec = dst;
43375 /* Target hook for scalar_mode_supported_p. */
43376 static bool
43377 ix86_scalar_mode_supported_p (machine_mode mode)
43379 if (DECIMAL_FLOAT_MODE_P (mode))
43380 return default_decimal_float_supported_p ();
43381 else if (mode == TFmode)
43382 return true;
43383 else
43384 return default_scalar_mode_supported_p (mode);
43387 /* Implements target hook vector_mode_supported_p. */
43388 static bool
43389 ix86_vector_mode_supported_p (machine_mode mode)
43391 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43392 return true;
43393 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43394 return true;
43395 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43396 return true;
43397 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43398 return true;
43399 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43400 return true;
43401 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43402 return true;
43403 return false;
43406 /* Target hook for c_mode_for_suffix. */
43407 static machine_mode
43408 ix86_c_mode_for_suffix (char suffix)
43410 if (suffix == 'q')
43411 return TFmode;
43412 if (suffix == 'w')
43413 return XFmode;
43415 return VOIDmode;
43418 /* Worker function for TARGET_MD_ASM_ADJUST.
43420 We implement asm flag outputs, and maintain source compatibility
43421 with the old cc0-based compiler. */
43423 static rtx_insn *
43424 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43425 vec<const char *> &constraints,
43426 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43428 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43429 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43431 bool saw_asm_flag = false;
43433 start_sequence ();
43434 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43436 const char *con = constraints[i];
43437 if (strncmp (con, "=@cc", 4) != 0)
43438 continue;
43439 con += 4;
43440 if (strchr (con, ',') != NULL)
43442 error ("alternatives not allowed in asm flag output");
43443 continue;
43446 bool invert = false;
43447 if (con[0] == 'n')
43448 invert = true, con++;
43450 machine_mode mode = CCmode;
43451 rtx_code code = UNKNOWN;
43453 switch (con[0])
43455 case 'a':
43456 if (con[1] == 0)
43457 mode = CCAmode, code = EQ;
43458 else if (con[1] == 'e' && con[2] == 0)
43459 mode = CCCmode, code = NE;
43460 break;
43461 case 'b':
43462 if (con[1] == 0)
43463 mode = CCCmode, code = EQ;
43464 else if (con[1] == 'e' && con[2] == 0)
43465 mode = CCAmode, code = NE;
43466 break;
43467 case 'c':
43468 if (con[1] == 0)
43469 mode = CCCmode, code = EQ;
43470 break;
43471 case 'e':
43472 if (con[1] == 0)
43473 mode = CCZmode, code = EQ;
43474 break;
43475 case 'g':
43476 if (con[1] == 0)
43477 mode = CCGCmode, code = GT;
43478 else if (con[1] == 'e' && con[2] == 0)
43479 mode = CCGCmode, code = GE;
43480 break;
43481 case 'l':
43482 if (con[1] == 0)
43483 mode = CCGCmode, code = LT;
43484 else if (con[1] == 'e' && con[2] == 0)
43485 mode = CCGCmode, code = LE;
43486 break;
43487 case 'o':
43488 if (con[1] == 0)
43489 mode = CCOmode, code = EQ;
43490 break;
43491 case 'p':
43492 if (con[1] == 0)
43493 mode = CCPmode, code = EQ;
43494 break;
43495 case 's':
43496 if (con[1] == 0)
43497 mode = CCSmode, code = EQ;
43498 break;
43499 case 'z':
43500 if (con[1] == 0)
43501 mode = CCZmode, code = EQ;
43502 break;
43504 if (code == UNKNOWN)
43506 error ("unknown asm flag output %qs", constraints[i]);
43507 continue;
43509 if (invert)
43510 code = reverse_condition (code);
43512 rtx dest = outputs[i];
43513 if (!saw_asm_flag)
43515 /* This is the first asm flag output. Here we put the flags
43516 register in as the real output and adjust the condition to
43517 allow it. */
43518 constraints[i] = "=Bf";
43519 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43520 saw_asm_flag = true;
43522 else
43524 /* We don't need the flags register as output twice. */
43525 constraints[i] = "=X";
43526 outputs[i] = gen_rtx_SCRATCH (SImode);
43529 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43530 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43532 machine_mode dest_mode = GET_MODE (dest);
43533 if (!SCALAR_INT_MODE_P (dest_mode))
43535 error ("invalid type for asm flag output");
43536 continue;
43539 if (dest_mode == DImode && !TARGET_64BIT)
43540 dest_mode = SImode;
43542 if (dest_mode != QImode)
43544 rtx destqi = gen_reg_rtx (QImode);
43545 emit_insn (gen_rtx_SET (destqi, x));
43547 if (TARGET_ZERO_EXTEND_WITH_AND
43548 && optimize_function_for_speed_p (cfun))
43550 x = force_reg (dest_mode, const0_rtx);
43552 emit_insn (gen_movstrictqi
43553 (gen_lowpart (QImode, x), destqi));
43555 else
43556 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43559 if (dest_mode != GET_MODE (dest))
43561 rtx tmp = gen_reg_rtx (SImode);
43563 emit_insn (gen_rtx_SET (tmp, x));
43564 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43566 else
43567 emit_insn (gen_rtx_SET (dest, x));
43569 rtx_insn *seq = get_insns ();
43570 end_sequence ();
43572 if (saw_asm_flag)
43573 return seq;
43574 else
43576 /* If we had no asm flag outputs, clobber the flags. */
43577 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43578 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43579 return NULL;
43583 /* Implements target vector targetm.asm.encode_section_info. */
43585 static void ATTRIBUTE_UNUSED
43586 ix86_encode_section_info (tree decl, rtx rtl, int first)
43588 default_encode_section_info (decl, rtl, first);
43590 if (ix86_in_large_data_p (decl))
43591 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43594 /* Worker function for REVERSE_CONDITION. */
43596 enum rtx_code
43597 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43599 return (mode != CCFPmode && mode != CCFPUmode
43600 ? reverse_condition (code)
43601 : reverse_condition_maybe_unordered (code));
43604 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43605 to OPERANDS[0]. */
43607 const char *
43608 output_387_reg_move (rtx insn, rtx *operands)
43610 if (REG_P (operands[0]))
43612 if (REG_P (operands[1])
43613 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43615 if (REGNO (operands[0]) == FIRST_STACK_REG)
43616 return output_387_ffreep (operands, 0);
43617 return "fstp\t%y0";
43619 if (STACK_TOP_P (operands[0]))
43620 return "fld%Z1\t%y1";
43621 return "fst\t%y0";
43623 else if (MEM_P (operands[0]))
43625 gcc_assert (REG_P (operands[1]));
43626 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43627 return "fstp%Z0\t%y0";
43628 else
43630 /* There is no non-popping store to memory for XFmode.
43631 So if we need one, follow the store with a load. */
43632 if (GET_MODE (operands[0]) == XFmode)
43633 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43634 else
43635 return "fst%Z0\t%y0";
43638 else
43639 gcc_unreachable();
43642 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43643 FP status register is set. */
43645 void
43646 ix86_emit_fp_unordered_jump (rtx label)
43648 rtx reg = gen_reg_rtx (HImode);
43649 rtx temp;
43651 emit_insn (gen_x86_fnstsw_1 (reg));
43653 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43655 emit_insn (gen_x86_sahf_1 (reg));
43657 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43658 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43660 else
43662 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
43664 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43665 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43668 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43669 gen_rtx_LABEL_REF (VOIDmode, label),
43670 pc_rtx);
43671 temp = gen_rtx_SET (pc_rtx, temp);
43673 emit_jump_insn (temp);
43674 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43677 /* Output code to perform a log1p XFmode calculation. */
43679 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43681 rtx_code_label *label1 = gen_label_rtx ();
43682 rtx_code_label *label2 = gen_label_rtx ();
43684 rtx tmp = gen_reg_rtx (XFmode);
43685 rtx tmp2 = gen_reg_rtx (XFmode);
43686 rtx test;
43688 emit_insn (gen_absxf2 (tmp, op1));
43689 test = gen_rtx_GE (VOIDmode, tmp,
43690 const_double_from_real_value (
43691 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43692 XFmode));
43693 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43695 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43696 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43697 emit_jump (label2);
43699 emit_label (label1);
43700 emit_move_insn (tmp, CONST1_RTX (XFmode));
43701 emit_insn (gen_addxf3 (tmp, op1, tmp));
43702 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43703 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43705 emit_label (label2);
43708 /* Emit code for round calculation. */
43709 void ix86_emit_i387_round (rtx op0, rtx op1)
43711 machine_mode inmode = GET_MODE (op1);
43712 machine_mode outmode = GET_MODE (op0);
43713 rtx e1, e2, res, tmp, tmp1, half;
43714 rtx scratch = gen_reg_rtx (HImode);
43715 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43716 rtx_code_label *jump_label = gen_label_rtx ();
43717 rtx insn;
43718 rtx (*gen_abs) (rtx, rtx);
43719 rtx (*gen_neg) (rtx, rtx);
43721 switch (inmode)
43723 case SFmode:
43724 gen_abs = gen_abssf2;
43725 break;
43726 case DFmode:
43727 gen_abs = gen_absdf2;
43728 break;
43729 case XFmode:
43730 gen_abs = gen_absxf2;
43731 break;
43732 default:
43733 gcc_unreachable ();
43736 switch (outmode)
43738 case SFmode:
43739 gen_neg = gen_negsf2;
43740 break;
43741 case DFmode:
43742 gen_neg = gen_negdf2;
43743 break;
43744 case XFmode:
43745 gen_neg = gen_negxf2;
43746 break;
43747 case HImode:
43748 gen_neg = gen_neghi2;
43749 break;
43750 case SImode:
43751 gen_neg = gen_negsi2;
43752 break;
43753 case DImode:
43754 gen_neg = gen_negdi2;
43755 break;
43756 default:
43757 gcc_unreachable ();
43760 e1 = gen_reg_rtx (inmode);
43761 e2 = gen_reg_rtx (inmode);
43762 res = gen_reg_rtx (outmode);
43764 half = const_double_from_real_value (dconsthalf, inmode);
43766 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43768 /* scratch = fxam(op1) */
43769 emit_insn (gen_rtx_SET (scratch,
43770 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43771 UNSPEC_FXAM)));
43772 /* e1 = fabs(op1) */
43773 emit_insn (gen_abs (e1, op1));
43775 /* e2 = e1 + 0.5 */
43776 half = force_reg (inmode, half);
43777 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43779 /* res = floor(e2) */
43780 if (inmode != XFmode)
43782 tmp1 = gen_reg_rtx (XFmode);
43784 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43786 else
43787 tmp1 = e2;
43789 switch (outmode)
43791 case SFmode:
43792 case DFmode:
43794 rtx tmp0 = gen_reg_rtx (XFmode);
43796 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43798 emit_insn (gen_rtx_SET (res,
43799 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43800 UNSPEC_TRUNC_NOOP)));
43802 break;
43803 case XFmode:
43804 emit_insn (gen_frndintxf2_floor (res, tmp1));
43805 break;
43806 case HImode:
43807 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43808 break;
43809 case SImode:
43810 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43811 break;
43812 case DImode:
43813 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43814 break;
43815 default:
43816 gcc_unreachable ();
43819 /* flags = signbit(a) */
43820 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
43822 /* if (flags) then res = -res */
43823 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43824 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43825 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43826 pc_rtx);
43827 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43828 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43829 JUMP_LABEL (insn) = jump_label;
43831 emit_insn (gen_neg (res, res));
43833 emit_label (jump_label);
43834 LABEL_NUSES (jump_label) = 1;
43836 emit_move_insn (op0, res);
43839 /* Output code to perform a Newton-Rhapson approximation of a single precision
43840 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43842 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43844 rtx x0, x1, e0, e1;
43846 x0 = gen_reg_rtx (mode);
43847 e0 = gen_reg_rtx (mode);
43848 e1 = gen_reg_rtx (mode);
43849 x1 = gen_reg_rtx (mode);
43851 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43853 b = force_reg (mode, b);
43855 /* x0 = rcp(b) estimate */
43856 if (mode == V16SFmode || mode == V8DFmode)
43858 if (TARGET_AVX512ER)
43860 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43861 UNSPEC_RCP28)));
43862 /* res = a * x0 */
43863 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43864 return;
43866 else
43867 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43868 UNSPEC_RCP14)));
43870 else
43871 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43872 UNSPEC_RCP)));
43874 /* e0 = x0 * b */
43875 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43877 /* e0 = x0 * e0 */
43878 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43880 /* e1 = x0 + x0 */
43881 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43883 /* x1 = e1 - e0 */
43884 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43886 /* res = a * x1 */
43887 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43890 /* Output code to perform a Newton-Rhapson approximation of a
43891 single precision floating point [reciprocal] square root. */
43893 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43895 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43896 REAL_VALUE_TYPE r;
43897 int unspec;
43899 x0 = gen_reg_rtx (mode);
43900 e0 = gen_reg_rtx (mode);
43901 e1 = gen_reg_rtx (mode);
43902 e2 = gen_reg_rtx (mode);
43903 e3 = gen_reg_rtx (mode);
43905 if (TARGET_AVX512ER && mode == V16SFmode)
43907 if (recip)
43908 /* res = rsqrt28(a) estimate */
43909 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43910 UNSPEC_RSQRT28)));
43911 else
43913 /* x0 = rsqrt28(a) estimate */
43914 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43915 UNSPEC_RSQRT28)));
43916 /* res = rcp28(x0) estimate */
43917 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43918 UNSPEC_RCP28)));
43920 return;
43923 real_from_integer (&r, VOIDmode, -3, SIGNED);
43924 mthree = const_double_from_real_value (r, SFmode);
43926 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43927 mhalf = const_double_from_real_value (r, SFmode);
43928 unspec = UNSPEC_RSQRT;
43930 if (VECTOR_MODE_P (mode))
43932 mthree = ix86_build_const_vector (mode, true, mthree);
43933 mhalf = ix86_build_const_vector (mode, true, mhalf);
43934 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43935 if (GET_MODE_SIZE (mode) == 64)
43936 unspec = UNSPEC_RSQRT14;
43939 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43940 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43942 a = force_reg (mode, a);
43944 /* x0 = rsqrt(a) estimate */
43945 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43946 unspec)));
43948 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43949 if (!recip)
43951 rtx zero = force_reg (mode, CONST0_RTX(mode));
43952 rtx mask;
43954 /* Handle masked compare. */
43955 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43957 mask = gen_reg_rtx (HImode);
43958 /* Imm value 0x4 corresponds to not-equal comparison. */
43959 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43960 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43962 else
43964 mask = gen_reg_rtx (mode);
43965 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43966 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43970 /* e0 = x0 * a */
43971 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43972 /* e1 = e0 * x0 */
43973 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43975 /* e2 = e1 - 3. */
43976 mthree = force_reg (mode, mthree);
43977 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43979 mhalf = force_reg (mode, mhalf);
43980 if (recip)
43981 /* e3 = -.5 * x0 */
43982 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43983 else
43984 /* e3 = -.5 * e0 */
43985 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43986 /* ret = e2 * e3 */
43987 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43990 #ifdef TARGET_SOLARIS
43991 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43993 static void
43994 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43995 tree decl)
43997 /* With Binutils 2.15, the "@unwind" marker must be specified on
43998 every occurrence of the ".eh_frame" section, not just the first
43999 one. */
44000 if (TARGET_64BIT
44001 && strcmp (name, ".eh_frame") == 0)
44003 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44004 flags & SECTION_WRITE ? "aw" : "a");
44005 return;
44008 #ifndef USE_GAS
44009 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44011 solaris_elf_asm_comdat_section (name, flags, decl);
44012 return;
44014 #endif
44016 default_elf_asm_named_section (name, flags, decl);
44018 #endif /* TARGET_SOLARIS */
44020 /* Return the mangling of TYPE if it is an extended fundamental type. */
44022 static const char *
44023 ix86_mangle_type (const_tree type)
44025 type = TYPE_MAIN_VARIANT (type);
44027 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44028 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44029 return NULL;
44031 switch (TYPE_MODE (type))
44033 case TFmode:
44034 /* __float128 is "g". */
44035 return "g";
44036 case XFmode:
44037 /* "long double" or __float80 is "e". */
44038 return "e";
44039 default:
44040 return NULL;
44044 /* For 32-bit code we can save PIC register setup by using
44045 __stack_chk_fail_local hidden function instead of calling
44046 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44047 register, so it is better to call __stack_chk_fail directly. */
44049 static tree ATTRIBUTE_UNUSED
44050 ix86_stack_protect_fail (void)
44052 return TARGET_64BIT
44053 ? default_external_stack_protect_fail ()
44054 : default_hidden_stack_protect_fail ();
44057 /* Select a format to encode pointers in exception handling data. CODE
44058 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44059 true if the symbol may be affected by dynamic relocations.
44061 ??? All x86 object file formats are capable of representing this.
44062 After all, the relocation needed is the same as for the call insn.
44063 Whether or not a particular assembler allows us to enter such, I
44064 guess we'll have to see. */
44066 asm_preferred_eh_data_format (int code, int global)
44068 if (flag_pic)
44070 int type = DW_EH_PE_sdata8;
44071 if (!TARGET_64BIT
44072 || ix86_cmodel == CM_SMALL_PIC
44073 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44074 type = DW_EH_PE_sdata4;
44075 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44077 if (ix86_cmodel == CM_SMALL
44078 || (ix86_cmodel == CM_MEDIUM && code))
44079 return DW_EH_PE_udata4;
44080 return DW_EH_PE_absptr;
44083 /* Expand copysign from SIGN to the positive value ABS_VALUE
44084 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44085 the sign-bit. */
44086 static void
44087 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44089 machine_mode mode = GET_MODE (sign);
44090 rtx sgn = gen_reg_rtx (mode);
44091 if (mask == NULL_RTX)
44093 machine_mode vmode;
44095 if (mode == SFmode)
44096 vmode = V4SFmode;
44097 else if (mode == DFmode)
44098 vmode = V2DFmode;
44099 else
44100 vmode = mode;
44102 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44103 if (!VECTOR_MODE_P (mode))
44105 /* We need to generate a scalar mode mask in this case. */
44106 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44107 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44108 mask = gen_reg_rtx (mode);
44109 emit_insn (gen_rtx_SET (mask, tmp));
44112 else
44113 mask = gen_rtx_NOT (mode, mask);
44114 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44115 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44118 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44119 mask for masking out the sign-bit is stored in *SMASK, if that is
44120 non-null. */
44121 static rtx
44122 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44124 machine_mode vmode, mode = GET_MODE (op0);
44125 rtx xa, mask;
44127 xa = gen_reg_rtx (mode);
44128 if (mode == SFmode)
44129 vmode = V4SFmode;
44130 else if (mode == DFmode)
44131 vmode = V2DFmode;
44132 else
44133 vmode = mode;
44134 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44135 if (!VECTOR_MODE_P (mode))
44137 /* We need to generate a scalar mode mask in this case. */
44138 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44139 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44140 mask = gen_reg_rtx (mode);
44141 emit_insn (gen_rtx_SET (mask, tmp));
44143 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44145 if (smask)
44146 *smask = mask;
44148 return xa;
44151 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44152 swapping the operands if SWAP_OPERANDS is true. The expanded
44153 code is a forward jump to a newly created label in case the
44154 comparison is true. The generated label rtx is returned. */
44155 static rtx_code_label *
44156 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44157 bool swap_operands)
44159 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
44160 rtx_code_label *label;
44161 rtx tmp;
44163 if (swap_operands)
44164 std::swap (op0, op1);
44166 label = gen_label_rtx ();
44167 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
44168 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
44169 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
44170 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44171 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44172 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44173 JUMP_LABEL (tmp) = label;
44175 return label;
44178 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44179 using comparison code CODE. Operands are swapped for the comparison if
44180 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44181 static rtx
44182 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44183 bool swap_operands)
44185 rtx (*insn)(rtx, rtx, rtx, rtx);
44186 machine_mode mode = GET_MODE (op0);
44187 rtx mask = gen_reg_rtx (mode);
44189 if (swap_operands)
44190 std::swap (op0, op1);
44192 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44194 emit_insn (insn (mask, op0, op1,
44195 gen_rtx_fmt_ee (code, mode, op0, op1)));
44196 return mask;
44199 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44200 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44201 static rtx
44202 ix86_gen_TWO52 (machine_mode mode)
44204 REAL_VALUE_TYPE TWO52r;
44205 rtx TWO52;
44207 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44208 TWO52 = const_double_from_real_value (TWO52r, mode);
44209 TWO52 = force_reg (mode, TWO52);
44211 return TWO52;
44214 /* Expand SSE sequence for computing lround from OP1 storing
44215 into OP0. */
44216 void
44217 ix86_expand_lround (rtx op0, rtx op1)
44219 /* C code for the stuff we're doing below:
44220 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44221 return (long)tmp;
44223 machine_mode mode = GET_MODE (op1);
44224 const struct real_format *fmt;
44225 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44226 rtx adj;
44228 /* load nextafter (0.5, 0.0) */
44229 fmt = REAL_MODE_FORMAT (mode);
44230 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44231 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44233 /* adj = copysign (0.5, op1) */
44234 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44235 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44237 /* adj = op1 + adj */
44238 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44240 /* op0 = (imode)adj */
44241 expand_fix (op0, adj, 0);
44244 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44245 into OPERAND0. */
44246 void
44247 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44249 /* C code for the stuff we're doing below (for do_floor):
44250 xi = (long)op1;
44251 xi -= (double)xi > op1 ? 1 : 0;
44252 return xi;
44254 machine_mode fmode = GET_MODE (op1);
44255 machine_mode imode = GET_MODE (op0);
44256 rtx ireg, freg, tmp;
44257 rtx_code_label *label;
44259 /* reg = (long)op1 */
44260 ireg = gen_reg_rtx (imode);
44261 expand_fix (ireg, op1, 0);
44263 /* freg = (double)reg */
44264 freg = gen_reg_rtx (fmode);
44265 expand_float (freg, ireg, 0);
44267 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44268 label = ix86_expand_sse_compare_and_jump (UNLE,
44269 freg, op1, !do_floor);
44270 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44271 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44272 emit_move_insn (ireg, tmp);
44274 emit_label (label);
44275 LABEL_NUSES (label) = 1;
44277 emit_move_insn (op0, ireg);
44280 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44281 result in OPERAND0. */
44282 void
44283 ix86_expand_rint (rtx operand0, rtx operand1)
44285 /* C code for the stuff we're doing below:
44286 xa = fabs (operand1);
44287 if (!isless (xa, 2**52))
44288 return operand1;
44289 xa = xa + 2**52 - 2**52;
44290 return copysign (xa, operand1);
44292 machine_mode mode = GET_MODE (operand0);
44293 rtx res, xa, TWO52, mask;
44294 rtx_code_label *label;
44296 res = gen_reg_rtx (mode);
44297 emit_move_insn (res, operand1);
44299 /* xa = abs (operand1) */
44300 xa = ix86_expand_sse_fabs (res, &mask);
44302 /* if (!isless (xa, TWO52)) goto label; */
44303 TWO52 = ix86_gen_TWO52 (mode);
44304 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44306 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44307 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44309 ix86_sse_copysign_to_positive (res, xa, res, mask);
44311 emit_label (label);
44312 LABEL_NUSES (label) = 1;
44314 emit_move_insn (operand0, res);
44317 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44318 into OPERAND0. */
44319 void
44320 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44322 /* C code for the stuff we expand below.
44323 double xa = fabs (x), x2;
44324 if (!isless (xa, TWO52))
44325 return x;
44326 xa = xa + TWO52 - TWO52;
44327 x2 = copysign (xa, x);
44328 Compensate. Floor:
44329 if (x2 > x)
44330 x2 -= 1;
44331 Compensate. Ceil:
44332 if (x2 < x)
44333 x2 -= -1;
44334 return x2;
44336 machine_mode mode = GET_MODE (operand0);
44337 rtx xa, TWO52, tmp, one, res, mask;
44338 rtx_code_label *label;
44340 TWO52 = ix86_gen_TWO52 (mode);
44342 /* Temporary for holding the result, initialized to the input
44343 operand to ease control flow. */
44344 res = gen_reg_rtx (mode);
44345 emit_move_insn (res, operand1);
44347 /* xa = abs (operand1) */
44348 xa = ix86_expand_sse_fabs (res, &mask);
44350 /* if (!isless (xa, TWO52)) goto label; */
44351 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44353 /* xa = xa + TWO52 - TWO52; */
44354 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44355 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44357 /* xa = copysign (xa, operand1) */
44358 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44360 /* generate 1.0 or -1.0 */
44361 one = force_reg (mode,
44362 const_double_from_real_value (do_floor
44363 ? dconst1 : dconstm1, mode));
44365 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44366 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44367 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44368 /* We always need to subtract here to preserve signed zero. */
44369 tmp = expand_simple_binop (mode, MINUS,
44370 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44371 emit_move_insn (res, tmp);
44373 emit_label (label);
44374 LABEL_NUSES (label) = 1;
44376 emit_move_insn (operand0, res);
44379 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44380 into OPERAND0. */
44381 void
44382 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44384 /* C code for the stuff we expand below.
44385 double xa = fabs (x), x2;
44386 if (!isless (xa, TWO52))
44387 return x;
44388 x2 = (double)(long)x;
44389 Compensate. Floor:
44390 if (x2 > x)
44391 x2 -= 1;
44392 Compensate. Ceil:
44393 if (x2 < x)
44394 x2 += 1;
44395 if (HONOR_SIGNED_ZEROS (mode))
44396 return copysign (x2, x);
44397 return x2;
44399 machine_mode mode = GET_MODE (operand0);
44400 rtx xa, xi, TWO52, tmp, one, res, mask;
44401 rtx_code_label *label;
44403 TWO52 = ix86_gen_TWO52 (mode);
44405 /* Temporary for holding the result, initialized to the input
44406 operand to ease control flow. */
44407 res = gen_reg_rtx (mode);
44408 emit_move_insn (res, operand1);
44410 /* xa = abs (operand1) */
44411 xa = ix86_expand_sse_fabs (res, &mask);
44413 /* if (!isless (xa, TWO52)) goto label; */
44414 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44416 /* xa = (double)(long)x */
44417 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44418 expand_fix (xi, res, 0);
44419 expand_float (xa, xi, 0);
44421 /* generate 1.0 */
44422 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44424 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44425 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44426 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44427 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44428 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44429 emit_move_insn (res, tmp);
44431 if (HONOR_SIGNED_ZEROS (mode))
44432 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44434 emit_label (label);
44435 LABEL_NUSES (label) = 1;
44437 emit_move_insn (operand0, res);
44440 /* Expand SSE sequence for computing round from OPERAND1 storing
44441 into OPERAND0. Sequence that works without relying on DImode truncation
44442 via cvttsd2siq that is only available on 64bit targets. */
44443 void
44444 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44446 /* C code for the stuff we expand below.
44447 double xa = fabs (x), xa2, x2;
44448 if (!isless (xa, TWO52))
44449 return x;
44450 Using the absolute value and copying back sign makes
44451 -0.0 -> -0.0 correct.
44452 xa2 = xa + TWO52 - TWO52;
44453 Compensate.
44454 dxa = xa2 - xa;
44455 if (dxa <= -0.5)
44456 xa2 += 1;
44457 else if (dxa > 0.5)
44458 xa2 -= 1;
44459 x2 = copysign (xa2, x);
44460 return x2;
44462 machine_mode mode = GET_MODE (operand0);
44463 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44464 rtx_code_label *label;
44466 TWO52 = ix86_gen_TWO52 (mode);
44468 /* Temporary for holding the result, initialized to the input
44469 operand to ease control flow. */
44470 res = gen_reg_rtx (mode);
44471 emit_move_insn (res, operand1);
44473 /* xa = abs (operand1) */
44474 xa = ix86_expand_sse_fabs (res, &mask);
44476 /* if (!isless (xa, TWO52)) goto label; */
44477 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44479 /* xa2 = xa + TWO52 - TWO52; */
44480 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44481 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44483 /* dxa = xa2 - xa; */
44484 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44486 /* generate 0.5, 1.0 and -0.5 */
44487 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44488 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44489 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44490 0, OPTAB_DIRECT);
44492 /* Compensate. */
44493 tmp = gen_reg_rtx (mode);
44494 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44495 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44496 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44497 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44498 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44499 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44500 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44501 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44503 /* res = copysign (xa2, operand1) */
44504 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44506 emit_label (label);
44507 LABEL_NUSES (label) = 1;
44509 emit_move_insn (operand0, res);
44512 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44513 into OPERAND0. */
44514 void
44515 ix86_expand_trunc (rtx operand0, rtx operand1)
44517 /* C code for SSE variant we expand below.
44518 double xa = fabs (x), x2;
44519 if (!isless (xa, TWO52))
44520 return x;
44521 x2 = (double)(long)x;
44522 if (HONOR_SIGNED_ZEROS (mode))
44523 return copysign (x2, x);
44524 return x2;
44526 machine_mode mode = GET_MODE (operand0);
44527 rtx xa, xi, TWO52, res, mask;
44528 rtx_code_label *label;
44530 TWO52 = ix86_gen_TWO52 (mode);
44532 /* Temporary for holding the result, initialized to the input
44533 operand to ease control flow. */
44534 res = gen_reg_rtx (mode);
44535 emit_move_insn (res, operand1);
44537 /* xa = abs (operand1) */
44538 xa = ix86_expand_sse_fabs (res, &mask);
44540 /* if (!isless (xa, TWO52)) goto label; */
44541 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44543 /* x = (double)(long)x */
44544 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44545 expand_fix (xi, res, 0);
44546 expand_float (res, xi, 0);
44548 if (HONOR_SIGNED_ZEROS (mode))
44549 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44551 emit_label (label);
44552 LABEL_NUSES (label) = 1;
44554 emit_move_insn (operand0, res);
44557 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44558 into OPERAND0. */
44559 void
44560 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44562 machine_mode mode = GET_MODE (operand0);
44563 rtx xa, mask, TWO52, one, res, smask, tmp;
44564 rtx_code_label *label;
44566 /* C code for SSE variant we expand below.
44567 double xa = fabs (x), x2;
44568 if (!isless (xa, TWO52))
44569 return x;
44570 xa2 = xa + TWO52 - TWO52;
44571 Compensate:
44572 if (xa2 > xa)
44573 xa2 -= 1.0;
44574 x2 = copysign (xa2, x);
44575 return x2;
44578 TWO52 = ix86_gen_TWO52 (mode);
44580 /* Temporary for holding the result, initialized to the input
44581 operand to ease control flow. */
44582 res = gen_reg_rtx (mode);
44583 emit_move_insn (res, operand1);
44585 /* xa = abs (operand1) */
44586 xa = ix86_expand_sse_fabs (res, &smask);
44588 /* if (!isless (xa, TWO52)) goto label; */
44589 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44591 /* res = xa + TWO52 - TWO52; */
44592 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44593 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44594 emit_move_insn (res, tmp);
44596 /* generate 1.0 */
44597 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44599 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44600 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44601 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44602 tmp = expand_simple_binop (mode, MINUS,
44603 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44604 emit_move_insn (res, tmp);
44606 /* res = copysign (res, operand1) */
44607 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44609 emit_label (label);
44610 LABEL_NUSES (label) = 1;
44612 emit_move_insn (operand0, res);
44615 /* Expand SSE sequence for computing round from OPERAND1 storing
44616 into OPERAND0. */
44617 void
44618 ix86_expand_round (rtx operand0, rtx operand1)
44620 /* C code for the stuff we're doing below:
44621 double xa = fabs (x);
44622 if (!isless (xa, TWO52))
44623 return x;
44624 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44625 return copysign (xa, x);
44627 machine_mode mode = GET_MODE (operand0);
44628 rtx res, TWO52, xa, xi, half, mask;
44629 rtx_code_label *label;
44630 const struct real_format *fmt;
44631 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44633 /* Temporary for holding the result, initialized to the input
44634 operand to ease control flow. */
44635 res = gen_reg_rtx (mode);
44636 emit_move_insn (res, operand1);
44638 TWO52 = ix86_gen_TWO52 (mode);
44639 xa = ix86_expand_sse_fabs (res, &mask);
44640 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44642 /* load nextafter (0.5, 0.0) */
44643 fmt = REAL_MODE_FORMAT (mode);
44644 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44645 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44647 /* xa = xa + 0.5 */
44648 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44649 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44651 /* xa = (double)(int64_t)xa */
44652 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44653 expand_fix (xi, xa, 0);
44654 expand_float (xa, xi, 0);
44656 /* res = copysign (xa, operand1) */
44657 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44659 emit_label (label);
44660 LABEL_NUSES (label) = 1;
44662 emit_move_insn (operand0, res);
44665 /* Expand SSE sequence for computing round
44666 from OP1 storing into OP0 using sse4 round insn. */
44667 void
44668 ix86_expand_round_sse4 (rtx op0, rtx op1)
44670 machine_mode mode = GET_MODE (op0);
44671 rtx e1, e2, res, half;
44672 const struct real_format *fmt;
44673 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44674 rtx (*gen_copysign) (rtx, rtx, rtx);
44675 rtx (*gen_round) (rtx, rtx, rtx);
44677 switch (mode)
44679 case SFmode:
44680 gen_copysign = gen_copysignsf3;
44681 gen_round = gen_sse4_1_roundsf2;
44682 break;
44683 case DFmode:
44684 gen_copysign = gen_copysigndf3;
44685 gen_round = gen_sse4_1_rounddf2;
44686 break;
44687 default:
44688 gcc_unreachable ();
44691 /* round (a) = trunc (a + copysign (0.5, a)) */
44693 /* load nextafter (0.5, 0.0) */
44694 fmt = REAL_MODE_FORMAT (mode);
44695 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44696 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44697 half = const_double_from_real_value (pred_half, mode);
44699 /* e1 = copysign (0.5, op1) */
44700 e1 = gen_reg_rtx (mode);
44701 emit_insn (gen_copysign (e1, half, op1));
44703 /* e2 = op1 + e1 */
44704 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44706 /* res = trunc (e2) */
44707 res = gen_reg_rtx (mode);
44708 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44710 emit_move_insn (op0, res);
44714 /* Table of valid machine attributes. */
44715 static const struct attribute_spec ix86_attribute_table[] =
44717 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44718 affects_type_identity } */
44719 /* Stdcall attribute says callee is responsible for popping arguments
44720 if they are not variable. */
44721 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44722 true },
44723 /* Fastcall attribute says callee is responsible for popping arguments
44724 if they are not variable. */
44725 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44726 true },
44727 /* Thiscall attribute says callee is responsible for popping arguments
44728 if they are not variable. */
44729 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44730 true },
44731 /* Cdecl attribute says the callee is a normal C declaration */
44732 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44733 true },
44734 /* Regparm attribute specifies how many integer arguments are to be
44735 passed in registers. */
44736 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44737 true },
44738 /* Sseregparm attribute says we are using x86_64 calling conventions
44739 for FP arguments. */
44740 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44741 true },
44742 /* The transactional memory builtins are implicitly regparm or fastcall
44743 depending on the ABI. Override the generic do-nothing attribute that
44744 these builtins were declared with. */
44745 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44746 true },
44747 /* force_align_arg_pointer says this function realigns the stack at entry. */
44748 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44749 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44750 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44751 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44752 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44753 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44754 false },
44755 #endif
44756 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44757 false },
44758 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44759 false },
44760 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44761 SUBTARGET_ATTRIBUTE_TABLE,
44762 #endif
44763 /* ms_abi and sysv_abi calling convention function attributes. */
44764 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44765 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44766 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44767 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
44768 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44769 false },
44770 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44771 ix86_handle_callee_pop_aggregate_return, true },
44772 { "interrupt", 0, 0, false, true, true,
44773 ix86_handle_interrupt_attribute, false },
44774 { "no_caller_saved_registers", 0, 0, false, true, true,
44775 ix86_handle_no_caller_saved_registers_attribute, false },
44777 /* End element. */
44778 { NULL, 0, 0, false, false, false, NULL, false }
44781 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44782 static int
44783 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44784 tree vectype, int)
44786 switch (type_of_cost)
44788 case scalar_stmt:
44789 return ix86_cost->scalar_stmt_cost;
44791 case scalar_load:
44792 return ix86_cost->scalar_load_cost;
44794 case scalar_store:
44795 return ix86_cost->scalar_store_cost;
44797 case vector_stmt:
44798 return ix86_cost->vec_stmt_cost;
44800 case vector_load:
44801 return ix86_cost->vec_align_load_cost;
44803 case vector_store:
44804 return ix86_cost->vec_store_cost;
44806 case vec_to_scalar:
44807 return ix86_cost->vec_to_scalar_cost;
44809 case scalar_to_vec:
44810 return ix86_cost->scalar_to_vec_cost;
44812 case unaligned_load:
44813 case unaligned_store:
44814 return ix86_cost->vec_unalign_load_cost;
44816 case cond_branch_taken:
44817 return ix86_cost->cond_taken_branch_cost;
44819 case cond_branch_not_taken:
44820 return ix86_cost->cond_not_taken_branch_cost;
44822 case vec_perm:
44823 case vec_promote_demote:
44824 return ix86_cost->vec_stmt_cost;
44826 case vec_construct:
44827 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
44829 default:
44830 gcc_unreachable ();
44834 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44835 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44836 insn every time. */
44838 static GTY(()) rtx_insn *vselect_insn;
44840 /* Initialize vselect_insn. */
44842 static void
44843 init_vselect_insn (void)
44845 unsigned i;
44846 rtx x;
44848 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44849 for (i = 0; i < MAX_VECT_LEN; ++i)
44850 XVECEXP (x, 0, i) = const0_rtx;
44851 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44852 const0_rtx), x);
44853 x = gen_rtx_SET (const0_rtx, x);
44854 start_sequence ();
44855 vselect_insn = emit_insn (x);
44856 end_sequence ();
44859 /* Construct (set target (vec_select op0 (parallel perm))) and
44860 return true if that's a valid instruction in the active ISA. */
44862 static bool
44863 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44864 unsigned nelt, bool testing_p)
44866 unsigned int i;
44867 rtx x, save_vconcat;
44868 int icode;
44870 if (vselect_insn == NULL_RTX)
44871 init_vselect_insn ();
44873 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44874 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44875 for (i = 0; i < nelt; ++i)
44876 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44877 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44878 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44879 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44880 SET_DEST (PATTERN (vselect_insn)) = target;
44881 icode = recog_memoized (vselect_insn);
44883 if (icode >= 0 && !testing_p)
44884 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44886 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44887 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44888 INSN_CODE (vselect_insn) = -1;
44890 return icode >= 0;
44893 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44895 static bool
44896 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44897 const unsigned char *perm, unsigned nelt,
44898 bool testing_p)
44900 machine_mode v2mode;
44901 rtx x;
44902 bool ok;
44904 if (vselect_insn == NULL_RTX)
44905 init_vselect_insn ();
44907 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
44908 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44909 PUT_MODE (x, v2mode);
44910 XEXP (x, 0) = op0;
44911 XEXP (x, 1) = op1;
44912 ok = expand_vselect (target, x, perm, nelt, testing_p);
44913 XEXP (x, 0) = const0_rtx;
44914 XEXP (x, 1) = const0_rtx;
44915 return ok;
44918 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44919 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44921 static bool
44922 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44924 machine_mode mmode, vmode = d->vmode;
44925 unsigned i, mask, nelt = d->nelt;
44926 rtx target, op0, op1, maskop, x;
44927 rtx rperm[32], vperm;
44929 if (d->one_operand_p)
44930 return false;
44931 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44932 && (TARGET_AVX512BW
44933 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44935 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44937 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44939 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44941 else
44942 return false;
44944 /* This is a blend, not a permute. Elements must stay in their
44945 respective lanes. */
44946 for (i = 0; i < nelt; ++i)
44948 unsigned e = d->perm[i];
44949 if (!(e == i || e == i + nelt))
44950 return false;
44953 if (d->testing_p)
44954 return true;
44956 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
44957 decision should be extracted elsewhere, so that we only try that
44958 sequence once all budget==3 options have been tried. */
44959 target = d->target;
44960 op0 = d->op0;
44961 op1 = d->op1;
44962 mask = 0;
44964 switch (vmode)
44966 case V8DFmode:
44967 case V16SFmode:
44968 case V4DFmode:
44969 case V8SFmode:
44970 case V2DFmode:
44971 case V4SFmode:
44972 case V8HImode:
44973 case V8SImode:
44974 case V32HImode:
44975 case V64QImode:
44976 case V16SImode:
44977 case V8DImode:
44978 for (i = 0; i < nelt; ++i)
44979 mask |= (d->perm[i] >= nelt) << i;
44980 break;
44982 case V2DImode:
44983 for (i = 0; i < 2; ++i)
44984 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
44985 vmode = V8HImode;
44986 goto do_subreg;
44988 case V4SImode:
44989 for (i = 0; i < 4; ++i)
44990 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
44991 vmode = V8HImode;
44992 goto do_subreg;
44994 case V16QImode:
44995 /* See if bytes move in pairs so we can use pblendw with
44996 an immediate argument, rather than pblendvb with a vector
44997 argument. */
44998 for (i = 0; i < 16; i += 2)
44999 if (d->perm[i] + 1 != d->perm[i + 1])
45001 use_pblendvb:
45002 for (i = 0; i < nelt; ++i)
45003 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45005 finish_pblendvb:
45006 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45007 vperm = force_reg (vmode, vperm);
45009 if (GET_MODE_SIZE (vmode) == 16)
45010 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45011 else
45012 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45013 if (target != d->target)
45014 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45015 return true;
45018 for (i = 0; i < 8; ++i)
45019 mask |= (d->perm[i * 2] >= 16) << i;
45020 vmode = V8HImode;
45021 /* FALLTHRU */
45023 do_subreg:
45024 target = gen_reg_rtx (vmode);
45025 op0 = gen_lowpart (vmode, op0);
45026 op1 = gen_lowpart (vmode, op1);
45027 break;
45029 case V32QImode:
45030 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45031 for (i = 0; i < 32; i += 2)
45032 if (d->perm[i] + 1 != d->perm[i + 1])
45033 goto use_pblendvb;
45034 /* See if bytes move in quadruplets. If yes, vpblendd
45035 with immediate can be used. */
45036 for (i = 0; i < 32; i += 4)
45037 if (d->perm[i] + 2 != d->perm[i + 2])
45038 break;
45039 if (i < 32)
45041 /* See if bytes move the same in both lanes. If yes,
45042 vpblendw with immediate can be used. */
45043 for (i = 0; i < 16; i += 2)
45044 if (d->perm[i] + 16 != d->perm[i + 16])
45045 goto use_pblendvb;
45047 /* Use vpblendw. */
45048 for (i = 0; i < 16; ++i)
45049 mask |= (d->perm[i * 2] >= 32) << i;
45050 vmode = V16HImode;
45051 goto do_subreg;
45054 /* Use vpblendd. */
45055 for (i = 0; i < 8; ++i)
45056 mask |= (d->perm[i * 4] >= 32) << i;
45057 vmode = V8SImode;
45058 goto do_subreg;
45060 case V16HImode:
45061 /* See if words move in pairs. If yes, vpblendd can be used. */
45062 for (i = 0; i < 16; i += 2)
45063 if (d->perm[i] + 1 != d->perm[i + 1])
45064 break;
45065 if (i < 16)
45067 /* See if words move the same in both lanes. If not,
45068 vpblendvb must be used. */
45069 for (i = 0; i < 8; i++)
45070 if (d->perm[i] + 8 != d->perm[i + 8])
45072 /* Use vpblendvb. */
45073 for (i = 0; i < 32; ++i)
45074 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45076 vmode = V32QImode;
45077 nelt = 32;
45078 target = gen_reg_rtx (vmode);
45079 op0 = gen_lowpart (vmode, op0);
45080 op1 = gen_lowpart (vmode, op1);
45081 goto finish_pblendvb;
45084 /* Use vpblendw. */
45085 for (i = 0; i < 16; ++i)
45086 mask |= (d->perm[i] >= 16) << i;
45087 break;
45090 /* Use vpblendd. */
45091 for (i = 0; i < 8; ++i)
45092 mask |= (d->perm[i * 2] >= 16) << i;
45093 vmode = V8SImode;
45094 goto do_subreg;
45096 case V4DImode:
45097 /* Use vpblendd. */
45098 for (i = 0; i < 4; ++i)
45099 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45100 vmode = V8SImode;
45101 goto do_subreg;
45103 default:
45104 gcc_unreachable ();
45107 switch (vmode)
45109 case V8DFmode:
45110 case V8DImode:
45111 mmode = QImode;
45112 break;
45113 case V16SFmode:
45114 case V16SImode:
45115 mmode = HImode;
45116 break;
45117 case V32HImode:
45118 mmode = SImode;
45119 break;
45120 case V64QImode:
45121 mmode = DImode;
45122 break;
45123 default:
45124 mmode = VOIDmode;
45127 if (mmode != VOIDmode)
45128 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45129 else
45130 maskop = GEN_INT (mask);
45132 /* This matches five different patterns with the different modes. */
45133 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45134 x = gen_rtx_SET (target, x);
45135 emit_insn (x);
45136 if (target != d->target)
45137 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45139 return true;
45142 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45143 in terms of the variable form of vpermilps.
45145 Note that we will have already failed the immediate input vpermilps,
45146 which requires that the high and low part shuffle be identical; the
45147 variable form doesn't require that. */
45149 static bool
45150 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45152 rtx rperm[8], vperm;
45153 unsigned i;
45155 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45156 return false;
45158 /* We can only permute within the 128-bit lane. */
45159 for (i = 0; i < 8; ++i)
45161 unsigned e = d->perm[i];
45162 if (i < 4 ? e >= 4 : e < 4)
45163 return false;
45166 if (d->testing_p)
45167 return true;
45169 for (i = 0; i < 8; ++i)
45171 unsigned e = d->perm[i];
45173 /* Within each 128-bit lane, the elements of op0 are numbered
45174 from 0 and the elements of op1 are numbered from 4. */
45175 if (e >= 8 + 4)
45176 e -= 8;
45177 else if (e >= 4)
45178 e -= 4;
45180 rperm[i] = GEN_INT (e);
45183 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45184 vperm = force_reg (V8SImode, vperm);
45185 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45187 return true;
45190 /* Return true if permutation D can be performed as VMODE permutation
45191 instead. */
45193 static bool
45194 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45196 unsigned int i, j, chunk;
45198 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45199 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45200 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45201 return false;
45203 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45204 return true;
45206 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45207 for (i = 0; i < d->nelt; i += chunk)
45208 if (d->perm[i] & (chunk - 1))
45209 return false;
45210 else
45211 for (j = 1; j < chunk; ++j)
45212 if (d->perm[i] + j != d->perm[i + j])
45213 return false;
45215 return true;
45218 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45219 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45221 static bool
45222 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45224 unsigned i, nelt, eltsz, mask;
45225 unsigned char perm[64];
45226 machine_mode vmode = V16QImode;
45227 rtx rperm[64], vperm, target, op0, op1;
45229 nelt = d->nelt;
45231 if (!d->one_operand_p)
45233 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45235 if (TARGET_AVX2
45236 && valid_perm_using_mode_p (V2TImode, d))
45238 if (d->testing_p)
45239 return true;
45241 /* Use vperm2i128 insn. The pattern uses
45242 V4DImode instead of V2TImode. */
45243 target = d->target;
45244 if (d->vmode != V4DImode)
45245 target = gen_reg_rtx (V4DImode);
45246 op0 = gen_lowpart (V4DImode, d->op0);
45247 op1 = gen_lowpart (V4DImode, d->op1);
45248 rperm[0]
45249 = GEN_INT ((d->perm[0] / (nelt / 2))
45250 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45251 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45252 if (target != d->target)
45253 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45254 return true;
45256 return false;
45259 else
45261 if (GET_MODE_SIZE (d->vmode) == 16)
45263 if (!TARGET_SSSE3)
45264 return false;
45266 else if (GET_MODE_SIZE (d->vmode) == 32)
45268 if (!TARGET_AVX2)
45269 return false;
45271 /* V4DImode should be already handled through
45272 expand_vselect by vpermq instruction. */
45273 gcc_assert (d->vmode != V4DImode);
45275 vmode = V32QImode;
45276 if (d->vmode == V8SImode
45277 || d->vmode == V16HImode
45278 || d->vmode == V32QImode)
45280 /* First see if vpermq can be used for
45281 V8SImode/V16HImode/V32QImode. */
45282 if (valid_perm_using_mode_p (V4DImode, d))
45284 for (i = 0; i < 4; i++)
45285 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45286 if (d->testing_p)
45287 return true;
45288 target = gen_reg_rtx (V4DImode);
45289 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45290 perm, 4, false))
45292 emit_move_insn (d->target,
45293 gen_lowpart (d->vmode, target));
45294 return true;
45296 return false;
45299 /* Next see if vpermd can be used. */
45300 if (valid_perm_using_mode_p (V8SImode, d))
45301 vmode = V8SImode;
45303 /* Or if vpermps can be used. */
45304 else if (d->vmode == V8SFmode)
45305 vmode = V8SImode;
45307 if (vmode == V32QImode)
45309 /* vpshufb only works intra lanes, it is not
45310 possible to shuffle bytes in between the lanes. */
45311 for (i = 0; i < nelt; ++i)
45312 if ((d->perm[i] ^ i) & (nelt / 2))
45313 return false;
45316 else if (GET_MODE_SIZE (d->vmode) == 64)
45318 if (!TARGET_AVX512BW)
45319 return false;
45321 /* If vpermq didn't work, vpshufb won't work either. */
45322 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45323 return false;
45325 vmode = V64QImode;
45326 if (d->vmode == V16SImode
45327 || d->vmode == V32HImode
45328 || d->vmode == V64QImode)
45330 /* First see if vpermq can be used for
45331 V16SImode/V32HImode/V64QImode. */
45332 if (valid_perm_using_mode_p (V8DImode, d))
45334 for (i = 0; i < 8; i++)
45335 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45336 if (d->testing_p)
45337 return true;
45338 target = gen_reg_rtx (V8DImode);
45339 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45340 perm, 8, false))
45342 emit_move_insn (d->target,
45343 gen_lowpart (d->vmode, target));
45344 return true;
45346 return false;
45349 /* Next see if vpermd can be used. */
45350 if (valid_perm_using_mode_p (V16SImode, d))
45351 vmode = V16SImode;
45353 /* Or if vpermps can be used. */
45354 else if (d->vmode == V16SFmode)
45355 vmode = V16SImode;
45356 if (vmode == V64QImode)
45358 /* vpshufb only works intra lanes, it is not
45359 possible to shuffle bytes in between the lanes. */
45360 for (i = 0; i < nelt; ++i)
45361 if ((d->perm[i] ^ i) & (nelt / 4))
45362 return false;
45365 else
45366 return false;
45369 if (d->testing_p)
45370 return true;
45372 if (vmode == V8SImode)
45373 for (i = 0; i < 8; ++i)
45374 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45375 else if (vmode == V16SImode)
45376 for (i = 0; i < 16; ++i)
45377 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45378 else
45380 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45381 if (!d->one_operand_p)
45382 mask = 2 * nelt - 1;
45383 else if (vmode == V16QImode)
45384 mask = nelt - 1;
45385 else if (vmode == V64QImode)
45386 mask = nelt / 4 - 1;
45387 else
45388 mask = nelt / 2 - 1;
45390 for (i = 0; i < nelt; ++i)
45392 unsigned j, e = d->perm[i] & mask;
45393 for (j = 0; j < eltsz; ++j)
45394 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45398 vperm = gen_rtx_CONST_VECTOR (vmode,
45399 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45400 vperm = force_reg (vmode, vperm);
45402 target = d->target;
45403 if (d->vmode != vmode)
45404 target = gen_reg_rtx (vmode);
45405 op0 = gen_lowpart (vmode, d->op0);
45406 if (d->one_operand_p)
45408 if (vmode == V16QImode)
45409 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45410 else if (vmode == V32QImode)
45411 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45412 else if (vmode == V64QImode)
45413 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45414 else if (vmode == V8SFmode)
45415 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45416 else if (vmode == V8SImode)
45417 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45418 else if (vmode == V16SFmode)
45419 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45420 else if (vmode == V16SImode)
45421 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45422 else
45423 gcc_unreachable ();
45425 else
45427 op1 = gen_lowpart (vmode, d->op1);
45428 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45430 if (target != d->target)
45431 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45433 return true;
45436 /* For V*[QHS]Imode permutations, check if the same permutation
45437 can't be performed in a 2x, 4x or 8x wider inner mode. */
45439 static bool
45440 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45441 struct expand_vec_perm_d *nd)
45443 int i;
45444 enum machine_mode mode = VOIDmode;
45446 switch (d->vmode)
45448 case V16QImode: mode = V8HImode; break;
45449 case V32QImode: mode = V16HImode; break;
45450 case V64QImode: mode = V32HImode; break;
45451 case V8HImode: mode = V4SImode; break;
45452 case V16HImode: mode = V8SImode; break;
45453 case V32HImode: mode = V16SImode; break;
45454 case V4SImode: mode = V2DImode; break;
45455 case V8SImode: mode = V4DImode; break;
45456 case V16SImode: mode = V8DImode; break;
45457 default: return false;
45459 for (i = 0; i < d->nelt; i += 2)
45460 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45461 return false;
45462 nd->vmode = mode;
45463 nd->nelt = d->nelt / 2;
45464 for (i = 0; i < nd->nelt; i++)
45465 nd->perm[i] = d->perm[2 * i] / 2;
45466 if (GET_MODE_INNER (mode) != DImode)
45467 canonicalize_vector_int_perm (nd, nd);
45468 if (nd != d)
45470 nd->one_operand_p = d->one_operand_p;
45471 nd->testing_p = d->testing_p;
45472 if (d->op0 == d->op1)
45473 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45474 else
45476 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45477 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45479 if (d->testing_p)
45480 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45481 else
45482 nd->target = gen_reg_rtx (nd->vmode);
45484 return true;
45487 /* Try to expand one-operand permutation with constant mask. */
45489 static bool
45490 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45492 machine_mode mode = GET_MODE (d->op0);
45493 machine_mode maskmode = mode;
45494 rtx (*gen) (rtx, rtx, rtx) = NULL;
45495 rtx target, op0, mask;
45496 rtx vec[64];
45498 if (!rtx_equal_p (d->op0, d->op1))
45499 return false;
45501 if (!TARGET_AVX512F)
45502 return false;
45504 switch (mode)
45506 case V16SImode:
45507 gen = gen_avx512f_permvarv16si;
45508 break;
45509 case V16SFmode:
45510 gen = gen_avx512f_permvarv16sf;
45511 maskmode = V16SImode;
45512 break;
45513 case V8DImode:
45514 gen = gen_avx512f_permvarv8di;
45515 break;
45516 case V8DFmode:
45517 gen = gen_avx512f_permvarv8df;
45518 maskmode = V8DImode;
45519 break;
45520 default:
45521 return false;
45524 target = d->target;
45525 op0 = d->op0;
45526 for (int i = 0; i < d->nelt; ++i)
45527 vec[i] = GEN_INT (d->perm[i]);
45528 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45529 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45530 return true;
45533 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45534 in a single instruction. */
45536 static bool
45537 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45539 unsigned i, nelt = d->nelt;
45540 struct expand_vec_perm_d nd;
45542 /* Check plain VEC_SELECT first, because AVX has instructions that could
45543 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45544 input where SEL+CONCAT may not. */
45545 if (d->one_operand_p)
45547 int mask = nelt - 1;
45548 bool identity_perm = true;
45549 bool broadcast_perm = true;
45551 for (i = 0; i < nelt; i++)
45553 nd.perm[i] = d->perm[i] & mask;
45554 if (nd.perm[i] != i)
45555 identity_perm = false;
45556 if (nd.perm[i])
45557 broadcast_perm = false;
45560 if (identity_perm)
45562 if (!d->testing_p)
45563 emit_move_insn (d->target, d->op0);
45564 return true;
45566 else if (broadcast_perm && TARGET_AVX2)
45568 /* Use vpbroadcast{b,w,d}. */
45569 rtx (*gen) (rtx, rtx) = NULL;
45570 switch (d->vmode)
45572 case V64QImode:
45573 if (TARGET_AVX512BW)
45574 gen = gen_avx512bw_vec_dupv64qi_1;
45575 break;
45576 case V32QImode:
45577 gen = gen_avx2_pbroadcastv32qi_1;
45578 break;
45579 case V32HImode:
45580 if (TARGET_AVX512BW)
45581 gen = gen_avx512bw_vec_dupv32hi_1;
45582 break;
45583 case V16HImode:
45584 gen = gen_avx2_pbroadcastv16hi_1;
45585 break;
45586 case V16SImode:
45587 if (TARGET_AVX512F)
45588 gen = gen_avx512f_vec_dupv16si_1;
45589 break;
45590 case V8SImode:
45591 gen = gen_avx2_pbroadcastv8si_1;
45592 break;
45593 case V16QImode:
45594 gen = gen_avx2_pbroadcastv16qi;
45595 break;
45596 case V8HImode:
45597 gen = gen_avx2_pbroadcastv8hi;
45598 break;
45599 case V16SFmode:
45600 if (TARGET_AVX512F)
45601 gen = gen_avx512f_vec_dupv16sf_1;
45602 break;
45603 case V8SFmode:
45604 gen = gen_avx2_vec_dupv8sf_1;
45605 break;
45606 case V8DFmode:
45607 if (TARGET_AVX512F)
45608 gen = gen_avx512f_vec_dupv8df_1;
45609 break;
45610 case V8DImode:
45611 if (TARGET_AVX512F)
45612 gen = gen_avx512f_vec_dupv8di_1;
45613 break;
45614 /* For other modes prefer other shuffles this function creates. */
45615 default: break;
45617 if (gen != NULL)
45619 if (!d->testing_p)
45620 emit_insn (gen (d->target, d->op0));
45621 return true;
45625 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45626 return true;
45628 /* There are plenty of patterns in sse.md that are written for
45629 SEL+CONCAT and are not replicated for a single op. Perhaps
45630 that should be changed, to avoid the nastiness here. */
45632 /* Recognize interleave style patterns, which means incrementing
45633 every other permutation operand. */
45634 for (i = 0; i < nelt; i += 2)
45636 nd.perm[i] = d->perm[i] & mask;
45637 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45639 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45640 d->testing_p))
45641 return true;
45643 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45644 if (nelt >= 4)
45646 for (i = 0; i < nelt; i += 4)
45648 nd.perm[i + 0] = d->perm[i + 0] & mask;
45649 nd.perm[i + 1] = d->perm[i + 1] & mask;
45650 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45651 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45654 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45655 d->testing_p))
45656 return true;
45660 /* Finally, try the fully general two operand permute. */
45661 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45662 d->testing_p))
45663 return true;
45665 /* Recognize interleave style patterns with reversed operands. */
45666 if (!d->one_operand_p)
45668 for (i = 0; i < nelt; ++i)
45670 unsigned e = d->perm[i];
45671 if (e >= nelt)
45672 e -= nelt;
45673 else
45674 e += nelt;
45675 nd.perm[i] = e;
45678 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45679 d->testing_p))
45680 return true;
45683 /* Try the SSE4.1 blend variable merge instructions. */
45684 if (expand_vec_perm_blend (d))
45685 return true;
45687 /* Try one of the AVX vpermil variable permutations. */
45688 if (expand_vec_perm_vpermil (d))
45689 return true;
45691 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45692 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45693 if (expand_vec_perm_pshufb (d))
45694 return true;
45696 /* Try the AVX2 vpalignr instruction. */
45697 if (expand_vec_perm_palignr (d, true))
45698 return true;
45700 /* Try the AVX512F vperm{s,d} instructions. */
45701 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45702 return true;
45704 /* Try the AVX512F vpermi2 instructions. */
45705 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45706 return true;
45708 /* See if we can get the same permutation in different vector integer
45709 mode. */
45710 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45712 if (!d->testing_p)
45713 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45714 return true;
45716 return false;
45719 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45720 in terms of a pair of pshuflw + pshufhw instructions. */
45722 static bool
45723 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45725 unsigned char perm2[MAX_VECT_LEN];
45726 unsigned i;
45727 bool ok;
45729 if (d->vmode != V8HImode || !d->one_operand_p)
45730 return false;
45732 /* The two permutations only operate in 64-bit lanes. */
45733 for (i = 0; i < 4; ++i)
45734 if (d->perm[i] >= 4)
45735 return false;
45736 for (i = 4; i < 8; ++i)
45737 if (d->perm[i] < 4)
45738 return false;
45740 if (d->testing_p)
45741 return true;
45743 /* Emit the pshuflw. */
45744 memcpy (perm2, d->perm, 4);
45745 for (i = 4; i < 8; ++i)
45746 perm2[i] = i;
45747 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45748 gcc_assert (ok);
45750 /* Emit the pshufhw. */
45751 memcpy (perm2 + 4, d->perm + 4, 4);
45752 for (i = 0; i < 4; ++i)
45753 perm2[i] = i;
45754 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45755 gcc_assert (ok);
45757 return true;
45760 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45761 the permutation using the SSSE3 palignr instruction. This succeeds
45762 when all of the elements in PERM fit within one vector and we merely
45763 need to shift them down so that a single vector permutation has a
45764 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45765 the vpalignr instruction itself can perform the requested permutation. */
45767 static bool
45768 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45770 unsigned i, nelt = d->nelt;
45771 unsigned min, max, minswap, maxswap;
45772 bool in_order, ok, swap = false;
45773 rtx shift, target;
45774 struct expand_vec_perm_d dcopy;
45776 /* Even with AVX, palignr only operates on 128-bit vectors,
45777 in AVX2 palignr operates on both 128-bit lanes. */
45778 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45779 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45780 return false;
45782 min = 2 * nelt;
45783 max = 0;
45784 minswap = 2 * nelt;
45785 maxswap = 0;
45786 for (i = 0; i < nelt; ++i)
45788 unsigned e = d->perm[i];
45789 unsigned eswap = d->perm[i] ^ nelt;
45790 if (GET_MODE_SIZE (d->vmode) == 32)
45792 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45793 eswap = e ^ (nelt / 2);
45795 if (e < min)
45796 min = e;
45797 if (e > max)
45798 max = e;
45799 if (eswap < minswap)
45800 minswap = eswap;
45801 if (eswap > maxswap)
45802 maxswap = eswap;
45804 if (min == 0
45805 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45807 if (d->one_operand_p
45808 || minswap == 0
45809 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45810 ? nelt / 2 : nelt))
45811 return false;
45812 swap = true;
45813 min = minswap;
45814 max = maxswap;
45817 /* Given that we have SSSE3, we know we'll be able to implement the
45818 single operand permutation after the palignr with pshufb for
45819 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45820 first. */
45821 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45822 return true;
45824 dcopy = *d;
45825 if (swap)
45827 dcopy.op0 = d->op1;
45828 dcopy.op1 = d->op0;
45829 for (i = 0; i < nelt; ++i)
45830 dcopy.perm[i] ^= nelt;
45833 in_order = true;
45834 for (i = 0; i < nelt; ++i)
45836 unsigned e = dcopy.perm[i];
45837 if (GET_MODE_SIZE (d->vmode) == 32
45838 && e >= nelt
45839 && (e & (nelt / 2 - 1)) < min)
45840 e = e - min - (nelt / 2);
45841 else
45842 e = e - min;
45843 if (e != i)
45844 in_order = false;
45845 dcopy.perm[i] = e;
45847 dcopy.one_operand_p = true;
45849 if (single_insn_only_p && !in_order)
45850 return false;
45852 /* For AVX2, test whether we can permute the result in one instruction. */
45853 if (d->testing_p)
45855 if (in_order)
45856 return true;
45857 dcopy.op1 = dcopy.op0;
45858 return expand_vec_perm_1 (&dcopy);
45861 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45862 if (GET_MODE_SIZE (d->vmode) == 16)
45864 target = gen_reg_rtx (TImode);
45865 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45866 gen_lowpart (TImode, dcopy.op0), shift));
45868 else
45870 target = gen_reg_rtx (V2TImode);
45871 emit_insn (gen_avx2_palignrv2ti (target,
45872 gen_lowpart (V2TImode, dcopy.op1),
45873 gen_lowpart (V2TImode, dcopy.op0),
45874 shift));
45877 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45879 /* Test for the degenerate case where the alignment by itself
45880 produces the desired permutation. */
45881 if (in_order)
45883 emit_move_insn (d->target, dcopy.op0);
45884 return true;
45887 ok = expand_vec_perm_1 (&dcopy);
45888 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45890 return ok;
45893 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45894 the permutation using the SSE4_1 pblendv instruction. Potentially
45895 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45897 static bool
45898 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45900 unsigned i, which, nelt = d->nelt;
45901 struct expand_vec_perm_d dcopy, dcopy1;
45902 machine_mode vmode = d->vmode;
45903 bool ok;
45905 /* Use the same checks as in expand_vec_perm_blend. */
45906 if (d->one_operand_p)
45907 return false;
45908 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45910 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45912 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45914 else
45915 return false;
45917 /* Figure out where permutation elements stay not in their
45918 respective lanes. */
45919 for (i = 0, which = 0; i < nelt; ++i)
45921 unsigned e = d->perm[i];
45922 if (e != i)
45923 which |= (e < nelt ? 1 : 2);
45925 /* We can pblend the part where elements stay not in their
45926 respective lanes only when these elements are all in one
45927 half of a permutation.
45928 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45929 lanes, but both 8 and 9 >= 8
45930 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45931 respective lanes and 8 >= 8, but 2 not. */
45932 if (which != 1 && which != 2)
45933 return false;
45934 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45935 return true;
45937 /* First we apply one operand permutation to the part where
45938 elements stay not in their respective lanes. */
45939 dcopy = *d;
45940 if (which == 2)
45941 dcopy.op0 = dcopy.op1 = d->op1;
45942 else
45943 dcopy.op0 = dcopy.op1 = d->op0;
45944 if (!d->testing_p)
45945 dcopy.target = gen_reg_rtx (vmode);
45946 dcopy.one_operand_p = true;
45948 for (i = 0; i < nelt; ++i)
45949 dcopy.perm[i] = d->perm[i] & (nelt - 1);
45951 ok = expand_vec_perm_1 (&dcopy);
45952 if (GET_MODE_SIZE (vmode) != 16 && !ok)
45953 return false;
45954 else
45955 gcc_assert (ok);
45956 if (d->testing_p)
45957 return true;
45959 /* Next we put permuted elements into their positions. */
45960 dcopy1 = *d;
45961 if (which == 2)
45962 dcopy1.op1 = dcopy.target;
45963 else
45964 dcopy1.op0 = dcopy.target;
45966 for (i = 0; i < nelt; ++i)
45967 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
45969 ok = expand_vec_perm_blend (&dcopy1);
45970 gcc_assert (ok);
45972 return true;
45975 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
45977 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45978 a two vector permutation into a single vector permutation by using
45979 an interleave operation to merge the vectors. */
45981 static bool
45982 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
45984 struct expand_vec_perm_d dremap, dfinal;
45985 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
45986 unsigned HOST_WIDE_INT contents;
45987 unsigned char remap[2 * MAX_VECT_LEN];
45988 rtx_insn *seq;
45989 bool ok, same_halves = false;
45991 if (GET_MODE_SIZE (d->vmode) == 16)
45993 if (d->one_operand_p)
45994 return false;
45996 else if (GET_MODE_SIZE (d->vmode) == 32)
45998 if (!TARGET_AVX)
45999 return false;
46000 /* For 32-byte modes allow even d->one_operand_p.
46001 The lack of cross-lane shuffling in some instructions
46002 might prevent a single insn shuffle. */
46003 dfinal = *d;
46004 dfinal.testing_p = true;
46005 /* If expand_vec_perm_interleave3 can expand this into
46006 a 3 insn sequence, give up and let it be expanded as
46007 3 insn sequence. While that is one insn longer,
46008 it doesn't need a memory operand and in the common
46009 case that both interleave low and high permutations
46010 with the same operands are adjacent needs 4 insns
46011 for both after CSE. */
46012 if (expand_vec_perm_interleave3 (&dfinal))
46013 return false;
46015 else
46016 return false;
46018 /* Examine from whence the elements come. */
46019 contents = 0;
46020 for (i = 0; i < nelt; ++i)
46021 contents |= HOST_WIDE_INT_1U << d->perm[i];
46023 memset (remap, 0xff, sizeof (remap));
46024 dremap = *d;
46026 if (GET_MODE_SIZE (d->vmode) == 16)
46028 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46030 /* Split the two input vectors into 4 halves. */
46031 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46032 h2 = h1 << nelt2;
46033 h3 = h2 << nelt2;
46034 h4 = h3 << nelt2;
46036 /* If the elements from the low halves use interleave low, and similarly
46037 for interleave high. If the elements are from mis-matched halves, we
46038 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46039 if ((contents & (h1 | h3)) == contents)
46041 /* punpckl* */
46042 for (i = 0; i < nelt2; ++i)
46044 remap[i] = i * 2;
46045 remap[i + nelt] = i * 2 + 1;
46046 dremap.perm[i * 2] = i;
46047 dremap.perm[i * 2 + 1] = i + nelt;
46049 if (!TARGET_SSE2 && d->vmode == V4SImode)
46050 dremap.vmode = V4SFmode;
46052 else if ((contents & (h2 | h4)) == contents)
46054 /* punpckh* */
46055 for (i = 0; i < nelt2; ++i)
46057 remap[i + nelt2] = i * 2;
46058 remap[i + nelt + nelt2] = i * 2 + 1;
46059 dremap.perm[i * 2] = i + nelt2;
46060 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46062 if (!TARGET_SSE2 && d->vmode == V4SImode)
46063 dremap.vmode = V4SFmode;
46065 else if ((contents & (h1 | h4)) == contents)
46067 /* shufps */
46068 for (i = 0; i < nelt2; ++i)
46070 remap[i] = i;
46071 remap[i + nelt + nelt2] = i + nelt2;
46072 dremap.perm[i] = i;
46073 dremap.perm[i + nelt2] = i + nelt + nelt2;
46075 if (nelt != 4)
46077 /* shufpd */
46078 dremap.vmode = V2DImode;
46079 dremap.nelt = 2;
46080 dremap.perm[0] = 0;
46081 dremap.perm[1] = 3;
46084 else if ((contents & (h2 | h3)) == contents)
46086 /* shufps */
46087 for (i = 0; i < nelt2; ++i)
46089 remap[i + nelt2] = i;
46090 remap[i + nelt] = i + nelt2;
46091 dremap.perm[i] = i + nelt2;
46092 dremap.perm[i + nelt2] = i + nelt;
46094 if (nelt != 4)
46096 /* shufpd */
46097 dremap.vmode = V2DImode;
46098 dremap.nelt = 2;
46099 dremap.perm[0] = 1;
46100 dremap.perm[1] = 2;
46103 else
46104 return false;
46106 else
46108 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46109 unsigned HOST_WIDE_INT q[8];
46110 unsigned int nonzero_halves[4];
46112 /* Split the two input vectors into 8 quarters. */
46113 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46114 for (i = 1; i < 8; ++i)
46115 q[i] = q[0] << (nelt4 * i);
46116 for (i = 0; i < 4; ++i)
46117 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46119 nonzero_halves[nzcnt] = i;
46120 ++nzcnt;
46123 if (nzcnt == 1)
46125 gcc_assert (d->one_operand_p);
46126 nonzero_halves[1] = nonzero_halves[0];
46127 same_halves = true;
46129 else if (d->one_operand_p)
46131 gcc_assert (nonzero_halves[0] == 0);
46132 gcc_assert (nonzero_halves[1] == 1);
46135 if (nzcnt <= 2)
46137 if (d->perm[0] / nelt2 == nonzero_halves[1])
46139 /* Attempt to increase the likelihood that dfinal
46140 shuffle will be intra-lane. */
46141 std::swap (nonzero_halves[0], nonzero_halves[1]);
46144 /* vperm2f128 or vperm2i128. */
46145 for (i = 0; i < nelt2; ++i)
46147 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46148 remap[i + nonzero_halves[0] * nelt2] = i;
46149 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46150 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46153 if (d->vmode != V8SFmode
46154 && d->vmode != V4DFmode
46155 && d->vmode != V8SImode)
46157 dremap.vmode = V8SImode;
46158 dremap.nelt = 8;
46159 for (i = 0; i < 4; ++i)
46161 dremap.perm[i] = i + nonzero_halves[0] * 4;
46162 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46166 else if (d->one_operand_p)
46167 return false;
46168 else if (TARGET_AVX2
46169 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46171 /* vpunpckl* */
46172 for (i = 0; i < nelt4; ++i)
46174 remap[i] = i * 2;
46175 remap[i + nelt] = i * 2 + 1;
46176 remap[i + nelt2] = i * 2 + nelt2;
46177 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46178 dremap.perm[i * 2] = i;
46179 dremap.perm[i * 2 + 1] = i + nelt;
46180 dremap.perm[i * 2 + nelt2] = i + nelt2;
46181 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46184 else if (TARGET_AVX2
46185 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46187 /* vpunpckh* */
46188 for (i = 0; i < nelt4; ++i)
46190 remap[i + nelt4] = i * 2;
46191 remap[i + nelt + nelt4] = i * 2 + 1;
46192 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46193 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46194 dremap.perm[i * 2] = i + nelt4;
46195 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46196 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46197 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46200 else
46201 return false;
46204 /* Use the remapping array set up above to move the elements from their
46205 swizzled locations into their final destinations. */
46206 dfinal = *d;
46207 for (i = 0; i < nelt; ++i)
46209 unsigned e = remap[d->perm[i]];
46210 gcc_assert (e < nelt);
46211 /* If same_halves is true, both halves of the remapped vector are the
46212 same. Avoid cross-lane accesses if possible. */
46213 if (same_halves && i >= nelt2)
46215 gcc_assert (e < nelt2);
46216 dfinal.perm[i] = e + nelt2;
46218 else
46219 dfinal.perm[i] = e;
46221 if (!d->testing_p)
46223 dremap.target = gen_reg_rtx (dremap.vmode);
46224 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46226 dfinal.op1 = dfinal.op0;
46227 dfinal.one_operand_p = true;
46229 /* Test if the final remap can be done with a single insn. For V4SFmode or
46230 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46231 start_sequence ();
46232 ok = expand_vec_perm_1 (&dfinal);
46233 seq = get_insns ();
46234 end_sequence ();
46236 if (!ok)
46237 return false;
46239 if (d->testing_p)
46240 return true;
46242 if (dremap.vmode != dfinal.vmode)
46244 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46245 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46248 ok = expand_vec_perm_1 (&dremap);
46249 gcc_assert (ok);
46251 emit_insn (seq);
46252 return true;
46255 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46256 a single vector cross-lane permutation into vpermq followed
46257 by any of the single insn permutations. */
46259 static bool
46260 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46262 struct expand_vec_perm_d dremap, dfinal;
46263 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46264 unsigned contents[2];
46265 bool ok;
46267 if (!(TARGET_AVX2
46268 && (d->vmode == V32QImode || d->vmode == V16HImode)
46269 && d->one_operand_p))
46270 return false;
46272 contents[0] = 0;
46273 contents[1] = 0;
46274 for (i = 0; i < nelt2; ++i)
46276 contents[0] |= 1u << (d->perm[i] / nelt4);
46277 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46280 for (i = 0; i < 2; ++i)
46282 unsigned int cnt = 0;
46283 for (j = 0; j < 4; ++j)
46284 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46285 return false;
46288 if (d->testing_p)
46289 return true;
46291 dremap = *d;
46292 dremap.vmode = V4DImode;
46293 dremap.nelt = 4;
46294 dremap.target = gen_reg_rtx (V4DImode);
46295 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46296 dremap.op1 = dremap.op0;
46297 dremap.one_operand_p = true;
46298 for (i = 0; i < 2; ++i)
46300 unsigned int cnt = 0;
46301 for (j = 0; j < 4; ++j)
46302 if ((contents[i] & (1u << j)) != 0)
46303 dremap.perm[2 * i + cnt++] = j;
46304 for (; cnt < 2; ++cnt)
46305 dremap.perm[2 * i + cnt] = 0;
46308 dfinal = *d;
46309 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46310 dfinal.op1 = dfinal.op0;
46311 dfinal.one_operand_p = true;
46312 for (i = 0, j = 0; i < nelt; ++i)
46314 if (i == nelt2)
46315 j = 2;
46316 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46317 if ((d->perm[i] / nelt4) == dremap.perm[j])
46319 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46320 dfinal.perm[i] |= nelt4;
46321 else
46322 gcc_unreachable ();
46325 ok = expand_vec_perm_1 (&dremap);
46326 gcc_assert (ok);
46328 ok = expand_vec_perm_1 (&dfinal);
46329 gcc_assert (ok);
46331 return true;
46334 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46335 a vector permutation using two instructions, vperm2f128 resp.
46336 vperm2i128 followed by any single in-lane permutation. */
46338 static bool
46339 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46341 struct expand_vec_perm_d dfirst, dsecond;
46342 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46343 bool ok;
46345 if (!TARGET_AVX
46346 || GET_MODE_SIZE (d->vmode) != 32
46347 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46348 return false;
46350 dsecond = *d;
46351 dsecond.one_operand_p = false;
46352 dsecond.testing_p = true;
46354 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46355 immediate. For perm < 16 the second permutation uses
46356 d->op0 as first operand, for perm >= 16 it uses d->op1
46357 as first operand. The second operand is the result of
46358 vperm2[fi]128. */
46359 for (perm = 0; perm < 32; perm++)
46361 /* Ignore permutations which do not move anything cross-lane. */
46362 if (perm < 16)
46364 /* The second shuffle for e.g. V4DFmode has
46365 0123 and ABCD operands.
46366 Ignore AB23, as 23 is already in the second lane
46367 of the first operand. */
46368 if ((perm & 0xc) == (1 << 2)) continue;
46369 /* And 01CD, as 01 is in the first lane of the first
46370 operand. */
46371 if ((perm & 3) == 0) continue;
46372 /* And 4567, as then the vperm2[fi]128 doesn't change
46373 anything on the original 4567 second operand. */
46374 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46376 else
46378 /* The second shuffle for e.g. V4DFmode has
46379 4567 and ABCD operands.
46380 Ignore AB67, as 67 is already in the second lane
46381 of the first operand. */
46382 if ((perm & 0xc) == (3 << 2)) continue;
46383 /* And 45CD, as 45 is in the first lane of the first
46384 operand. */
46385 if ((perm & 3) == 2) continue;
46386 /* And 0123, as then the vperm2[fi]128 doesn't change
46387 anything on the original 0123 first operand. */
46388 if ((perm & 0xf) == (1 << 2)) continue;
46391 for (i = 0; i < nelt; i++)
46393 j = d->perm[i] / nelt2;
46394 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46395 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46396 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46397 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46398 else
46399 break;
46402 if (i == nelt)
46404 start_sequence ();
46405 ok = expand_vec_perm_1 (&dsecond);
46406 end_sequence ();
46408 else
46409 ok = false;
46411 if (ok)
46413 if (d->testing_p)
46414 return true;
46416 /* Found a usable second shuffle. dfirst will be
46417 vperm2f128 on d->op0 and d->op1. */
46418 dsecond.testing_p = false;
46419 dfirst = *d;
46420 dfirst.target = gen_reg_rtx (d->vmode);
46421 for (i = 0; i < nelt; i++)
46422 dfirst.perm[i] = (i & (nelt2 - 1))
46423 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46425 canonicalize_perm (&dfirst);
46426 ok = expand_vec_perm_1 (&dfirst);
46427 gcc_assert (ok);
46429 /* And dsecond is some single insn shuffle, taking
46430 d->op0 and result of vperm2f128 (if perm < 16) or
46431 d->op1 and result of vperm2f128 (otherwise). */
46432 if (perm >= 16)
46433 dsecond.op0 = dsecond.op1;
46434 dsecond.op1 = dfirst.target;
46436 ok = expand_vec_perm_1 (&dsecond);
46437 gcc_assert (ok);
46439 return true;
46442 /* For one operand, the only useful vperm2f128 permutation is 0x01
46443 aka lanes swap. */
46444 if (d->one_operand_p)
46445 return false;
46448 return false;
46451 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46452 a two vector permutation using 2 intra-lane interleave insns
46453 and cross-lane shuffle for 32-byte vectors. */
46455 static bool
46456 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46458 unsigned i, nelt;
46459 rtx (*gen) (rtx, rtx, rtx);
46461 if (d->one_operand_p)
46462 return false;
46463 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46465 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46467 else
46468 return false;
46470 nelt = d->nelt;
46471 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46472 return false;
46473 for (i = 0; i < nelt; i += 2)
46474 if (d->perm[i] != d->perm[0] + i / 2
46475 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46476 return false;
46478 if (d->testing_p)
46479 return true;
46481 switch (d->vmode)
46483 case V32QImode:
46484 if (d->perm[0])
46485 gen = gen_vec_interleave_highv32qi;
46486 else
46487 gen = gen_vec_interleave_lowv32qi;
46488 break;
46489 case V16HImode:
46490 if (d->perm[0])
46491 gen = gen_vec_interleave_highv16hi;
46492 else
46493 gen = gen_vec_interleave_lowv16hi;
46494 break;
46495 case V8SImode:
46496 if (d->perm[0])
46497 gen = gen_vec_interleave_highv8si;
46498 else
46499 gen = gen_vec_interleave_lowv8si;
46500 break;
46501 case V4DImode:
46502 if (d->perm[0])
46503 gen = gen_vec_interleave_highv4di;
46504 else
46505 gen = gen_vec_interleave_lowv4di;
46506 break;
46507 case V8SFmode:
46508 if (d->perm[0])
46509 gen = gen_vec_interleave_highv8sf;
46510 else
46511 gen = gen_vec_interleave_lowv8sf;
46512 break;
46513 case V4DFmode:
46514 if (d->perm[0])
46515 gen = gen_vec_interleave_highv4df;
46516 else
46517 gen = gen_vec_interleave_lowv4df;
46518 break;
46519 default:
46520 gcc_unreachable ();
46523 emit_insn (gen (d->target, d->op0, d->op1));
46524 return true;
46527 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46528 a single vector permutation using a single intra-lane vector
46529 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46530 the non-swapped and swapped vectors together. */
46532 static bool
46533 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46535 struct expand_vec_perm_d dfirst, dsecond;
46536 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46537 rtx_insn *seq;
46538 bool ok;
46539 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46541 if (!TARGET_AVX
46542 || TARGET_AVX2
46543 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46544 || !d->one_operand_p)
46545 return false;
46547 dfirst = *d;
46548 for (i = 0; i < nelt; i++)
46549 dfirst.perm[i] = 0xff;
46550 for (i = 0, msk = 0; i < nelt; i++)
46552 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46553 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46554 return false;
46555 dfirst.perm[j] = d->perm[i];
46556 if (j != i)
46557 msk |= (1 << i);
46559 for (i = 0; i < nelt; i++)
46560 if (dfirst.perm[i] == 0xff)
46561 dfirst.perm[i] = i;
46563 if (!d->testing_p)
46564 dfirst.target = gen_reg_rtx (dfirst.vmode);
46566 start_sequence ();
46567 ok = expand_vec_perm_1 (&dfirst);
46568 seq = get_insns ();
46569 end_sequence ();
46571 if (!ok)
46572 return false;
46574 if (d->testing_p)
46575 return true;
46577 emit_insn (seq);
46579 dsecond = *d;
46580 dsecond.op0 = dfirst.target;
46581 dsecond.op1 = dfirst.target;
46582 dsecond.one_operand_p = true;
46583 dsecond.target = gen_reg_rtx (dsecond.vmode);
46584 for (i = 0; i < nelt; i++)
46585 dsecond.perm[i] = i ^ nelt2;
46587 ok = expand_vec_perm_1 (&dsecond);
46588 gcc_assert (ok);
46590 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46591 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46592 return true;
46595 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46596 permutation using two vperm2f128, followed by a vshufpd insn blending
46597 the two vectors together. */
46599 static bool
46600 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46602 struct expand_vec_perm_d dfirst, dsecond, dthird;
46603 bool ok;
46605 if (!TARGET_AVX || (d->vmode != V4DFmode))
46606 return false;
46608 if (d->testing_p)
46609 return true;
46611 dfirst = *d;
46612 dsecond = *d;
46613 dthird = *d;
46615 dfirst.perm[0] = (d->perm[0] & ~1);
46616 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46617 dfirst.perm[2] = (d->perm[2] & ~1);
46618 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46619 dsecond.perm[0] = (d->perm[1] & ~1);
46620 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46621 dsecond.perm[2] = (d->perm[3] & ~1);
46622 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46623 dthird.perm[0] = (d->perm[0] % 2);
46624 dthird.perm[1] = (d->perm[1] % 2) + 4;
46625 dthird.perm[2] = (d->perm[2] % 2) + 2;
46626 dthird.perm[3] = (d->perm[3] % 2) + 6;
46628 dfirst.target = gen_reg_rtx (dfirst.vmode);
46629 dsecond.target = gen_reg_rtx (dsecond.vmode);
46630 dthird.op0 = dfirst.target;
46631 dthird.op1 = dsecond.target;
46632 dthird.one_operand_p = false;
46634 canonicalize_perm (&dfirst);
46635 canonicalize_perm (&dsecond);
46637 ok = expand_vec_perm_1 (&dfirst)
46638 && expand_vec_perm_1 (&dsecond)
46639 && expand_vec_perm_1 (&dthird);
46641 gcc_assert (ok);
46643 return true;
46646 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46647 permutation with two pshufb insns and an ior. We should have already
46648 failed all two instruction sequences. */
46650 static bool
46651 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46653 rtx rperm[2][16], vperm, l, h, op, m128;
46654 unsigned int i, nelt, eltsz;
46656 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46657 return false;
46658 gcc_assert (!d->one_operand_p);
46660 if (d->testing_p)
46661 return true;
46663 nelt = d->nelt;
46664 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46666 /* Generate two permutation masks. If the required element is within
46667 the given vector it is shuffled into the proper lane. If the required
46668 element is in the other vector, force a zero into the lane by setting
46669 bit 7 in the permutation mask. */
46670 m128 = GEN_INT (-128);
46671 for (i = 0; i < nelt; ++i)
46673 unsigned j, e = d->perm[i];
46674 unsigned which = (e >= nelt);
46675 if (e >= nelt)
46676 e -= nelt;
46678 for (j = 0; j < eltsz; ++j)
46680 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46681 rperm[1-which][i*eltsz + j] = m128;
46685 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46686 vperm = force_reg (V16QImode, vperm);
46688 l = gen_reg_rtx (V16QImode);
46689 op = gen_lowpart (V16QImode, d->op0);
46690 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46692 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46693 vperm = force_reg (V16QImode, vperm);
46695 h = gen_reg_rtx (V16QImode);
46696 op = gen_lowpart (V16QImode, d->op1);
46697 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46699 op = d->target;
46700 if (d->vmode != V16QImode)
46701 op = gen_reg_rtx (V16QImode);
46702 emit_insn (gen_iorv16qi3 (op, l, h));
46703 if (op != d->target)
46704 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46706 return true;
46709 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46710 with two vpshufb insns, vpermq and vpor. We should have already failed
46711 all two or three instruction sequences. */
46713 static bool
46714 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46716 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46717 unsigned int i, nelt, eltsz;
46719 if (!TARGET_AVX2
46720 || !d->one_operand_p
46721 || (d->vmode != V32QImode && d->vmode != V16HImode))
46722 return false;
46724 if (d->testing_p)
46725 return true;
46727 nelt = d->nelt;
46728 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46730 /* Generate two permutation masks. If the required element is within
46731 the same lane, it is shuffled in. If the required element from the
46732 other lane, force a zero by setting bit 7 in the permutation mask.
46733 In the other mask the mask has non-negative elements if element
46734 is requested from the other lane, but also moved to the other lane,
46735 so that the result of vpshufb can have the two V2TImode halves
46736 swapped. */
46737 m128 = GEN_INT (-128);
46738 for (i = 0; i < nelt; ++i)
46740 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46741 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46743 for (j = 0; j < eltsz; ++j)
46745 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46746 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46750 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46751 vperm = force_reg (V32QImode, vperm);
46753 h = gen_reg_rtx (V32QImode);
46754 op = gen_lowpart (V32QImode, d->op0);
46755 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46757 /* Swap the 128-byte lanes of h into hp. */
46758 hp = gen_reg_rtx (V4DImode);
46759 op = gen_lowpart (V4DImode, h);
46760 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46761 const1_rtx));
46763 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46764 vperm = force_reg (V32QImode, vperm);
46766 l = gen_reg_rtx (V32QImode);
46767 op = gen_lowpart (V32QImode, d->op0);
46768 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46770 op = d->target;
46771 if (d->vmode != V32QImode)
46772 op = gen_reg_rtx (V32QImode);
46773 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46774 if (op != d->target)
46775 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46777 return true;
46780 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46781 and extract-odd permutations of two V32QImode and V16QImode operand
46782 with two vpshufb insns, vpor and vpermq. We should have already
46783 failed all two or three instruction sequences. */
46785 static bool
46786 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46788 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46789 unsigned int i, nelt, eltsz;
46791 if (!TARGET_AVX2
46792 || d->one_operand_p
46793 || (d->vmode != V32QImode && d->vmode != V16HImode))
46794 return false;
46796 for (i = 0; i < d->nelt; ++i)
46797 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46798 return false;
46800 if (d->testing_p)
46801 return true;
46803 nelt = d->nelt;
46804 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46806 /* Generate two permutation masks. In the first permutation mask
46807 the first quarter will contain indexes for the first half
46808 of the op0, the second quarter will contain bit 7 set, third quarter
46809 will contain indexes for the second half of the op0 and the
46810 last quarter bit 7 set. In the second permutation mask
46811 the first quarter will contain bit 7 set, the second quarter
46812 indexes for the first half of the op1, the third quarter bit 7 set
46813 and last quarter indexes for the second half of the op1.
46814 I.e. the first mask e.g. for V32QImode extract even will be:
46815 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46816 (all values masked with 0xf except for -128) and second mask
46817 for extract even will be
46818 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46819 m128 = GEN_INT (-128);
46820 for (i = 0; i < nelt; ++i)
46822 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46823 unsigned which = d->perm[i] >= nelt;
46824 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46826 for (j = 0; j < eltsz; ++j)
46828 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46829 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46833 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46834 vperm = force_reg (V32QImode, vperm);
46836 l = gen_reg_rtx (V32QImode);
46837 op = gen_lowpart (V32QImode, d->op0);
46838 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46840 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46841 vperm = force_reg (V32QImode, vperm);
46843 h = gen_reg_rtx (V32QImode);
46844 op = gen_lowpart (V32QImode, d->op1);
46845 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46847 ior = gen_reg_rtx (V32QImode);
46848 emit_insn (gen_iorv32qi3 (ior, l, h));
46850 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46851 op = gen_reg_rtx (V4DImode);
46852 ior = gen_lowpart (V4DImode, ior);
46853 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46854 const1_rtx, GEN_INT (3)));
46855 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46857 return true;
46860 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46861 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46862 with two "and" and "pack" or two "shift" and "pack" insns. We should
46863 have already failed all two instruction sequences. */
46865 static bool
46866 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46868 rtx op, dop0, dop1, t, rperm[16];
46869 unsigned i, odd, c, s, nelt = d->nelt;
46870 bool end_perm = false;
46871 machine_mode half_mode;
46872 rtx (*gen_and) (rtx, rtx, rtx);
46873 rtx (*gen_pack) (rtx, rtx, rtx);
46874 rtx (*gen_shift) (rtx, rtx, rtx);
46876 if (d->one_operand_p)
46877 return false;
46879 switch (d->vmode)
46881 case V8HImode:
46882 /* Required for "pack". */
46883 if (!TARGET_SSE4_1)
46884 return false;
46885 c = 0xffff;
46886 s = 16;
46887 half_mode = V4SImode;
46888 gen_and = gen_andv4si3;
46889 gen_pack = gen_sse4_1_packusdw;
46890 gen_shift = gen_lshrv4si3;
46891 break;
46892 case V16QImode:
46893 /* No check as all instructions are SSE2. */
46894 c = 0xff;
46895 s = 8;
46896 half_mode = V8HImode;
46897 gen_and = gen_andv8hi3;
46898 gen_pack = gen_sse2_packuswb;
46899 gen_shift = gen_lshrv8hi3;
46900 break;
46901 case V16HImode:
46902 if (!TARGET_AVX2)
46903 return false;
46904 c = 0xffff;
46905 s = 16;
46906 half_mode = V8SImode;
46907 gen_and = gen_andv8si3;
46908 gen_pack = gen_avx2_packusdw;
46909 gen_shift = gen_lshrv8si3;
46910 end_perm = true;
46911 break;
46912 case V32QImode:
46913 if (!TARGET_AVX2)
46914 return false;
46915 c = 0xff;
46916 s = 8;
46917 half_mode = V16HImode;
46918 gen_and = gen_andv16hi3;
46919 gen_pack = gen_avx2_packuswb;
46920 gen_shift = gen_lshrv16hi3;
46921 end_perm = true;
46922 break;
46923 default:
46924 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46925 general shuffles. */
46926 return false;
46929 /* Check that permutation is even or odd. */
46930 odd = d->perm[0];
46931 if (odd > 1)
46932 return false;
46934 for (i = 1; i < nelt; ++i)
46935 if (d->perm[i] != 2 * i + odd)
46936 return false;
46938 if (d->testing_p)
46939 return true;
46941 dop0 = gen_reg_rtx (half_mode);
46942 dop1 = gen_reg_rtx (half_mode);
46943 if (odd == 0)
46945 for (i = 0; i < nelt / 2; i++)
46946 rperm[i] = GEN_INT (c);
46947 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
46948 t = force_reg (half_mode, t);
46949 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
46950 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
46952 else
46954 emit_insn (gen_shift (dop0,
46955 gen_lowpart (half_mode, d->op0),
46956 GEN_INT (s)));
46957 emit_insn (gen_shift (dop1,
46958 gen_lowpart (half_mode, d->op1),
46959 GEN_INT (s)));
46961 /* In AVX2 for 256 bit case we need to permute pack result. */
46962 if (TARGET_AVX2 && end_perm)
46964 op = gen_reg_rtx (d->vmode);
46965 t = gen_reg_rtx (V4DImode);
46966 emit_insn (gen_pack (op, dop0, dop1));
46967 emit_insn (gen_avx2_permv4di_1 (t,
46968 gen_lowpart (V4DImode, op),
46969 const0_rtx,
46970 const2_rtx,
46971 const1_rtx,
46972 GEN_INT (3)));
46973 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
46975 else
46976 emit_insn (gen_pack (d->target, dop0, dop1));
46978 return true;
46981 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46982 and extract-odd permutations of two V64QI operands
46983 with two "shifts", two "truncs" and one "concat" insns for "odd"
46984 and two "truncs" and one concat insn for "even."
46985 Have already failed all two instruction sequences. */
46987 static bool
46988 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
46990 rtx t1, t2, t3, t4;
46991 unsigned i, odd, nelt = d->nelt;
46993 if (!TARGET_AVX512BW
46994 || d->one_operand_p
46995 || d->vmode != V64QImode)
46996 return false;
46998 /* Check that permutation is even or odd. */
46999 odd = d->perm[0];
47000 if (odd > 1)
47001 return false;
47003 for (i = 1; i < nelt; ++i)
47004 if (d->perm[i] != 2 * i + odd)
47005 return false;
47007 if (d->testing_p)
47008 return true;
47011 if (odd)
47013 t1 = gen_reg_rtx (V32HImode);
47014 t2 = gen_reg_rtx (V32HImode);
47015 emit_insn (gen_lshrv32hi3 (t1,
47016 gen_lowpart (V32HImode, d->op0),
47017 GEN_INT (8)));
47018 emit_insn (gen_lshrv32hi3 (t2,
47019 gen_lowpart (V32HImode, d->op1),
47020 GEN_INT (8)));
47022 else
47024 t1 = gen_lowpart (V32HImode, d->op0);
47025 t2 = gen_lowpart (V32HImode, d->op1);
47028 t3 = gen_reg_rtx (V32QImode);
47029 t4 = gen_reg_rtx (V32QImode);
47030 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47031 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47032 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47034 return true;
47037 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47038 and extract-odd permutations. */
47040 static bool
47041 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47043 rtx t1, t2, t3, t4, t5;
47045 switch (d->vmode)
47047 case V4DFmode:
47048 if (d->testing_p)
47049 break;
47050 t1 = gen_reg_rtx (V4DFmode);
47051 t2 = gen_reg_rtx (V4DFmode);
47053 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47054 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47055 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47057 /* Now an unpck[lh]pd will produce the result required. */
47058 if (odd)
47059 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47060 else
47061 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47062 emit_insn (t3);
47063 break;
47065 case V8SFmode:
47067 int mask = odd ? 0xdd : 0x88;
47069 if (d->testing_p)
47070 break;
47071 t1 = gen_reg_rtx (V8SFmode);
47072 t2 = gen_reg_rtx (V8SFmode);
47073 t3 = gen_reg_rtx (V8SFmode);
47075 /* Shuffle within the 128-bit lanes to produce:
47076 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47077 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47078 GEN_INT (mask)));
47080 /* Shuffle the lanes around to produce:
47081 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47082 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47083 GEN_INT (0x3)));
47085 /* Shuffle within the 128-bit lanes to produce:
47086 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47087 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47089 /* Shuffle within the 128-bit lanes to produce:
47090 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47091 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47093 /* Shuffle the lanes around to produce:
47094 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47095 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47096 GEN_INT (0x20)));
47098 break;
47100 case V2DFmode:
47101 case V4SFmode:
47102 case V2DImode:
47103 case V4SImode:
47104 /* These are always directly implementable by expand_vec_perm_1. */
47105 gcc_unreachable ();
47107 case V8HImode:
47108 if (TARGET_SSE4_1)
47109 return expand_vec_perm_even_odd_pack (d);
47110 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47111 return expand_vec_perm_pshufb2 (d);
47112 else
47114 if (d->testing_p)
47115 break;
47116 /* We need 2*log2(N)-1 operations to achieve odd/even
47117 with interleave. */
47118 t1 = gen_reg_rtx (V8HImode);
47119 t2 = gen_reg_rtx (V8HImode);
47120 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47121 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47122 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47123 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47124 if (odd)
47125 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47126 else
47127 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47128 emit_insn (t3);
47130 break;
47132 case V16QImode:
47133 return expand_vec_perm_even_odd_pack (d);
47135 case V16HImode:
47136 case V32QImode:
47137 return expand_vec_perm_even_odd_pack (d);
47139 case V64QImode:
47140 return expand_vec_perm_even_odd_trunc (d);
47142 case V4DImode:
47143 if (!TARGET_AVX2)
47145 struct expand_vec_perm_d d_copy = *d;
47146 d_copy.vmode = V4DFmode;
47147 if (d->testing_p)
47148 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47149 else
47150 d_copy.target = gen_reg_rtx (V4DFmode);
47151 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47152 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47153 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47155 if (!d->testing_p)
47156 emit_move_insn (d->target,
47157 gen_lowpart (V4DImode, d_copy.target));
47158 return true;
47160 return false;
47163 if (d->testing_p)
47164 break;
47166 t1 = gen_reg_rtx (V4DImode);
47167 t2 = gen_reg_rtx (V4DImode);
47169 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47170 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47171 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47173 /* Now an vpunpck[lh]qdq will produce the result required. */
47174 if (odd)
47175 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47176 else
47177 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47178 emit_insn (t3);
47179 break;
47181 case V8SImode:
47182 if (!TARGET_AVX2)
47184 struct expand_vec_perm_d d_copy = *d;
47185 d_copy.vmode = V8SFmode;
47186 if (d->testing_p)
47187 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47188 else
47189 d_copy.target = gen_reg_rtx (V8SFmode);
47190 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47191 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47192 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47194 if (!d->testing_p)
47195 emit_move_insn (d->target,
47196 gen_lowpart (V8SImode, d_copy.target));
47197 return true;
47199 return false;
47202 if (d->testing_p)
47203 break;
47205 t1 = gen_reg_rtx (V8SImode);
47206 t2 = gen_reg_rtx (V8SImode);
47207 t3 = gen_reg_rtx (V4DImode);
47208 t4 = gen_reg_rtx (V4DImode);
47209 t5 = gen_reg_rtx (V4DImode);
47211 /* Shuffle the lanes around into
47212 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47213 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47214 gen_lowpart (V4DImode, d->op1),
47215 GEN_INT (0x20)));
47216 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47217 gen_lowpart (V4DImode, d->op1),
47218 GEN_INT (0x31)));
47220 /* Swap the 2nd and 3rd position in each lane into
47221 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47222 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47223 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47224 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47225 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47227 /* Now an vpunpck[lh]qdq will produce
47228 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47229 if (odd)
47230 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47231 gen_lowpart (V4DImode, t2));
47232 else
47233 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47234 gen_lowpart (V4DImode, t2));
47235 emit_insn (t3);
47236 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47237 break;
47239 default:
47240 gcc_unreachable ();
47243 return true;
47246 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47247 extract-even and extract-odd permutations. */
47249 static bool
47250 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47252 unsigned i, odd, nelt = d->nelt;
47254 odd = d->perm[0];
47255 if (odd != 0 && odd != 1)
47256 return false;
47258 for (i = 1; i < nelt; ++i)
47259 if (d->perm[i] != 2 * i + odd)
47260 return false;
47262 return expand_vec_perm_even_odd_1 (d, odd);
47265 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47266 permutations. We assume that expand_vec_perm_1 has already failed. */
47268 static bool
47269 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47271 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47272 machine_mode vmode = d->vmode;
47273 unsigned char perm2[4];
47274 rtx op0 = d->op0, dest;
47275 bool ok;
47277 switch (vmode)
47279 case V4DFmode:
47280 case V8SFmode:
47281 /* These are special-cased in sse.md so that we can optionally
47282 use the vbroadcast instruction. They expand to two insns
47283 if the input happens to be in a register. */
47284 gcc_unreachable ();
47286 case V2DFmode:
47287 case V2DImode:
47288 case V4SFmode:
47289 case V4SImode:
47290 /* These are always implementable using standard shuffle patterns. */
47291 gcc_unreachable ();
47293 case V8HImode:
47294 case V16QImode:
47295 /* These can be implemented via interleave. We save one insn by
47296 stopping once we have promoted to V4SImode and then use pshufd. */
47297 if (d->testing_p)
47298 return true;
47301 rtx dest;
47302 rtx (*gen) (rtx, rtx, rtx)
47303 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47304 : gen_vec_interleave_lowv8hi;
47306 if (elt >= nelt2)
47308 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47309 : gen_vec_interleave_highv8hi;
47310 elt -= nelt2;
47312 nelt2 /= 2;
47314 dest = gen_reg_rtx (vmode);
47315 emit_insn (gen (dest, op0, op0));
47316 vmode = get_mode_wider_vector (vmode);
47317 op0 = gen_lowpart (vmode, dest);
47319 while (vmode != V4SImode);
47321 memset (perm2, elt, 4);
47322 dest = gen_reg_rtx (V4SImode);
47323 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47324 gcc_assert (ok);
47325 if (!d->testing_p)
47326 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47327 return true;
47329 case V64QImode:
47330 case V32QImode:
47331 case V16HImode:
47332 case V8SImode:
47333 case V4DImode:
47334 /* For AVX2 broadcasts of the first element vpbroadcast* or
47335 vpermq should be used by expand_vec_perm_1. */
47336 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47337 return false;
47339 default:
47340 gcc_unreachable ();
47344 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47345 broadcast permutations. */
47347 static bool
47348 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47350 unsigned i, elt, nelt = d->nelt;
47352 if (!d->one_operand_p)
47353 return false;
47355 elt = d->perm[0];
47356 for (i = 1; i < nelt; ++i)
47357 if (d->perm[i] != elt)
47358 return false;
47360 return expand_vec_perm_broadcast_1 (d);
47363 /* Implement arbitrary permutations of two V64QImode operands
47364 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
47365 static bool
47366 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
47368 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47369 return false;
47371 if (d->testing_p)
47372 return true;
47374 struct expand_vec_perm_d ds[2];
47375 rtx rperm[128], vperm, target0, target1;
47376 unsigned int i, nelt;
47377 machine_mode vmode;
47379 nelt = d->nelt;
47380 vmode = V64QImode;
47382 for (i = 0; i < 2; i++)
47384 ds[i] = *d;
47385 ds[i].vmode = V32HImode;
47386 ds[i].nelt = 32;
47387 ds[i].target = gen_reg_rtx (V32HImode);
47388 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47389 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47392 /* Prepare permutations such that the first one takes care of
47393 putting the even bytes into the right positions or one higher
47394 positions (ds[0]) and the second one takes care of
47395 putting the odd bytes into the right positions or one below
47396 (ds[1]). */
47398 for (i = 0; i < nelt; i++)
47400 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47401 if (i & 1)
47403 rperm[i] = constm1_rtx;
47404 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47406 else
47408 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47409 rperm[i + 64] = constm1_rtx;
47413 bool ok = expand_vec_perm_1 (&ds[0]);
47414 gcc_assert (ok);
47415 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47417 ok = expand_vec_perm_1 (&ds[1]);
47418 gcc_assert (ok);
47419 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47421 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47422 vperm = force_reg (vmode, vperm);
47423 target0 = gen_reg_rtx (V64QImode);
47424 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47426 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47427 vperm = force_reg (vmode, vperm);
47428 target1 = gen_reg_rtx (V64QImode);
47429 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47431 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47432 return true;
47435 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47436 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47437 all the shorter instruction sequences. */
47439 static bool
47440 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47442 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47443 unsigned int i, nelt, eltsz;
47444 bool used[4];
47446 if (!TARGET_AVX2
47447 || d->one_operand_p
47448 || (d->vmode != V32QImode && d->vmode != V16HImode))
47449 return false;
47451 if (d->testing_p)
47452 return true;
47454 nelt = d->nelt;
47455 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47457 /* Generate 4 permutation masks. If the required element is within
47458 the same lane, it is shuffled in. If the required element from the
47459 other lane, force a zero by setting bit 7 in the permutation mask.
47460 In the other mask the mask has non-negative elements if element
47461 is requested from the other lane, but also moved to the other lane,
47462 so that the result of vpshufb can have the two V2TImode halves
47463 swapped. */
47464 m128 = GEN_INT (-128);
47465 for (i = 0; i < 32; ++i)
47467 rperm[0][i] = m128;
47468 rperm[1][i] = m128;
47469 rperm[2][i] = m128;
47470 rperm[3][i] = m128;
47472 used[0] = false;
47473 used[1] = false;
47474 used[2] = false;
47475 used[3] = false;
47476 for (i = 0; i < nelt; ++i)
47478 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47479 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47480 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47482 for (j = 0; j < eltsz; ++j)
47483 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47484 used[which] = true;
47487 for (i = 0; i < 2; ++i)
47489 if (!used[2 * i + 1])
47491 h[i] = NULL_RTX;
47492 continue;
47494 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47495 gen_rtvec_v (32, rperm[2 * i + 1]));
47496 vperm = force_reg (V32QImode, vperm);
47497 h[i] = gen_reg_rtx (V32QImode);
47498 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47499 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47502 /* Swap the 128-byte lanes of h[X]. */
47503 for (i = 0; i < 2; ++i)
47505 if (h[i] == NULL_RTX)
47506 continue;
47507 op = gen_reg_rtx (V4DImode);
47508 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47509 const2_rtx, GEN_INT (3), const0_rtx,
47510 const1_rtx));
47511 h[i] = gen_lowpart (V32QImode, op);
47514 for (i = 0; i < 2; ++i)
47516 if (!used[2 * i])
47518 l[i] = NULL_RTX;
47519 continue;
47521 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47522 vperm = force_reg (V32QImode, vperm);
47523 l[i] = gen_reg_rtx (V32QImode);
47524 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47525 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47528 for (i = 0; i < 2; ++i)
47530 if (h[i] && l[i])
47532 op = gen_reg_rtx (V32QImode);
47533 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47534 l[i] = op;
47536 else if (h[i])
47537 l[i] = h[i];
47540 gcc_assert (l[0] && l[1]);
47541 op = d->target;
47542 if (d->vmode != V32QImode)
47543 op = gen_reg_rtx (V32QImode);
47544 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47545 if (op != d->target)
47546 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47547 return true;
47550 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47551 With all of the interface bits taken care of, perform the expansion
47552 in D and return true on success. */
47554 static bool
47555 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47557 /* Try a single instruction expansion. */
47558 if (expand_vec_perm_1 (d))
47559 return true;
47561 /* Try sequences of two instructions. */
47563 if (expand_vec_perm_pshuflw_pshufhw (d))
47564 return true;
47566 if (expand_vec_perm_palignr (d, false))
47567 return true;
47569 if (expand_vec_perm_interleave2 (d))
47570 return true;
47572 if (expand_vec_perm_broadcast (d))
47573 return true;
47575 if (expand_vec_perm_vpermq_perm_1 (d))
47576 return true;
47578 if (expand_vec_perm_vperm2f128 (d))
47579 return true;
47581 if (expand_vec_perm_pblendv (d))
47582 return true;
47584 /* Try sequences of three instructions. */
47586 if (expand_vec_perm_even_odd_pack (d))
47587 return true;
47589 if (expand_vec_perm_2vperm2f128_vshuf (d))
47590 return true;
47592 if (expand_vec_perm_pshufb2 (d))
47593 return true;
47595 if (expand_vec_perm_interleave3 (d))
47596 return true;
47598 if (expand_vec_perm_vperm2f128_vblend (d))
47599 return true;
47601 /* Try sequences of four instructions. */
47603 if (expand_vec_perm_even_odd_trunc (d))
47604 return true;
47605 if (expand_vec_perm_vpshufb2_vpermq (d))
47606 return true;
47608 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47609 return true;
47611 if (expand_vec_perm_vpermi2_vpshub2 (d))
47612 return true;
47614 /* ??? Look for narrow permutations whose element orderings would
47615 allow the promotion to a wider mode. */
47617 /* ??? Look for sequences of interleave or a wider permute that place
47618 the data into the correct lanes for a half-vector shuffle like
47619 pshuf[lh]w or vpermilps. */
47621 /* ??? Look for sequences of interleave that produce the desired results.
47622 The combinatorics of punpck[lh] get pretty ugly... */
47624 if (expand_vec_perm_even_odd (d))
47625 return true;
47627 /* Even longer sequences. */
47628 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47629 return true;
47631 /* See if we can get the same permutation in different vector integer
47632 mode. */
47633 struct expand_vec_perm_d nd;
47634 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47636 if (!d->testing_p)
47637 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47638 return true;
47641 return false;
47644 /* If a permutation only uses one operand, make it clear. Returns true
47645 if the permutation references both operands. */
47647 static bool
47648 canonicalize_perm (struct expand_vec_perm_d *d)
47650 int i, which, nelt = d->nelt;
47652 for (i = which = 0; i < nelt; ++i)
47653 which |= (d->perm[i] < nelt ? 1 : 2);
47655 d->one_operand_p = true;
47656 switch (which)
47658 default:
47659 gcc_unreachable();
47661 case 3:
47662 if (!rtx_equal_p (d->op0, d->op1))
47664 d->one_operand_p = false;
47665 break;
47667 /* The elements of PERM do not suggest that only the first operand
47668 is used, but both operands are identical. Allow easier matching
47669 of the permutation by folding the permutation into the single
47670 input vector. */
47671 /* FALLTHRU */
47673 case 2:
47674 for (i = 0; i < nelt; ++i)
47675 d->perm[i] &= nelt - 1;
47676 d->op0 = d->op1;
47677 break;
47679 case 1:
47680 d->op1 = d->op0;
47681 break;
47684 return (which == 3);
47687 bool
47688 ix86_expand_vec_perm_const (rtx operands[4])
47690 struct expand_vec_perm_d d;
47691 unsigned char perm[MAX_VECT_LEN];
47692 int i, nelt;
47693 bool two_args;
47694 rtx sel;
47696 d.target = operands[0];
47697 d.op0 = operands[1];
47698 d.op1 = operands[2];
47699 sel = operands[3];
47701 d.vmode = GET_MODE (d.target);
47702 gcc_assert (VECTOR_MODE_P (d.vmode));
47703 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47704 d.testing_p = false;
47706 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47707 gcc_assert (XVECLEN (sel, 0) == nelt);
47708 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47710 for (i = 0; i < nelt; ++i)
47712 rtx e = XVECEXP (sel, 0, i);
47713 int ei = INTVAL (e) & (2 * nelt - 1);
47714 d.perm[i] = ei;
47715 perm[i] = ei;
47718 two_args = canonicalize_perm (&d);
47720 if (ix86_expand_vec_perm_const_1 (&d))
47721 return true;
47723 /* If the selector says both arguments are needed, but the operands are the
47724 same, the above tried to expand with one_operand_p and flattened selector.
47725 If that didn't work, retry without one_operand_p; we succeeded with that
47726 during testing. */
47727 if (two_args && d.one_operand_p)
47729 d.one_operand_p = false;
47730 memcpy (d.perm, perm, sizeof (perm));
47731 return ix86_expand_vec_perm_const_1 (&d);
47734 return false;
47737 /* Implement targetm.vectorize.vec_perm_const_ok. */
47739 static bool
47740 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
47741 const unsigned char *sel)
47743 struct expand_vec_perm_d d;
47744 unsigned int i, nelt, which;
47745 bool ret;
47747 d.vmode = vmode;
47748 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47749 d.testing_p = true;
47751 /* Given sufficient ISA support we can just return true here
47752 for selected vector modes. */
47753 switch (d.vmode)
47755 case V16SFmode:
47756 case V16SImode:
47757 case V8DImode:
47758 case V8DFmode:
47759 if (TARGET_AVX512F)
47760 /* All implementable with a single vpermi2 insn. */
47761 return true;
47762 break;
47763 case V32HImode:
47764 if (TARGET_AVX512BW)
47765 /* All implementable with a single vpermi2 insn. */
47766 return true;
47767 break;
47768 case V64QImode:
47769 if (TARGET_AVX512BW)
47770 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
47771 return true;
47772 break;
47773 case V8SImode:
47774 case V8SFmode:
47775 case V4DFmode:
47776 case V4DImode:
47777 if (TARGET_AVX512VL)
47778 /* All implementable with a single vpermi2 insn. */
47779 return true;
47780 break;
47781 case V16HImode:
47782 if (TARGET_AVX2)
47783 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47784 return true;
47785 break;
47786 case V32QImode:
47787 if (TARGET_AVX2)
47788 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47789 return true;
47790 break;
47791 case V4SImode:
47792 case V4SFmode:
47793 case V8HImode:
47794 case V16QImode:
47795 /* All implementable with a single vpperm insn. */
47796 if (TARGET_XOP)
47797 return true;
47798 /* All implementable with 2 pshufb + 1 ior. */
47799 if (TARGET_SSSE3)
47800 return true;
47801 break;
47802 case V2DImode:
47803 case V2DFmode:
47804 /* All implementable with shufpd or unpck[lh]pd. */
47805 return true;
47806 default:
47807 return false;
47810 /* Extract the values from the vector CST into the permutation
47811 array in D. */
47812 memcpy (d.perm, sel, nelt);
47813 for (i = which = 0; i < nelt; ++i)
47815 unsigned char e = d.perm[i];
47816 gcc_assert (e < 2 * nelt);
47817 which |= (e < nelt ? 1 : 2);
47820 /* For all elements from second vector, fold the elements to first. */
47821 if (which == 2)
47822 for (i = 0; i < nelt; ++i)
47823 d.perm[i] -= nelt;
47825 /* Check whether the mask can be applied to the vector type. */
47826 d.one_operand_p = (which != 3);
47828 /* Implementable with shufps or pshufd. */
47829 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47830 return true;
47832 /* Otherwise we have to go through the motions and see if we can
47833 figure out how to generate the requested permutation. */
47834 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47835 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47836 if (!d.one_operand_p)
47837 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47839 start_sequence ();
47840 ret = ix86_expand_vec_perm_const_1 (&d);
47841 end_sequence ();
47843 return ret;
47846 void
47847 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47849 struct expand_vec_perm_d d;
47850 unsigned i, nelt;
47852 d.target = targ;
47853 d.op0 = op0;
47854 d.op1 = op1;
47855 d.vmode = GET_MODE (targ);
47856 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47857 d.one_operand_p = false;
47858 d.testing_p = false;
47860 for (i = 0; i < nelt; ++i)
47861 d.perm[i] = i * 2 + odd;
47863 /* We'll either be able to implement the permutation directly... */
47864 if (expand_vec_perm_1 (&d))
47865 return;
47867 /* ... or we use the special-case patterns. */
47868 expand_vec_perm_even_odd_1 (&d, odd);
47871 static void
47872 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47874 struct expand_vec_perm_d d;
47875 unsigned i, nelt, base;
47876 bool ok;
47878 d.target = targ;
47879 d.op0 = op0;
47880 d.op1 = op1;
47881 d.vmode = GET_MODE (targ);
47882 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47883 d.one_operand_p = false;
47884 d.testing_p = false;
47886 base = high_p ? nelt / 2 : 0;
47887 for (i = 0; i < nelt / 2; ++i)
47889 d.perm[i * 2] = i + base;
47890 d.perm[i * 2 + 1] = i + base + nelt;
47893 /* Note that for AVX this isn't one instruction. */
47894 ok = ix86_expand_vec_perm_const_1 (&d);
47895 gcc_assert (ok);
47899 /* Expand a vector operation CODE for a V*QImode in terms of the
47900 same operation on V*HImode. */
47902 void
47903 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47905 machine_mode qimode = GET_MODE (dest);
47906 machine_mode himode;
47907 rtx (*gen_il) (rtx, rtx, rtx);
47908 rtx (*gen_ih) (rtx, rtx, rtx);
47909 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47910 struct expand_vec_perm_d d;
47911 bool ok, full_interleave;
47912 bool uns_p = false;
47913 int i;
47915 switch (qimode)
47917 case V16QImode:
47918 himode = V8HImode;
47919 gen_il = gen_vec_interleave_lowv16qi;
47920 gen_ih = gen_vec_interleave_highv16qi;
47921 break;
47922 case V32QImode:
47923 himode = V16HImode;
47924 gen_il = gen_avx2_interleave_lowv32qi;
47925 gen_ih = gen_avx2_interleave_highv32qi;
47926 break;
47927 case V64QImode:
47928 himode = V32HImode;
47929 gen_il = gen_avx512bw_interleave_lowv64qi;
47930 gen_ih = gen_avx512bw_interleave_highv64qi;
47931 break;
47932 default:
47933 gcc_unreachable ();
47936 op2_l = op2_h = op2;
47937 switch (code)
47939 case MULT:
47940 /* Unpack data such that we've got a source byte in each low byte of
47941 each word. We don't care what goes into the high byte of each word.
47942 Rather than trying to get zero in there, most convenient is to let
47943 it be a copy of the low byte. */
47944 op2_l = gen_reg_rtx (qimode);
47945 op2_h = gen_reg_rtx (qimode);
47946 emit_insn (gen_il (op2_l, op2, op2));
47947 emit_insn (gen_ih (op2_h, op2, op2));
47948 /* FALLTHRU */
47950 op1_l = gen_reg_rtx (qimode);
47951 op1_h = gen_reg_rtx (qimode);
47952 emit_insn (gen_il (op1_l, op1, op1));
47953 emit_insn (gen_ih (op1_h, op1, op1));
47954 full_interleave = qimode == V16QImode;
47955 break;
47957 case ASHIFT:
47958 case LSHIFTRT:
47959 uns_p = true;
47960 /* FALLTHRU */
47961 case ASHIFTRT:
47962 op1_l = gen_reg_rtx (himode);
47963 op1_h = gen_reg_rtx (himode);
47964 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
47965 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
47966 full_interleave = true;
47967 break;
47968 default:
47969 gcc_unreachable ();
47972 /* Perform the operation. */
47973 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
47974 1, OPTAB_DIRECT);
47975 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
47976 1, OPTAB_DIRECT);
47977 gcc_assert (res_l && res_h);
47979 /* Merge the data back into the right place. */
47980 d.target = dest;
47981 d.op0 = gen_lowpart (qimode, res_l);
47982 d.op1 = gen_lowpart (qimode, res_h);
47983 d.vmode = qimode;
47984 d.nelt = GET_MODE_NUNITS (qimode);
47985 d.one_operand_p = false;
47986 d.testing_p = false;
47988 if (full_interleave)
47990 /* For SSE2, we used an full interleave, so the desired
47991 results are in the even elements. */
47992 for (i = 0; i < d.nelt; ++i)
47993 d.perm[i] = i * 2;
47995 else
47997 /* For AVX, the interleave used above was not cross-lane. So the
47998 extraction is evens but with the second and third quarter swapped.
47999 Happily, that is even one insn shorter than even extraction.
48000 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48001 always first from the first and then from the second source operand,
48002 the index bits above the low 4 bits remains the same.
48003 Thus, for d.nelt == 32 we want permutation
48004 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48005 and for d.nelt == 64 we want permutation
48006 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48007 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48008 for (i = 0; i < d.nelt; ++i)
48009 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48012 ok = ix86_expand_vec_perm_const_1 (&d);
48013 gcc_assert (ok);
48015 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48016 gen_rtx_fmt_ee (code, qimode, op1, op2));
48019 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48020 if op is CONST_VECTOR with all odd elements equal to their
48021 preceding element. */
48023 static bool
48024 const_vector_equal_evenodd_p (rtx op)
48026 machine_mode mode = GET_MODE (op);
48027 int i, nunits = GET_MODE_NUNITS (mode);
48028 if (GET_CODE (op) != CONST_VECTOR
48029 || nunits != CONST_VECTOR_NUNITS (op))
48030 return false;
48031 for (i = 0; i < nunits; i += 2)
48032 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48033 return false;
48034 return true;
48037 void
48038 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48039 bool uns_p, bool odd_p)
48041 machine_mode mode = GET_MODE (op1);
48042 machine_mode wmode = GET_MODE (dest);
48043 rtx x;
48044 rtx orig_op1 = op1, orig_op2 = op2;
48046 if (!nonimmediate_operand (op1, mode))
48047 op1 = force_reg (mode, op1);
48048 if (!nonimmediate_operand (op2, mode))
48049 op2 = force_reg (mode, op2);
48051 /* We only play even/odd games with vectors of SImode. */
48052 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48054 /* If we're looking for the odd results, shift those members down to
48055 the even slots. For some cpus this is faster than a PSHUFD. */
48056 if (odd_p)
48058 /* For XOP use vpmacsdqh, but only for smult, as it is only
48059 signed. */
48060 if (TARGET_XOP && mode == V4SImode && !uns_p)
48062 x = force_reg (wmode, CONST0_RTX (wmode));
48063 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48064 return;
48067 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48068 if (!const_vector_equal_evenodd_p (orig_op1))
48069 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48070 x, NULL, 1, OPTAB_DIRECT);
48071 if (!const_vector_equal_evenodd_p (orig_op2))
48072 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48073 x, NULL, 1, OPTAB_DIRECT);
48074 op1 = gen_lowpart (mode, op1);
48075 op2 = gen_lowpart (mode, op2);
48078 if (mode == V16SImode)
48080 if (uns_p)
48081 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48082 else
48083 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48085 else if (mode == V8SImode)
48087 if (uns_p)
48088 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48089 else
48090 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48092 else if (uns_p)
48093 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48094 else if (TARGET_SSE4_1)
48095 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48096 else
48098 rtx s1, s2, t0, t1, t2;
48100 /* The easiest way to implement this without PMULDQ is to go through
48101 the motions as if we are performing a full 64-bit multiply. With
48102 the exception that we need to do less shuffling of the elements. */
48104 /* Compute the sign-extension, aka highparts, of the two operands. */
48105 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48106 op1, pc_rtx, pc_rtx);
48107 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48108 op2, pc_rtx, pc_rtx);
48110 /* Multiply LO(A) * HI(B), and vice-versa. */
48111 t1 = gen_reg_rtx (wmode);
48112 t2 = gen_reg_rtx (wmode);
48113 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48114 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48116 /* Multiply LO(A) * LO(B). */
48117 t0 = gen_reg_rtx (wmode);
48118 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48120 /* Combine and shift the highparts into place. */
48121 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48122 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48123 1, OPTAB_DIRECT);
48125 /* Combine high and low parts. */
48126 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48127 return;
48129 emit_insn (x);
48132 void
48133 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48134 bool uns_p, bool high_p)
48136 machine_mode wmode = GET_MODE (dest);
48137 machine_mode mode = GET_MODE (op1);
48138 rtx t1, t2, t3, t4, mask;
48140 switch (mode)
48142 case V4SImode:
48143 t1 = gen_reg_rtx (mode);
48144 t2 = gen_reg_rtx (mode);
48145 if (TARGET_XOP && !uns_p)
48147 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48148 shuffle the elements once so that all elements are in the right
48149 place for immediate use: { A C B D }. */
48150 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48151 const1_rtx, GEN_INT (3)));
48152 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48153 const1_rtx, GEN_INT (3)));
48155 else
48157 /* Put the elements into place for the multiply. */
48158 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48159 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48160 high_p = false;
48162 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48163 break;
48165 case V8SImode:
48166 /* Shuffle the elements between the lanes. After this we
48167 have { A B E F | C D G H } for each operand. */
48168 t1 = gen_reg_rtx (V4DImode);
48169 t2 = gen_reg_rtx (V4DImode);
48170 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48171 const0_rtx, const2_rtx,
48172 const1_rtx, GEN_INT (3)));
48173 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48174 const0_rtx, const2_rtx,
48175 const1_rtx, GEN_INT (3)));
48177 /* Shuffle the elements within the lanes. After this we
48178 have { A A B B | C C D D } or { E E F F | G G H H }. */
48179 t3 = gen_reg_rtx (V8SImode);
48180 t4 = gen_reg_rtx (V8SImode);
48181 mask = GEN_INT (high_p
48182 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48183 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48184 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48185 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48187 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48188 break;
48190 case V8HImode:
48191 case V16HImode:
48192 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48193 uns_p, OPTAB_DIRECT);
48194 t2 = expand_binop (mode,
48195 uns_p ? umul_highpart_optab : smul_highpart_optab,
48196 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48197 gcc_assert (t1 && t2);
48199 t3 = gen_reg_rtx (mode);
48200 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48201 emit_move_insn (dest, gen_lowpart (wmode, t3));
48202 break;
48204 case V16QImode:
48205 case V32QImode:
48206 case V32HImode:
48207 case V16SImode:
48208 case V64QImode:
48209 t1 = gen_reg_rtx (wmode);
48210 t2 = gen_reg_rtx (wmode);
48211 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48212 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48214 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48215 break;
48217 default:
48218 gcc_unreachable ();
48222 void
48223 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48225 rtx res_1, res_2, res_3, res_4;
48227 res_1 = gen_reg_rtx (V4SImode);
48228 res_2 = gen_reg_rtx (V4SImode);
48229 res_3 = gen_reg_rtx (V2DImode);
48230 res_4 = gen_reg_rtx (V2DImode);
48231 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48232 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48234 /* Move the results in element 2 down to element 1; we don't care
48235 what goes in elements 2 and 3. Then we can merge the parts
48236 back together with an interleave.
48238 Note that two other sequences were tried:
48239 (1) Use interleaves at the start instead of psrldq, which allows
48240 us to use a single shufps to merge things back at the end.
48241 (2) Use shufps here to combine the two vectors, then pshufd to
48242 put the elements in the correct order.
48243 In both cases the cost of the reformatting stall was too high
48244 and the overall sequence slower. */
48246 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48247 const0_rtx, const2_rtx,
48248 const0_rtx, const0_rtx));
48249 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48250 const0_rtx, const2_rtx,
48251 const0_rtx, const0_rtx));
48252 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48254 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48257 void
48258 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48260 machine_mode mode = GET_MODE (op0);
48261 rtx t1, t2, t3, t4, t5, t6;
48263 if (TARGET_AVX512DQ && mode == V8DImode)
48264 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48265 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48266 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48267 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48268 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48269 else if (TARGET_XOP && mode == V2DImode)
48271 /* op1: A,B,C,D, op2: E,F,G,H */
48272 op1 = gen_lowpart (V4SImode, op1);
48273 op2 = gen_lowpart (V4SImode, op2);
48275 t1 = gen_reg_rtx (V4SImode);
48276 t2 = gen_reg_rtx (V4SImode);
48277 t3 = gen_reg_rtx (V2DImode);
48278 t4 = gen_reg_rtx (V2DImode);
48280 /* t1: B,A,D,C */
48281 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48282 GEN_INT (1),
48283 GEN_INT (0),
48284 GEN_INT (3),
48285 GEN_INT (2)));
48287 /* t2: (B*E),(A*F),(D*G),(C*H) */
48288 emit_insn (gen_mulv4si3 (t2, t1, op2));
48290 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48291 emit_insn (gen_xop_phadddq (t3, t2));
48293 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48294 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48296 /* Multiply lower parts and add all */
48297 t5 = gen_reg_rtx (V2DImode);
48298 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48299 gen_lowpart (V4SImode, op1),
48300 gen_lowpart (V4SImode, op2)));
48301 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48304 else
48306 machine_mode nmode;
48307 rtx (*umul) (rtx, rtx, rtx);
48309 if (mode == V2DImode)
48311 umul = gen_vec_widen_umult_even_v4si;
48312 nmode = V4SImode;
48314 else if (mode == V4DImode)
48316 umul = gen_vec_widen_umult_even_v8si;
48317 nmode = V8SImode;
48319 else if (mode == V8DImode)
48321 umul = gen_vec_widen_umult_even_v16si;
48322 nmode = V16SImode;
48324 else
48325 gcc_unreachable ();
48328 /* Multiply low parts. */
48329 t1 = gen_reg_rtx (mode);
48330 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48332 /* Shift input vectors right 32 bits so we can multiply high parts. */
48333 t6 = GEN_INT (32);
48334 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48335 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48337 /* Multiply high parts by low parts. */
48338 t4 = gen_reg_rtx (mode);
48339 t5 = gen_reg_rtx (mode);
48340 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48341 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48343 /* Combine and shift the highparts back. */
48344 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48345 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48347 /* Combine high and low parts. */
48348 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48351 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48352 gen_rtx_MULT (mode, op1, op2));
48355 /* Return 1 if control tansfer instruction INSN
48356 should be encoded with bnd prefix.
48357 If insn is NULL then return 1 when control
48358 transfer instructions should be prefixed with
48359 bnd by default for current function. */
48361 bool
48362 ix86_bnd_prefixed_insn_p (rtx insn)
48364 /* For call insns check special flag. */
48365 if (insn && CALL_P (insn))
48367 rtx call = get_call_rtx_from (insn);
48368 if (call)
48369 return CALL_EXPR_WITH_BOUNDS_P (call);
48372 /* All other insns are prefixed only if function is instrumented. */
48373 return chkp_function_instrumented_p (current_function_decl);
48376 /* Calculate integer abs() using only SSE2 instructions. */
48378 void
48379 ix86_expand_sse2_abs (rtx target, rtx input)
48381 machine_mode mode = GET_MODE (target);
48382 rtx tmp0, tmp1, x;
48384 switch (mode)
48386 /* For 32-bit signed integer X, the best way to calculate the absolute
48387 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48388 case V4SImode:
48389 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48390 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48391 NULL, 0, OPTAB_DIRECT);
48392 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48393 NULL, 0, OPTAB_DIRECT);
48394 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48395 target, 0, OPTAB_DIRECT);
48396 break;
48398 /* For 16-bit signed integer X, the best way to calculate the absolute
48399 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48400 case V8HImode:
48401 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48403 x = expand_simple_binop (mode, SMAX, tmp0, input,
48404 target, 0, OPTAB_DIRECT);
48405 break;
48407 /* For 8-bit signed integer X, the best way to calculate the absolute
48408 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48409 as SSE2 provides the PMINUB insn. */
48410 case V16QImode:
48411 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48413 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48414 target, 0, OPTAB_DIRECT);
48415 break;
48417 default:
48418 gcc_unreachable ();
48421 if (x != target)
48422 emit_move_insn (target, x);
48425 /* Expand an extract from a vector register through pextr insn.
48426 Return true if successful. */
48428 bool
48429 ix86_expand_pextr (rtx *operands)
48431 rtx dst = operands[0];
48432 rtx src = operands[1];
48434 unsigned int size = INTVAL (operands[2]);
48435 unsigned int pos = INTVAL (operands[3]);
48437 if (SUBREG_P (dst))
48439 /* Reject non-lowpart subregs. */
48440 if (SUBREG_BYTE (dst) > 0)
48441 return false;
48442 dst = SUBREG_REG (dst);
48445 if (SUBREG_P (src))
48447 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48448 src = SUBREG_REG (src);
48451 switch (GET_MODE (src))
48453 case V16QImode:
48454 case V8HImode:
48455 case V4SImode:
48456 case V2DImode:
48457 case V1TImode:
48458 case TImode:
48460 machine_mode srcmode, dstmode;
48461 rtx d, pat;
48463 dstmode = mode_for_size (size, MODE_INT, 0);
48465 switch (dstmode)
48467 case QImode:
48468 if (!TARGET_SSE4_1)
48469 return false;
48470 srcmode = V16QImode;
48471 break;
48473 case HImode:
48474 if (!TARGET_SSE2)
48475 return false;
48476 srcmode = V8HImode;
48477 break;
48479 case SImode:
48480 if (!TARGET_SSE4_1)
48481 return false;
48482 srcmode = V4SImode;
48483 break;
48485 case DImode:
48486 gcc_assert (TARGET_64BIT);
48487 if (!TARGET_SSE4_1)
48488 return false;
48489 srcmode = V2DImode;
48490 break;
48492 default:
48493 return false;
48496 /* Reject extractions from misaligned positions. */
48497 if (pos & (size-1))
48498 return false;
48500 if (GET_MODE (dst) == dstmode)
48501 d = dst;
48502 else
48503 d = gen_reg_rtx (dstmode);
48505 /* Construct insn pattern. */
48506 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48507 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48509 /* Let the rtl optimizers know about the zero extension performed. */
48510 if (dstmode == QImode || dstmode == HImode)
48512 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48513 d = gen_lowpart (SImode, d);
48516 emit_insn (gen_rtx_SET (d, pat));
48518 if (d != dst)
48519 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48520 return true;
48523 default:
48524 return false;
48528 /* Expand an insert into a vector register through pinsr insn.
48529 Return true if successful. */
48531 bool
48532 ix86_expand_pinsr (rtx *operands)
48534 rtx dst = operands[0];
48535 rtx src = operands[3];
48537 unsigned int size = INTVAL (operands[1]);
48538 unsigned int pos = INTVAL (operands[2]);
48540 if (SUBREG_P (dst))
48542 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48543 dst = SUBREG_REG (dst);
48546 switch (GET_MODE (dst))
48548 case V16QImode:
48549 case V8HImode:
48550 case V4SImode:
48551 case V2DImode:
48552 case V1TImode:
48553 case TImode:
48555 machine_mode srcmode, dstmode;
48556 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48557 rtx d;
48559 srcmode = mode_for_size (size, MODE_INT, 0);
48561 switch (srcmode)
48563 case QImode:
48564 if (!TARGET_SSE4_1)
48565 return false;
48566 dstmode = V16QImode;
48567 pinsr = gen_sse4_1_pinsrb;
48568 break;
48570 case HImode:
48571 if (!TARGET_SSE2)
48572 return false;
48573 dstmode = V8HImode;
48574 pinsr = gen_sse2_pinsrw;
48575 break;
48577 case SImode:
48578 if (!TARGET_SSE4_1)
48579 return false;
48580 dstmode = V4SImode;
48581 pinsr = gen_sse4_1_pinsrd;
48582 break;
48584 case DImode:
48585 gcc_assert (TARGET_64BIT);
48586 if (!TARGET_SSE4_1)
48587 return false;
48588 dstmode = V2DImode;
48589 pinsr = gen_sse4_1_pinsrq;
48590 break;
48592 default:
48593 return false;
48596 /* Reject insertions to misaligned positions. */
48597 if (pos & (size-1))
48598 return false;
48600 if (SUBREG_P (src))
48602 unsigned int srcpos = SUBREG_BYTE (src);
48604 if (srcpos > 0)
48606 rtx extr_ops[4];
48608 extr_ops[0] = gen_reg_rtx (srcmode);
48609 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48610 extr_ops[2] = GEN_INT (size);
48611 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48613 if (!ix86_expand_pextr (extr_ops))
48614 return false;
48616 src = extr_ops[0];
48618 else
48619 src = gen_lowpart (srcmode, SUBREG_REG (src));
48622 if (GET_MODE (dst) == dstmode)
48623 d = dst;
48624 else
48625 d = gen_reg_rtx (dstmode);
48627 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48628 gen_lowpart (srcmode, src),
48629 GEN_INT (1 << (pos / size))));
48630 if (d != dst)
48631 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48632 return true;
48635 default:
48636 return false;
48640 /* This function returns the calling abi specific va_list type node.
48641 It returns the FNDECL specific va_list type. */
48643 static tree
48644 ix86_fn_abi_va_list (tree fndecl)
48646 if (!TARGET_64BIT)
48647 return va_list_type_node;
48648 gcc_assert (fndecl != NULL_TREE);
48650 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48651 return ms_va_list_type_node;
48652 else
48653 return sysv_va_list_type_node;
48656 /* Returns the canonical va_list type specified by TYPE. If there
48657 is no valid TYPE provided, it return NULL_TREE. */
48659 static tree
48660 ix86_canonical_va_list_type (tree type)
48662 if (TARGET_64BIT)
48664 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48665 return ms_va_list_type_node;
48667 if ((TREE_CODE (type) == ARRAY_TYPE
48668 && integer_zerop (array_type_nelts (type)))
48669 || POINTER_TYPE_P (type))
48671 tree elem_type = TREE_TYPE (type);
48672 if (TREE_CODE (elem_type) == RECORD_TYPE
48673 && lookup_attribute ("sysv_abi va_list",
48674 TYPE_ATTRIBUTES (elem_type)))
48675 return sysv_va_list_type_node;
48678 return NULL_TREE;
48681 return std_canonical_va_list_type (type);
48684 /* Iterate through the target-specific builtin types for va_list.
48685 IDX denotes the iterator, *PTREE is set to the result type of
48686 the va_list builtin, and *PNAME to its internal type.
48687 Returns zero if there is no element for this index, otherwise
48688 IDX should be increased upon the next call.
48689 Note, do not iterate a base builtin's name like __builtin_va_list.
48690 Used from c_common_nodes_and_builtins. */
48692 static int
48693 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48695 if (TARGET_64BIT)
48697 switch (idx)
48699 default:
48700 break;
48702 case 0:
48703 *ptree = ms_va_list_type_node;
48704 *pname = "__builtin_ms_va_list";
48705 return 1;
48707 case 1:
48708 *ptree = sysv_va_list_type_node;
48709 *pname = "__builtin_sysv_va_list";
48710 return 1;
48714 return 0;
48717 #undef TARGET_SCHED_DISPATCH
48718 #define TARGET_SCHED_DISPATCH has_dispatch
48719 #undef TARGET_SCHED_DISPATCH_DO
48720 #define TARGET_SCHED_DISPATCH_DO do_dispatch
48721 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48722 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48723 #undef TARGET_SCHED_REORDER
48724 #define TARGET_SCHED_REORDER ix86_sched_reorder
48725 #undef TARGET_SCHED_ADJUST_PRIORITY
48726 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48727 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48728 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48729 ix86_dependencies_evaluation_hook
48731 /* The size of the dispatch window is the total number of bytes of
48732 object code allowed in a window. */
48733 #define DISPATCH_WINDOW_SIZE 16
48735 /* Number of dispatch windows considered for scheduling. */
48736 #define MAX_DISPATCH_WINDOWS 3
48738 /* Maximum number of instructions in a window. */
48739 #define MAX_INSN 4
48741 /* Maximum number of immediate operands in a window. */
48742 #define MAX_IMM 4
48744 /* Maximum number of immediate bits allowed in a window. */
48745 #define MAX_IMM_SIZE 128
48747 /* Maximum number of 32 bit immediates allowed in a window. */
48748 #define MAX_IMM_32 4
48750 /* Maximum number of 64 bit immediates allowed in a window. */
48751 #define MAX_IMM_64 2
48753 /* Maximum total of loads or prefetches allowed in a window. */
48754 #define MAX_LOAD 2
48756 /* Maximum total of stores allowed in a window. */
48757 #define MAX_STORE 1
48759 #undef BIG
48760 #define BIG 100
48763 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
48764 enum dispatch_group {
48765 disp_no_group = 0,
48766 disp_load,
48767 disp_store,
48768 disp_load_store,
48769 disp_prefetch,
48770 disp_imm,
48771 disp_imm_32,
48772 disp_imm_64,
48773 disp_branch,
48774 disp_cmp,
48775 disp_jcc,
48776 disp_last
48779 /* Number of allowable groups in a dispatch window. It is an array
48780 indexed by dispatch_group enum. 100 is used as a big number,
48781 because the number of these kind of operations does not have any
48782 effect in dispatch window, but we need them for other reasons in
48783 the table. */
48784 static unsigned int num_allowable_groups[disp_last] = {
48785 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
48788 char group_name[disp_last + 1][16] = {
48789 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
48790 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
48791 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
48794 /* Instruction path. */
48795 enum insn_path {
48796 no_path = 0,
48797 path_single, /* Single micro op. */
48798 path_double, /* Double micro op. */
48799 path_multi, /* Instructions with more than 2 micro op.. */
48800 last_path
48803 /* sched_insn_info defines a window to the instructions scheduled in
48804 the basic block. It contains a pointer to the insn_info table and
48805 the instruction scheduled.
48807 Windows are allocated for each basic block and are linked
48808 together. */
48809 typedef struct sched_insn_info_s {
48810 rtx insn;
48811 enum dispatch_group group;
48812 enum insn_path path;
48813 int byte_len;
48814 int imm_bytes;
48815 } sched_insn_info;
48817 /* Linked list of dispatch windows. This is a two way list of
48818 dispatch windows of a basic block. It contains information about
48819 the number of uops in the window and the total number of
48820 instructions and of bytes in the object code for this dispatch
48821 window. */
48822 typedef struct dispatch_windows_s {
48823 int num_insn; /* Number of insn in the window. */
48824 int num_uops; /* Number of uops in the window. */
48825 int window_size; /* Number of bytes in the window. */
48826 int window_num; /* Window number between 0 or 1. */
48827 int num_imm; /* Number of immediates in an insn. */
48828 int num_imm_32; /* Number of 32 bit immediates in an insn. */
48829 int num_imm_64; /* Number of 64 bit immediates in an insn. */
48830 int imm_size; /* Total immediates in the window. */
48831 int num_loads; /* Total memory loads in the window. */
48832 int num_stores; /* Total memory stores in the window. */
48833 int violation; /* Violation exists in window. */
48834 sched_insn_info *window; /* Pointer to the window. */
48835 struct dispatch_windows_s *next;
48836 struct dispatch_windows_s *prev;
48837 } dispatch_windows;
48839 /* Immediate valuse used in an insn. */
48840 typedef struct imm_info_s
48842 int imm;
48843 int imm32;
48844 int imm64;
48845 } imm_info;
48847 static dispatch_windows *dispatch_window_list;
48848 static dispatch_windows *dispatch_window_list1;
48850 /* Get dispatch group of insn. */
48852 static enum dispatch_group
48853 get_mem_group (rtx_insn *insn)
48855 enum attr_memory memory;
48857 if (INSN_CODE (insn) < 0)
48858 return disp_no_group;
48859 memory = get_attr_memory (insn);
48860 if (memory == MEMORY_STORE)
48861 return disp_store;
48863 if (memory == MEMORY_LOAD)
48864 return disp_load;
48866 if (memory == MEMORY_BOTH)
48867 return disp_load_store;
48869 return disp_no_group;
48872 /* Return true if insn is a compare instruction. */
48874 static bool
48875 is_cmp (rtx_insn *insn)
48877 enum attr_type type;
48879 type = get_attr_type (insn);
48880 return (type == TYPE_TEST
48881 || type == TYPE_ICMP
48882 || type == TYPE_FCMP
48883 || GET_CODE (PATTERN (insn)) == COMPARE);
48886 /* Return true if a dispatch violation encountered. */
48888 static bool
48889 dispatch_violation (void)
48891 if (dispatch_window_list->next)
48892 return dispatch_window_list->next->violation;
48893 return dispatch_window_list->violation;
48896 /* Return true if insn is a branch instruction. */
48898 static bool
48899 is_branch (rtx_insn *insn)
48901 return (CALL_P (insn) || JUMP_P (insn));
48904 /* Return true if insn is a prefetch instruction. */
48906 static bool
48907 is_prefetch (rtx_insn *insn)
48909 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
48912 /* This function initializes a dispatch window and the list container holding a
48913 pointer to the window. */
48915 static void
48916 init_window (int window_num)
48918 int i;
48919 dispatch_windows *new_list;
48921 if (window_num == 0)
48922 new_list = dispatch_window_list;
48923 else
48924 new_list = dispatch_window_list1;
48926 new_list->num_insn = 0;
48927 new_list->num_uops = 0;
48928 new_list->window_size = 0;
48929 new_list->next = NULL;
48930 new_list->prev = NULL;
48931 new_list->window_num = window_num;
48932 new_list->num_imm = 0;
48933 new_list->num_imm_32 = 0;
48934 new_list->num_imm_64 = 0;
48935 new_list->imm_size = 0;
48936 new_list->num_loads = 0;
48937 new_list->num_stores = 0;
48938 new_list->violation = false;
48940 for (i = 0; i < MAX_INSN; i++)
48942 new_list->window[i].insn = NULL;
48943 new_list->window[i].group = disp_no_group;
48944 new_list->window[i].path = no_path;
48945 new_list->window[i].byte_len = 0;
48946 new_list->window[i].imm_bytes = 0;
48948 return;
48951 /* This function allocates and initializes a dispatch window and the
48952 list container holding a pointer to the window. */
48954 static dispatch_windows *
48955 allocate_window (void)
48957 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
48958 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
48960 return new_list;
48963 /* This routine initializes the dispatch scheduling information. It
48964 initiates building dispatch scheduler tables and constructs the
48965 first dispatch window. */
48967 static void
48968 init_dispatch_sched (void)
48970 /* Allocate a dispatch list and a window. */
48971 dispatch_window_list = allocate_window ();
48972 dispatch_window_list1 = allocate_window ();
48973 init_window (0);
48974 init_window (1);
48977 /* This function returns true if a branch is detected. End of a basic block
48978 does not have to be a branch, but here we assume only branches end a
48979 window. */
48981 static bool
48982 is_end_basic_block (enum dispatch_group group)
48984 return group == disp_branch;
48987 /* This function is called when the end of a window processing is reached. */
48989 static void
48990 process_end_window (void)
48992 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
48993 if (dispatch_window_list->next)
48995 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
48996 gcc_assert (dispatch_window_list->window_size
48997 + dispatch_window_list1->window_size <= 48);
48998 init_window (1);
49000 init_window (0);
49003 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
49004 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
49005 for 48 bytes of instructions. Note that these windows are not dispatch
49006 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49008 static dispatch_windows *
49009 allocate_next_window (int window_num)
49011 if (window_num == 0)
49013 if (dispatch_window_list->next)
49014 init_window (1);
49015 init_window (0);
49016 return dispatch_window_list;
49019 dispatch_window_list->next = dispatch_window_list1;
49020 dispatch_window_list1->prev = dispatch_window_list;
49022 return dispatch_window_list1;
49025 /* Compute number of immediate operands of an instruction. */
49027 static void
49028 find_constant (rtx in_rtx, imm_info *imm_values)
49030 if (INSN_P (in_rtx))
49031 in_rtx = PATTERN (in_rtx);
49032 subrtx_iterator::array_type array;
49033 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49034 if (const_rtx x = *iter)
49035 switch (GET_CODE (x))
49037 case CONST:
49038 case SYMBOL_REF:
49039 case CONST_INT:
49040 (imm_values->imm)++;
49041 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49042 (imm_values->imm32)++;
49043 else
49044 (imm_values->imm64)++;
49045 break;
49047 case CONST_DOUBLE:
49048 case CONST_WIDE_INT:
49049 (imm_values->imm)++;
49050 (imm_values->imm64)++;
49051 break;
49053 case CODE_LABEL:
49054 if (LABEL_KIND (x) == LABEL_NORMAL)
49056 (imm_values->imm)++;
49057 (imm_values->imm32)++;
49059 break;
49061 default:
49062 break;
49066 /* Return total size of immediate operands of an instruction along with number
49067 of corresponding immediate-operands. It initializes its parameters to zero
49068 befor calling FIND_CONSTANT.
49069 INSN is the input instruction. IMM is the total of immediates.
49070 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49071 bit immediates. */
49073 static int
49074 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49076 imm_info imm_values = {0, 0, 0};
49078 find_constant (insn, &imm_values);
49079 *imm = imm_values.imm;
49080 *imm32 = imm_values.imm32;
49081 *imm64 = imm_values.imm64;
49082 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49085 /* This function indicates if an operand of an instruction is an
49086 immediate. */
49088 static bool
49089 has_immediate (rtx_insn *insn)
49091 int num_imm_operand;
49092 int num_imm32_operand;
49093 int num_imm64_operand;
49095 if (insn)
49096 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49097 &num_imm64_operand);
49098 return false;
49101 /* Return single or double path for instructions. */
49103 static enum insn_path
49104 get_insn_path (rtx_insn *insn)
49106 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49108 if ((int)path == 0)
49109 return path_single;
49111 if ((int)path == 1)
49112 return path_double;
49114 return path_multi;
49117 /* Return insn dispatch group. */
49119 static enum dispatch_group
49120 get_insn_group (rtx_insn *insn)
49122 enum dispatch_group group = get_mem_group (insn);
49123 if (group)
49124 return group;
49126 if (is_branch (insn))
49127 return disp_branch;
49129 if (is_cmp (insn))
49130 return disp_cmp;
49132 if (has_immediate (insn))
49133 return disp_imm;
49135 if (is_prefetch (insn))
49136 return disp_prefetch;
49138 return disp_no_group;
49141 /* Count number of GROUP restricted instructions in a dispatch
49142 window WINDOW_LIST. */
49144 static int
49145 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
49147 enum dispatch_group group = get_insn_group (insn);
49148 int imm_size;
49149 int num_imm_operand;
49150 int num_imm32_operand;
49151 int num_imm64_operand;
49153 if (group == disp_no_group)
49154 return 0;
49156 if (group == disp_imm)
49158 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49159 &num_imm64_operand);
49160 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
49161 || num_imm_operand + window_list->num_imm > MAX_IMM
49162 || (num_imm32_operand > 0
49163 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
49164 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
49165 || (num_imm64_operand > 0
49166 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
49167 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
49168 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
49169 && num_imm64_operand > 0
49170 && ((window_list->num_imm_64 > 0
49171 && window_list->num_insn >= 2)
49172 || window_list->num_insn >= 3)))
49173 return BIG;
49175 return 1;
49178 if ((group == disp_load_store
49179 && (window_list->num_loads >= MAX_LOAD
49180 || window_list->num_stores >= MAX_STORE))
49181 || ((group == disp_load
49182 || group == disp_prefetch)
49183 && window_list->num_loads >= MAX_LOAD)
49184 || (group == disp_store
49185 && window_list->num_stores >= MAX_STORE))
49186 return BIG;
49188 return 1;
49191 /* This function returns true if insn satisfies dispatch rules on the
49192 last window scheduled. */
49194 static bool
49195 fits_dispatch_window (rtx_insn *insn)
49197 dispatch_windows *window_list = dispatch_window_list;
49198 dispatch_windows *window_list_next = dispatch_window_list->next;
49199 unsigned int num_restrict;
49200 enum dispatch_group group = get_insn_group (insn);
49201 enum insn_path path = get_insn_path (insn);
49202 int sum;
49204 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
49205 instructions should be given the lowest priority in the
49206 scheduling process in Haifa scheduler to make sure they will be
49207 scheduled in the same dispatch window as the reference to them. */
49208 if (group == disp_jcc || group == disp_cmp)
49209 return false;
49211 /* Check nonrestricted. */
49212 if (group == disp_no_group || group == disp_branch)
49213 return true;
49215 /* Get last dispatch window. */
49216 if (window_list_next)
49217 window_list = window_list_next;
49219 if (window_list->window_num == 1)
49221 sum = window_list->prev->window_size + window_list->window_size;
49223 if (sum == 32
49224 || (min_insn_size (insn) + sum) >= 48)
49225 /* Window 1 is full. Go for next window. */
49226 return true;
49229 num_restrict = count_num_restricted (insn, window_list);
49231 if (num_restrict > num_allowable_groups[group])
49232 return false;
49234 /* See if it fits in the first window. */
49235 if (window_list->window_num == 0)
49237 /* The first widow should have only single and double path
49238 uops. */
49239 if (path == path_double
49240 && (window_list->num_uops + 2) > MAX_INSN)
49241 return false;
49242 else if (path != path_single)
49243 return false;
49245 return true;
49248 /* Add an instruction INSN with NUM_UOPS micro-operations to the
49249 dispatch window WINDOW_LIST. */
49251 static void
49252 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
49254 int byte_len = min_insn_size (insn);
49255 int num_insn = window_list->num_insn;
49256 int imm_size;
49257 sched_insn_info *window = window_list->window;
49258 enum dispatch_group group = get_insn_group (insn);
49259 enum insn_path path = get_insn_path (insn);
49260 int num_imm_operand;
49261 int num_imm32_operand;
49262 int num_imm64_operand;
49264 if (!window_list->violation && group != disp_cmp
49265 && !fits_dispatch_window (insn))
49266 window_list->violation = true;
49268 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49269 &num_imm64_operand);
49271 /* Initialize window with new instruction. */
49272 window[num_insn].insn = insn;
49273 window[num_insn].byte_len = byte_len;
49274 window[num_insn].group = group;
49275 window[num_insn].path = path;
49276 window[num_insn].imm_bytes = imm_size;
49278 window_list->window_size += byte_len;
49279 window_list->num_insn = num_insn + 1;
49280 window_list->num_uops = window_list->num_uops + num_uops;
49281 window_list->imm_size += imm_size;
49282 window_list->num_imm += num_imm_operand;
49283 window_list->num_imm_32 += num_imm32_operand;
49284 window_list->num_imm_64 += num_imm64_operand;
49286 if (group == disp_store)
49287 window_list->num_stores += 1;
49288 else if (group == disp_load
49289 || group == disp_prefetch)
49290 window_list->num_loads += 1;
49291 else if (group == disp_load_store)
49293 window_list->num_stores += 1;
49294 window_list->num_loads += 1;
49298 /* Adds a scheduled instruction, INSN, to the current dispatch window.
49299 If the total bytes of instructions or the number of instructions in
49300 the window exceed allowable, it allocates a new window. */
49302 static void
49303 add_to_dispatch_window (rtx_insn *insn)
49305 int byte_len;
49306 dispatch_windows *window_list;
49307 dispatch_windows *next_list;
49308 dispatch_windows *window0_list;
49309 enum insn_path path;
49310 enum dispatch_group insn_group;
49311 bool insn_fits;
49312 int num_insn;
49313 int num_uops;
49314 int window_num;
49315 int insn_num_uops;
49316 int sum;
49318 if (INSN_CODE (insn) < 0)
49319 return;
49321 byte_len = min_insn_size (insn);
49322 window_list = dispatch_window_list;
49323 next_list = window_list->next;
49324 path = get_insn_path (insn);
49325 insn_group = get_insn_group (insn);
49327 /* Get the last dispatch window. */
49328 if (next_list)
49329 window_list = dispatch_window_list->next;
49331 if (path == path_single)
49332 insn_num_uops = 1;
49333 else if (path == path_double)
49334 insn_num_uops = 2;
49335 else
49336 insn_num_uops = (int) path;
49338 /* If current window is full, get a new window.
49339 Window number zero is full, if MAX_INSN uops are scheduled in it.
49340 Window number one is full, if window zero's bytes plus window
49341 one's bytes is 32, or if the bytes of the new instruction added
49342 to the total makes it greater than 48, or it has already MAX_INSN
49343 instructions in it. */
49344 num_insn = window_list->num_insn;
49345 num_uops = window_list->num_uops;
49346 window_num = window_list->window_num;
49347 insn_fits = fits_dispatch_window (insn);
49349 if (num_insn >= MAX_INSN
49350 || num_uops + insn_num_uops > MAX_INSN
49351 || !(insn_fits))
49353 window_num = ~window_num & 1;
49354 window_list = allocate_next_window (window_num);
49357 if (window_num == 0)
49359 add_insn_window (insn, window_list, insn_num_uops);
49360 if (window_list->num_insn >= MAX_INSN
49361 && insn_group == disp_branch)
49363 process_end_window ();
49364 return;
49367 else if (window_num == 1)
49369 window0_list = window_list->prev;
49370 sum = window0_list->window_size + window_list->window_size;
49371 if (sum == 32
49372 || (byte_len + sum) >= 48)
49374 process_end_window ();
49375 window_list = dispatch_window_list;
49378 add_insn_window (insn, window_list, insn_num_uops);
49380 else
49381 gcc_unreachable ();
49383 if (is_end_basic_block (insn_group))
49385 /* End of basic block is reached do end-basic-block process. */
49386 process_end_window ();
49387 return;
49391 /* Print the dispatch window, WINDOW_NUM, to FILE. */
49393 DEBUG_FUNCTION static void
49394 debug_dispatch_window_file (FILE *file, int window_num)
49396 dispatch_windows *list;
49397 int i;
49399 if (window_num == 0)
49400 list = dispatch_window_list;
49401 else
49402 list = dispatch_window_list1;
49404 fprintf (file, "Window #%d:\n", list->window_num);
49405 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
49406 list->num_insn, list->num_uops, list->window_size);
49407 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49408 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
49410 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
49411 list->num_stores);
49412 fprintf (file, " insn info:\n");
49414 for (i = 0; i < MAX_INSN; i++)
49416 if (!list->window[i].insn)
49417 break;
49418 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
49419 i, group_name[list->window[i].group],
49420 i, (void *)list->window[i].insn,
49421 i, list->window[i].path,
49422 i, list->window[i].byte_len,
49423 i, list->window[i].imm_bytes);
49427 /* Print to stdout a dispatch window. */
49429 DEBUG_FUNCTION void
49430 debug_dispatch_window (int window_num)
49432 debug_dispatch_window_file (stdout, window_num);
49435 /* Print INSN dispatch information to FILE. */
49437 DEBUG_FUNCTION static void
49438 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
49440 int byte_len;
49441 enum insn_path path;
49442 enum dispatch_group group;
49443 int imm_size;
49444 int num_imm_operand;
49445 int num_imm32_operand;
49446 int num_imm64_operand;
49448 if (INSN_CODE (insn) < 0)
49449 return;
49451 byte_len = min_insn_size (insn);
49452 path = get_insn_path (insn);
49453 group = get_insn_group (insn);
49454 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49455 &num_imm64_operand);
49457 fprintf (file, " insn info:\n");
49458 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
49459 group_name[group], path, byte_len);
49460 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49461 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
49464 /* Print to STDERR the status of the ready list with respect to
49465 dispatch windows. */
49467 DEBUG_FUNCTION void
49468 debug_ready_dispatch (void)
49470 int i;
49471 int no_ready = number_in_ready ();
49473 fprintf (stdout, "Number of ready: %d\n", no_ready);
49475 for (i = 0; i < no_ready; i++)
49476 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
49479 /* This routine is the driver of the dispatch scheduler. */
49481 static void
49482 do_dispatch (rtx_insn *insn, int mode)
49484 if (mode == DISPATCH_INIT)
49485 init_dispatch_sched ();
49486 else if (mode == ADD_TO_DISPATCH_WINDOW)
49487 add_to_dispatch_window (insn);
49490 /* Return TRUE if Dispatch Scheduling is supported. */
49492 static bool
49493 has_dispatch (rtx_insn *insn, int action)
49495 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
49496 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
49497 switch (action)
49499 default:
49500 return false;
49502 case IS_DISPATCH_ON:
49503 return true;
49504 break;
49506 case IS_CMP:
49507 return is_cmp (insn);
49509 case DISPATCH_VIOLATION:
49510 return dispatch_violation ();
49512 case FITS_DISPATCH_WINDOW:
49513 return fits_dispatch_window (insn);
49516 return false;
49519 /* Implementation of reassociation_width target hook used by
49520 reassoc phase to identify parallelism level in reassociated
49521 tree. Statements tree_code is passed in OPC. Arguments type
49522 is passed in MODE.
49524 Currently parallel reassociation is enabled for Atom
49525 processors only and we set reassociation width to be 2
49526 because Atom may issue up to 2 instructions per cycle.
49528 Return value should be fixed if parallel reassociation is
49529 enabled for other processors. */
49531 static int
49532 ix86_reassociation_width (unsigned int, machine_mode mode)
49534 /* Vector part. */
49535 if (VECTOR_MODE_P (mode))
49537 if (TARGET_VECTOR_PARALLEL_EXECUTION)
49538 return 2;
49539 else
49540 return 1;
49543 /* Scalar part. */
49544 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
49545 return 2;
49546 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
49547 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
49548 else
49549 return 1;
49552 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49553 place emms and femms instructions. */
49555 static machine_mode
49556 ix86_preferred_simd_mode (machine_mode mode)
49558 if (!TARGET_SSE)
49559 return word_mode;
49561 switch (mode)
49563 case QImode:
49564 return TARGET_AVX512BW ? V64QImode :
49565 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
49566 case HImode:
49567 return TARGET_AVX512BW ? V32HImode :
49568 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
49569 case SImode:
49570 return TARGET_AVX512F ? V16SImode :
49571 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
49572 case DImode:
49573 return TARGET_AVX512F ? V8DImode :
49574 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
49576 case SFmode:
49577 if (TARGET_AVX512F)
49578 return V16SFmode;
49579 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49580 return V8SFmode;
49581 else
49582 return V4SFmode;
49584 case DFmode:
49585 if (TARGET_AVX512F)
49586 return V8DFmode;
49587 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49588 return V4DFmode;
49589 else if (TARGET_SSE2)
49590 return V2DFmode;
49591 /* FALLTHRU */
49593 default:
49594 return word_mode;
49598 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49599 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49600 256bit and 128bit vectors. */
49602 static unsigned int
49603 ix86_autovectorize_vector_sizes (void)
49605 return TARGET_AVX512F ? 64 | 32 | 16 :
49606 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
49609 /* Implemenation of targetm.vectorize.get_mask_mode. */
49611 static machine_mode
49612 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
49614 unsigned elem_size = vector_size / nunits;
49616 /* Scalar mask case. */
49617 if ((TARGET_AVX512F && vector_size == 64)
49618 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49620 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49621 return smallest_mode_for_size (nunits, MODE_INT);
49624 machine_mode elem_mode
49625 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
49627 gcc_assert (elem_size * nunits == vector_size);
49629 return mode_for_vector (elem_mode, nunits);
49634 /* Return class of registers which could be used for pseudo of MODE
49635 and of class RCLASS for spilling instead of memory. Return NO_REGS
49636 if it is not possible or non-profitable. */
49638 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49640 static reg_class_t
49641 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49643 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49644 && TARGET_SSE2
49645 && TARGET_INTER_UNIT_MOVES_TO_VEC
49646 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49647 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49648 && INTEGER_CLASS_P (rclass))
49649 return ALL_SSE_REGS;
49650 return NO_REGS;
49653 /* Implement targetm.vectorize.init_cost. */
49655 static void *
49656 ix86_init_cost (struct loop *)
49658 unsigned *cost = XNEWVEC (unsigned, 3);
49659 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49660 return cost;
49663 /* Implement targetm.vectorize.add_stmt_cost. */
49665 static unsigned
49666 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49667 struct _stmt_vec_info *stmt_info, int misalign,
49668 enum vect_cost_model_location where)
49670 unsigned *cost = (unsigned *) data;
49671 unsigned retval = 0;
49673 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49674 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49676 /* Penalize DFmode vector operations for !TARGET_VECTORIZE_DOUBLE. */
49677 if (kind == vector_stmt && !TARGET_VECTORIZE_DOUBLE
49678 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49679 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49681 /* Statements in an inner loop relative to the loop being
49682 vectorized are weighted more heavily. The value here is
49683 arbitrary and could potentially be improved with analysis. */
49684 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49685 count *= 50; /* FIXME. */
49687 retval = (unsigned) (count * stmt_cost);
49689 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49690 for Silvermont as it has out of order integer pipeline and can execute
49691 2 scalar instruction per tick, but has in order SIMD pipeline. */
49692 if (TARGET_SILVERMONT || TARGET_INTEL)
49693 if (stmt_info && stmt_info->stmt)
49695 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49696 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49697 retval = (retval * 17) / 10;
49700 cost[where] += retval;
49702 return retval;
49705 /* Implement targetm.vectorize.finish_cost. */
49707 static void
49708 ix86_finish_cost (void *data, unsigned *prologue_cost,
49709 unsigned *body_cost, unsigned *epilogue_cost)
49711 unsigned *cost = (unsigned *) data;
49712 *prologue_cost = cost[vect_prologue];
49713 *body_cost = cost[vect_body];
49714 *epilogue_cost = cost[vect_epilogue];
49717 /* Implement targetm.vectorize.destroy_cost_data. */
49719 static void
49720 ix86_destroy_cost_data (void *data)
49722 free (data);
49725 /* Validate target specific memory model bits in VAL. */
49727 static unsigned HOST_WIDE_INT
49728 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49730 enum memmodel model = memmodel_from_int (val);
49731 bool strong;
49733 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49734 |MEMMODEL_MASK)
49735 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49737 warning (OPT_Winvalid_memory_model,
49738 "Unknown architecture specific memory model");
49739 return MEMMODEL_SEQ_CST;
49741 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49742 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49744 warning (OPT_Winvalid_memory_model,
49745 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49746 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49748 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49750 warning (OPT_Winvalid_memory_model,
49751 "HLE_RELEASE not used with RELEASE or stronger memory model");
49752 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49754 return val;
49757 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49758 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49759 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49760 or number of vecsize_mangle variants that should be emitted. */
49762 static int
49763 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49764 struct cgraph_simd_clone *clonei,
49765 tree base_type, int num)
49767 int ret = 1;
49769 if (clonei->simdlen
49770 && (clonei->simdlen < 2
49771 || clonei->simdlen > 1024
49772 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49774 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49775 "unsupported simdlen %d", clonei->simdlen);
49776 return 0;
49779 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49780 if (TREE_CODE (ret_type) != VOID_TYPE)
49781 switch (TYPE_MODE (ret_type))
49783 case QImode:
49784 case HImode:
49785 case SImode:
49786 case DImode:
49787 case SFmode:
49788 case DFmode:
49789 /* case SCmode: */
49790 /* case DCmode: */
49791 break;
49792 default:
49793 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49794 "unsupported return type %qT for simd\n", ret_type);
49795 return 0;
49798 tree t;
49799 int i;
49801 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49802 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49803 switch (TYPE_MODE (TREE_TYPE (t)))
49805 case QImode:
49806 case HImode:
49807 case SImode:
49808 case DImode:
49809 case SFmode:
49810 case DFmode:
49811 /* case SCmode: */
49812 /* case DCmode: */
49813 break;
49814 default:
49815 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49816 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49817 return 0;
49820 if (clonei->cilk_elemental)
49822 /* Parse here processor clause. If not present, default to 'b'. */
49823 clonei->vecsize_mangle = 'b';
49825 else if (!TREE_PUBLIC (node->decl))
49827 /* If the function isn't exported, we can pick up just one ISA
49828 for the clones. */
49829 if (TARGET_AVX512F)
49830 clonei->vecsize_mangle = 'e';
49831 else if (TARGET_AVX2)
49832 clonei->vecsize_mangle = 'd';
49833 else if (TARGET_AVX)
49834 clonei->vecsize_mangle = 'c';
49835 else
49836 clonei->vecsize_mangle = 'b';
49837 ret = 1;
49839 else
49841 clonei->vecsize_mangle = "bcde"[num];
49842 ret = 4;
49844 clonei->mask_mode = VOIDmode;
49845 switch (clonei->vecsize_mangle)
49847 case 'b':
49848 clonei->vecsize_int = 128;
49849 clonei->vecsize_float = 128;
49850 break;
49851 case 'c':
49852 clonei->vecsize_int = 128;
49853 clonei->vecsize_float = 256;
49854 break;
49855 case 'd':
49856 clonei->vecsize_int = 256;
49857 clonei->vecsize_float = 256;
49858 break;
49859 case 'e':
49860 clonei->vecsize_int = 512;
49861 clonei->vecsize_float = 512;
49862 if (TYPE_MODE (base_type) == QImode)
49863 clonei->mask_mode = DImode;
49864 else
49865 clonei->mask_mode = SImode;
49866 break;
49868 if (clonei->simdlen == 0)
49870 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49871 clonei->simdlen = clonei->vecsize_int;
49872 else
49873 clonei->simdlen = clonei->vecsize_float;
49874 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49876 else if (clonei->simdlen > 16)
49878 /* For compatibility with ICC, use the same upper bounds
49879 for simdlen. In particular, for CTYPE below, use the return type,
49880 unless the function returns void, in that case use the characteristic
49881 type. If it is possible for given SIMDLEN to pass CTYPE value
49882 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49883 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49884 emit corresponding clone. */
49885 tree ctype = ret_type;
49886 if (TREE_CODE (ret_type) == VOID_TYPE)
49887 ctype = base_type;
49888 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49889 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49890 cnt /= clonei->vecsize_int;
49891 else
49892 cnt /= clonei->vecsize_float;
49893 if (cnt > (TARGET_64BIT ? 16 : 8))
49895 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49896 "unsupported simdlen %d", clonei->simdlen);
49897 return 0;
49900 return ret;
49903 /* Add target attribute to SIMD clone NODE if needed. */
49905 static void
49906 ix86_simd_clone_adjust (struct cgraph_node *node)
49908 const char *str = NULL;
49909 gcc_assert (node->decl == cfun->decl);
49910 switch (node->simdclone->vecsize_mangle)
49912 case 'b':
49913 if (!TARGET_SSE2)
49914 str = "sse2";
49915 break;
49916 case 'c':
49917 if (!TARGET_AVX)
49918 str = "avx";
49919 break;
49920 case 'd':
49921 if (!TARGET_AVX2)
49922 str = "avx2";
49923 break;
49924 case 'e':
49925 if (!TARGET_AVX512F)
49926 str = "avx512f";
49927 break;
49928 default:
49929 gcc_unreachable ();
49931 if (str == NULL)
49932 return;
49933 push_cfun (NULL);
49934 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49935 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49936 gcc_assert (ok);
49937 pop_cfun ();
49938 ix86_reset_previous_fndecl ();
49939 ix86_set_current_function (node->decl);
49942 /* If SIMD clone NODE can't be used in a vectorized loop
49943 in current function, return -1, otherwise return a badness of using it
49944 (0 if it is most desirable from vecsize_mangle point of view, 1
49945 slightly less desirable, etc.). */
49947 static int
49948 ix86_simd_clone_usable (struct cgraph_node *node)
49950 switch (node->simdclone->vecsize_mangle)
49952 case 'b':
49953 if (!TARGET_SSE2)
49954 return -1;
49955 if (!TARGET_AVX)
49956 return 0;
49957 return TARGET_AVX2 ? 2 : 1;
49958 case 'c':
49959 if (!TARGET_AVX)
49960 return -1;
49961 return TARGET_AVX2 ? 1 : 0;
49962 break;
49963 case 'd':
49964 if (!TARGET_AVX2)
49965 return -1;
49966 return 0;
49967 case 'e':
49968 if (!TARGET_AVX512F)
49969 return -1;
49970 return 0;
49971 default:
49972 gcc_unreachable ();
49976 /* This function adjusts the unroll factor based on
49977 the hardware capabilities. For ex, bdver3 has
49978 a loop buffer which makes unrolling of smaller
49979 loops less important. This function decides the
49980 unroll factor using number of memory references
49981 (value 32 is used) as a heuristic. */
49983 static unsigned
49984 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49986 basic_block *bbs;
49987 rtx_insn *insn;
49988 unsigned i;
49989 unsigned mem_count = 0;
49991 if (!TARGET_ADJUST_UNROLL)
49992 return nunroll;
49994 /* Count the number of memory references within the loop body.
49995 This value determines the unrolling factor for bdver3 and bdver4
49996 architectures. */
49997 subrtx_iterator::array_type array;
49998 bbs = get_loop_body (loop);
49999 for (i = 0; i < loop->num_nodes; i++)
50000 FOR_BB_INSNS (bbs[i], insn)
50001 if (NONDEBUG_INSN_P (insn))
50002 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50003 if (const_rtx x = *iter)
50004 if (MEM_P (x))
50006 machine_mode mode = GET_MODE (x);
50007 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50008 if (n_words > 4)
50009 mem_count += 2;
50010 else
50011 mem_count += 1;
50013 free (bbs);
50015 if (mem_count && mem_count <=32)
50016 return 32/mem_count;
50018 return nunroll;
50022 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50024 static bool
50025 ix86_float_exceptions_rounding_supported_p (void)
50027 /* For x87 floating point with standard excess precision handling,
50028 there is no adddf3 pattern (since x87 floating point only has
50029 XFmode operations) so the default hook implementation gets this
50030 wrong. */
50031 return TARGET_80387 || TARGET_SSE_MATH;
50034 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50036 static void
50037 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50039 if (!TARGET_80387 && !TARGET_SSE_MATH)
50040 return;
50041 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50042 if (TARGET_80387)
50044 tree fenv_index_type = build_index_type (size_int (6));
50045 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50046 tree fenv_var = create_tmp_var_raw (fenv_type);
50047 TREE_ADDRESSABLE (fenv_var) = 1;
50048 tree fenv_ptr = build_pointer_type (fenv_type);
50049 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50050 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50051 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50052 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50053 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50054 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50055 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50056 tree hold_fnclex = build_call_expr (fnclex, 0);
50057 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50058 NULL_TREE, NULL_TREE);
50059 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50060 hold_fnclex);
50061 *clear = build_call_expr (fnclex, 0);
50062 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50063 tree fnstsw_call = build_call_expr (fnstsw, 0);
50064 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50065 sw_var, fnstsw_call);
50066 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50067 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50068 exceptions_var, exceptions_x87);
50069 *update = build2 (COMPOUND_EXPR, integer_type_node,
50070 sw_mod, update_mod);
50071 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50072 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50074 if (TARGET_SSE_MATH)
50076 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50077 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50078 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50079 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50080 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50081 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50082 mxcsr_orig_var, stmxcsr_hold_call);
50083 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50084 mxcsr_orig_var,
50085 build_int_cst (unsigned_type_node, 0x1f80));
50086 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50087 build_int_cst (unsigned_type_node, 0xffffffc0));
50088 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50089 mxcsr_mod_var, hold_mod_val);
50090 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50091 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50092 hold_assign_orig, hold_assign_mod);
50093 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50094 ldmxcsr_hold_call);
50095 if (*hold)
50096 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50097 else
50098 *hold = hold_all;
50099 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50100 if (*clear)
50101 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50102 ldmxcsr_clear_call);
50103 else
50104 *clear = ldmxcsr_clear_call;
50105 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50106 tree exceptions_sse = fold_convert (integer_type_node,
50107 stxmcsr_update_call);
50108 if (*update)
50110 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50111 exceptions_var, exceptions_sse);
50112 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50113 exceptions_var, exceptions_mod);
50114 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50115 exceptions_assign);
50117 else
50118 *update = build2 (MODIFY_EXPR, integer_type_node,
50119 exceptions_var, exceptions_sse);
50120 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50121 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50122 ldmxcsr_update_call);
50124 tree atomic_feraiseexcept
50125 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50126 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50127 1, exceptions_var);
50128 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50129 atomic_feraiseexcept_call);
50132 /* Return mode to be used for bounds or VOIDmode
50133 if bounds are not supported. */
50135 static enum machine_mode
50136 ix86_mpx_bound_mode ()
50138 /* Do not support pointer checker if MPX
50139 is not enabled. */
50140 if (!TARGET_MPX)
50142 if (flag_check_pointer_bounds)
50143 warning (0, "Pointer Checker requires MPX support on this target."
50144 " Use -mmpx options to enable MPX.");
50145 return VOIDmode;
50148 return BNDmode;
50151 /* Return constant used to statically initialize constant bounds.
50153 This function is used to create special bound values. For now
50154 only INIT bounds and NONE bounds are expected. More special
50155 values may be added later. */
50157 static tree
50158 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50160 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50161 : build_zero_cst (pointer_sized_int_node);
50162 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50163 : build_minus_one_cst (pointer_sized_int_node);
50165 /* This function is supposed to be used to create INIT and
50166 NONE bounds only. */
50167 gcc_assert ((lb == 0 && ub == -1)
50168 || (lb == -1 && ub == 0));
50170 return build_complex (NULL, low, high);
50173 /* Generate a list of statements STMTS to initialize pointer bounds
50174 variable VAR with bounds LB and UB. Return the number of generated
50175 statements. */
50177 static int
50178 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50180 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50181 tree lhs, modify, var_p;
50183 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50184 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50186 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50187 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50188 append_to_statement_list (modify, stmts);
50190 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50191 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50192 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50193 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50194 append_to_statement_list (modify, stmts);
50196 return 2;
50199 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50200 /* For i386, common symbol is local only for non-PIE binaries. For
50201 x86-64, common symbol is local only for non-PIE binaries or linker
50202 supports copy reloc in PIE binaries. */
50204 static bool
50205 ix86_binds_local_p (const_tree exp)
50207 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50208 (!flag_pic
50209 || (TARGET_64BIT
50210 && HAVE_LD_PIE_COPYRELOC != 0)));
50212 #endif
50214 /* If MEM is in the form of [base+offset], extract the two parts
50215 of address and set to BASE and OFFSET, otherwise return false. */
50217 static bool
50218 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50220 rtx addr;
50222 gcc_assert (MEM_P (mem));
50224 addr = XEXP (mem, 0);
50226 if (GET_CODE (addr) == CONST)
50227 addr = XEXP (addr, 0);
50229 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50231 *base = addr;
50232 *offset = const0_rtx;
50233 return true;
50236 if (GET_CODE (addr) == PLUS
50237 && (REG_P (XEXP (addr, 0))
50238 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50239 && CONST_INT_P (XEXP (addr, 1)))
50241 *base = XEXP (addr, 0);
50242 *offset = XEXP (addr, 1);
50243 return true;
50246 return false;
50249 /* Given OPERANDS of consecutive load/store, check if we can merge
50250 them into move multiple. LOAD is true if they are load instructions.
50251 MODE is the mode of memory operands. */
50253 bool
50254 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50255 enum machine_mode mode)
50257 HOST_WIDE_INT offval_1, offval_2, msize;
50258 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50260 if (load)
50262 mem_1 = operands[1];
50263 mem_2 = operands[3];
50264 reg_1 = operands[0];
50265 reg_2 = operands[2];
50267 else
50269 mem_1 = operands[0];
50270 mem_2 = operands[2];
50271 reg_1 = operands[1];
50272 reg_2 = operands[3];
50275 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50277 if (REGNO (reg_1) != REGNO (reg_2))
50278 return false;
50280 /* Check if the addresses are in the form of [base+offset]. */
50281 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50282 return false;
50283 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50284 return false;
50286 /* Check if the bases are the same. */
50287 if (!rtx_equal_p (base_1, base_2))
50288 return false;
50290 offval_1 = INTVAL (offset_1);
50291 offval_2 = INTVAL (offset_2);
50292 msize = GET_MODE_SIZE (mode);
50293 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50294 if (offval_1 + msize != offval_2)
50295 return false;
50297 return true;
50300 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50302 static bool
50303 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50304 optimization_type opt_type)
50306 switch (op)
50308 case asin_optab:
50309 case acos_optab:
50310 case log1p_optab:
50311 case exp_optab:
50312 case exp10_optab:
50313 case exp2_optab:
50314 case expm1_optab:
50315 case ldexp_optab:
50316 case scalb_optab:
50317 case round_optab:
50318 return opt_type == OPTIMIZE_FOR_SPEED;
50320 case rint_optab:
50321 if (SSE_FLOAT_MODE_P (mode1)
50322 && TARGET_SSE_MATH
50323 && !flag_trapping_math
50324 && !TARGET_ROUND)
50325 return opt_type == OPTIMIZE_FOR_SPEED;
50326 return true;
50328 case floor_optab:
50329 case ceil_optab:
50330 case btrunc_optab:
50331 if (SSE_FLOAT_MODE_P (mode1)
50332 && TARGET_SSE_MATH
50333 && !flag_trapping_math
50334 && TARGET_ROUND)
50335 return true;
50336 return opt_type == OPTIMIZE_FOR_SPEED;
50338 case rsqrt_optab:
50339 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50341 default:
50342 return true;
50346 /* Address space support.
50348 This is not "far pointers" in the 16-bit sense, but an easy way
50349 to use %fs and %gs segment prefixes. Therefore:
50351 (a) All address spaces have the same modes,
50352 (b) All address spaces have the same addresss forms,
50353 (c) While %fs and %gs are technically subsets of the generic
50354 address space, they are probably not subsets of each other.
50355 (d) Since we have no access to the segment base register values
50356 without resorting to a system call, we cannot convert a
50357 non-default address space to a default address space.
50358 Therefore we do not claim %fs or %gs are subsets of generic.
50360 Therefore we can (mostly) use the default hooks. */
50362 /* All use of segmentation is assumed to make address 0 valid. */
50364 static bool
50365 ix86_addr_space_zero_address_valid (addr_space_t as)
50367 return as != ADDR_SPACE_GENERIC;
50369 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50370 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50372 /* Initialize the GCC target structure. */
50373 #undef TARGET_RETURN_IN_MEMORY
50374 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50376 #undef TARGET_LEGITIMIZE_ADDRESS
50377 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50379 #undef TARGET_ATTRIBUTE_TABLE
50380 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50381 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50382 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50383 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50384 # undef TARGET_MERGE_DECL_ATTRIBUTES
50385 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50386 #endif
50388 #undef TARGET_COMP_TYPE_ATTRIBUTES
50389 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50391 #undef TARGET_INIT_BUILTINS
50392 #define TARGET_INIT_BUILTINS ix86_init_builtins
50393 #undef TARGET_BUILTIN_DECL
50394 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50395 #undef TARGET_EXPAND_BUILTIN
50396 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50398 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50399 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50400 ix86_builtin_vectorized_function
50402 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50403 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50405 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50406 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50408 #undef TARGET_BUILTIN_RECIPROCAL
50409 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50411 #undef TARGET_ASM_FUNCTION_EPILOGUE
50412 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50414 #undef TARGET_ENCODE_SECTION_INFO
50415 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50416 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50417 #else
50418 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50419 #endif
50421 #undef TARGET_ASM_OPEN_PAREN
50422 #define TARGET_ASM_OPEN_PAREN ""
50423 #undef TARGET_ASM_CLOSE_PAREN
50424 #define TARGET_ASM_CLOSE_PAREN ""
50426 #undef TARGET_ASM_BYTE_OP
50427 #define TARGET_ASM_BYTE_OP ASM_BYTE
50429 #undef TARGET_ASM_ALIGNED_HI_OP
50430 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50431 #undef TARGET_ASM_ALIGNED_SI_OP
50432 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50433 #ifdef ASM_QUAD
50434 #undef TARGET_ASM_ALIGNED_DI_OP
50435 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50436 #endif
50438 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50439 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50441 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50442 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50444 #undef TARGET_ASM_UNALIGNED_HI_OP
50445 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50446 #undef TARGET_ASM_UNALIGNED_SI_OP
50447 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50448 #undef TARGET_ASM_UNALIGNED_DI_OP
50449 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50451 #undef TARGET_PRINT_OPERAND
50452 #define TARGET_PRINT_OPERAND ix86_print_operand
50453 #undef TARGET_PRINT_OPERAND_ADDRESS
50454 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50455 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50456 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50457 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50458 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50460 #undef TARGET_SCHED_INIT_GLOBAL
50461 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50462 #undef TARGET_SCHED_ADJUST_COST
50463 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50464 #undef TARGET_SCHED_ISSUE_RATE
50465 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50466 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50467 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50468 ia32_multipass_dfa_lookahead
50469 #undef TARGET_SCHED_MACRO_FUSION_P
50470 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50471 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50472 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50474 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50475 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50477 #undef TARGET_MEMMODEL_CHECK
50478 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50480 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50481 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50483 #ifdef HAVE_AS_TLS
50484 #undef TARGET_HAVE_TLS
50485 #define TARGET_HAVE_TLS true
50486 #endif
50487 #undef TARGET_CANNOT_FORCE_CONST_MEM
50488 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50489 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50490 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50492 #undef TARGET_DELEGITIMIZE_ADDRESS
50493 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50495 #undef TARGET_MS_BITFIELD_LAYOUT_P
50496 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50498 #if TARGET_MACHO
50499 #undef TARGET_BINDS_LOCAL_P
50500 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50501 #else
50502 #undef TARGET_BINDS_LOCAL_P
50503 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50504 #endif
50505 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50506 #undef TARGET_BINDS_LOCAL_P
50507 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50508 #endif
50510 #undef TARGET_ASM_OUTPUT_MI_THUNK
50511 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50512 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50513 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50515 #undef TARGET_ASM_FILE_START
50516 #define TARGET_ASM_FILE_START x86_file_start
50518 #undef TARGET_OPTION_OVERRIDE
50519 #define TARGET_OPTION_OVERRIDE ix86_option_override
50521 #undef TARGET_REGISTER_MOVE_COST
50522 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50523 #undef TARGET_MEMORY_MOVE_COST
50524 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50525 #undef TARGET_RTX_COSTS
50526 #define TARGET_RTX_COSTS ix86_rtx_costs
50527 #undef TARGET_ADDRESS_COST
50528 #define TARGET_ADDRESS_COST ix86_address_cost
50530 #undef TARGET_FIXED_CONDITION_CODE_REGS
50531 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50532 #undef TARGET_CC_MODES_COMPATIBLE
50533 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50535 #undef TARGET_MACHINE_DEPENDENT_REORG
50536 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50538 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50539 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50541 #undef TARGET_BUILD_BUILTIN_VA_LIST
50542 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50544 #undef TARGET_FOLD_BUILTIN
50545 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50547 #undef TARGET_COMPARE_VERSION_PRIORITY
50548 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50550 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50551 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50552 ix86_generate_version_dispatcher_body
50554 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50555 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50556 ix86_get_function_versions_dispatcher
50558 #undef TARGET_ENUM_VA_LIST_P
50559 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50561 #undef TARGET_FN_ABI_VA_LIST
50562 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50564 #undef TARGET_CANONICAL_VA_LIST_TYPE
50565 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50567 #undef TARGET_EXPAND_BUILTIN_VA_START
50568 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50570 #undef TARGET_MD_ASM_ADJUST
50571 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50573 #undef TARGET_PROMOTE_PROTOTYPES
50574 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50575 #undef TARGET_SETUP_INCOMING_VARARGS
50576 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50577 #undef TARGET_MUST_PASS_IN_STACK
50578 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50579 #undef TARGET_FUNCTION_ARG_ADVANCE
50580 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50581 #undef TARGET_FUNCTION_ARG
50582 #define TARGET_FUNCTION_ARG ix86_function_arg
50583 #undef TARGET_INIT_PIC_REG
50584 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50585 #undef TARGET_USE_PSEUDO_PIC_REG
50586 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50587 #undef TARGET_FUNCTION_ARG_BOUNDARY
50588 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50589 #undef TARGET_PASS_BY_REFERENCE
50590 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50591 #undef TARGET_INTERNAL_ARG_POINTER
50592 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50593 #undef TARGET_UPDATE_STACK_BOUNDARY
50594 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50595 #undef TARGET_GET_DRAP_RTX
50596 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50597 #undef TARGET_STRICT_ARGUMENT_NAMING
50598 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50599 #undef TARGET_STATIC_CHAIN
50600 #define TARGET_STATIC_CHAIN ix86_static_chain
50601 #undef TARGET_TRAMPOLINE_INIT
50602 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50603 #undef TARGET_RETURN_POPS_ARGS
50604 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50606 #undef TARGET_LEGITIMATE_COMBINED_INSN
50607 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50609 #undef TARGET_ASAN_SHADOW_OFFSET
50610 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50612 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50613 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50615 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50616 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50618 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50619 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50621 #undef TARGET_C_MODE_FOR_SUFFIX
50622 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50624 #ifdef HAVE_AS_TLS
50625 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50626 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50627 #endif
50629 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50630 #undef TARGET_INSERT_ATTRIBUTES
50631 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50632 #endif
50634 #undef TARGET_MANGLE_TYPE
50635 #define TARGET_MANGLE_TYPE ix86_mangle_type
50637 #if !TARGET_MACHO
50638 #undef TARGET_STACK_PROTECT_FAIL
50639 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50640 #endif
50642 #undef TARGET_FUNCTION_VALUE
50643 #define TARGET_FUNCTION_VALUE ix86_function_value
50645 #undef TARGET_FUNCTION_VALUE_REGNO_P
50646 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50648 #undef TARGET_PROMOTE_FUNCTION_MODE
50649 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50651 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50652 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50654 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50655 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50657 #undef TARGET_INSTANTIATE_DECLS
50658 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50660 #undef TARGET_SECONDARY_RELOAD
50661 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50663 #undef TARGET_CLASS_MAX_NREGS
50664 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50666 #undef TARGET_PREFERRED_RELOAD_CLASS
50667 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50668 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50669 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50670 #undef TARGET_CLASS_LIKELY_SPILLED_P
50671 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50673 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50674 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50675 ix86_builtin_vectorization_cost
50676 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50677 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50678 ix86_vectorize_vec_perm_const_ok
50679 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50680 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50681 ix86_preferred_simd_mode
50682 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50683 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50684 ix86_autovectorize_vector_sizes
50685 #undef TARGET_VECTORIZE_GET_MASK_MODE
50686 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50687 #undef TARGET_VECTORIZE_INIT_COST
50688 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50689 #undef TARGET_VECTORIZE_ADD_STMT_COST
50690 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50691 #undef TARGET_VECTORIZE_FINISH_COST
50692 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50693 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50694 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50696 #undef TARGET_SET_CURRENT_FUNCTION
50697 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50699 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50700 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50702 #undef TARGET_OPTION_SAVE
50703 #define TARGET_OPTION_SAVE ix86_function_specific_save
50705 #undef TARGET_OPTION_RESTORE
50706 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50708 #undef TARGET_OPTION_POST_STREAM_IN
50709 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50711 #undef TARGET_OPTION_PRINT
50712 #define TARGET_OPTION_PRINT ix86_function_specific_print
50714 #undef TARGET_OPTION_FUNCTION_VERSIONS
50715 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
50717 #undef TARGET_CAN_INLINE_P
50718 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50720 #undef TARGET_LEGITIMATE_ADDRESS_P
50721 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50723 #undef TARGET_REGISTER_PRIORITY
50724 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50726 #undef TARGET_REGISTER_USAGE_LEVELING_P
50727 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50729 #undef TARGET_LEGITIMATE_CONSTANT_P
50730 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50732 #undef TARGET_FRAME_POINTER_REQUIRED
50733 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50735 #undef TARGET_CAN_ELIMINATE
50736 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50738 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50739 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50741 #undef TARGET_ASM_CODE_END
50742 #define TARGET_ASM_CODE_END ix86_code_end
50744 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50745 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50747 #if TARGET_MACHO
50748 #undef TARGET_INIT_LIBFUNCS
50749 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
50750 #endif
50752 #undef TARGET_LOOP_UNROLL_ADJUST
50753 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50755 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50756 #undef TARGET_SPILL_CLASS
50757 #define TARGET_SPILL_CLASS ix86_spill_class
50759 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50760 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50761 ix86_simd_clone_compute_vecsize_and_simdlen
50763 #undef TARGET_SIMD_CLONE_ADJUST
50764 #define TARGET_SIMD_CLONE_ADJUST \
50765 ix86_simd_clone_adjust
50767 #undef TARGET_SIMD_CLONE_USABLE
50768 #define TARGET_SIMD_CLONE_USABLE \
50769 ix86_simd_clone_usable
50771 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50772 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50773 ix86_float_exceptions_rounding_supported_p
50775 #undef TARGET_MODE_EMIT
50776 #define TARGET_MODE_EMIT ix86_emit_mode_set
50778 #undef TARGET_MODE_NEEDED
50779 #define TARGET_MODE_NEEDED ix86_mode_needed
50781 #undef TARGET_MODE_AFTER
50782 #define TARGET_MODE_AFTER ix86_mode_after
50784 #undef TARGET_MODE_ENTRY
50785 #define TARGET_MODE_ENTRY ix86_mode_entry
50787 #undef TARGET_MODE_EXIT
50788 #define TARGET_MODE_EXIT ix86_mode_exit
50790 #undef TARGET_MODE_PRIORITY
50791 #define TARGET_MODE_PRIORITY ix86_mode_priority
50793 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50794 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50796 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50797 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50799 #undef TARGET_STORE_BOUNDS_FOR_ARG
50800 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50802 #undef TARGET_LOAD_RETURNED_BOUNDS
50803 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50805 #undef TARGET_STORE_RETURNED_BOUNDS
50806 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50808 #undef TARGET_CHKP_BOUND_MODE
50809 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50811 #undef TARGET_BUILTIN_CHKP_FUNCTION
50812 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50814 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50815 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50817 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50818 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50820 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50821 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50823 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50824 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50826 #undef TARGET_OFFLOAD_OPTIONS
50827 #define TARGET_OFFLOAD_OPTIONS \
50828 ix86_offload_options
50830 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50831 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50833 #undef TARGET_OPTAB_SUPPORTED_P
50834 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50836 #undef TARGET_HARD_REGNO_SCRATCH_OK
50837 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50839 struct gcc_target targetm = TARGET_INITIALIZER;
50841 #include "gt-i386.h"