2016-11-17 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
[official-gcc.git] / gcc / config / i386 / i386.c
bloba5c4ba7b6300c12dfc1dfe9a190b9167c2439e05
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
86 /* This file should be included last. */
87 #include "target-def.h"
89 static rtx legitimize_dllimport_symbol (rtx, bool);
90 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
91 static rtx legitimize_pe_coff_symbol (rtx, bool);
92 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
94 #ifndef CHECK_STACK_LIMIT
95 #define CHECK_STACK_LIMIT (-1)
96 #endif
98 /* Return index of given mode in mult and division cost tables. */
99 #define MODE_INDEX(mode) \
100 ((mode) == QImode ? 0 \
101 : (mode) == HImode ? 1 \
102 : (mode) == SImode ? 2 \
103 : (mode) == DImode ? 3 \
104 : 4)
106 /* Processor costs (relative to an add) */
107 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
108 #define COSTS_N_BYTES(N) ((N) * 2)
110 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
112 static stringop_algs ix86_size_memcpy[2] = {
113 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
114 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
115 static stringop_algs ix86_size_memset[2] = {
116 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
117 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
119 const
120 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
121 COSTS_N_BYTES (2), /* cost of an add instruction */
122 COSTS_N_BYTES (3), /* cost of a lea instruction */
123 COSTS_N_BYTES (2), /* variable shift costs */
124 COSTS_N_BYTES (3), /* constant shift costs */
125 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
126 COSTS_N_BYTES (3), /* HI */
127 COSTS_N_BYTES (3), /* SI */
128 COSTS_N_BYTES (3), /* DI */
129 COSTS_N_BYTES (5)}, /* other */
130 0, /* cost of multiply per each bit set */
131 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
132 COSTS_N_BYTES (3), /* HI */
133 COSTS_N_BYTES (3), /* SI */
134 COSTS_N_BYTES (3), /* DI */
135 COSTS_N_BYTES (5)}, /* other */
136 COSTS_N_BYTES (3), /* cost of movsx */
137 COSTS_N_BYTES (3), /* cost of movzx */
138 0, /* "large" insn */
139 2, /* MOVE_RATIO */
140 2, /* cost for loading QImode using movzbl */
141 {2, 2, 2}, /* cost of loading integer registers
142 in QImode, HImode and SImode.
143 Relative to reg-reg move (2). */
144 {2, 2, 2}, /* cost of storing integer registers */
145 2, /* cost of reg,reg fld/fst */
146 {2, 2, 2}, /* cost of loading fp registers
147 in SFmode, DFmode and XFmode */
148 {2, 2, 2}, /* cost of storing fp registers
149 in SFmode, DFmode and XFmode */
150 3, /* cost of moving MMX register */
151 {3, 3}, /* cost of loading MMX registers
152 in SImode and DImode */
153 {3, 3}, /* cost of storing MMX registers
154 in SImode and DImode */
155 3, /* cost of moving SSE register */
156 {3, 3, 3}, /* cost of loading SSE registers
157 in SImode, DImode and TImode */
158 {3, 3, 3}, /* cost of storing SSE registers
159 in SImode, DImode and TImode */
160 3, /* MMX or SSE register to integer */
161 0, /* size of l1 cache */
162 0, /* size of l2 cache */
163 0, /* size of prefetch block */
164 0, /* number of parallel prefetches */
165 2, /* Branch cost */
166 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
167 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
168 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
169 COSTS_N_BYTES (2), /* cost of FABS instruction. */
170 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
171 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
172 ix86_size_memcpy,
173 ix86_size_memset,
174 1, /* scalar_stmt_cost. */
175 1, /* scalar load_cost. */
176 1, /* scalar_store_cost. */
177 1, /* vec_stmt_cost. */
178 1, /* vec_to_scalar_cost. */
179 1, /* scalar_to_vec_cost. */
180 1, /* vec_align_load_cost. */
181 1, /* vec_unalign_load_cost. */
182 1, /* vec_store_cost. */
183 1, /* cond_taken_branch_cost. */
184 1, /* cond_not_taken_branch_cost. */
187 /* Processor costs (relative to an add) */
188 static stringop_algs i386_memcpy[2] = {
189 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
190 DUMMY_STRINGOP_ALGS};
191 static stringop_algs i386_memset[2] = {
192 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
193 DUMMY_STRINGOP_ALGS};
195 static const
196 struct processor_costs i386_cost = { /* 386 specific costs */
197 COSTS_N_INSNS (1), /* cost of an add instruction */
198 COSTS_N_INSNS (1), /* cost of a lea instruction */
199 COSTS_N_INSNS (3), /* variable shift costs */
200 COSTS_N_INSNS (2), /* constant shift costs */
201 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
202 COSTS_N_INSNS (6), /* HI */
203 COSTS_N_INSNS (6), /* SI */
204 COSTS_N_INSNS (6), /* DI */
205 COSTS_N_INSNS (6)}, /* other */
206 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
207 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
208 COSTS_N_INSNS (23), /* HI */
209 COSTS_N_INSNS (23), /* SI */
210 COSTS_N_INSNS (23), /* DI */
211 COSTS_N_INSNS (23)}, /* other */
212 COSTS_N_INSNS (3), /* cost of movsx */
213 COSTS_N_INSNS (2), /* cost of movzx */
214 15, /* "large" insn */
215 3, /* MOVE_RATIO */
216 4, /* cost for loading QImode using movzbl */
217 {2, 4, 2}, /* cost of loading integer registers
218 in QImode, HImode and SImode.
219 Relative to reg-reg move (2). */
220 {2, 4, 2}, /* cost of storing integer registers */
221 2, /* cost of reg,reg fld/fst */
222 {8, 8, 8}, /* cost of loading fp registers
223 in SFmode, DFmode and XFmode */
224 {8, 8, 8}, /* cost of storing fp registers
225 in SFmode, DFmode and XFmode */
226 2, /* cost of moving MMX register */
227 {4, 8}, /* cost of loading MMX registers
228 in SImode and DImode */
229 {4, 8}, /* cost of storing MMX registers
230 in SImode and DImode */
231 2, /* cost of moving SSE register */
232 {4, 8, 16}, /* cost of loading SSE registers
233 in SImode, DImode and TImode */
234 {4, 8, 16}, /* cost of storing SSE registers
235 in SImode, DImode and TImode */
236 3, /* MMX or SSE register to integer */
237 0, /* size of l1 cache */
238 0, /* size of l2 cache */
239 0, /* size of prefetch block */
240 0, /* number of parallel prefetches */
241 1, /* Branch cost */
242 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
243 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
244 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
245 COSTS_N_INSNS (22), /* cost of FABS instruction. */
246 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
247 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
248 i386_memcpy,
249 i386_memset,
250 1, /* scalar_stmt_cost. */
251 1, /* scalar load_cost. */
252 1, /* scalar_store_cost. */
253 1, /* vec_stmt_cost. */
254 1, /* vec_to_scalar_cost. */
255 1, /* scalar_to_vec_cost. */
256 1, /* vec_align_load_cost. */
257 2, /* vec_unalign_load_cost. */
258 1, /* vec_store_cost. */
259 3, /* cond_taken_branch_cost. */
260 1, /* cond_not_taken_branch_cost. */
263 static stringop_algs i486_memcpy[2] = {
264 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
265 DUMMY_STRINGOP_ALGS};
266 static stringop_algs i486_memset[2] = {
267 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
268 DUMMY_STRINGOP_ALGS};
270 static const
271 struct processor_costs i486_cost = { /* 486 specific costs */
272 COSTS_N_INSNS (1), /* cost of an add instruction */
273 COSTS_N_INSNS (1), /* cost of a lea instruction */
274 COSTS_N_INSNS (3), /* variable shift costs */
275 COSTS_N_INSNS (2), /* constant shift costs */
276 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
277 COSTS_N_INSNS (12), /* HI */
278 COSTS_N_INSNS (12), /* SI */
279 COSTS_N_INSNS (12), /* DI */
280 COSTS_N_INSNS (12)}, /* other */
281 1, /* cost of multiply per each bit set */
282 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
283 COSTS_N_INSNS (40), /* HI */
284 COSTS_N_INSNS (40), /* SI */
285 COSTS_N_INSNS (40), /* DI */
286 COSTS_N_INSNS (40)}, /* other */
287 COSTS_N_INSNS (3), /* cost of movsx */
288 COSTS_N_INSNS (2), /* cost of movzx */
289 15, /* "large" insn */
290 3, /* MOVE_RATIO */
291 4, /* cost for loading QImode using movzbl */
292 {2, 4, 2}, /* cost of loading integer registers
293 in QImode, HImode and SImode.
294 Relative to reg-reg move (2). */
295 {2, 4, 2}, /* cost of storing integer registers */
296 2, /* cost of reg,reg fld/fst */
297 {8, 8, 8}, /* cost of loading fp registers
298 in SFmode, DFmode and XFmode */
299 {8, 8, 8}, /* cost of storing fp registers
300 in SFmode, DFmode and XFmode */
301 2, /* cost of moving MMX register */
302 {4, 8}, /* cost of loading MMX registers
303 in SImode and DImode */
304 {4, 8}, /* cost of storing MMX registers
305 in SImode and DImode */
306 2, /* cost of moving SSE register */
307 {4, 8, 16}, /* cost of loading SSE registers
308 in SImode, DImode and TImode */
309 {4, 8, 16}, /* cost of storing SSE registers
310 in SImode, DImode and TImode */
311 3, /* MMX or SSE register to integer */
312 4, /* size of l1 cache. 486 has 8kB cache
313 shared for code and data, so 4kB is
314 not really precise. */
315 4, /* size of l2 cache */
316 0, /* size of prefetch block */
317 0, /* number of parallel prefetches */
318 1, /* Branch cost */
319 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
320 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
321 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
322 COSTS_N_INSNS (3), /* cost of FABS instruction. */
323 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
324 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
325 i486_memcpy,
326 i486_memset,
327 1, /* scalar_stmt_cost. */
328 1, /* scalar load_cost. */
329 1, /* scalar_store_cost. */
330 1, /* vec_stmt_cost. */
331 1, /* vec_to_scalar_cost. */
332 1, /* scalar_to_vec_cost. */
333 1, /* vec_align_load_cost. */
334 2, /* vec_unalign_load_cost. */
335 1, /* vec_store_cost. */
336 3, /* cond_taken_branch_cost. */
337 1, /* cond_not_taken_branch_cost. */
340 static stringop_algs pentium_memcpy[2] = {
341 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
342 DUMMY_STRINGOP_ALGS};
343 static stringop_algs pentium_memset[2] = {
344 {libcall, {{-1, rep_prefix_4_byte, false}}},
345 DUMMY_STRINGOP_ALGS};
347 static const
348 struct processor_costs pentium_cost = {
349 COSTS_N_INSNS (1), /* cost of an add instruction */
350 COSTS_N_INSNS (1), /* cost of a lea instruction */
351 COSTS_N_INSNS (4), /* variable shift costs */
352 COSTS_N_INSNS (1), /* constant shift costs */
353 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
354 COSTS_N_INSNS (11), /* HI */
355 COSTS_N_INSNS (11), /* SI */
356 COSTS_N_INSNS (11), /* DI */
357 COSTS_N_INSNS (11)}, /* other */
358 0, /* cost of multiply per each bit set */
359 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
360 COSTS_N_INSNS (25), /* HI */
361 COSTS_N_INSNS (25), /* SI */
362 COSTS_N_INSNS (25), /* DI */
363 COSTS_N_INSNS (25)}, /* other */
364 COSTS_N_INSNS (3), /* cost of movsx */
365 COSTS_N_INSNS (2), /* cost of movzx */
366 8, /* "large" insn */
367 6, /* MOVE_RATIO */
368 6, /* cost for loading QImode using movzbl */
369 {2, 4, 2}, /* cost of loading integer registers
370 in QImode, HImode and SImode.
371 Relative to reg-reg move (2). */
372 {2, 4, 2}, /* cost of storing integer registers */
373 2, /* cost of reg,reg fld/fst */
374 {2, 2, 6}, /* cost of loading fp registers
375 in SFmode, DFmode and XFmode */
376 {4, 4, 6}, /* cost of storing fp registers
377 in SFmode, DFmode and XFmode */
378 8, /* cost of moving MMX register */
379 {8, 8}, /* cost of loading MMX registers
380 in SImode and DImode */
381 {8, 8}, /* cost of storing MMX registers
382 in SImode and DImode */
383 2, /* cost of moving SSE register */
384 {4, 8, 16}, /* cost of loading SSE registers
385 in SImode, DImode and TImode */
386 {4, 8, 16}, /* cost of storing SSE registers
387 in SImode, DImode and TImode */
388 3, /* MMX or SSE register to integer */
389 8, /* size of l1 cache. */
390 8, /* size of l2 cache */
391 0, /* size of prefetch block */
392 0, /* number of parallel prefetches */
393 2, /* Branch cost */
394 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
395 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
396 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
397 COSTS_N_INSNS (1), /* cost of FABS instruction. */
398 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
399 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
400 pentium_memcpy,
401 pentium_memset,
402 1, /* scalar_stmt_cost. */
403 1, /* scalar load_cost. */
404 1, /* scalar_store_cost. */
405 1, /* vec_stmt_cost. */
406 1, /* vec_to_scalar_cost. */
407 1, /* scalar_to_vec_cost. */
408 1, /* vec_align_load_cost. */
409 2, /* vec_unalign_load_cost. */
410 1, /* vec_store_cost. */
411 3, /* cond_taken_branch_cost. */
412 1, /* cond_not_taken_branch_cost. */
415 static const
416 struct processor_costs lakemont_cost = {
417 COSTS_N_INSNS (1), /* cost of an add instruction */
418 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
419 COSTS_N_INSNS (1), /* variable shift costs */
420 COSTS_N_INSNS (1), /* constant shift costs */
421 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
422 COSTS_N_INSNS (11), /* HI */
423 COSTS_N_INSNS (11), /* SI */
424 COSTS_N_INSNS (11), /* DI */
425 COSTS_N_INSNS (11)}, /* other */
426 0, /* cost of multiply per each bit set */
427 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
428 COSTS_N_INSNS (25), /* HI */
429 COSTS_N_INSNS (25), /* SI */
430 COSTS_N_INSNS (25), /* DI */
431 COSTS_N_INSNS (25)}, /* other */
432 COSTS_N_INSNS (3), /* cost of movsx */
433 COSTS_N_INSNS (2), /* cost of movzx */
434 8, /* "large" insn */
435 17, /* MOVE_RATIO */
436 6, /* cost for loading QImode using movzbl */
437 {2, 4, 2}, /* cost of loading integer registers
438 in QImode, HImode and SImode.
439 Relative to reg-reg move (2). */
440 {2, 4, 2}, /* cost of storing integer registers */
441 2, /* cost of reg,reg fld/fst */
442 {2, 2, 6}, /* cost of loading fp registers
443 in SFmode, DFmode and XFmode */
444 {4, 4, 6}, /* cost of storing fp registers
445 in SFmode, DFmode and XFmode */
446 8, /* cost of moving MMX register */
447 {8, 8}, /* cost of loading MMX registers
448 in SImode and DImode */
449 {8, 8}, /* cost of storing MMX registers
450 in SImode and DImode */
451 2, /* cost of moving SSE register */
452 {4, 8, 16}, /* cost of loading SSE registers
453 in SImode, DImode and TImode */
454 {4, 8, 16}, /* cost of storing SSE registers
455 in SImode, DImode and TImode */
456 3, /* MMX or SSE register to integer */
457 8, /* size of l1 cache. */
458 8, /* size of l2 cache */
459 0, /* size of prefetch block */
460 0, /* number of parallel prefetches */
461 2, /* Branch cost */
462 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
463 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
464 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
465 COSTS_N_INSNS (1), /* cost of FABS instruction. */
466 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
467 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
468 pentium_memcpy,
469 pentium_memset,
470 1, /* scalar_stmt_cost. */
471 1, /* scalar load_cost. */
472 1, /* scalar_store_cost. */
473 1, /* vec_stmt_cost. */
474 1, /* vec_to_scalar_cost. */
475 1, /* scalar_to_vec_cost. */
476 1, /* vec_align_load_cost. */
477 2, /* vec_unalign_load_cost. */
478 1, /* vec_store_cost. */
479 3, /* cond_taken_branch_cost. */
480 1, /* cond_not_taken_branch_cost. */
483 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
484 (we ensure the alignment). For small blocks inline loop is still a
485 noticeable win, for bigger blocks either rep movsl or rep movsb is
486 way to go. Rep movsb has apparently more expensive startup time in CPU,
487 but after 4K the difference is down in the noise. */
488 static stringop_algs pentiumpro_memcpy[2] = {
489 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
490 {8192, rep_prefix_4_byte, false},
491 {-1, rep_prefix_1_byte, false}}},
492 DUMMY_STRINGOP_ALGS};
493 static stringop_algs pentiumpro_memset[2] = {
494 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
495 {8192, rep_prefix_4_byte, false},
496 {-1, libcall, false}}},
497 DUMMY_STRINGOP_ALGS};
498 static const
499 struct processor_costs pentiumpro_cost = {
500 COSTS_N_INSNS (1), /* cost of an add instruction */
501 COSTS_N_INSNS (1), /* cost of a lea instruction */
502 COSTS_N_INSNS (1), /* variable shift costs */
503 COSTS_N_INSNS (1), /* constant shift costs */
504 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
505 COSTS_N_INSNS (4), /* HI */
506 COSTS_N_INSNS (4), /* SI */
507 COSTS_N_INSNS (4), /* DI */
508 COSTS_N_INSNS (4)}, /* other */
509 0, /* cost of multiply per each bit set */
510 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
511 COSTS_N_INSNS (17), /* HI */
512 COSTS_N_INSNS (17), /* SI */
513 COSTS_N_INSNS (17), /* DI */
514 COSTS_N_INSNS (17)}, /* other */
515 COSTS_N_INSNS (1), /* cost of movsx */
516 COSTS_N_INSNS (1), /* cost of movzx */
517 8, /* "large" insn */
518 6, /* MOVE_RATIO */
519 2, /* cost for loading QImode using movzbl */
520 {4, 4, 4}, /* cost of loading integer registers
521 in QImode, HImode and SImode.
522 Relative to reg-reg move (2). */
523 {2, 2, 2}, /* cost of storing integer registers */
524 2, /* cost of reg,reg fld/fst */
525 {2, 2, 6}, /* cost of loading fp registers
526 in SFmode, DFmode and XFmode */
527 {4, 4, 6}, /* cost of storing fp registers
528 in SFmode, DFmode and XFmode */
529 2, /* cost of moving MMX register */
530 {2, 2}, /* cost of loading MMX registers
531 in SImode and DImode */
532 {2, 2}, /* cost of storing MMX registers
533 in SImode and DImode */
534 2, /* cost of moving SSE register */
535 {2, 2, 8}, /* cost of loading SSE registers
536 in SImode, DImode and TImode */
537 {2, 2, 8}, /* cost of storing SSE registers
538 in SImode, DImode and TImode */
539 3, /* MMX or SSE register to integer */
540 8, /* size of l1 cache. */
541 256, /* size of l2 cache */
542 32, /* size of prefetch block */
543 6, /* number of parallel prefetches */
544 2, /* Branch cost */
545 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
546 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
547 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
548 COSTS_N_INSNS (2), /* cost of FABS instruction. */
549 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
550 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
551 pentiumpro_memcpy,
552 pentiumpro_memset,
553 1, /* scalar_stmt_cost. */
554 1, /* scalar load_cost. */
555 1, /* scalar_store_cost. */
556 1, /* vec_stmt_cost. */
557 1, /* vec_to_scalar_cost. */
558 1, /* scalar_to_vec_cost. */
559 1, /* vec_align_load_cost. */
560 2, /* vec_unalign_load_cost. */
561 1, /* vec_store_cost. */
562 3, /* cond_taken_branch_cost. */
563 1, /* cond_not_taken_branch_cost. */
566 static stringop_algs geode_memcpy[2] = {
567 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
568 DUMMY_STRINGOP_ALGS};
569 static stringop_algs geode_memset[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572 static const
573 struct processor_costs geode_cost = {
574 COSTS_N_INSNS (1), /* cost of an add instruction */
575 COSTS_N_INSNS (1), /* cost of a lea instruction */
576 COSTS_N_INSNS (2), /* variable shift costs */
577 COSTS_N_INSNS (1), /* constant shift costs */
578 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
579 COSTS_N_INSNS (4), /* HI */
580 COSTS_N_INSNS (7), /* SI */
581 COSTS_N_INSNS (7), /* DI */
582 COSTS_N_INSNS (7)}, /* other */
583 0, /* cost of multiply per each bit set */
584 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
585 COSTS_N_INSNS (23), /* HI */
586 COSTS_N_INSNS (39), /* SI */
587 COSTS_N_INSNS (39), /* DI */
588 COSTS_N_INSNS (39)}, /* other */
589 COSTS_N_INSNS (1), /* cost of movsx */
590 COSTS_N_INSNS (1), /* cost of movzx */
591 8, /* "large" insn */
592 4, /* MOVE_RATIO */
593 1, /* cost for loading QImode using movzbl */
594 {1, 1, 1}, /* cost of loading integer registers
595 in QImode, HImode and SImode.
596 Relative to reg-reg move (2). */
597 {1, 1, 1}, /* cost of storing integer registers */
598 1, /* cost of reg,reg fld/fst */
599 {1, 1, 1}, /* cost of loading fp registers
600 in SFmode, DFmode and XFmode */
601 {4, 6, 6}, /* cost of storing fp registers
602 in SFmode, DFmode and XFmode */
604 2, /* cost of moving MMX register */
605 {2, 2}, /* cost of loading MMX registers
606 in SImode and DImode */
607 {2, 2}, /* cost of storing MMX registers
608 in SImode and DImode */
609 2, /* cost of moving SSE register */
610 {2, 2, 8}, /* cost of loading SSE registers
611 in SImode, DImode and TImode */
612 {2, 2, 8}, /* cost of storing SSE registers
613 in SImode, DImode and TImode */
614 3, /* MMX or SSE register to integer */
615 64, /* size of l1 cache. */
616 128, /* size of l2 cache. */
617 32, /* size of prefetch block */
618 1, /* number of parallel prefetches */
619 1, /* Branch cost */
620 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
621 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
622 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
623 COSTS_N_INSNS (1), /* cost of FABS instruction. */
624 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
625 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
626 geode_memcpy,
627 geode_memset,
628 1, /* scalar_stmt_cost. */
629 1, /* scalar load_cost. */
630 1, /* scalar_store_cost. */
631 1, /* vec_stmt_cost. */
632 1, /* vec_to_scalar_cost. */
633 1, /* scalar_to_vec_cost. */
634 1, /* vec_align_load_cost. */
635 2, /* vec_unalign_load_cost. */
636 1, /* vec_store_cost. */
637 3, /* cond_taken_branch_cost. */
638 1, /* cond_not_taken_branch_cost. */
641 static stringop_algs k6_memcpy[2] = {
642 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
643 DUMMY_STRINGOP_ALGS};
644 static stringop_algs k6_memset[2] = {
645 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
646 DUMMY_STRINGOP_ALGS};
647 static const
648 struct processor_costs k6_cost = {
649 COSTS_N_INSNS (1), /* cost of an add instruction */
650 COSTS_N_INSNS (2), /* cost of a lea instruction */
651 COSTS_N_INSNS (1), /* variable shift costs */
652 COSTS_N_INSNS (1), /* constant shift costs */
653 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
654 COSTS_N_INSNS (3), /* HI */
655 COSTS_N_INSNS (3), /* SI */
656 COSTS_N_INSNS (3), /* DI */
657 COSTS_N_INSNS (3)}, /* other */
658 0, /* cost of multiply per each bit set */
659 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
660 COSTS_N_INSNS (18), /* HI */
661 COSTS_N_INSNS (18), /* SI */
662 COSTS_N_INSNS (18), /* DI */
663 COSTS_N_INSNS (18)}, /* other */
664 COSTS_N_INSNS (2), /* cost of movsx */
665 COSTS_N_INSNS (2), /* cost of movzx */
666 8, /* "large" insn */
667 4, /* MOVE_RATIO */
668 3, /* cost for loading QImode using movzbl */
669 {4, 5, 4}, /* cost of loading integer registers
670 in QImode, HImode and SImode.
671 Relative to reg-reg move (2). */
672 {2, 3, 2}, /* cost of storing integer registers */
673 4, /* cost of reg,reg fld/fst */
674 {6, 6, 6}, /* cost of loading fp registers
675 in SFmode, DFmode and XFmode */
676 {4, 4, 4}, /* cost of storing fp registers
677 in SFmode, DFmode and XFmode */
678 2, /* cost of moving MMX register */
679 {2, 2}, /* cost of loading MMX registers
680 in SImode and DImode */
681 {2, 2}, /* cost of storing MMX registers
682 in SImode and DImode */
683 2, /* cost of moving SSE register */
684 {2, 2, 8}, /* cost of loading SSE registers
685 in SImode, DImode and TImode */
686 {2, 2, 8}, /* cost of storing SSE registers
687 in SImode, DImode and TImode */
688 6, /* MMX or SSE register to integer */
689 32, /* size of l1 cache. */
690 32, /* size of l2 cache. Some models
691 have integrated l2 cache, but
692 optimizing for k6 is not important
693 enough to worry about that. */
694 32, /* size of prefetch block */
695 1, /* number of parallel prefetches */
696 1, /* Branch cost */
697 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
698 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
699 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
700 COSTS_N_INSNS (2), /* cost of FABS instruction. */
701 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
702 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
703 k6_memcpy,
704 k6_memset,
705 1, /* scalar_stmt_cost. */
706 1, /* scalar load_cost. */
707 1, /* scalar_store_cost. */
708 1, /* vec_stmt_cost. */
709 1, /* vec_to_scalar_cost. */
710 1, /* scalar_to_vec_cost. */
711 1, /* vec_align_load_cost. */
712 2, /* vec_unalign_load_cost. */
713 1, /* vec_store_cost. */
714 3, /* cond_taken_branch_cost. */
715 1, /* cond_not_taken_branch_cost. */
718 /* For some reason, Athlon deals better with REP prefix (relative to loops)
719 compared to K8. Alignment becomes important after 8 bytes for memcpy and
720 128 bytes for memset. */
721 static stringop_algs athlon_memcpy[2] = {
722 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
723 DUMMY_STRINGOP_ALGS};
724 static stringop_algs athlon_memset[2] = {
725 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
726 DUMMY_STRINGOP_ALGS};
727 static const
728 struct processor_costs athlon_cost = {
729 COSTS_N_INSNS (1), /* cost of an add instruction */
730 COSTS_N_INSNS (2), /* cost of a lea instruction */
731 COSTS_N_INSNS (1), /* variable shift costs */
732 COSTS_N_INSNS (1), /* constant shift costs */
733 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
734 COSTS_N_INSNS (5), /* HI */
735 COSTS_N_INSNS (5), /* SI */
736 COSTS_N_INSNS (5), /* DI */
737 COSTS_N_INSNS (5)}, /* other */
738 0, /* cost of multiply per each bit set */
739 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
740 COSTS_N_INSNS (26), /* HI */
741 COSTS_N_INSNS (42), /* SI */
742 COSTS_N_INSNS (74), /* DI */
743 COSTS_N_INSNS (74)}, /* other */
744 COSTS_N_INSNS (1), /* cost of movsx */
745 COSTS_N_INSNS (1), /* cost of movzx */
746 8, /* "large" insn */
747 9, /* MOVE_RATIO */
748 4, /* cost for loading QImode using movzbl */
749 {3, 4, 3}, /* cost of loading integer registers
750 in QImode, HImode and SImode.
751 Relative to reg-reg move (2). */
752 {3, 4, 3}, /* cost of storing integer registers */
753 4, /* cost of reg,reg fld/fst */
754 {4, 4, 12}, /* cost of loading fp registers
755 in SFmode, DFmode and XFmode */
756 {6, 6, 8}, /* cost of storing fp registers
757 in SFmode, DFmode and XFmode */
758 2, /* cost of moving MMX register */
759 {4, 4}, /* cost of loading MMX registers
760 in SImode and DImode */
761 {4, 4}, /* cost of storing MMX registers
762 in SImode and DImode */
763 2, /* cost of moving SSE register */
764 {4, 4, 6}, /* cost of loading SSE registers
765 in SImode, DImode and TImode */
766 {4, 4, 5}, /* cost of storing SSE registers
767 in SImode, DImode and TImode */
768 5, /* MMX or SSE register to integer */
769 64, /* size of l1 cache. */
770 256, /* size of l2 cache. */
771 64, /* size of prefetch block */
772 6, /* number of parallel prefetches */
773 5, /* Branch cost */
774 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
775 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
776 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
777 COSTS_N_INSNS (2), /* cost of FABS instruction. */
778 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
779 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
780 athlon_memcpy,
781 athlon_memset,
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
795 /* K8 has optimized REP instruction for medium sized blocks, but for very
796 small blocks it is better to use loop. For large blocks, libcall can
797 do nontemporary accesses and beat inline considerably. */
798 static stringop_algs k8_memcpy[2] = {
799 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
800 {-1, rep_prefix_4_byte, false}}},
801 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
802 {-1, libcall, false}}}};
803 static stringop_algs k8_memset[2] = {
804 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
805 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
806 {libcall, {{48, unrolled_loop, false},
807 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
808 static const
809 struct processor_costs k8_cost = {
810 COSTS_N_INSNS (1), /* cost of an add instruction */
811 COSTS_N_INSNS (2), /* cost of a lea instruction */
812 COSTS_N_INSNS (1), /* variable shift costs */
813 COSTS_N_INSNS (1), /* constant shift costs */
814 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
815 COSTS_N_INSNS (4), /* HI */
816 COSTS_N_INSNS (3), /* SI */
817 COSTS_N_INSNS (4), /* DI */
818 COSTS_N_INSNS (5)}, /* other */
819 0, /* cost of multiply per each bit set */
820 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
821 COSTS_N_INSNS (26), /* HI */
822 COSTS_N_INSNS (42), /* SI */
823 COSTS_N_INSNS (74), /* DI */
824 COSTS_N_INSNS (74)}, /* other */
825 COSTS_N_INSNS (1), /* cost of movsx */
826 COSTS_N_INSNS (1), /* cost of movzx */
827 8, /* "large" insn */
828 9, /* MOVE_RATIO */
829 4, /* cost for loading QImode using movzbl */
830 {3, 4, 3}, /* cost of loading integer registers
831 in QImode, HImode and SImode.
832 Relative to reg-reg move (2). */
833 {3, 4, 3}, /* cost of storing integer registers */
834 4, /* cost of reg,reg fld/fst */
835 {4, 4, 12}, /* cost of loading fp registers
836 in SFmode, DFmode and XFmode */
837 {6, 6, 8}, /* cost of storing fp registers
838 in SFmode, DFmode and XFmode */
839 2, /* cost of moving MMX register */
840 {3, 3}, /* cost of loading MMX registers
841 in SImode and DImode */
842 {4, 4}, /* cost of storing MMX registers
843 in SImode and DImode */
844 2, /* cost of moving SSE register */
845 {4, 3, 6}, /* cost of loading SSE registers
846 in SImode, DImode and TImode */
847 {4, 4, 5}, /* cost of storing SSE registers
848 in SImode, DImode and TImode */
849 5, /* MMX or SSE register to integer */
850 64, /* size of l1 cache. */
851 512, /* size of l2 cache. */
852 64, /* size of prefetch block */
853 /* New AMD processors never drop prefetches; if they cannot be performed
854 immediately, they are queued. We set number of simultaneous prefetches
855 to a large constant to reflect this (it probably is not a good idea not
856 to limit number of prefetches at all, as their execution also takes some
857 time). */
858 100, /* number of parallel prefetches */
859 3, /* Branch cost */
860 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
861 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
862 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
863 COSTS_N_INSNS (2), /* cost of FABS instruction. */
864 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
865 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
867 k8_memcpy,
868 k8_memset,
869 4, /* scalar_stmt_cost. */
870 2, /* scalar load_cost. */
871 2, /* scalar_store_cost. */
872 5, /* vec_stmt_cost. */
873 0, /* vec_to_scalar_cost. */
874 2, /* scalar_to_vec_cost. */
875 2, /* vec_align_load_cost. */
876 3, /* vec_unalign_load_cost. */
877 3, /* vec_store_cost. */
878 3, /* cond_taken_branch_cost. */
879 2, /* cond_not_taken_branch_cost. */
882 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
883 very small blocks it is better to use loop. For large blocks, libcall can
884 do nontemporary accesses and beat inline considerably. */
885 static stringop_algs amdfam10_memcpy[2] = {
886 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
887 {-1, rep_prefix_4_byte, false}}},
888 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
889 {-1, libcall, false}}}};
890 static stringop_algs amdfam10_memset[2] = {
891 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
892 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
893 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
894 {-1, libcall, false}}}};
895 struct processor_costs amdfam10_cost = {
896 COSTS_N_INSNS (1), /* cost of an add instruction */
897 COSTS_N_INSNS (2), /* cost of a lea instruction */
898 COSTS_N_INSNS (1), /* variable shift costs */
899 COSTS_N_INSNS (1), /* constant shift costs */
900 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
901 COSTS_N_INSNS (4), /* HI */
902 COSTS_N_INSNS (3), /* SI */
903 COSTS_N_INSNS (4), /* DI */
904 COSTS_N_INSNS (5)}, /* other */
905 0, /* cost of multiply per each bit set */
906 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
907 COSTS_N_INSNS (35), /* HI */
908 COSTS_N_INSNS (51), /* SI */
909 COSTS_N_INSNS (83), /* DI */
910 COSTS_N_INSNS (83)}, /* other */
911 COSTS_N_INSNS (1), /* cost of movsx */
912 COSTS_N_INSNS (1), /* cost of movzx */
913 8, /* "large" insn */
914 9, /* MOVE_RATIO */
915 4, /* cost for loading QImode using movzbl */
916 {3, 4, 3}, /* cost of loading integer registers
917 in QImode, HImode and SImode.
918 Relative to reg-reg move (2). */
919 {3, 4, 3}, /* cost of storing integer registers */
920 4, /* cost of reg,reg fld/fst */
921 {4, 4, 12}, /* cost of loading fp registers
922 in SFmode, DFmode and XFmode */
923 {6, 6, 8}, /* cost of storing fp registers
924 in SFmode, DFmode and XFmode */
925 2, /* cost of moving MMX register */
926 {3, 3}, /* cost of loading MMX registers
927 in SImode and DImode */
928 {4, 4}, /* cost of storing MMX registers
929 in SImode and DImode */
930 2, /* cost of moving SSE register */
931 {4, 4, 3}, /* cost of loading SSE registers
932 in SImode, DImode and TImode */
933 {4, 4, 5}, /* cost of storing SSE registers
934 in SImode, DImode and TImode */
935 3, /* MMX or SSE register to integer */
936 /* On K8:
937 MOVD reg64, xmmreg Double FSTORE 4
938 MOVD reg32, xmmreg Double FSTORE 4
939 On AMDFAM10:
940 MOVD reg64, xmmreg Double FADD 3
941 1/1 1/1
942 MOVD reg32, xmmreg Double FADD 3
943 1/1 1/1 */
944 64, /* size of l1 cache. */
945 512, /* size of l2 cache. */
946 64, /* size of prefetch block */
947 /* New AMD processors never drop prefetches; if they cannot be performed
948 immediately, they are queued. We set number of simultaneous prefetches
949 to a large constant to reflect this (it probably is not a good idea not
950 to limit number of prefetches at all, as their execution also takes some
951 time). */
952 100, /* number of parallel prefetches */
953 2, /* Branch cost */
954 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
955 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
956 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
957 COSTS_N_INSNS (2), /* cost of FABS instruction. */
958 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
959 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
961 amdfam10_memcpy,
962 amdfam10_memset,
963 4, /* scalar_stmt_cost. */
964 2, /* scalar load_cost. */
965 2, /* scalar_store_cost. */
966 6, /* vec_stmt_cost. */
967 0, /* vec_to_scalar_cost. */
968 2, /* scalar_to_vec_cost. */
969 2, /* vec_align_load_cost. */
970 2, /* vec_unalign_load_cost. */
971 2, /* vec_store_cost. */
972 2, /* cond_taken_branch_cost. */
973 1, /* cond_not_taken_branch_cost. */
976 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
977 very small blocks it is better to use loop. For large blocks, libcall
978 can do nontemporary accesses and beat inline considerably. */
979 static stringop_algs bdver1_memcpy[2] = {
980 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
981 {-1, rep_prefix_4_byte, false}}},
982 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
983 {-1, libcall, false}}}};
984 static stringop_algs bdver1_memset[2] = {
985 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
986 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
987 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
990 const struct processor_costs bdver1_cost = {
991 COSTS_N_INSNS (1), /* cost of an add instruction */
992 COSTS_N_INSNS (1), /* cost of a lea instruction */
993 COSTS_N_INSNS (1), /* variable shift costs */
994 COSTS_N_INSNS (1), /* constant shift costs */
995 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
996 COSTS_N_INSNS (4), /* HI */
997 COSTS_N_INSNS (4), /* SI */
998 COSTS_N_INSNS (6), /* DI */
999 COSTS_N_INSNS (6)}, /* other */
1000 0, /* cost of multiply per each bit set */
1001 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1002 COSTS_N_INSNS (35), /* HI */
1003 COSTS_N_INSNS (51), /* SI */
1004 COSTS_N_INSNS (83), /* DI */
1005 COSTS_N_INSNS (83)}, /* other */
1006 COSTS_N_INSNS (1), /* cost of movsx */
1007 COSTS_N_INSNS (1), /* cost of movzx */
1008 8, /* "large" insn */
1009 9, /* MOVE_RATIO */
1010 4, /* cost for loading QImode using movzbl */
1011 {5, 5, 4}, /* cost of loading integer registers
1012 in QImode, HImode and SImode.
1013 Relative to reg-reg move (2). */
1014 {4, 4, 4}, /* cost of storing integer registers */
1015 2, /* cost of reg,reg fld/fst */
1016 {5, 5, 12}, /* cost of loading fp registers
1017 in SFmode, DFmode and XFmode */
1018 {4, 4, 8}, /* cost of storing fp registers
1019 in SFmode, DFmode and XFmode */
1020 2, /* cost of moving MMX register */
1021 {4, 4}, /* cost of loading MMX registers
1022 in SImode and DImode */
1023 {4, 4}, /* cost of storing MMX registers
1024 in SImode and DImode */
1025 2, /* cost of moving SSE register */
1026 {4, 4, 4}, /* cost of loading SSE registers
1027 in SImode, DImode and TImode */
1028 {4, 4, 4}, /* cost of storing SSE registers
1029 in SImode, DImode and TImode */
1030 2, /* MMX or SSE register to integer */
1031 /* On K8:
1032 MOVD reg64, xmmreg Double FSTORE 4
1033 MOVD reg32, xmmreg Double FSTORE 4
1034 On AMDFAM10:
1035 MOVD reg64, xmmreg Double FADD 3
1036 1/1 1/1
1037 MOVD reg32, xmmreg Double FADD 3
1038 1/1 1/1 */
1039 16, /* size of l1 cache. */
1040 2048, /* size of l2 cache. */
1041 64, /* size of prefetch block */
1042 /* New AMD processors never drop prefetches; if they cannot be performed
1043 immediately, they are queued. We set number of simultaneous prefetches
1044 to a large constant to reflect this (it probably is not a good idea not
1045 to limit number of prefetches at all, as their execution also takes some
1046 time). */
1047 100, /* number of parallel prefetches */
1048 2, /* Branch cost */
1049 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1050 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1051 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1052 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1053 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1054 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1056 bdver1_memcpy,
1057 bdver1_memset,
1058 6, /* scalar_stmt_cost. */
1059 4, /* scalar load_cost. */
1060 4, /* scalar_store_cost. */
1061 6, /* vec_stmt_cost. */
1062 0, /* vec_to_scalar_cost. */
1063 2, /* scalar_to_vec_cost. */
1064 4, /* vec_align_load_cost. */
1065 4, /* vec_unalign_load_cost. */
1066 4, /* vec_store_cost. */
1067 4, /* cond_taken_branch_cost. */
1068 2, /* cond_not_taken_branch_cost. */
1071 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1072 very small blocks it is better to use loop. For large blocks, libcall
1073 can do nontemporary accesses and beat inline considerably. */
1075 static stringop_algs bdver2_memcpy[2] = {
1076 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1077 {-1, rep_prefix_4_byte, false}}},
1078 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1079 {-1, libcall, false}}}};
1080 static stringop_algs bdver2_memset[2] = {
1081 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1082 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1083 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1086 const struct processor_costs bdver2_cost = {
1087 COSTS_N_INSNS (1), /* cost of an add instruction */
1088 COSTS_N_INSNS (1), /* cost of a lea instruction */
1089 COSTS_N_INSNS (1), /* variable shift costs */
1090 COSTS_N_INSNS (1), /* constant shift costs */
1091 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1092 COSTS_N_INSNS (4), /* HI */
1093 COSTS_N_INSNS (4), /* SI */
1094 COSTS_N_INSNS (6), /* DI */
1095 COSTS_N_INSNS (6)}, /* other */
1096 0, /* cost of multiply per each bit set */
1097 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1098 COSTS_N_INSNS (35), /* HI */
1099 COSTS_N_INSNS (51), /* SI */
1100 COSTS_N_INSNS (83), /* DI */
1101 COSTS_N_INSNS (83)}, /* other */
1102 COSTS_N_INSNS (1), /* cost of movsx */
1103 COSTS_N_INSNS (1), /* cost of movzx */
1104 8, /* "large" insn */
1105 9, /* MOVE_RATIO */
1106 4, /* cost for loading QImode using movzbl */
1107 {5, 5, 4}, /* cost of loading integer registers
1108 in QImode, HImode and SImode.
1109 Relative to reg-reg move (2). */
1110 {4, 4, 4}, /* cost of storing integer registers */
1111 2, /* cost of reg,reg fld/fst */
1112 {5, 5, 12}, /* cost of loading fp registers
1113 in SFmode, DFmode and XFmode */
1114 {4, 4, 8}, /* cost of storing fp registers
1115 in SFmode, DFmode and XFmode */
1116 2, /* cost of moving MMX register */
1117 {4, 4}, /* cost of loading MMX registers
1118 in SImode and DImode */
1119 {4, 4}, /* cost of storing MMX registers
1120 in SImode and DImode */
1121 2, /* cost of moving SSE register */
1122 {4, 4, 4}, /* cost of loading SSE registers
1123 in SImode, DImode and TImode */
1124 {4, 4, 4}, /* cost of storing SSE registers
1125 in SImode, DImode and TImode */
1126 2, /* MMX or SSE register to integer */
1127 /* On K8:
1128 MOVD reg64, xmmreg Double FSTORE 4
1129 MOVD reg32, xmmreg Double FSTORE 4
1130 On AMDFAM10:
1131 MOVD reg64, xmmreg Double FADD 3
1132 1/1 1/1
1133 MOVD reg32, xmmreg Double FADD 3
1134 1/1 1/1 */
1135 16, /* size of l1 cache. */
1136 2048, /* size of l2 cache. */
1137 64, /* size of prefetch block */
1138 /* New AMD processors never drop prefetches; if they cannot be performed
1139 immediately, they are queued. We set number of simultaneous prefetches
1140 to a large constant to reflect this (it probably is not a good idea not
1141 to limit number of prefetches at all, as their execution also takes some
1142 time). */
1143 100, /* number of parallel prefetches */
1144 2, /* Branch cost */
1145 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1146 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1147 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1148 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1149 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1150 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1152 bdver2_memcpy,
1153 bdver2_memset,
1154 6, /* scalar_stmt_cost. */
1155 4, /* scalar load_cost. */
1156 4, /* scalar_store_cost. */
1157 6, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 4, /* vec_align_load_cost. */
1161 4, /* vec_unalign_load_cost. */
1162 4, /* vec_store_cost. */
1163 4, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1168 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1169 very small blocks it is better to use loop. For large blocks, libcall
1170 can do nontemporary accesses and beat inline considerably. */
1171 static stringop_algs bdver3_memcpy[2] = {
1172 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1173 {-1, rep_prefix_4_byte, false}}},
1174 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 static stringop_algs bdver3_memset[2] = {
1177 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1178 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1179 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1180 {-1, libcall, false}}}};
1181 struct processor_costs bdver3_cost = {
1182 COSTS_N_INSNS (1), /* cost of an add instruction */
1183 COSTS_N_INSNS (1), /* cost of a lea instruction */
1184 COSTS_N_INSNS (1), /* variable shift costs */
1185 COSTS_N_INSNS (1), /* constant shift costs */
1186 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1187 COSTS_N_INSNS (4), /* HI */
1188 COSTS_N_INSNS (4), /* SI */
1189 COSTS_N_INSNS (6), /* DI */
1190 COSTS_N_INSNS (6)}, /* other */
1191 0, /* cost of multiply per each bit set */
1192 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1193 COSTS_N_INSNS (35), /* HI */
1194 COSTS_N_INSNS (51), /* SI */
1195 COSTS_N_INSNS (83), /* DI */
1196 COSTS_N_INSNS (83)}, /* other */
1197 COSTS_N_INSNS (1), /* cost of movsx */
1198 COSTS_N_INSNS (1), /* cost of movzx */
1199 8, /* "large" insn */
1200 9, /* MOVE_RATIO */
1201 4, /* cost for loading QImode using movzbl */
1202 {5, 5, 4}, /* cost of loading integer registers
1203 in QImode, HImode and SImode.
1204 Relative to reg-reg move (2). */
1205 {4, 4, 4}, /* cost of storing integer registers */
1206 2, /* cost of reg,reg fld/fst */
1207 {5, 5, 12}, /* cost of loading fp registers
1208 in SFmode, DFmode and XFmode */
1209 {4, 4, 8}, /* cost of storing fp registers
1210 in SFmode, DFmode and XFmode */
1211 2, /* cost of moving MMX register */
1212 {4, 4}, /* cost of loading MMX registers
1213 in SImode and DImode */
1214 {4, 4}, /* cost of storing MMX registers
1215 in SImode and DImode */
1216 2, /* cost of moving SSE register */
1217 {4, 4, 4}, /* cost of loading SSE registers
1218 in SImode, DImode and TImode */
1219 {4, 4, 4}, /* cost of storing SSE registers
1220 in SImode, DImode and TImode */
1221 2, /* MMX or SSE register to integer */
1222 16, /* size of l1 cache. */
1223 2048, /* size of l2 cache. */
1224 64, /* size of prefetch block */
1225 /* New AMD processors never drop prefetches; if they cannot be performed
1226 immediately, they are queued. We set number of simultaneous prefetches
1227 to a large constant to reflect this (it probably is not a good idea not
1228 to limit number of prefetches at all, as their execution also takes some
1229 time). */
1230 100, /* number of parallel prefetches */
1231 2, /* Branch cost */
1232 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1233 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1234 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1235 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1236 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1237 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1239 bdver3_memcpy,
1240 bdver3_memset,
1241 6, /* scalar_stmt_cost. */
1242 4, /* scalar load_cost. */
1243 4, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 4, /* vec_align_load_cost. */
1248 4, /* vec_unalign_load_cost. */
1249 4, /* vec_store_cost. */
1250 4, /* cond_taken_branch_cost. */
1251 2, /* cond_not_taken_branch_cost. */
1254 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1255 very small blocks it is better to use loop. For large blocks, libcall
1256 can do nontemporary accesses and beat inline considerably. */
1257 static stringop_algs bdver4_memcpy[2] = {
1258 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1259 {-1, rep_prefix_4_byte, false}}},
1260 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 static stringop_algs bdver4_memset[2] = {
1263 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1264 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1265 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1266 {-1, libcall, false}}}};
1267 struct processor_costs bdver4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (1), /* cost of a lea instruction */
1270 COSTS_N_INSNS (1), /* variable shift costs */
1271 COSTS_N_INSNS (1), /* constant shift costs */
1272 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (4), /* HI */
1274 COSTS_N_INSNS (4), /* SI */
1275 COSTS_N_INSNS (6), /* DI */
1276 COSTS_N_INSNS (6)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (35), /* HI */
1280 COSTS_N_INSNS (51), /* SI */
1281 COSTS_N_INSNS (83), /* DI */
1282 COSTS_N_INSNS (83)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 8, /* "large" insn */
1286 9, /* MOVE_RATIO */
1287 4, /* cost for loading QImode using movzbl */
1288 {5, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {4, 4, 4}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {5, 5, 12}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 8}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {4, 4}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {4, 4}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 2, /* cost of moving SSE register */
1303 {4, 4, 4}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {4, 4, 4}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 2, /* MMX or SSE register to integer */
1308 16, /* size of l1 cache. */
1309 2048, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 /* New AMD processors never drop prefetches; if they cannot be performed
1312 immediately, they are queued. We set number of simultaneous prefetches
1313 to a large constant to reflect this (it probably is not a good idea not
1314 to limit number of prefetches at all, as their execution also takes some
1315 time). */
1316 100, /* number of parallel prefetches */
1317 2, /* Branch cost */
1318 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1319 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1320 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1321 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1322 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1323 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1325 bdver4_memcpy,
1326 bdver4_memset,
1327 6, /* scalar_stmt_cost. */
1328 4, /* scalar load_cost. */
1329 4, /* scalar_store_cost. */
1330 6, /* vec_stmt_cost. */
1331 0, /* vec_to_scalar_cost. */
1332 2, /* scalar_to_vec_cost. */
1333 4, /* vec_align_load_cost. */
1334 4, /* vec_unalign_load_cost. */
1335 4, /* vec_store_cost. */
1336 4, /* cond_taken_branch_cost. */
1337 2, /* cond_not_taken_branch_cost. */
1341 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1342 very small blocks it is better to use loop. For large blocks, libcall
1343 can do nontemporary accesses and beat inline considerably. */
1344 static stringop_algs znver1_memcpy[2] = {
1345 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1346 {-1, rep_prefix_4_byte, false}}},
1347 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1348 {-1, libcall, false}}}};
1349 static stringop_algs znver1_memset[2] = {
1350 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1351 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1352 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1353 {-1, libcall, false}}}};
1354 struct processor_costs znver1_cost = {
1355 COSTS_N_INSNS (1), /* cost of an add instruction. */
1356 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1357 COSTS_N_INSNS (1), /* variable shift costs. */
1358 COSTS_N_INSNS (1), /* constant shift costs. */
1359 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1360 COSTS_N_INSNS (3), /* HI. */
1361 COSTS_N_INSNS (3), /* SI. */
1362 COSTS_N_INSNS (4), /* DI. */
1363 COSTS_N_INSNS (4)}, /* other. */
1364 0, /* cost of multiply per each bit
1365 set. */
1366 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1367 COSTS_N_INSNS (35), /* HI. */
1368 COSTS_N_INSNS (51), /* SI. */
1369 COSTS_N_INSNS (83), /* DI. */
1370 COSTS_N_INSNS (83)}, /* other. */
1371 COSTS_N_INSNS (1), /* cost of movsx. */
1372 COSTS_N_INSNS (1), /* cost of movzx. */
1373 8, /* "large" insn. */
1374 9, /* MOVE_RATIO. */
1375 4, /* cost for loading QImode using
1376 movzbl. */
1377 {5, 5, 4}, /* cost of loading integer registers
1378 in QImode, HImode and SImode.
1379 Relative to reg-reg move (2). */
1380 {4, 4, 4}, /* cost of storing integer
1381 registers. */
1382 2, /* cost of reg,reg fld/fst. */
1383 {5, 5, 12}, /* cost of loading fp registers
1384 in SFmode, DFmode and XFmode. */
1385 {4, 4, 8}, /* cost of storing fp registers
1386 in SFmode, DFmode and XFmode. */
1387 2, /* cost of moving MMX register. */
1388 {4, 4}, /* cost of loading MMX registers
1389 in SImode and DImode. */
1390 {4, 4}, /* cost of storing MMX registers
1391 in SImode and DImode. */
1392 2, /* cost of moving SSE register. */
1393 {4, 4, 4}, /* cost of loading SSE registers
1394 in SImode, DImode and TImode. */
1395 {4, 4, 4}, /* cost of storing SSE registers
1396 in SImode, DImode and TImode. */
1397 2, /* MMX or SSE register to integer. */
1398 32, /* size of l1 cache. */
1399 512, /* size of l2 cache. */
1400 64, /* size of prefetch block. */
1401 /* New AMD processors never drop prefetches; if they cannot be performed
1402 immediately, they are queued. We set number of simultaneous prefetches
1403 to a large constant to reflect this (it probably is not a good idea not
1404 to limit number of prefetches at all, as their execution also takes some
1405 time). */
1406 100, /* number of parallel prefetches. */
1407 2, /* Branch cost. */
1408 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1409 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1410 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1411 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1412 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1413 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1415 znver1_memcpy,
1416 znver1_memset,
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 4, /* cond_taken_branch_cost. */
1427 2, /* cond_not_taken_branch_cost. */
1430 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1431 very small blocks it is better to use loop. For large blocks, libcall can
1432 do nontemporary accesses and beat inline considerably. */
1433 static stringop_algs btver1_memcpy[2] = {
1434 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1435 {-1, rep_prefix_4_byte, false}}},
1436 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1437 {-1, libcall, false}}}};
1438 static stringop_algs btver1_memset[2] = {
1439 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1440 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1441 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1442 {-1, libcall, false}}}};
1443 const struct processor_costs btver1_cost = {
1444 COSTS_N_INSNS (1), /* cost of an add instruction */
1445 COSTS_N_INSNS (2), /* cost of a lea instruction */
1446 COSTS_N_INSNS (1), /* variable shift costs */
1447 COSTS_N_INSNS (1), /* constant shift costs */
1448 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1449 COSTS_N_INSNS (4), /* HI */
1450 COSTS_N_INSNS (3), /* SI */
1451 COSTS_N_INSNS (4), /* DI */
1452 COSTS_N_INSNS (5)}, /* other */
1453 0, /* cost of multiply per each bit set */
1454 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1455 COSTS_N_INSNS (35), /* HI */
1456 COSTS_N_INSNS (51), /* SI */
1457 COSTS_N_INSNS (83), /* DI */
1458 COSTS_N_INSNS (83)}, /* other */
1459 COSTS_N_INSNS (1), /* cost of movsx */
1460 COSTS_N_INSNS (1), /* cost of movzx */
1461 8, /* "large" insn */
1462 9, /* MOVE_RATIO */
1463 4, /* cost for loading QImode using movzbl */
1464 {3, 4, 3}, /* cost of loading integer registers
1465 in QImode, HImode and SImode.
1466 Relative to reg-reg move (2). */
1467 {3, 4, 3}, /* cost of storing integer registers */
1468 4, /* cost of reg,reg fld/fst */
1469 {4, 4, 12}, /* cost of loading fp registers
1470 in SFmode, DFmode and XFmode */
1471 {6, 6, 8}, /* cost of storing fp registers
1472 in SFmode, DFmode and XFmode */
1473 2, /* cost of moving MMX register */
1474 {3, 3}, /* cost of loading MMX registers
1475 in SImode and DImode */
1476 {4, 4}, /* cost of storing MMX registers
1477 in SImode and DImode */
1478 2, /* cost of moving SSE register */
1479 {4, 4, 3}, /* cost of loading SSE registers
1480 in SImode, DImode and TImode */
1481 {4, 4, 5}, /* cost of storing SSE registers
1482 in SImode, DImode and TImode */
1483 3, /* MMX or SSE register to integer */
1484 /* On K8:
1485 MOVD reg64, xmmreg Double FSTORE 4
1486 MOVD reg32, xmmreg Double FSTORE 4
1487 On AMDFAM10:
1488 MOVD reg64, xmmreg Double FADD 3
1489 1/1 1/1
1490 MOVD reg32, xmmreg Double FADD 3
1491 1/1 1/1 */
1492 32, /* size of l1 cache. */
1493 512, /* size of l2 cache. */
1494 64, /* size of prefetch block */
1495 100, /* number of parallel prefetches */
1496 2, /* Branch cost */
1497 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1498 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1499 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1500 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1501 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1502 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1504 btver1_memcpy,
1505 btver1_memset,
1506 4, /* scalar_stmt_cost. */
1507 2, /* scalar load_cost. */
1508 2, /* scalar_store_cost. */
1509 6, /* vec_stmt_cost. */
1510 0, /* vec_to_scalar_cost. */
1511 2, /* scalar_to_vec_cost. */
1512 2, /* vec_align_load_cost. */
1513 2, /* vec_unalign_load_cost. */
1514 2, /* vec_store_cost. */
1515 2, /* cond_taken_branch_cost. */
1516 1, /* cond_not_taken_branch_cost. */
1519 static stringop_algs btver2_memcpy[2] = {
1520 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1521 {-1, rep_prefix_4_byte, false}}},
1522 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1523 {-1, libcall, false}}}};
1524 static stringop_algs btver2_memset[2] = {
1525 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1526 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1527 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1528 {-1, libcall, false}}}};
1529 const struct processor_costs btver2_cost = {
1530 COSTS_N_INSNS (1), /* cost of an add instruction */
1531 COSTS_N_INSNS (2), /* cost of a lea instruction */
1532 COSTS_N_INSNS (1), /* variable shift costs */
1533 COSTS_N_INSNS (1), /* constant shift costs */
1534 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1535 COSTS_N_INSNS (4), /* HI */
1536 COSTS_N_INSNS (3), /* SI */
1537 COSTS_N_INSNS (4), /* DI */
1538 COSTS_N_INSNS (5)}, /* other */
1539 0, /* cost of multiply per each bit set */
1540 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1541 COSTS_N_INSNS (35), /* HI */
1542 COSTS_N_INSNS (51), /* SI */
1543 COSTS_N_INSNS (83), /* DI */
1544 COSTS_N_INSNS (83)}, /* other */
1545 COSTS_N_INSNS (1), /* cost of movsx */
1546 COSTS_N_INSNS (1), /* cost of movzx */
1547 8, /* "large" insn */
1548 9, /* MOVE_RATIO */
1549 4, /* cost for loading QImode using movzbl */
1550 {3, 4, 3}, /* cost of loading integer registers
1551 in QImode, HImode and SImode.
1552 Relative to reg-reg move (2). */
1553 {3, 4, 3}, /* cost of storing integer registers */
1554 4, /* cost of reg,reg fld/fst */
1555 {4, 4, 12}, /* cost of loading fp registers
1556 in SFmode, DFmode and XFmode */
1557 {6, 6, 8}, /* cost of storing fp registers
1558 in SFmode, DFmode and XFmode */
1559 2, /* cost of moving MMX register */
1560 {3, 3}, /* cost of loading MMX registers
1561 in SImode and DImode */
1562 {4, 4}, /* cost of storing MMX registers
1563 in SImode and DImode */
1564 2, /* cost of moving SSE register */
1565 {4, 4, 3}, /* cost of loading SSE registers
1566 in SImode, DImode and TImode */
1567 {4, 4, 5}, /* cost of storing SSE registers
1568 in SImode, DImode and TImode */
1569 3, /* MMX or SSE register to integer */
1570 /* On K8:
1571 MOVD reg64, xmmreg Double FSTORE 4
1572 MOVD reg32, xmmreg Double FSTORE 4
1573 On AMDFAM10:
1574 MOVD reg64, xmmreg Double FADD 3
1575 1/1 1/1
1576 MOVD reg32, xmmreg Double FADD 3
1577 1/1 1/1 */
1578 32, /* size of l1 cache. */
1579 2048, /* size of l2 cache. */
1580 64, /* size of prefetch block */
1581 100, /* number of parallel prefetches */
1582 2, /* Branch cost */
1583 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1584 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1585 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1586 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1587 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1588 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1589 btver2_memcpy,
1590 btver2_memset,
1591 4, /* scalar_stmt_cost. */
1592 2, /* scalar load_cost. */
1593 2, /* scalar_store_cost. */
1594 6, /* vec_stmt_cost. */
1595 0, /* vec_to_scalar_cost. */
1596 2, /* scalar_to_vec_cost. */
1597 2, /* vec_align_load_cost. */
1598 2, /* vec_unalign_load_cost. */
1599 2, /* vec_store_cost. */
1600 2, /* cond_taken_branch_cost. */
1601 1, /* cond_not_taken_branch_cost. */
1604 static stringop_algs pentium4_memcpy[2] = {
1605 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1606 DUMMY_STRINGOP_ALGS};
1607 static stringop_algs pentium4_memset[2] = {
1608 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1609 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1610 DUMMY_STRINGOP_ALGS};
1612 static const
1613 struct processor_costs pentium4_cost = {
1614 COSTS_N_INSNS (1), /* cost of an add instruction */
1615 COSTS_N_INSNS (3), /* cost of a lea instruction */
1616 COSTS_N_INSNS (4), /* variable shift costs */
1617 COSTS_N_INSNS (4), /* constant shift costs */
1618 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1619 COSTS_N_INSNS (15), /* HI */
1620 COSTS_N_INSNS (15), /* SI */
1621 COSTS_N_INSNS (15), /* DI */
1622 COSTS_N_INSNS (15)}, /* other */
1623 0, /* cost of multiply per each bit set */
1624 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1625 COSTS_N_INSNS (56), /* HI */
1626 COSTS_N_INSNS (56), /* SI */
1627 COSTS_N_INSNS (56), /* DI */
1628 COSTS_N_INSNS (56)}, /* other */
1629 COSTS_N_INSNS (1), /* cost of movsx */
1630 COSTS_N_INSNS (1), /* cost of movzx */
1631 16, /* "large" insn */
1632 6, /* MOVE_RATIO */
1633 2, /* cost for loading QImode using movzbl */
1634 {4, 5, 4}, /* cost of loading integer registers
1635 in QImode, HImode and SImode.
1636 Relative to reg-reg move (2). */
1637 {2, 3, 2}, /* cost of storing integer registers */
1638 2, /* cost of reg,reg fld/fst */
1639 {2, 2, 6}, /* cost of loading fp registers
1640 in SFmode, DFmode and XFmode */
1641 {4, 4, 6}, /* cost of storing fp registers
1642 in SFmode, DFmode and XFmode */
1643 2, /* cost of moving MMX register */
1644 {2, 2}, /* cost of loading MMX registers
1645 in SImode and DImode */
1646 {2, 2}, /* cost of storing MMX registers
1647 in SImode and DImode */
1648 12, /* cost of moving SSE register */
1649 {12, 12, 12}, /* cost of loading SSE registers
1650 in SImode, DImode and TImode */
1651 {2, 2, 8}, /* cost of storing SSE registers
1652 in SImode, DImode and TImode */
1653 10, /* MMX or SSE register to integer */
1654 8, /* size of l1 cache. */
1655 256, /* size of l2 cache. */
1656 64, /* size of prefetch block */
1657 6, /* number of parallel prefetches */
1658 2, /* Branch cost */
1659 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1660 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1661 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1662 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1663 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1664 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1665 pentium4_memcpy,
1666 pentium4_memset,
1667 1, /* scalar_stmt_cost. */
1668 1, /* scalar load_cost. */
1669 1, /* scalar_store_cost. */
1670 1, /* vec_stmt_cost. */
1671 1, /* vec_to_scalar_cost. */
1672 1, /* scalar_to_vec_cost. */
1673 1, /* vec_align_load_cost. */
1674 2, /* vec_unalign_load_cost. */
1675 1, /* vec_store_cost. */
1676 3, /* cond_taken_branch_cost. */
1677 1, /* cond_not_taken_branch_cost. */
1680 static stringop_algs nocona_memcpy[2] = {
1681 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1682 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1683 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1685 static stringop_algs nocona_memset[2] = {
1686 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1687 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1688 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1689 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1691 static const
1692 struct processor_costs nocona_cost = {
1693 COSTS_N_INSNS (1), /* cost of an add instruction */
1694 COSTS_N_INSNS (1), /* cost of a lea instruction */
1695 COSTS_N_INSNS (1), /* variable shift costs */
1696 COSTS_N_INSNS (1), /* constant shift costs */
1697 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1698 COSTS_N_INSNS (10), /* HI */
1699 COSTS_N_INSNS (10), /* SI */
1700 COSTS_N_INSNS (10), /* DI */
1701 COSTS_N_INSNS (10)}, /* other */
1702 0, /* cost of multiply per each bit set */
1703 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1704 COSTS_N_INSNS (66), /* HI */
1705 COSTS_N_INSNS (66), /* SI */
1706 COSTS_N_INSNS (66), /* DI */
1707 COSTS_N_INSNS (66)}, /* other */
1708 COSTS_N_INSNS (1), /* cost of movsx */
1709 COSTS_N_INSNS (1), /* cost of movzx */
1710 16, /* "large" insn */
1711 17, /* MOVE_RATIO */
1712 4, /* cost for loading QImode using movzbl */
1713 {4, 4, 4}, /* cost of loading integer registers
1714 in QImode, HImode and SImode.
1715 Relative to reg-reg move (2). */
1716 {4, 4, 4}, /* cost of storing integer registers */
1717 3, /* cost of reg,reg fld/fst */
1718 {12, 12, 12}, /* cost of loading fp registers
1719 in SFmode, DFmode and XFmode */
1720 {4, 4, 4}, /* cost of storing fp registers
1721 in SFmode, DFmode and XFmode */
1722 6, /* cost of moving MMX register */
1723 {12, 12}, /* cost of loading MMX registers
1724 in SImode and DImode */
1725 {12, 12}, /* cost of storing MMX registers
1726 in SImode and DImode */
1727 6, /* cost of moving SSE register */
1728 {12, 12, 12}, /* cost of loading SSE registers
1729 in SImode, DImode and TImode */
1730 {12, 12, 12}, /* cost of storing SSE registers
1731 in SImode, DImode and TImode */
1732 8, /* MMX or SSE register to integer */
1733 8, /* size of l1 cache. */
1734 1024, /* size of l2 cache. */
1735 64, /* size of prefetch block */
1736 8, /* number of parallel prefetches */
1737 1, /* Branch cost */
1738 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1739 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1740 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1741 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1742 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1743 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1744 nocona_memcpy,
1745 nocona_memset,
1746 1, /* scalar_stmt_cost. */
1747 1, /* scalar load_cost. */
1748 1, /* scalar_store_cost. */
1749 1, /* vec_stmt_cost. */
1750 1, /* vec_to_scalar_cost. */
1751 1, /* scalar_to_vec_cost. */
1752 1, /* vec_align_load_cost. */
1753 2, /* vec_unalign_load_cost. */
1754 1, /* vec_store_cost. */
1755 3, /* cond_taken_branch_cost. */
1756 1, /* cond_not_taken_branch_cost. */
1759 static stringop_algs atom_memcpy[2] = {
1760 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1761 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1762 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1763 static stringop_algs atom_memset[2] = {
1764 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1765 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1766 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1767 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1768 static const
1769 struct processor_costs atom_cost = {
1770 COSTS_N_INSNS (1), /* cost of an add instruction */
1771 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1772 COSTS_N_INSNS (1), /* variable shift costs */
1773 COSTS_N_INSNS (1), /* constant shift costs */
1774 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1775 COSTS_N_INSNS (4), /* HI */
1776 COSTS_N_INSNS (3), /* SI */
1777 COSTS_N_INSNS (4), /* DI */
1778 COSTS_N_INSNS (2)}, /* other */
1779 0, /* cost of multiply per each bit set */
1780 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1781 COSTS_N_INSNS (26), /* HI */
1782 COSTS_N_INSNS (42), /* SI */
1783 COSTS_N_INSNS (74), /* DI */
1784 COSTS_N_INSNS (74)}, /* other */
1785 COSTS_N_INSNS (1), /* cost of movsx */
1786 COSTS_N_INSNS (1), /* cost of movzx */
1787 8, /* "large" insn */
1788 17, /* MOVE_RATIO */
1789 4, /* cost for loading QImode using movzbl */
1790 {4, 4, 4}, /* cost of loading integer registers
1791 in QImode, HImode and SImode.
1792 Relative to reg-reg move (2). */
1793 {4, 4, 4}, /* cost of storing integer registers */
1794 4, /* cost of reg,reg fld/fst */
1795 {12, 12, 12}, /* cost of loading fp registers
1796 in SFmode, DFmode and XFmode */
1797 {6, 6, 8}, /* cost of storing fp registers
1798 in SFmode, DFmode and XFmode */
1799 2, /* cost of moving MMX register */
1800 {8, 8}, /* cost of loading MMX registers
1801 in SImode and DImode */
1802 {8, 8}, /* cost of storing MMX registers
1803 in SImode and DImode */
1804 2, /* cost of moving SSE register */
1805 {8, 8, 8}, /* cost of loading SSE registers
1806 in SImode, DImode and TImode */
1807 {8, 8, 8}, /* cost of storing SSE registers
1808 in SImode, DImode and TImode */
1809 5, /* MMX or SSE register to integer */
1810 32, /* size of l1 cache. */
1811 256, /* size of l2 cache. */
1812 64, /* size of prefetch block */
1813 6, /* number of parallel prefetches */
1814 3, /* Branch cost */
1815 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1816 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1817 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1818 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1819 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1820 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1821 atom_memcpy,
1822 atom_memset,
1823 1, /* scalar_stmt_cost. */
1824 1, /* scalar load_cost. */
1825 1, /* scalar_store_cost. */
1826 1, /* vec_stmt_cost. */
1827 1, /* vec_to_scalar_cost. */
1828 1, /* scalar_to_vec_cost. */
1829 1, /* vec_align_load_cost. */
1830 2, /* vec_unalign_load_cost. */
1831 1, /* vec_store_cost. */
1832 3, /* cond_taken_branch_cost. */
1833 1, /* cond_not_taken_branch_cost. */
1836 static stringop_algs slm_memcpy[2] = {
1837 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1838 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1839 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static stringop_algs slm_memset[2] = {
1841 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1842 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1843 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1844 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1845 static const
1846 struct processor_costs slm_cost = {
1847 COSTS_N_INSNS (1), /* cost of an add instruction */
1848 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1849 COSTS_N_INSNS (1), /* variable shift costs */
1850 COSTS_N_INSNS (1), /* constant shift costs */
1851 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1852 COSTS_N_INSNS (3), /* HI */
1853 COSTS_N_INSNS (3), /* SI */
1854 COSTS_N_INSNS (4), /* DI */
1855 COSTS_N_INSNS (2)}, /* other */
1856 0, /* cost of multiply per each bit set */
1857 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1858 COSTS_N_INSNS (26), /* HI */
1859 COSTS_N_INSNS (42), /* SI */
1860 COSTS_N_INSNS (74), /* DI */
1861 COSTS_N_INSNS (74)}, /* other */
1862 COSTS_N_INSNS (1), /* cost of movsx */
1863 COSTS_N_INSNS (1), /* cost of movzx */
1864 8, /* "large" insn */
1865 17, /* MOVE_RATIO */
1866 4, /* cost for loading QImode using movzbl */
1867 {4, 4, 4}, /* cost of loading integer registers
1868 in QImode, HImode and SImode.
1869 Relative to reg-reg move (2). */
1870 {4, 4, 4}, /* cost of storing integer registers */
1871 4, /* cost of reg,reg fld/fst */
1872 {12, 12, 12}, /* cost of loading fp registers
1873 in SFmode, DFmode and XFmode */
1874 {6, 6, 8}, /* cost of storing fp registers
1875 in SFmode, DFmode and XFmode */
1876 2, /* cost of moving MMX register */
1877 {8, 8}, /* cost of loading MMX registers
1878 in SImode and DImode */
1879 {8, 8}, /* cost of storing MMX registers
1880 in SImode and DImode */
1881 2, /* cost of moving SSE register */
1882 {8, 8, 8}, /* cost of loading SSE registers
1883 in SImode, DImode and TImode */
1884 {8, 8, 8}, /* cost of storing SSE registers
1885 in SImode, DImode and TImode */
1886 5, /* MMX or SSE register to integer */
1887 32, /* size of l1 cache. */
1888 256, /* size of l2 cache. */
1889 64, /* size of prefetch block */
1890 6, /* number of parallel prefetches */
1891 3, /* Branch cost */
1892 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1893 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1894 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1895 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1896 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1897 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1898 slm_memcpy,
1899 slm_memset,
1900 1, /* scalar_stmt_cost. */
1901 1, /* scalar load_cost. */
1902 1, /* scalar_store_cost. */
1903 1, /* vec_stmt_cost. */
1904 4, /* vec_to_scalar_cost. */
1905 1, /* scalar_to_vec_cost. */
1906 1, /* vec_align_load_cost. */
1907 2, /* vec_unalign_load_cost. */
1908 1, /* vec_store_cost. */
1909 3, /* cond_taken_branch_cost. */
1910 1, /* cond_not_taken_branch_cost. */
1913 static stringop_algs intel_memcpy[2] = {
1914 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1915 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1916 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1917 static stringop_algs intel_memset[2] = {
1918 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1919 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1920 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1921 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1922 static const
1923 struct processor_costs intel_cost = {
1924 COSTS_N_INSNS (1), /* cost of an add instruction */
1925 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1926 COSTS_N_INSNS (1), /* variable shift costs */
1927 COSTS_N_INSNS (1), /* constant shift costs */
1928 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1929 COSTS_N_INSNS (3), /* HI */
1930 COSTS_N_INSNS (3), /* SI */
1931 COSTS_N_INSNS (4), /* DI */
1932 COSTS_N_INSNS (2)}, /* other */
1933 0, /* cost of multiply per each bit set */
1934 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1935 COSTS_N_INSNS (26), /* HI */
1936 COSTS_N_INSNS (42), /* SI */
1937 COSTS_N_INSNS (74), /* DI */
1938 COSTS_N_INSNS (74)}, /* other */
1939 COSTS_N_INSNS (1), /* cost of movsx */
1940 COSTS_N_INSNS (1), /* cost of movzx */
1941 8, /* "large" insn */
1942 17, /* MOVE_RATIO */
1943 4, /* cost for loading QImode using movzbl */
1944 {4, 4, 4}, /* cost of loading integer registers
1945 in QImode, HImode and SImode.
1946 Relative to reg-reg move (2). */
1947 {4, 4, 4}, /* cost of storing integer registers */
1948 4, /* cost of reg,reg fld/fst */
1949 {12, 12, 12}, /* cost of loading fp registers
1950 in SFmode, DFmode and XFmode */
1951 {6, 6, 8}, /* cost of storing fp registers
1952 in SFmode, DFmode and XFmode */
1953 2, /* cost of moving MMX register */
1954 {8, 8}, /* cost of loading MMX registers
1955 in SImode and DImode */
1956 {8, 8}, /* cost of storing MMX registers
1957 in SImode and DImode */
1958 2, /* cost of moving SSE register */
1959 {8, 8, 8}, /* cost of loading SSE registers
1960 in SImode, DImode and TImode */
1961 {8, 8, 8}, /* cost of storing SSE registers
1962 in SImode, DImode and TImode */
1963 5, /* MMX or SSE register to integer */
1964 32, /* size of l1 cache. */
1965 256, /* size of l2 cache. */
1966 64, /* size of prefetch block */
1967 6, /* number of parallel prefetches */
1968 3, /* Branch cost */
1969 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1970 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1971 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1972 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1973 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1974 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1975 intel_memcpy,
1976 intel_memset,
1977 1, /* scalar_stmt_cost. */
1978 1, /* scalar load_cost. */
1979 1, /* scalar_store_cost. */
1980 1, /* vec_stmt_cost. */
1981 4, /* vec_to_scalar_cost. */
1982 1, /* scalar_to_vec_cost. */
1983 1, /* vec_align_load_cost. */
1984 2, /* vec_unalign_load_cost. */
1985 1, /* vec_store_cost. */
1986 3, /* cond_taken_branch_cost. */
1987 1, /* cond_not_taken_branch_cost. */
1990 /* Generic should produce code tuned for Core-i7 (and newer chips)
1991 and btver1 (and newer chips). */
1993 static stringop_algs generic_memcpy[2] = {
1994 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1995 {-1, libcall, false}}},
1996 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1997 {-1, libcall, false}}}};
1998 static stringop_algs generic_memset[2] = {
1999 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2000 {-1, libcall, false}}},
2001 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2002 {-1, libcall, false}}}};
2003 static const
2004 struct processor_costs generic_cost = {
2005 COSTS_N_INSNS (1), /* cost of an add instruction */
2006 /* On all chips taken into consideration lea is 2 cycles and more. With
2007 this cost however our current implementation of synth_mult results in
2008 use of unnecessary temporary registers causing regression on several
2009 SPECfp benchmarks. */
2010 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2011 COSTS_N_INSNS (1), /* variable shift costs */
2012 COSTS_N_INSNS (1), /* constant shift costs */
2013 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2014 COSTS_N_INSNS (4), /* HI */
2015 COSTS_N_INSNS (3), /* SI */
2016 COSTS_N_INSNS (4), /* DI */
2017 COSTS_N_INSNS (2)}, /* other */
2018 0, /* cost of multiply per each bit set */
2019 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2020 COSTS_N_INSNS (26), /* HI */
2021 COSTS_N_INSNS (42), /* SI */
2022 COSTS_N_INSNS (74), /* DI */
2023 COSTS_N_INSNS (74)}, /* other */
2024 COSTS_N_INSNS (1), /* cost of movsx */
2025 COSTS_N_INSNS (1), /* cost of movzx */
2026 8, /* "large" insn */
2027 17, /* MOVE_RATIO */
2028 4, /* cost for loading QImode using movzbl */
2029 {4, 4, 4}, /* cost of loading integer registers
2030 in QImode, HImode and SImode.
2031 Relative to reg-reg move (2). */
2032 {4, 4, 4}, /* cost of storing integer registers */
2033 4, /* cost of reg,reg fld/fst */
2034 {12, 12, 12}, /* cost of loading fp registers
2035 in SFmode, DFmode and XFmode */
2036 {6, 6, 8}, /* cost of storing fp registers
2037 in SFmode, DFmode and XFmode */
2038 2, /* cost of moving MMX register */
2039 {8, 8}, /* cost of loading MMX registers
2040 in SImode and DImode */
2041 {8, 8}, /* cost of storing MMX registers
2042 in SImode and DImode */
2043 2, /* cost of moving SSE register */
2044 {8, 8, 8}, /* cost of loading SSE registers
2045 in SImode, DImode and TImode */
2046 {8, 8, 8}, /* cost of storing SSE registers
2047 in SImode, DImode and TImode */
2048 5, /* MMX or SSE register to integer */
2049 32, /* size of l1 cache. */
2050 512, /* size of l2 cache. */
2051 64, /* size of prefetch block */
2052 6, /* number of parallel prefetches */
2053 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2054 value is increased to perhaps more appropriate value of 5. */
2055 3, /* Branch cost */
2056 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2057 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2058 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2059 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2060 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2061 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2062 generic_memcpy,
2063 generic_memset,
2064 1, /* scalar_stmt_cost. */
2065 1, /* scalar load_cost. */
2066 1, /* scalar_store_cost. */
2067 1, /* vec_stmt_cost. */
2068 1, /* vec_to_scalar_cost. */
2069 1, /* scalar_to_vec_cost. */
2070 1, /* vec_align_load_cost. */
2071 2, /* vec_unalign_load_cost. */
2072 1, /* vec_store_cost. */
2073 3, /* cond_taken_branch_cost. */
2074 1, /* cond_not_taken_branch_cost. */
2077 /* core_cost should produce code tuned for Core familly of CPUs. */
2078 static stringop_algs core_memcpy[2] = {
2079 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2080 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2081 {-1, libcall, false}}}};
2082 static stringop_algs core_memset[2] = {
2083 {libcall, {{6, loop_1_byte, true},
2084 {24, loop, true},
2085 {8192, rep_prefix_4_byte, true},
2086 {-1, libcall, false}}},
2087 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2088 {-1, libcall, false}}}};
2090 static const
2091 struct processor_costs core_cost = {
2092 COSTS_N_INSNS (1), /* cost of an add instruction */
2093 /* On all chips taken into consideration lea is 2 cycles and more. With
2094 this cost however our current implementation of synth_mult results in
2095 use of unnecessary temporary registers causing regression on several
2096 SPECfp benchmarks. */
2097 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2098 COSTS_N_INSNS (1), /* variable shift costs */
2099 COSTS_N_INSNS (1), /* constant shift costs */
2100 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2101 COSTS_N_INSNS (4), /* HI */
2102 COSTS_N_INSNS (3), /* SI */
2103 COSTS_N_INSNS (4), /* DI */
2104 COSTS_N_INSNS (2)}, /* other */
2105 0, /* cost of multiply per each bit set */
2106 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2107 COSTS_N_INSNS (26), /* HI */
2108 COSTS_N_INSNS (42), /* SI */
2109 COSTS_N_INSNS (74), /* DI */
2110 COSTS_N_INSNS (74)}, /* other */
2111 COSTS_N_INSNS (1), /* cost of movsx */
2112 COSTS_N_INSNS (1), /* cost of movzx */
2113 8, /* "large" insn */
2114 17, /* MOVE_RATIO */
2115 4, /* cost for loading QImode using movzbl */
2116 {4, 4, 4}, /* cost of loading integer registers
2117 in QImode, HImode and SImode.
2118 Relative to reg-reg move (2). */
2119 {4, 4, 4}, /* cost of storing integer registers */
2120 4, /* cost of reg,reg fld/fst */
2121 {12, 12, 12}, /* cost of loading fp registers
2122 in SFmode, DFmode and XFmode */
2123 {6, 6, 8}, /* cost of storing fp registers
2124 in SFmode, DFmode and XFmode */
2125 2, /* cost of moving MMX register */
2126 {8, 8}, /* cost of loading MMX registers
2127 in SImode and DImode */
2128 {8, 8}, /* cost of storing MMX registers
2129 in SImode and DImode */
2130 2, /* cost of moving SSE register */
2131 {8, 8, 8}, /* cost of loading SSE registers
2132 in SImode, DImode and TImode */
2133 {8, 8, 8}, /* cost of storing SSE registers
2134 in SImode, DImode and TImode */
2135 5, /* MMX or SSE register to integer */
2136 64, /* size of l1 cache. */
2137 512, /* size of l2 cache. */
2138 64, /* size of prefetch block */
2139 6, /* number of parallel prefetches */
2140 /* FIXME perhaps more appropriate value is 5. */
2141 3, /* Branch cost */
2142 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2143 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2144 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2145 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2146 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2147 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2148 core_memcpy,
2149 core_memset,
2150 1, /* scalar_stmt_cost. */
2151 1, /* scalar load_cost. */
2152 1, /* scalar_store_cost. */
2153 1, /* vec_stmt_cost. */
2154 1, /* vec_to_scalar_cost. */
2155 1, /* scalar_to_vec_cost. */
2156 1, /* vec_align_load_cost. */
2157 2, /* vec_unalign_load_cost. */
2158 1, /* vec_store_cost. */
2159 3, /* cond_taken_branch_cost. */
2160 1, /* cond_not_taken_branch_cost. */
2164 /* Set by -mtune. */
2165 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2167 /* Set by -mtune or -Os. */
2168 const struct processor_costs *ix86_cost = &pentium_cost;
2170 /* Processor feature/optimization bitmasks. */
2171 #define m_386 (1U<<PROCESSOR_I386)
2172 #define m_486 (1U<<PROCESSOR_I486)
2173 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2174 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2175 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2176 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2177 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2178 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2179 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2180 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2181 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2182 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2183 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2184 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2185 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2186 #define m_KNL (1U<<PROCESSOR_KNL)
2187 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2188 #define m_INTEL (1U<<PROCESSOR_INTEL)
2190 #define m_GEODE (1U<<PROCESSOR_GEODE)
2191 #define m_K6 (1U<<PROCESSOR_K6)
2192 #define m_K6_GEODE (m_K6 | m_GEODE)
2193 #define m_K8 (1U<<PROCESSOR_K8)
2194 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2195 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2196 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2197 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2198 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2199 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2200 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2201 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2202 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2203 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2204 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2205 #define m_BTVER (m_BTVER1 | m_BTVER2)
2206 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2207 | m_ZNVER1)
2209 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2211 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2212 #undef DEF_TUNE
2213 #define DEF_TUNE(tune, name, selector) name,
2214 #include "x86-tune.def"
2215 #undef DEF_TUNE
2218 /* Feature tests against the various tunings. */
2219 unsigned char ix86_tune_features[X86_TUNE_LAST];
2221 /* Feature tests against the various tunings used to create ix86_tune_features
2222 based on the processor mask. */
2223 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2224 #undef DEF_TUNE
2225 #define DEF_TUNE(tune, name, selector) selector,
2226 #include "x86-tune.def"
2227 #undef DEF_TUNE
2230 /* Feature tests against the various architecture variations. */
2231 unsigned char ix86_arch_features[X86_ARCH_LAST];
2233 /* Feature tests against the various architecture variations, used to create
2234 ix86_arch_features based on the processor mask. */
2235 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2236 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2237 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2239 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2240 ~m_386,
2242 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2243 ~(m_386 | m_486),
2245 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2246 ~m_386,
2248 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2249 ~m_386,
2252 /* In case the average insn count for single function invocation is
2253 lower than this constant, emit fast (but longer) prologue and
2254 epilogue code. */
2255 #define FAST_PROLOGUE_INSN_COUNT 20
2257 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2258 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2259 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2260 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2262 /* Array of the smallest class containing reg number REGNO, indexed by
2263 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2265 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2267 /* ax, dx, cx, bx */
2268 AREG, DREG, CREG, BREG,
2269 /* si, di, bp, sp */
2270 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2271 /* FP registers */
2272 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2273 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2274 /* arg pointer */
2275 NON_Q_REGS,
2276 /* flags, fpsr, fpcr, frame */
2277 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2278 /* SSE registers */
2279 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2280 SSE_REGS, SSE_REGS,
2281 /* MMX registers */
2282 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2283 MMX_REGS, MMX_REGS,
2284 /* REX registers */
2285 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2286 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2287 /* SSE REX registers */
2288 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2289 SSE_REGS, SSE_REGS,
2290 /* AVX-512 SSE registers */
2291 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2292 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2293 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2294 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2295 /* Mask registers. */
2296 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2297 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2298 /* MPX bound registers */
2299 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2302 /* The "default" register map used in 32bit mode. */
2304 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2306 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2307 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2308 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2309 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2310 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2311 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2312 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2313 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2314 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2315 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2316 101, 102, 103, 104, /* bound registers */
2319 /* The "default" register map used in 64bit mode. */
2321 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2323 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2324 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2325 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2326 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2327 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2328 8,9,10,11,12,13,14,15, /* extended integer registers */
2329 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2330 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2331 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2332 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2333 126, 127, 128, 129, /* bound registers */
2336 /* Define the register numbers to be used in Dwarf debugging information.
2337 The SVR4 reference port C compiler uses the following register numbers
2338 in its Dwarf output code:
2339 0 for %eax (gcc regno = 0)
2340 1 for %ecx (gcc regno = 2)
2341 2 for %edx (gcc regno = 1)
2342 3 for %ebx (gcc regno = 3)
2343 4 for %esp (gcc regno = 7)
2344 5 for %ebp (gcc regno = 6)
2345 6 for %esi (gcc regno = 4)
2346 7 for %edi (gcc regno = 5)
2347 The following three DWARF register numbers are never generated by
2348 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2349 believes these numbers have these meanings.
2350 8 for %eip (no gcc equivalent)
2351 9 for %eflags (gcc regno = 17)
2352 10 for %trapno (no gcc equivalent)
2353 It is not at all clear how we should number the FP stack registers
2354 for the x86 architecture. If the version of SDB on x86/svr4 were
2355 a bit less brain dead with respect to floating-point then we would
2356 have a precedent to follow with respect to DWARF register numbers
2357 for x86 FP registers, but the SDB on x86/svr4 is so completely
2358 broken with respect to FP registers that it is hardly worth thinking
2359 of it as something to strive for compatibility with.
2360 The version of x86/svr4 SDB I have at the moment does (partially)
2361 seem to believe that DWARF register number 11 is associated with
2362 the x86 register %st(0), but that's about all. Higher DWARF
2363 register numbers don't seem to be associated with anything in
2364 particular, and even for DWARF regno 11, SDB only seems to under-
2365 stand that it should say that a variable lives in %st(0) (when
2366 asked via an `=' command) if we said it was in DWARF regno 11,
2367 but SDB still prints garbage when asked for the value of the
2368 variable in question (via a `/' command).
2369 (Also note that the labels SDB prints for various FP stack regs
2370 when doing an `x' command are all wrong.)
2371 Note that these problems generally don't affect the native SVR4
2372 C compiler because it doesn't allow the use of -O with -g and
2373 because when it is *not* optimizing, it allocates a memory
2374 location for each floating-point variable, and the memory
2375 location is what gets described in the DWARF AT_location
2376 attribute for the variable in question.
2377 Regardless of the severe mental illness of the x86/svr4 SDB, we
2378 do something sensible here and we use the following DWARF
2379 register numbers. Note that these are all stack-top-relative
2380 numbers.
2381 11 for %st(0) (gcc regno = 8)
2382 12 for %st(1) (gcc regno = 9)
2383 13 for %st(2) (gcc regno = 10)
2384 14 for %st(3) (gcc regno = 11)
2385 15 for %st(4) (gcc regno = 12)
2386 16 for %st(5) (gcc regno = 13)
2387 17 for %st(6) (gcc regno = 14)
2388 18 for %st(7) (gcc regno = 15)
2390 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2392 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2393 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2394 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2395 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2396 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2397 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2398 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2399 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2400 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2401 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2402 101, 102, 103, 104, /* bound registers */
2405 /* Define parameter passing and return registers. */
2407 static int const x86_64_int_parameter_registers[6] =
2409 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2412 static int const x86_64_ms_abi_int_parameter_registers[4] =
2414 CX_REG, DX_REG, R8_REG, R9_REG
2417 static int const x86_64_int_return_registers[4] =
2419 AX_REG, DX_REG, DI_REG, SI_REG
2422 /* Additional registers that are clobbered by SYSV calls. */
2424 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2426 SI_REG, DI_REG,
2427 XMM6_REG, XMM7_REG,
2428 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2429 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2432 /* Define the structure for the machine field in struct function. */
2434 struct GTY(()) stack_local_entry {
2435 unsigned short mode;
2436 unsigned short n;
2437 rtx rtl;
2438 struct stack_local_entry *next;
2441 /* Structure describing stack frame layout.
2442 Stack grows downward:
2444 [arguments]
2445 <- ARG_POINTER
2446 saved pc
2448 saved static chain if ix86_static_chain_on_stack
2450 saved frame pointer if frame_pointer_needed
2451 <- HARD_FRAME_POINTER
2452 [saved regs]
2453 <- regs_save_offset
2454 [padding0]
2456 [saved SSE regs]
2457 <- sse_regs_save_offset
2458 [padding1] |
2459 | <- FRAME_POINTER
2460 [va_arg registers] |
2462 [frame] |
2464 [padding2] | = to_allocate
2465 <- STACK_POINTER
2467 struct ix86_frame
2469 int nsseregs;
2470 int nregs;
2471 int va_arg_size;
2472 int red_zone_size;
2473 int outgoing_arguments_size;
2475 /* The offsets relative to ARG_POINTER. */
2476 HOST_WIDE_INT frame_pointer_offset;
2477 HOST_WIDE_INT hard_frame_pointer_offset;
2478 HOST_WIDE_INT stack_pointer_offset;
2479 HOST_WIDE_INT hfp_save_offset;
2480 HOST_WIDE_INT reg_save_offset;
2481 HOST_WIDE_INT sse_reg_save_offset;
2483 /* When save_regs_using_mov is set, emit prologue using
2484 move instead of push instructions. */
2485 bool save_regs_using_mov;
2488 /* Which cpu are we scheduling for. */
2489 enum attr_cpu ix86_schedule;
2491 /* Which cpu are we optimizing for. */
2492 enum processor_type ix86_tune;
2494 /* Which instruction set architecture to use. */
2495 enum processor_type ix86_arch;
2497 /* True if processor has SSE prefetch instruction. */
2498 unsigned char x86_prefetch_sse;
2500 /* -mstackrealign option */
2501 static const char ix86_force_align_arg_pointer_string[]
2502 = "force_align_arg_pointer";
2504 static rtx (*ix86_gen_leave) (void);
2505 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2506 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2507 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2508 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2509 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2510 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_clzero) (rtx);
2512 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2513 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2514 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2515 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2516 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2517 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2519 /* Preferred alignment for stack boundary in bits. */
2520 unsigned int ix86_preferred_stack_boundary;
2522 /* Alignment for incoming stack boundary in bits specified at
2523 command line. */
2524 static unsigned int ix86_user_incoming_stack_boundary;
2526 /* Default alignment for incoming stack boundary in bits. */
2527 static unsigned int ix86_default_incoming_stack_boundary;
2529 /* Alignment for incoming stack boundary in bits. */
2530 unsigned int ix86_incoming_stack_boundary;
2532 /* Calling abi specific va_list type nodes. */
2533 static GTY(()) tree sysv_va_list_type_node;
2534 static GTY(()) tree ms_va_list_type_node;
2536 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2537 char internal_label_prefix[16];
2538 int internal_label_prefix_len;
2540 /* Fence to use after loop using movnt. */
2541 tree x86_mfence;
2543 /* Register class used for passing given 64bit part of the argument.
2544 These represent classes as documented by the PS ABI, with the exception
2545 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2546 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2548 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2549 whenever possible (upper half does contain padding). */
2550 enum x86_64_reg_class
2552 X86_64_NO_CLASS,
2553 X86_64_INTEGER_CLASS,
2554 X86_64_INTEGERSI_CLASS,
2555 X86_64_SSE_CLASS,
2556 X86_64_SSESF_CLASS,
2557 X86_64_SSEDF_CLASS,
2558 X86_64_SSEUP_CLASS,
2559 X86_64_X87_CLASS,
2560 X86_64_X87UP_CLASS,
2561 X86_64_COMPLEX_X87_CLASS,
2562 X86_64_MEMORY_CLASS
2565 #define MAX_CLASSES 8
2567 /* Table of constants used by fldpi, fldln2, etc.... */
2568 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2569 static bool ext_80387_constants_init = 0;
2572 static struct machine_function * ix86_init_machine_status (void);
2573 static rtx ix86_function_value (const_tree, const_tree, bool);
2574 static bool ix86_function_value_regno_p (const unsigned int);
2575 static unsigned int ix86_function_arg_boundary (machine_mode,
2576 const_tree);
2577 static rtx ix86_static_chain (const_tree, bool);
2578 static int ix86_function_regparm (const_tree, const_tree);
2579 static void ix86_compute_frame_layout (struct ix86_frame *);
2580 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2581 rtx, rtx, int);
2582 static void ix86_add_new_builtins (HOST_WIDE_INT);
2583 static tree ix86_canonical_va_list_type (tree);
2584 static void predict_jump (int);
2585 static unsigned int split_stack_prologue_scratch_regno (void);
2586 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2588 enum ix86_function_specific_strings
2590 IX86_FUNCTION_SPECIFIC_ARCH,
2591 IX86_FUNCTION_SPECIFIC_TUNE,
2592 IX86_FUNCTION_SPECIFIC_MAX
2595 static char *ix86_target_string (HOST_WIDE_INT, int, int, const char *,
2596 const char *, enum fpmath_unit, bool);
2597 static void ix86_function_specific_save (struct cl_target_option *,
2598 struct gcc_options *opts);
2599 static void ix86_function_specific_restore (struct gcc_options *opts,
2600 struct cl_target_option *);
2601 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2602 static void ix86_function_specific_print (FILE *, int,
2603 struct cl_target_option *);
2604 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2605 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2606 struct gcc_options *,
2607 struct gcc_options *,
2608 struct gcc_options *);
2609 static bool ix86_can_inline_p (tree, tree);
2610 static void ix86_set_current_function (tree);
2611 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2613 static enum calling_abi ix86_function_abi (const_tree);
2616 #ifndef SUBTARGET32_DEFAULT_CPU
2617 #define SUBTARGET32_DEFAULT_CPU "i386"
2618 #endif
2620 /* Whether -mtune= or -march= were specified */
2621 static int ix86_tune_defaulted;
2622 static int ix86_arch_specified;
2624 /* Vectorization library interface and handlers. */
2625 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2627 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2628 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2630 /* Processor target table, indexed by processor number */
2631 struct ptt
2633 const char *const name; /* processor name */
2634 const struct processor_costs *cost; /* Processor costs */
2635 const int align_loop; /* Default alignments. */
2636 const int align_loop_max_skip;
2637 const int align_jump;
2638 const int align_jump_max_skip;
2639 const int align_func;
2642 /* This table must be in sync with enum processor_type in i386.h. */
2643 static const struct ptt processor_target_table[PROCESSOR_max] =
2645 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2646 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2647 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2648 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2649 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2650 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2651 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2652 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2653 {"core2", &core_cost, 16, 10, 16, 10, 16},
2654 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2655 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2656 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2657 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2658 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2659 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2660 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2661 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2662 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2663 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2664 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2665 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2666 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2667 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2668 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2669 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2670 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2671 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2672 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2673 {"znver1", &znver1_cost, 16, 10, 16, 7, 11}
2676 static unsigned int
2677 rest_of_handle_insert_vzeroupper (void)
2679 int i;
2681 /* vzeroupper instructions are inserted immediately after reload to
2682 account for possible spills from 256bit registers. The pass
2683 reuses mode switching infrastructure by re-running mode insertion
2684 pass, so disable entities that have already been processed. */
2685 for (i = 0; i < MAX_386_ENTITIES; i++)
2686 ix86_optimize_mode_switching[i] = 0;
2688 ix86_optimize_mode_switching[AVX_U128] = 1;
2690 /* Call optimize_mode_switching. */
2691 g->get_passes ()->execute_pass_mode_switching ();
2692 return 0;
2695 /* Return 1 if INSN uses or defines a hard register.
2696 Hard register uses in a memory address are ignored.
2697 Clobbers and flags definitions are ignored. */
2699 static bool
2700 has_non_address_hard_reg (rtx_insn *insn)
2702 df_ref ref;
2703 FOR_EACH_INSN_DEF (ref, insn)
2704 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2705 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2706 && DF_REF_REGNO (ref) != FLAGS_REG)
2707 return true;
2709 FOR_EACH_INSN_USE (ref, insn)
2710 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2711 return true;
2713 return false;
2716 /* Check if comparison INSN may be transformed
2717 into vector comparison. Currently we transform
2718 zero checks only which look like:
2720 (set (reg:CCZ 17 flags)
2721 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2722 (subreg:SI (reg:DI x) 0))
2723 (const_int 0 [0]))) */
2725 static bool
2726 convertible_comparison_p (rtx_insn *insn)
2728 if (!TARGET_SSE4_1)
2729 return false;
2731 rtx def_set = single_set (insn);
2733 gcc_assert (def_set);
2735 rtx src = SET_SRC (def_set);
2736 rtx dst = SET_DEST (def_set);
2738 gcc_assert (GET_CODE (src) == COMPARE);
2740 if (GET_CODE (dst) != REG
2741 || REGNO (dst) != FLAGS_REG
2742 || GET_MODE (dst) != CCZmode)
2743 return false;
2745 rtx op1 = XEXP (src, 0);
2746 rtx op2 = XEXP (src, 1);
2748 if (op2 != CONST0_RTX (GET_MODE (op2)))
2749 return false;
2751 if (GET_CODE (op1) != IOR)
2752 return false;
2754 op2 = XEXP (op1, 1);
2755 op1 = XEXP (op1, 0);
2757 if (!SUBREG_P (op1)
2758 || !SUBREG_P (op2)
2759 || GET_MODE (op1) != SImode
2760 || GET_MODE (op2) != SImode
2761 || ((SUBREG_BYTE (op1) != 0
2762 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2763 && (SUBREG_BYTE (op2) != 0
2764 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2765 return false;
2767 op1 = SUBREG_REG (op1);
2768 op2 = SUBREG_REG (op2);
2770 if (op1 != op2
2771 || !REG_P (op1)
2772 || GET_MODE (op1) != DImode)
2773 return false;
2775 return true;
2778 /* The DImode version of scalar_to_vector_candidate_p. */
2780 static bool
2781 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2783 rtx def_set = single_set (insn);
2785 if (!def_set)
2786 return false;
2788 if (has_non_address_hard_reg (insn))
2789 return false;
2791 rtx src = SET_SRC (def_set);
2792 rtx dst = SET_DEST (def_set);
2794 if (GET_CODE (src) == COMPARE)
2795 return convertible_comparison_p (insn);
2797 /* We are interested in DImode promotion only. */
2798 if ((GET_MODE (src) != DImode
2799 && !CONST_INT_P (src))
2800 || GET_MODE (dst) != DImode)
2801 return false;
2803 if (!REG_P (dst) && !MEM_P (dst))
2804 return false;
2806 switch (GET_CODE (src))
2808 case ASHIFT:
2809 case LSHIFTRT:
2810 /* Consider only non-variable shifts narrower
2811 than general register width. */
2812 if (!(CONST_INT_P (XEXP (src, 1))
2813 && IN_RANGE (INTVAL (XEXP (src, 1)), 0, 31)))
2814 return false;
2815 break;
2817 case PLUS:
2818 case MINUS:
2819 case IOR:
2820 case XOR:
2821 case AND:
2822 if (!REG_P (XEXP (src, 1))
2823 && !MEM_P (XEXP (src, 1))
2824 && !CONST_INT_P (XEXP (src, 1)))
2825 return false;
2826 break;
2828 case REG:
2829 return true;
2831 case MEM:
2832 case CONST_INT:
2833 return REG_P (dst);
2835 default:
2836 return false;
2839 if (!REG_P (XEXP (src, 0))
2840 && !MEM_P (XEXP (src, 0))
2841 && !CONST_INT_P (XEXP (src, 0))
2842 /* Check for andnot case. */
2843 && (GET_CODE (src) != AND
2844 || GET_CODE (XEXP (src, 0)) != NOT
2845 || !REG_P (XEXP (XEXP (src, 0), 0))))
2846 return false;
2848 if ((GET_MODE (XEXP (src, 0)) != DImode
2849 && !CONST_INT_P (XEXP (src, 0)))
2850 || (GET_MODE (XEXP (src, 1)) != DImode
2851 && !CONST_INT_P (XEXP (src, 1))))
2852 return false;
2854 return true;
2857 /* The TImode version of scalar_to_vector_candidate_p. */
2859 static bool
2860 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2862 rtx def_set = single_set (insn);
2864 if (!def_set)
2865 return false;
2867 if (has_non_address_hard_reg (insn))
2868 return false;
2870 rtx src = SET_SRC (def_set);
2871 rtx dst = SET_DEST (def_set);
2873 /* Only TImode load and store are allowed. */
2874 if (GET_MODE (dst) != TImode)
2875 return false;
2877 if (MEM_P (dst))
2879 /* Check for store. Memory must be aligned or unaligned store
2880 is optimal. Only support store from register, standard SSE
2881 constant or CONST_WIDE_INT generated from piecewise store.
2883 ??? Verify performance impact before enabling CONST_INT for
2884 __int128 store. */
2885 if (misaligned_operand (dst, TImode)
2886 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2887 return false;
2889 switch (GET_CODE (src))
2891 default:
2892 return false;
2894 case REG:
2895 case CONST_WIDE_INT:
2896 return true;
2898 case CONST_INT:
2899 return standard_sse_constant_p (src, TImode);
2902 else if (MEM_P (src))
2904 /* Check for load. Memory must be aligned or unaligned load is
2905 optimal. */
2906 return (REG_P (dst)
2907 && (!misaligned_operand (src, TImode)
2908 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2911 return false;
2914 /* Return 1 if INSN may be converted into vector
2915 instruction. */
2917 static bool
2918 scalar_to_vector_candidate_p (rtx_insn *insn)
2920 if (TARGET_64BIT)
2921 return timode_scalar_to_vector_candidate_p (insn);
2922 else
2923 return dimode_scalar_to_vector_candidate_p (insn);
2926 /* The DImode version of remove_non_convertible_regs. */
2928 static void
2929 dimode_remove_non_convertible_regs (bitmap candidates)
2931 bitmap_iterator bi;
2932 unsigned id;
2933 bitmap regs = BITMAP_ALLOC (NULL);
2935 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2937 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2938 rtx reg = SET_DEST (def_set);
2940 if (!REG_P (reg)
2941 || bitmap_bit_p (regs, REGNO (reg))
2942 || HARD_REGISTER_P (reg))
2943 continue;
2945 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2946 def;
2947 def = DF_REF_NEXT_REG (def))
2949 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2951 if (dump_file)
2952 fprintf (dump_file,
2953 "r%d has non convertible definition in insn %d\n",
2954 REGNO (reg), DF_REF_INSN_UID (def));
2956 bitmap_set_bit (regs, REGNO (reg));
2957 break;
2962 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2964 for (df_ref def = DF_REG_DEF_CHAIN (id);
2965 def;
2966 def = DF_REF_NEXT_REG (def))
2967 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2969 if (dump_file)
2970 fprintf (dump_file, "Removing insn %d from candidates list\n",
2971 DF_REF_INSN_UID (def));
2973 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2977 BITMAP_FREE (regs);
2980 /* For a register REGNO, scan instructions for its defs and uses.
2981 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2983 static void
2984 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2985 unsigned int regno)
2987 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2988 def;
2989 def = DF_REF_NEXT_REG (def))
2991 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2993 if (dump_file)
2994 fprintf (dump_file,
2995 "r%d has non convertible def in insn %d\n",
2996 regno, DF_REF_INSN_UID (def));
2998 bitmap_set_bit (regs, regno);
2999 break;
3003 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3004 ref;
3005 ref = DF_REF_NEXT_REG (ref))
3007 /* Debug instructions are skipped. */
3008 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3009 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3011 if (dump_file)
3012 fprintf (dump_file,
3013 "r%d has non convertible use in insn %d\n",
3014 regno, DF_REF_INSN_UID (ref));
3016 bitmap_set_bit (regs, regno);
3017 break;
3022 /* The TImode version of remove_non_convertible_regs. */
3024 static void
3025 timode_remove_non_convertible_regs (bitmap candidates)
3027 bitmap_iterator bi;
3028 unsigned id;
3029 bitmap regs = BITMAP_ALLOC (NULL);
3031 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3033 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3034 rtx dest = SET_DEST (def_set);
3035 rtx src = SET_SRC (def_set);
3037 if ((!REG_P (dest)
3038 || bitmap_bit_p (regs, REGNO (dest))
3039 || HARD_REGISTER_P (dest))
3040 && (!REG_P (src)
3041 || bitmap_bit_p (regs, REGNO (src))
3042 || HARD_REGISTER_P (src)))
3043 continue;
3045 if (REG_P (dest))
3046 timode_check_non_convertible_regs (candidates, regs,
3047 REGNO (dest));
3049 if (REG_P (src))
3050 timode_check_non_convertible_regs (candidates, regs,
3051 REGNO (src));
3054 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3056 for (df_ref def = DF_REG_DEF_CHAIN (id);
3057 def;
3058 def = DF_REF_NEXT_REG (def))
3059 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3061 if (dump_file)
3062 fprintf (dump_file, "Removing insn %d from candidates list\n",
3063 DF_REF_INSN_UID (def));
3065 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3068 for (df_ref ref = DF_REG_USE_CHAIN (id);
3069 ref;
3070 ref = DF_REF_NEXT_REG (ref))
3071 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3073 if (dump_file)
3074 fprintf (dump_file, "Removing insn %d from candidates list\n",
3075 DF_REF_INSN_UID (ref));
3077 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3081 BITMAP_FREE (regs);
3084 /* For a given bitmap of insn UIDs scans all instruction and
3085 remove insn from CANDIDATES in case it has both convertible
3086 and not convertible definitions.
3088 All insns in a bitmap are conversion candidates according to
3089 scalar_to_vector_candidate_p. Currently it implies all insns
3090 are single_set. */
3092 static void
3093 remove_non_convertible_regs (bitmap candidates)
3095 if (TARGET_64BIT)
3096 timode_remove_non_convertible_regs (candidates);
3097 else
3098 dimode_remove_non_convertible_regs (candidates);
3101 class scalar_chain
3103 public:
3104 scalar_chain ();
3105 virtual ~scalar_chain ();
3107 static unsigned max_id;
3109 /* ID of a chain. */
3110 unsigned int chain_id;
3111 /* A queue of instructions to be included into a chain. */
3112 bitmap queue;
3113 /* Instructions included into a chain. */
3114 bitmap insns;
3115 /* All registers defined by a chain. */
3116 bitmap defs;
3117 /* Registers used in both vector and sclar modes. */
3118 bitmap defs_conv;
3120 void build (bitmap candidates, unsigned insn_uid);
3121 virtual int compute_convert_gain () = 0;
3122 int convert ();
3124 protected:
3125 void add_to_queue (unsigned insn_uid);
3126 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3128 private:
3129 void add_insn (bitmap candidates, unsigned insn_uid);
3130 void analyze_register_chain (bitmap candidates, df_ref ref);
3131 virtual void mark_dual_mode_def (df_ref def) = 0;
3132 virtual void convert_insn (rtx_insn *insn) = 0;
3133 virtual void convert_registers () = 0;
3136 class dimode_scalar_chain : public scalar_chain
3138 public:
3139 int compute_convert_gain ();
3140 private:
3141 void mark_dual_mode_def (df_ref def);
3142 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3143 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3144 void convert_insn (rtx_insn *insn);
3145 void convert_op (rtx *op, rtx_insn *insn);
3146 void convert_reg (unsigned regno);
3147 void make_vector_copies (unsigned regno);
3148 void convert_registers ();
3149 int vector_const_cost (rtx exp);
3152 class timode_scalar_chain : public scalar_chain
3154 public:
3155 /* Convert from TImode to V1TImode is always faster. */
3156 int compute_convert_gain () { return 1; }
3158 private:
3159 void mark_dual_mode_def (df_ref def);
3160 void fix_debug_reg_uses (rtx reg);
3161 void convert_insn (rtx_insn *insn);
3162 /* We don't convert registers to difference size. */
3163 void convert_registers () {}
3166 unsigned scalar_chain::max_id = 0;
3168 /* Initialize new chain. */
3170 scalar_chain::scalar_chain ()
3172 chain_id = ++max_id;
3174 if (dump_file)
3175 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3177 bitmap_obstack_initialize (NULL);
3178 insns = BITMAP_ALLOC (NULL);
3179 defs = BITMAP_ALLOC (NULL);
3180 defs_conv = BITMAP_ALLOC (NULL);
3181 queue = NULL;
3184 /* Free chain's data. */
3186 scalar_chain::~scalar_chain ()
3188 BITMAP_FREE (insns);
3189 BITMAP_FREE (defs);
3190 BITMAP_FREE (defs_conv);
3191 bitmap_obstack_release (NULL);
3194 /* Add instruction into chains' queue. */
3196 void
3197 scalar_chain::add_to_queue (unsigned insn_uid)
3199 if (bitmap_bit_p (insns, insn_uid)
3200 || bitmap_bit_p (queue, insn_uid))
3201 return;
3203 if (dump_file)
3204 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3205 insn_uid, chain_id);
3206 bitmap_set_bit (queue, insn_uid);
3209 /* For DImode conversion, mark register defined by DEF as requiring
3210 conversion. */
3212 void
3213 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3215 gcc_assert (DF_REF_REG_DEF_P (def));
3217 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3218 return;
3220 if (dump_file)
3221 fprintf (dump_file,
3222 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3223 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3225 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3228 /* For TImode conversion, it is unused. */
3230 void
3231 timode_scalar_chain::mark_dual_mode_def (df_ref)
3233 gcc_unreachable ();
3236 /* Check REF's chain to add new insns into a queue
3237 and find registers requiring conversion. */
3239 void
3240 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3242 df_link *chain;
3244 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3245 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3246 add_to_queue (DF_REF_INSN_UID (ref));
3248 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3250 unsigned uid = DF_REF_INSN_UID (chain->ref);
3252 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3253 continue;
3255 if (!DF_REF_REG_MEM_P (chain->ref))
3257 if (bitmap_bit_p (insns, uid))
3258 continue;
3260 if (bitmap_bit_p (candidates, uid))
3262 add_to_queue (uid);
3263 continue;
3267 if (DF_REF_REG_DEF_P (chain->ref))
3269 if (dump_file)
3270 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3271 DF_REF_REGNO (chain->ref), uid);
3272 mark_dual_mode_def (chain->ref);
3274 else
3276 if (dump_file)
3277 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3278 DF_REF_REGNO (chain->ref), uid);
3279 mark_dual_mode_def (ref);
3284 /* Add instruction into a chain. */
3286 void
3287 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3289 if (bitmap_bit_p (insns, insn_uid))
3290 return;
3292 if (dump_file)
3293 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3295 bitmap_set_bit (insns, insn_uid);
3297 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3298 rtx def_set = single_set (insn);
3299 if (def_set && REG_P (SET_DEST (def_set))
3300 && !HARD_REGISTER_P (SET_DEST (def_set)))
3301 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3303 df_ref ref;
3304 df_ref def;
3305 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3306 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3307 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3308 def;
3309 def = DF_REF_NEXT_REG (def))
3310 analyze_register_chain (candidates, def);
3311 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3312 if (!DF_REF_REG_MEM_P (ref))
3313 analyze_register_chain (candidates, ref);
3316 /* Build new chain starting from insn INSN_UID recursively
3317 adding all dependent uses and definitions. */
3319 void
3320 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3322 queue = BITMAP_ALLOC (NULL);
3323 bitmap_set_bit (queue, insn_uid);
3325 if (dump_file)
3326 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3328 while (!bitmap_empty_p (queue))
3330 insn_uid = bitmap_first_set_bit (queue);
3331 bitmap_clear_bit (queue, insn_uid);
3332 bitmap_clear_bit (candidates, insn_uid);
3333 add_insn (candidates, insn_uid);
3336 if (dump_file)
3338 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3339 fprintf (dump_file, " insns: ");
3340 dump_bitmap (dump_file, insns);
3341 if (!bitmap_empty_p (defs_conv))
3343 bitmap_iterator bi;
3344 unsigned id;
3345 const char *comma = "";
3346 fprintf (dump_file, " defs to convert: ");
3347 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3349 fprintf (dump_file, "%sr%d", comma, id);
3350 comma = ", ";
3352 fprintf (dump_file, "\n");
3356 BITMAP_FREE (queue);
3359 /* Return a cost of building a vector costant
3360 instead of using a scalar one. */
3363 dimode_scalar_chain::vector_const_cost (rtx exp)
3365 gcc_assert (CONST_INT_P (exp));
3367 if (standard_sse_constant_p (exp, V2DImode))
3368 return COSTS_N_INSNS (1);
3369 return ix86_cost->sse_load[1];
3372 /* Compute a gain for chain conversion. */
3375 dimode_scalar_chain::compute_convert_gain ()
3377 bitmap_iterator bi;
3378 unsigned insn_uid;
3379 int gain = 0;
3380 int cost = 0;
3382 if (dump_file)
3383 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3385 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3387 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3388 rtx def_set = single_set (insn);
3389 rtx src = SET_SRC (def_set);
3390 rtx dst = SET_DEST (def_set);
3392 if (REG_P (src) && REG_P (dst))
3393 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3394 else if (REG_P (src) && MEM_P (dst))
3395 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3396 else if (MEM_P (src) && REG_P (dst))
3397 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3398 else if (GET_CODE (src) == ASHIFT
3399 || GET_CODE (src) == LSHIFTRT)
3401 gain += ix86_cost->add;
3402 if (CONST_INT_P (XEXP (src, 0)))
3403 gain -= vector_const_cost (XEXP (src, 0));
3405 else if (GET_CODE (src) == PLUS
3406 || GET_CODE (src) == MINUS
3407 || GET_CODE (src) == IOR
3408 || GET_CODE (src) == XOR
3409 || GET_CODE (src) == AND)
3411 gain += ix86_cost->add;
3412 if (CONST_INT_P (XEXP (src, 0)))
3413 gain -= vector_const_cost (XEXP (src, 0));
3414 if (CONST_INT_P (XEXP (src, 1)))
3415 gain -= vector_const_cost (XEXP (src, 1));
3417 else if (GET_CODE (src) == COMPARE)
3419 /* Assume comparison cost is the same. */
3421 else if (GET_CODE (src) == CONST_INT)
3423 if (REG_P (dst))
3424 gain += COSTS_N_INSNS (2);
3425 else if (MEM_P (dst))
3426 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3427 gain -= vector_const_cost (src);
3429 else
3430 gcc_unreachable ();
3433 if (dump_file)
3434 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3436 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3437 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3439 if (dump_file)
3440 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3442 gain -= cost;
3444 if (dump_file)
3445 fprintf (dump_file, " Total gain: %d\n", gain);
3447 return gain;
3450 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3453 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3455 if (x == reg)
3456 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3458 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3459 int i, j;
3460 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3462 if (fmt[i] == 'e')
3463 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3464 else if (fmt[i] == 'E')
3465 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3466 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3467 reg, new_reg);
3470 return x;
3473 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3475 void
3476 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3477 rtx reg, rtx new_reg)
3479 replace_with_subreg (single_set (insn), reg, new_reg);
3482 /* Insert generated conversion instruction sequence INSNS
3483 after instruction AFTER. New BB may be required in case
3484 instruction has EH region attached. */
3486 void
3487 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3489 if (!control_flow_insn_p (after))
3491 emit_insn_after (insns, after);
3492 return;
3495 basic_block bb = BLOCK_FOR_INSN (after);
3496 edge e = find_fallthru_edge (bb->succs);
3497 gcc_assert (e);
3499 basic_block new_bb = split_edge (e);
3500 emit_insn_after (insns, BB_HEAD (new_bb));
3503 /* Make vector copies for all register REGNO definitions
3504 and replace its uses in a chain. */
3506 void
3507 dimode_scalar_chain::make_vector_copies (unsigned regno)
3509 rtx reg = regno_reg_rtx[regno];
3510 rtx vreg = gen_reg_rtx (DImode);
3511 df_ref ref;
3513 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3514 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3516 rtx_insn *insn = DF_REF_INSN (ref);
3518 start_sequence ();
3519 if (TARGET_SSE4_1)
3521 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3522 CONST0_RTX (V4SImode),
3523 gen_rtx_SUBREG (SImode, reg, 0)));
3524 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3525 gen_rtx_SUBREG (V4SImode, vreg, 0),
3526 gen_rtx_SUBREG (SImode, reg, 4),
3527 GEN_INT (2)));
3529 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3531 rtx tmp = gen_reg_rtx (DImode);
3532 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3533 CONST0_RTX (V4SImode),
3534 gen_rtx_SUBREG (SImode, reg, 0)));
3535 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3536 CONST0_RTX (V4SImode),
3537 gen_rtx_SUBREG (SImode, reg, 4)));
3538 emit_insn (gen_vec_interleave_lowv4si
3539 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3540 gen_rtx_SUBREG (V4SImode, vreg, 0),
3541 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3543 else
3545 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3546 emit_move_insn (adjust_address (tmp, SImode, 0),
3547 gen_rtx_SUBREG (SImode, reg, 0));
3548 emit_move_insn (adjust_address (tmp, SImode, 4),
3549 gen_rtx_SUBREG (SImode, reg, 4));
3550 emit_move_insn (vreg, tmp);
3552 rtx_insn *seq = get_insns ();
3553 end_sequence ();
3554 emit_conversion_insns (seq, insn);
3556 if (dump_file)
3557 fprintf (dump_file,
3558 " Copied r%d to a vector register r%d for insn %d\n",
3559 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3562 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3563 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3565 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3567 if (dump_file)
3568 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3569 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3573 /* Convert all definitions of register REGNO
3574 and fix its uses. Scalar copies may be created
3575 in case register is used in not convertible insn. */
3577 void
3578 dimode_scalar_chain::convert_reg (unsigned regno)
3580 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3581 rtx reg = regno_reg_rtx[regno];
3582 rtx scopy = NULL_RTX;
3583 df_ref ref;
3584 bitmap conv;
3586 conv = BITMAP_ALLOC (NULL);
3587 bitmap_copy (conv, insns);
3589 if (scalar_copy)
3590 scopy = gen_reg_rtx (DImode);
3592 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3594 rtx_insn *insn = DF_REF_INSN (ref);
3595 rtx def_set = single_set (insn);
3596 rtx src = SET_SRC (def_set);
3597 rtx reg = DF_REF_REG (ref);
3599 if (!MEM_P (src))
3601 replace_with_subreg_in_insn (insn, reg, reg);
3602 bitmap_clear_bit (conv, INSN_UID (insn));
3605 if (scalar_copy)
3607 rtx vcopy = gen_reg_rtx (V2DImode);
3609 start_sequence ();
3610 if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3612 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3613 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3614 gen_rtx_SUBREG (SImode, vcopy, 0));
3615 emit_move_insn (vcopy,
3616 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3617 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3618 gen_rtx_SUBREG (SImode, vcopy, 0));
3620 else
3622 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3623 emit_move_insn (tmp, reg);
3624 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3625 adjust_address (tmp, SImode, 0));
3626 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3627 adjust_address (tmp, SImode, 4));
3629 rtx_insn *seq = get_insns ();
3630 end_sequence ();
3631 emit_conversion_insns (seq, insn);
3633 if (dump_file)
3634 fprintf (dump_file,
3635 " Copied r%d to a scalar register r%d for insn %d\n",
3636 regno, REGNO (scopy), INSN_UID (insn));
3640 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3641 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3643 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3645 rtx def_set = single_set (DF_REF_INSN (ref));
3646 if (!MEM_P (SET_DEST (def_set))
3647 || !REG_P (SET_SRC (def_set)))
3648 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3649 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3652 /* Skip debug insns and uninitialized uses. */
3653 else if (DF_REF_CHAIN (ref)
3654 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3656 gcc_assert (scopy);
3657 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3658 df_insn_rescan (DF_REF_INSN (ref));
3661 BITMAP_FREE (conv);
3664 /* Convert operand OP in INSN. We should handle
3665 memory operands and uninitialized registers.
3666 All other register uses are converted during
3667 registers conversion. */
3669 void
3670 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3672 *op = copy_rtx_if_shared (*op);
3674 if (GET_CODE (*op) == NOT)
3676 convert_op (&XEXP (*op, 0), insn);
3677 PUT_MODE (*op, V2DImode);
3679 else if (MEM_P (*op))
3681 rtx tmp = gen_reg_rtx (DImode);
3683 emit_insn_before (gen_move_insn (tmp, *op), insn);
3684 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3686 if (dump_file)
3687 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3688 INSN_UID (insn), REGNO (tmp));
3690 else if (REG_P (*op))
3692 /* We may have not converted register usage in case
3693 this register has no definition. Otherwise it
3694 should be converted in convert_reg. */
3695 df_ref ref;
3696 FOR_EACH_INSN_USE (ref, insn)
3697 if (DF_REF_REGNO (ref) == REGNO (*op))
3699 gcc_assert (!DF_REF_CHAIN (ref));
3700 break;
3702 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3704 else if (CONST_INT_P (*op))
3706 rtx vec_cst;
3707 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3709 /* Prefer all ones vector in case of -1. */
3710 if (constm1_operand (*op, GET_MODE (*op)))
3711 vec_cst = CONSTM1_RTX (V2DImode);
3712 else
3713 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3714 gen_rtvec (2, *op, const0_rtx));
3716 if (!standard_sse_constant_p (vec_cst, V2DImode))
3718 start_sequence ();
3719 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3720 rtx_insn *seq = get_insns ();
3721 end_sequence ();
3722 emit_insn_before (seq, insn);
3725 emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
3726 *op = tmp;
3728 else
3730 gcc_assert (SUBREG_P (*op));
3731 gcc_assert (GET_MODE (*op) == V2DImode);
3735 /* Convert INSN to vector mode. */
3737 void
3738 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3740 rtx def_set = single_set (insn);
3741 rtx src = SET_SRC (def_set);
3742 rtx dst = SET_DEST (def_set);
3743 rtx subreg;
3745 if (MEM_P (dst) && !REG_P (src))
3747 /* There are no scalar integer instructions and therefore
3748 temporary register usage is required. */
3749 rtx tmp = gen_reg_rtx (DImode);
3750 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3751 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3754 switch (GET_CODE (src))
3756 case ASHIFT:
3757 case LSHIFTRT:
3758 convert_op (&XEXP (src, 0), insn);
3759 PUT_MODE (src, V2DImode);
3760 break;
3762 case PLUS:
3763 case MINUS:
3764 case IOR:
3765 case XOR:
3766 case AND:
3767 convert_op (&XEXP (src, 0), insn);
3768 convert_op (&XEXP (src, 1), insn);
3769 PUT_MODE (src, V2DImode);
3770 break;
3772 case MEM:
3773 if (!REG_P (dst))
3774 convert_op (&src, insn);
3775 break;
3777 case REG:
3778 if (!MEM_P (dst))
3779 convert_op (&src, insn);
3780 break;
3782 case SUBREG:
3783 gcc_assert (GET_MODE (src) == V2DImode);
3784 break;
3786 case COMPARE:
3787 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3789 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3790 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3792 if (REG_P (src))
3793 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3794 else
3795 subreg = copy_rtx_if_shared (src);
3796 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3797 copy_rtx_if_shared (subreg),
3798 copy_rtx_if_shared (subreg)),
3799 insn);
3800 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3801 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3802 copy_rtx_if_shared (src)),
3803 UNSPEC_PTEST);
3804 break;
3806 case CONST_INT:
3807 convert_op (&src, insn);
3808 break;
3810 default:
3811 gcc_unreachable ();
3814 SET_SRC (def_set) = src;
3815 SET_DEST (def_set) = dst;
3817 /* Drop possible dead definitions. */
3818 PATTERN (insn) = def_set;
3820 INSN_CODE (insn) = -1;
3821 recog_memoized (insn);
3822 df_insn_rescan (insn);
3825 /* Fix uses of converted REG in debug insns. */
3827 void
3828 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3830 if (!flag_var_tracking)
3831 return;
3833 df_ref ref;
3834 for (ref = DF_REG_USE_CHAIN (REGNO (reg));
3835 ref;
3836 ref = DF_REF_NEXT_REG (ref))
3838 rtx_insn *insn = DF_REF_INSN (ref);
3839 if (DEBUG_INSN_P (insn))
3841 /* It may be a debug insn with a TImode variable in
3842 register. */
3843 rtx val = PATTERN (insn);
3844 if (GET_MODE (val) != TImode)
3845 continue;
3846 gcc_assert (GET_CODE (val) == VAR_LOCATION);
3847 rtx loc = PAT_VAR_LOCATION_LOC (val);
3848 /* It may have been converted to TImode already. */
3849 if (GET_MODE (loc) == TImode)
3850 continue;
3851 gcc_assert (REG_P (loc)
3852 && GET_MODE (loc) == V1TImode);
3853 /* Convert V1TImode register, which has been updated by a SET
3854 insn before, to SUBREG TImode. */
3855 PAT_VAR_LOCATION_LOC (val) = gen_rtx_SUBREG (TImode, loc, 0);
3856 df_insn_rescan (insn);
3861 /* Convert INSN from TImode to V1T1mode. */
3863 void
3864 timode_scalar_chain::convert_insn (rtx_insn *insn)
3866 rtx def_set = single_set (insn);
3867 rtx src = SET_SRC (def_set);
3868 rtx dst = SET_DEST (def_set);
3870 switch (GET_CODE (dst))
3872 case REG:
3874 rtx tmp = find_reg_equal_equiv_note (insn);
3875 if (tmp)
3876 PUT_MODE (XEXP (tmp, 0), V1TImode);
3877 PUT_MODE (dst, V1TImode);
3878 fix_debug_reg_uses (dst);
3880 break;
3881 case MEM:
3882 PUT_MODE (dst, V1TImode);
3883 break;
3885 default:
3886 gcc_unreachable ();
3889 switch (GET_CODE (src))
3891 case REG:
3892 PUT_MODE (src, V1TImode);
3893 /* Call fix_debug_reg_uses only if SRC is never defined. */
3894 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3895 fix_debug_reg_uses (src);
3896 break;
3898 case MEM:
3899 PUT_MODE (src, V1TImode);
3900 break;
3902 case CONST_WIDE_INT:
3903 if (NONDEBUG_INSN_P (insn))
3905 /* Since there are no instructions to store 128-bit constant,
3906 temporary register usage is required. */
3907 rtx tmp = gen_reg_rtx (V1TImode);
3908 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3909 src = validize_mem (force_const_mem (V1TImode, src));
3910 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3911 dst = tmp;
3913 break;
3915 case CONST_INT:
3916 switch (standard_sse_constant_p (src, TImode))
3918 case 1:
3919 src = CONST0_RTX (GET_MODE (dst));
3920 break;
3921 case 2:
3922 src = CONSTM1_RTX (GET_MODE (dst));
3923 break;
3924 default:
3925 gcc_unreachable ();
3927 if (NONDEBUG_INSN_P (insn))
3929 rtx tmp = gen_reg_rtx (V1TImode);
3930 /* Since there are no instructions to store standard SSE
3931 constant, temporary register usage is required. */
3932 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3933 dst = tmp;
3935 break;
3937 default:
3938 gcc_unreachable ();
3941 SET_SRC (def_set) = src;
3942 SET_DEST (def_set) = dst;
3944 /* Drop possible dead definitions. */
3945 PATTERN (insn) = def_set;
3947 INSN_CODE (insn) = -1;
3948 recog_memoized (insn);
3949 df_insn_rescan (insn);
3952 void
3953 dimode_scalar_chain::convert_registers ()
3955 bitmap_iterator bi;
3956 unsigned id;
3958 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
3959 convert_reg (id);
3961 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
3962 make_vector_copies (id);
3965 /* Convert whole chain creating required register
3966 conversions and copies. */
3969 scalar_chain::convert ()
3971 bitmap_iterator bi;
3972 unsigned id;
3973 int converted_insns = 0;
3975 if (!dbg_cnt (stv_conversion))
3976 return 0;
3978 if (dump_file)
3979 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
3981 convert_registers ();
3983 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
3985 convert_insn (DF_INSN_UID_GET (id)->insn);
3986 converted_insns++;
3989 return converted_insns;
3992 /* Main STV pass function. Find and convert scalar
3993 instructions into vector mode when profitable. */
3995 static unsigned int
3996 convert_scalars_to_vector ()
3998 basic_block bb;
3999 bitmap candidates;
4000 int converted_insns = 0;
4002 bitmap_obstack_initialize (NULL);
4003 candidates = BITMAP_ALLOC (NULL);
4005 calculate_dominance_info (CDI_DOMINATORS);
4006 df_set_flags (DF_DEFER_INSN_RESCAN);
4007 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4008 df_md_add_problem ();
4009 df_analyze ();
4011 /* Find all instructions we want to convert into vector mode. */
4012 if (dump_file)
4013 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4015 FOR_EACH_BB_FN (bb, cfun)
4017 rtx_insn *insn;
4018 FOR_BB_INSNS (bb, insn)
4019 if (scalar_to_vector_candidate_p (insn))
4021 if (dump_file)
4022 fprintf (dump_file, " insn %d is marked as a candidate\n",
4023 INSN_UID (insn));
4025 bitmap_set_bit (candidates, INSN_UID (insn));
4029 remove_non_convertible_regs (candidates);
4031 if (bitmap_empty_p (candidates))
4032 if (dump_file)
4033 fprintf (dump_file, "There are no candidates for optimization.\n");
4035 while (!bitmap_empty_p (candidates))
4037 unsigned uid = bitmap_first_set_bit (candidates);
4038 scalar_chain *chain;
4040 if (TARGET_64BIT)
4041 chain = new timode_scalar_chain;
4042 else
4043 chain = new dimode_scalar_chain;
4045 /* Find instructions chain we want to convert to vector mode.
4046 Check all uses and definitions to estimate all required
4047 conversions. */
4048 chain->build (candidates, uid);
4050 if (chain->compute_convert_gain () > 0)
4051 converted_insns += chain->convert ();
4052 else
4053 if (dump_file)
4054 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4055 chain->chain_id);
4057 delete chain;
4060 if (dump_file)
4061 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4063 BITMAP_FREE (candidates);
4064 bitmap_obstack_release (NULL);
4065 df_process_deferred_rescans ();
4067 /* Conversion means we may have 128bit register spills/fills
4068 which require aligned stack. */
4069 if (converted_insns)
4071 if (crtl->stack_alignment_needed < 128)
4072 crtl->stack_alignment_needed = 128;
4073 if (crtl->stack_alignment_estimated < 128)
4074 crtl->stack_alignment_estimated = 128;
4077 return 0;
4080 namespace {
4082 const pass_data pass_data_insert_vzeroupper =
4084 RTL_PASS, /* type */
4085 "vzeroupper", /* name */
4086 OPTGROUP_NONE, /* optinfo_flags */
4087 TV_MACH_DEP, /* tv_id */
4088 0, /* properties_required */
4089 0, /* properties_provided */
4090 0, /* properties_destroyed */
4091 0, /* todo_flags_start */
4092 TODO_df_finish, /* todo_flags_finish */
4095 class pass_insert_vzeroupper : public rtl_opt_pass
4097 public:
4098 pass_insert_vzeroupper(gcc::context *ctxt)
4099 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4102 /* opt_pass methods: */
4103 virtual bool gate (function *)
4105 return TARGET_AVX && !TARGET_AVX512F
4106 && TARGET_VZEROUPPER && flag_expensive_optimizations
4107 && !optimize_size;
4110 virtual unsigned int execute (function *)
4112 return rest_of_handle_insert_vzeroupper ();
4115 }; // class pass_insert_vzeroupper
4117 const pass_data pass_data_stv =
4119 RTL_PASS, /* type */
4120 "stv", /* name */
4121 OPTGROUP_NONE, /* optinfo_flags */
4122 TV_MACH_DEP, /* tv_id */
4123 0, /* properties_required */
4124 0, /* properties_provided */
4125 0, /* properties_destroyed */
4126 0, /* todo_flags_start */
4127 TODO_df_finish, /* todo_flags_finish */
4130 class pass_stv : public rtl_opt_pass
4132 public:
4133 pass_stv (gcc::context *ctxt)
4134 : rtl_opt_pass (pass_data_stv, ctxt),
4135 timode_p (false)
4138 /* opt_pass methods: */
4139 virtual bool gate (function *)
4141 return (timode_p == !!TARGET_64BIT
4142 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4145 virtual unsigned int execute (function *)
4147 return convert_scalars_to_vector ();
4150 opt_pass *clone ()
4152 return new pass_stv (m_ctxt);
4155 void set_pass_param (unsigned int n, bool param)
4157 gcc_assert (n == 0);
4158 timode_p = param;
4161 private:
4162 bool timode_p;
4163 }; // class pass_stv
4165 } // anon namespace
4167 rtl_opt_pass *
4168 make_pass_insert_vzeroupper (gcc::context *ctxt)
4170 return new pass_insert_vzeroupper (ctxt);
4173 rtl_opt_pass *
4174 make_pass_stv (gcc::context *ctxt)
4176 return new pass_stv (ctxt);
4179 /* Return true if a red-zone is in use. */
4181 bool
4182 ix86_using_red_zone (void)
4184 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4187 /* Return a string that documents the current -m options. The caller is
4188 responsible for freeing the string. */
4190 static char *
4191 ix86_target_string (HOST_WIDE_INT isa, int flags, int ix86_flags,
4192 const char *arch, const char *tune,
4193 enum fpmath_unit fpmath, bool add_nl_p)
4195 struct ix86_target_opts
4197 const char *option; /* option string */
4198 HOST_WIDE_INT mask; /* isa mask options */
4201 /* This table is ordered so that options like -msse4.2 that imply
4202 preceding options while match those first. */
4203 static struct ix86_target_opts isa_opts[] =
4205 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4206 { "-mfma", OPTION_MASK_ISA_FMA },
4207 { "-mxop", OPTION_MASK_ISA_XOP },
4208 { "-mlwp", OPTION_MASK_ISA_LWP },
4209 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4210 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4211 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4212 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4213 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4214 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4215 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4216 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4217 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4218 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4219 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4220 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4221 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4222 { "-msse3", OPTION_MASK_ISA_SSE3 },
4223 { "-msse2", OPTION_MASK_ISA_SSE2 },
4224 { "-msse", OPTION_MASK_ISA_SSE },
4225 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4226 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4227 { "-mmmx", OPTION_MASK_ISA_MMX },
4228 { "-mabm", OPTION_MASK_ISA_ABM },
4229 { "-mbmi", OPTION_MASK_ISA_BMI },
4230 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4231 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4232 { "-mhle", OPTION_MASK_ISA_HLE },
4233 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4234 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4235 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4236 { "-madx", OPTION_MASK_ISA_ADX },
4237 { "-mtbm", OPTION_MASK_ISA_TBM },
4238 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4239 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4240 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4241 { "-maes", OPTION_MASK_ISA_AES },
4242 { "-msha", OPTION_MASK_ISA_SHA },
4243 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4244 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4245 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4246 { "-mf16c", OPTION_MASK_ISA_F16C },
4247 { "-mrtm", OPTION_MASK_ISA_RTM },
4248 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4249 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4250 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4251 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4252 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4253 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4254 { "-mmpx", OPTION_MASK_ISA_MPX },
4255 { "-mclwb", OPTION_MASK_ISA_CLWB },
4256 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4257 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4258 { "-mpku", OPTION_MASK_ISA_PKU },
4261 /* Flag options. */
4262 static struct ix86_target_opts flag_opts[] =
4264 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4265 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4266 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4267 { "-m80387", MASK_80387 },
4268 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4269 { "-malign-double", MASK_ALIGN_DOUBLE },
4270 { "-mcld", MASK_CLD },
4271 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4272 { "-mieee-fp", MASK_IEEE_FP },
4273 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4274 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4275 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4276 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4277 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4278 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4279 { "-mno-red-zone", MASK_NO_RED_ZONE },
4280 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4281 { "-mrecip", MASK_RECIP },
4282 { "-mrtd", MASK_RTD },
4283 { "-msseregparm", MASK_SSEREGPARM },
4284 { "-mstack-arg-probe", MASK_STACK_PROBE },
4285 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4286 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4287 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4288 { "-mvzeroupper", MASK_VZEROUPPER },
4289 { "-mstv", MASK_STV},
4290 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
4291 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
4292 { "-mprefer-avx128", MASK_PREFER_AVX128},
4295 /* Additional flag options. */
4296 static struct ix86_target_opts ix86_flag_opts[] =
4298 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4301 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts)
4302 + ARRAY_SIZE (ix86_flag_opts) + 6][2];
4304 char isa_other[40];
4305 char target_other[40];
4306 char ix86_target_other[40];
4307 unsigned num = 0;
4308 unsigned i, j;
4309 char *ret;
4310 char *ptr;
4311 size_t len;
4312 size_t line_len;
4313 size_t sep_len;
4314 const char *abi;
4316 memset (opts, '\0', sizeof (opts));
4318 /* Add -march= option. */
4319 if (arch)
4321 opts[num][0] = "-march=";
4322 opts[num++][1] = arch;
4325 /* Add -mtune= option. */
4326 if (tune)
4328 opts[num][0] = "-mtune=";
4329 opts[num++][1] = tune;
4332 /* Add -m32/-m64/-mx32. */
4333 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4335 if ((isa & OPTION_MASK_ABI_64) != 0)
4336 abi = "-m64";
4337 else
4338 abi = "-mx32";
4339 isa &= ~ (OPTION_MASK_ISA_64BIT
4340 | OPTION_MASK_ABI_64
4341 | OPTION_MASK_ABI_X32);
4343 else
4344 abi = "-m32";
4345 opts[num++][0] = abi;
4347 /* Pick out the options in isa options. */
4348 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4350 if ((isa & isa_opts[i].mask) != 0)
4352 opts[num++][0] = isa_opts[i].option;
4353 isa &= ~ isa_opts[i].mask;
4357 if (isa && add_nl_p)
4359 opts[num++][0] = isa_other;
4360 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
4361 isa);
4364 /* Add flag options. */
4365 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4367 if ((flags & flag_opts[i].mask) != 0)
4369 opts[num++][0] = flag_opts[i].option;
4370 flags &= ~ flag_opts[i].mask;
4374 if (flags && add_nl_p)
4376 opts[num++][0] = target_other;
4377 sprintf (target_other, "(other flags: %#x)", flags);
4380 /* Add additional flag options. */
4381 for (i = 0; i < ARRAY_SIZE (ix86_flag_opts); i++)
4383 if ((ix86_flags & ix86_flag_opts[i].mask) != 0)
4385 opts[num++][0] = ix86_flag_opts[i].option;
4386 ix86_flags &= ~ ix86_flag_opts[i].mask;
4390 if (ix86_flags && add_nl_p)
4392 opts[num++][0] = ix86_target_other;
4393 sprintf (ix86_target_other, "(other flags: %#x)", ix86_flags);
4396 /* Add -fpmath= option. */
4397 if (fpmath)
4399 opts[num][0] = "-mfpmath=";
4400 switch ((int) fpmath)
4402 case FPMATH_387:
4403 opts[num++][1] = "387";
4404 break;
4406 case FPMATH_SSE:
4407 opts[num++][1] = "sse";
4408 break;
4410 case FPMATH_387 | FPMATH_SSE:
4411 opts[num++][1] = "sse+387";
4412 break;
4414 default:
4415 gcc_unreachable ();
4419 /* Any options? */
4420 if (num == 0)
4421 return NULL;
4423 gcc_assert (num < ARRAY_SIZE (opts));
4425 /* Size the string. */
4426 len = 0;
4427 sep_len = (add_nl_p) ? 3 : 1;
4428 for (i = 0; i < num; i++)
4430 len += sep_len;
4431 for (j = 0; j < 2; j++)
4432 if (opts[i][j])
4433 len += strlen (opts[i][j]);
4436 /* Build the string. */
4437 ret = ptr = (char *) xmalloc (len);
4438 line_len = 0;
4440 for (i = 0; i < num; i++)
4442 size_t len2[2];
4444 for (j = 0; j < 2; j++)
4445 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4447 if (i != 0)
4449 *ptr++ = ' ';
4450 line_len++;
4452 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4454 *ptr++ = '\\';
4455 *ptr++ = '\n';
4456 line_len = 0;
4460 for (j = 0; j < 2; j++)
4461 if (opts[i][j])
4463 memcpy (ptr, opts[i][j], len2[j]);
4464 ptr += len2[j];
4465 line_len += len2[j];
4469 *ptr = '\0';
4470 gcc_assert (ret + len >= ptr);
4472 return ret;
4475 /* Return true, if profiling code should be emitted before
4476 prologue. Otherwise it returns false.
4477 Note: For x86 with "hotfix" it is sorried. */
4478 static bool
4479 ix86_profile_before_prologue (void)
4481 return flag_fentry != 0;
4484 /* Function that is callable from the debugger to print the current
4485 options. */
4486 void ATTRIBUTE_UNUSED
4487 ix86_debug_options (void)
4489 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
4490 ix86_target_flags,
4491 ix86_arch_string, ix86_tune_string,
4492 ix86_fpmath, true);
4494 if (opts)
4496 fprintf (stderr, "%s\n\n", opts);
4497 free (opts);
4499 else
4500 fputs ("<no options>\n\n", stderr);
4502 return;
4505 /* Return true if T is one of the bytes we should avoid with
4506 -fmitigate-rop. */
4508 static bool
4509 ix86_rop_should_change_byte_p (int t)
4511 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4514 static const char *stringop_alg_names[] = {
4515 #define DEF_ENUM
4516 #define DEF_ALG(alg, name) #name,
4517 #include "stringop.def"
4518 #undef DEF_ENUM
4519 #undef DEF_ALG
4522 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4523 The string is of the following form (or comma separated list of it):
4525 strategy_alg:max_size:[align|noalign]
4527 where the full size range for the strategy is either [0, max_size] or
4528 [min_size, max_size], in which min_size is the max_size + 1 of the
4529 preceding range. The last size range must have max_size == -1.
4531 Examples:
4534 -mmemcpy-strategy=libcall:-1:noalign
4536 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4540 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4542 This is to tell the compiler to use the following strategy for memset
4543 1) when the expected size is between [1, 16], use rep_8byte strategy;
4544 2) when the size is between [17, 2048], use vector_loop;
4545 3) when the size is > 2048, use libcall. */
4547 struct stringop_size_range
4549 int max;
4550 stringop_alg alg;
4551 bool noalign;
4554 static void
4555 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4557 const struct stringop_algs *default_algs;
4558 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4559 char *curr_range_str, *next_range_str;
4560 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4561 int i = 0, n = 0;
4563 if (is_memset)
4564 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4565 else
4566 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4568 curr_range_str = strategy_str;
4572 int maxs;
4573 char alg_name[128];
4574 char align[16];
4575 next_range_str = strchr (curr_range_str, ',');
4576 if (next_range_str)
4577 *next_range_str++ = '\0';
4579 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4580 alg_name, &maxs, align))
4582 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4583 return;
4586 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4588 error ("size ranges of option %qs should be increasing", opt);
4589 return;
4592 for (i = 0; i < last_alg; i++)
4593 if (!strcmp (alg_name, stringop_alg_names[i]))
4594 break;
4596 if (i == last_alg)
4598 error ("wrong strategy name %qs specified for option %qs",
4599 alg_name, opt);
4601 auto_vec <const char *> candidates;
4602 for (i = 0; i < last_alg; i++)
4603 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4604 candidates.safe_push (stringop_alg_names[i]);
4606 char *s;
4607 const char *hint
4608 = candidates_list_and_hint (alg_name, s, candidates);
4609 if (hint)
4610 inform (input_location,
4611 "valid arguments to %qs are: %s; did you mean %qs?",
4612 opt, s, hint);
4613 else
4614 inform (input_location, "valid arguments to %qs are: %s",
4615 opt, s);
4616 XDELETEVEC (s);
4617 return;
4620 if ((stringop_alg) i == rep_prefix_8_byte
4621 && !TARGET_64BIT)
4623 /* rep; movq isn't available in 32-bit code. */
4624 error ("strategy name %qs specified for option %qs "
4625 "not supported for 32-bit code", alg_name, opt);
4626 return;
4629 input_ranges[n].max = maxs;
4630 input_ranges[n].alg = (stringop_alg) i;
4631 if (!strcmp (align, "align"))
4632 input_ranges[n].noalign = false;
4633 else if (!strcmp (align, "noalign"))
4634 input_ranges[n].noalign = true;
4635 else
4637 error ("unknown alignment %qs specified for option %qs", align, opt);
4638 return;
4640 n++;
4641 curr_range_str = next_range_str;
4643 while (curr_range_str);
4645 if (input_ranges[n - 1].max != -1)
4647 error ("the max value for the last size range should be -1"
4648 " for option %qs", opt);
4649 return;
4652 if (n > MAX_STRINGOP_ALGS)
4654 error ("too many size ranges specified in option %qs", opt);
4655 return;
4658 /* Now override the default algs array. */
4659 for (i = 0; i < n; i++)
4661 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4662 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4663 = input_ranges[i].alg;
4664 *const_cast<int *>(&default_algs->size[i].noalign)
4665 = input_ranges[i].noalign;
4670 /* parse -mtune-ctrl= option. When DUMP is true,
4671 print the features that are explicitly set. */
4673 static void
4674 parse_mtune_ctrl_str (bool dump)
4676 if (!ix86_tune_ctrl_string)
4677 return;
4679 char *next_feature_string = NULL;
4680 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4681 char *orig = curr_feature_string;
4682 int i;
4685 bool clear = false;
4687 next_feature_string = strchr (curr_feature_string, ',');
4688 if (next_feature_string)
4689 *next_feature_string++ = '\0';
4690 if (*curr_feature_string == '^')
4692 curr_feature_string++;
4693 clear = true;
4695 for (i = 0; i < X86_TUNE_LAST; i++)
4697 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4699 ix86_tune_features[i] = !clear;
4700 if (dump)
4701 fprintf (stderr, "Explicitly %s feature %s\n",
4702 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4703 break;
4706 if (i == X86_TUNE_LAST)
4707 error ("Unknown parameter to option -mtune-ctrl: %s",
4708 clear ? curr_feature_string - 1 : curr_feature_string);
4709 curr_feature_string = next_feature_string;
4711 while (curr_feature_string);
4712 free (orig);
4715 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4716 processor type. */
4718 static void
4719 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4721 unsigned int ix86_tune_mask = 1u << ix86_tune;
4722 int i;
4724 for (i = 0; i < X86_TUNE_LAST; ++i)
4726 if (ix86_tune_no_default)
4727 ix86_tune_features[i] = 0;
4728 else
4729 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4732 if (dump)
4734 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4735 for (i = 0; i < X86_TUNE_LAST; i++)
4736 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4737 ix86_tune_features[i] ? "on" : "off");
4740 parse_mtune_ctrl_str (dump);
4744 /* Default align_* from the processor table. */
4746 static void
4747 ix86_default_align (struct gcc_options *opts)
4749 if (opts->x_align_loops == 0)
4751 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4752 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4754 if (opts->x_align_jumps == 0)
4756 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4757 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4759 if (opts->x_align_functions == 0)
4761 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4765 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4767 static void
4768 ix86_override_options_after_change (void)
4770 ix86_default_align (&global_options);
4773 /* Override various settings based on options. If MAIN_ARGS_P, the
4774 options are from the command line, otherwise they are from
4775 attributes. Return true if there's an error related to march
4776 option. */
4778 static bool
4779 ix86_option_override_internal (bool main_args_p,
4780 struct gcc_options *opts,
4781 struct gcc_options *opts_set)
4783 int i;
4784 unsigned int ix86_arch_mask;
4785 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4787 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4788 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4789 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4790 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4791 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4792 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4793 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4794 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4795 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4796 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4797 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4798 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4799 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4800 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4801 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4802 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4803 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4804 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4805 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4806 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4807 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4808 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4809 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4810 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4811 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4812 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4813 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4814 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4815 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4816 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4817 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4818 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4819 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4820 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4821 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4822 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4823 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4824 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4825 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4826 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4827 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4828 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4829 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4830 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4831 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4832 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4833 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4834 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4835 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4836 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4837 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4838 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4839 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4840 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4841 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4842 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4843 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4844 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4845 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4846 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4848 #define PTA_CORE2 \
4849 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4850 | PTA_CX16 | PTA_FXSR)
4851 #define PTA_NEHALEM \
4852 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4853 #define PTA_WESTMERE \
4854 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4855 #define PTA_SANDYBRIDGE \
4856 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4857 #define PTA_IVYBRIDGE \
4858 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4859 #define PTA_HASWELL \
4860 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4861 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4862 #define PTA_BROADWELL \
4863 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4864 #define PTA_SKYLAKE \
4865 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4866 #define PTA_SKYLAKE_AVX512 \
4867 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4868 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4869 #define PTA_KNL \
4870 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4871 #define PTA_BONNELL \
4872 (PTA_CORE2 | PTA_MOVBE)
4873 #define PTA_SILVERMONT \
4874 (PTA_WESTMERE | PTA_MOVBE)
4876 /* if this reaches 64, need to widen struct pta flags below */
4878 static struct pta
4880 const char *const name; /* processor name or nickname. */
4881 const enum processor_type processor;
4882 const enum attr_cpu schedule;
4883 const unsigned HOST_WIDE_INT flags;
4885 const processor_alias_table[] =
4887 {"i386", PROCESSOR_I386, CPU_NONE, 0},
4888 {"i486", PROCESSOR_I486, CPU_NONE, 0},
4889 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4890 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4891 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
4892 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
4893 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
4894 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4895 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4896 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4897 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4898 PTA_MMX | PTA_SSE | PTA_FXSR},
4899 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4900 PTA_MMX | PTA_SSE | PTA_FXSR},
4901 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4902 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4903 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4904 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4905 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4906 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4907 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
4908 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4909 PTA_MMX | PTA_SSE | PTA_FXSR},
4910 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4911 PTA_MMX | PTA_SSE | PTA_FXSR},
4912 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4913 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4914 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
4915 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
4916 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
4917 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4918 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
4919 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4920 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
4921 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4922 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
4923 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
4924 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4925 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4926 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
4927 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4928 PTA_SANDYBRIDGE},
4929 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4930 PTA_SANDYBRIDGE},
4931 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4932 PTA_IVYBRIDGE},
4933 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4934 PTA_IVYBRIDGE},
4935 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4936 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4937 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
4938 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
4939 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
4940 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4941 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4942 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4943 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4944 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
4945 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
4946 {"geode", PROCESSOR_GEODE, CPU_GEODE,
4947 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4948 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
4949 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4950 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4951 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
4952 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4953 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
4954 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4955 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
4956 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4957 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
4958 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4959 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
4960 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4961 {"x86-64", PROCESSOR_K8, CPU_K8,
4962 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4963 {"eden-x2", PROCESSOR_K8, CPU_K8,
4964 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4965 {"nano", PROCESSOR_K8, CPU_K8,
4966 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4967 | PTA_SSSE3 | PTA_FXSR},
4968 {"nano-1000", PROCESSOR_K8, CPU_K8,
4969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4970 | PTA_SSSE3 | PTA_FXSR},
4971 {"nano-2000", PROCESSOR_K8, CPU_K8,
4972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4973 | PTA_SSSE3 | PTA_FXSR},
4974 {"nano-3000", PROCESSOR_K8, CPU_K8,
4975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4977 {"nano-x2", PROCESSOR_K8, CPU_K8,
4978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4980 {"eden-x4", PROCESSOR_K8, CPU_K8,
4981 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4982 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4983 {"nano-x4", PROCESSOR_K8, CPU_K8,
4984 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4985 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4986 {"k8", PROCESSOR_K8, CPU_K8,
4987 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4988 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4989 {"k8-sse3", PROCESSOR_K8, CPU_K8,
4990 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4991 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4992 {"opteron", PROCESSOR_K8, CPU_K8,
4993 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4994 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4995 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
4996 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4997 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4998 {"athlon64", PROCESSOR_K8, CPU_K8,
4999 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5000 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5001 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5002 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5003 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5004 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5005 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5006 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5007 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5008 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5009 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5010 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5011 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5012 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5013 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5014 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5015 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5016 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5017 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5018 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5019 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5020 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5021 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5022 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5023 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5024 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5025 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5026 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5027 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5028 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5029 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5030 | PTA_XSAVEOPT | PTA_FSGSBASE},
5031 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5032 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5033 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5034 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5035 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5036 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5037 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5038 | PTA_MOVBE | PTA_MWAITX},
5039 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5043 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5044 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5045 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5046 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5047 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5048 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5049 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5050 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5051 | PTA_FXSR | PTA_XSAVE},
5052 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5053 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5054 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5055 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5056 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5057 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5059 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5060 PTA_64BIT
5061 | PTA_HLE /* flags are only used for -march switch. */ },
5064 /* -mrecip options. */
5065 static struct
5067 const char *string; /* option name */
5068 unsigned int mask; /* mask bits to set */
5070 const recip_options[] =
5072 { "all", RECIP_MASK_ALL },
5073 { "none", RECIP_MASK_NONE },
5074 { "div", RECIP_MASK_DIV },
5075 { "sqrt", RECIP_MASK_SQRT },
5076 { "vec-div", RECIP_MASK_VEC_DIV },
5077 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5080 int const pta_size = ARRAY_SIZE (processor_alias_table);
5082 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5083 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5084 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5085 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5086 #ifdef TARGET_BI_ARCH
5087 else
5089 #if TARGET_BI_ARCH == 1
5090 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5091 is on and OPTION_MASK_ABI_X32 is off. We turn off
5092 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5093 -mx32. */
5094 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5095 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5096 #else
5097 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5098 on and OPTION_MASK_ABI_64 is off. We turn off
5099 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5100 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5101 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5102 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5103 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5104 #endif
5105 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5106 && TARGET_IAMCU_P (opts->x_target_flags))
5107 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5108 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5110 #endif
5112 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5114 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5115 OPTION_MASK_ABI_64 for TARGET_X32. */
5116 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5117 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5119 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5120 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5121 | OPTION_MASK_ABI_X32
5122 | OPTION_MASK_ABI_64);
5123 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5125 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5126 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5127 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5128 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5131 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5132 SUBTARGET_OVERRIDE_OPTIONS;
5133 #endif
5135 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5136 SUBSUBTARGET_OVERRIDE_OPTIONS;
5137 #endif
5139 /* -fPIC is the default for x86_64. */
5140 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5141 opts->x_flag_pic = 2;
5143 /* Need to check -mtune=generic first. */
5144 if (opts->x_ix86_tune_string)
5146 /* As special support for cross compilers we read -mtune=native
5147 as -mtune=generic. With native compilers we won't see the
5148 -mtune=native, as it was changed by the driver. */
5149 if (!strcmp (opts->x_ix86_tune_string, "native"))
5151 opts->x_ix86_tune_string = "generic";
5153 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5154 warning (OPT_Wdeprecated,
5155 main_args_p
5156 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5157 "or %<-mtune=generic%> instead as appropriate"
5158 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5159 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5160 "instead as appropriate");
5162 else
5164 if (opts->x_ix86_arch_string)
5165 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5166 if (!opts->x_ix86_tune_string)
5168 opts->x_ix86_tune_string
5169 = processor_target_table[TARGET_CPU_DEFAULT].name;
5170 ix86_tune_defaulted = 1;
5173 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5174 or defaulted. We need to use a sensible tune option. */
5175 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5177 opts->x_ix86_tune_string = "generic";
5181 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5182 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5184 /* rep; movq isn't available in 32-bit code. */
5185 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5186 opts->x_ix86_stringop_alg = no_stringop;
5189 if (!opts->x_ix86_arch_string)
5190 opts->x_ix86_arch_string
5191 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5192 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5193 else
5194 ix86_arch_specified = 1;
5196 if (opts_set->x_ix86_pmode)
5198 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5199 && opts->x_ix86_pmode == PMODE_SI)
5200 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5201 && opts->x_ix86_pmode == PMODE_DI))
5202 error ("address mode %qs not supported in the %s bit mode",
5203 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5204 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5206 else
5207 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5208 ? PMODE_DI : PMODE_SI;
5210 if (!opts_set->x_ix86_abi)
5211 opts->x_ix86_abi = DEFAULT_ABI;
5213 /* For targets using ms ABI enable ms-extensions, if not
5214 explicit turned off. For non-ms ABI we turn off this
5215 option. */
5216 if (!opts_set->x_flag_ms_extensions)
5217 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5219 if (opts_set->x_ix86_cmodel)
5221 switch (opts->x_ix86_cmodel)
5223 case CM_SMALL:
5224 case CM_SMALL_PIC:
5225 if (opts->x_flag_pic)
5226 opts->x_ix86_cmodel = CM_SMALL_PIC;
5227 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5228 error ("code model %qs not supported in the %s bit mode",
5229 "small", "32");
5230 break;
5232 case CM_MEDIUM:
5233 case CM_MEDIUM_PIC:
5234 if (opts->x_flag_pic)
5235 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5236 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5237 error ("code model %qs not supported in the %s bit mode",
5238 "medium", "32");
5239 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5240 error ("code model %qs not supported in x32 mode",
5241 "medium");
5242 break;
5244 case CM_LARGE:
5245 case CM_LARGE_PIC:
5246 if (opts->x_flag_pic)
5247 opts->x_ix86_cmodel = CM_LARGE_PIC;
5248 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5249 error ("code model %qs not supported in the %s bit mode",
5250 "large", "32");
5251 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5252 error ("code model %qs not supported in x32 mode",
5253 "large");
5254 break;
5256 case CM_32:
5257 if (opts->x_flag_pic)
5258 error ("code model %s does not support PIC mode", "32");
5259 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5260 error ("code model %qs not supported in the %s bit mode",
5261 "32", "64");
5262 break;
5264 case CM_KERNEL:
5265 if (opts->x_flag_pic)
5267 error ("code model %s does not support PIC mode", "kernel");
5268 opts->x_ix86_cmodel = CM_32;
5270 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5271 error ("code model %qs not supported in the %s bit mode",
5272 "kernel", "32");
5273 break;
5275 default:
5276 gcc_unreachable ();
5279 else
5281 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5282 use of rip-relative addressing. This eliminates fixups that
5283 would otherwise be needed if this object is to be placed in a
5284 DLL, and is essentially just as efficient as direct addressing. */
5285 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5286 && (TARGET_RDOS || TARGET_PECOFF))
5287 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5288 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5289 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5290 else
5291 opts->x_ix86_cmodel = CM_32;
5293 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5295 error ("-masm=intel not supported in this configuration");
5296 opts->x_ix86_asm_dialect = ASM_ATT;
5298 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5299 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5300 sorry ("%i-bit mode not compiled in",
5301 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5303 for (i = 0; i < pta_size; i++)
5304 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5306 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5308 error (main_args_p
5309 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5310 : "%<generic%> CPU can be used only for "
5311 "%<target(\"tune=\")%> attribute");
5312 return false;
5314 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5316 error (main_args_p
5317 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5318 : "%<intel%> CPU can be used only for "
5319 "%<target(\"tune=\")%> attribute");
5320 return false;
5323 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5324 && !(processor_alias_table[i].flags & PTA_64BIT))
5326 error ("CPU you selected does not support x86-64 "
5327 "instruction set");
5328 return false;
5331 ix86_schedule = processor_alias_table[i].schedule;
5332 ix86_arch = processor_alias_table[i].processor;
5333 /* Default cpu tuning to the architecture. */
5334 ix86_tune = ix86_arch;
5336 if (processor_alias_table[i].flags & PTA_MMX
5337 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5338 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5339 if (processor_alias_table[i].flags & PTA_3DNOW
5340 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5341 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5342 if (processor_alias_table[i].flags & PTA_3DNOW_A
5343 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5344 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5345 if (processor_alias_table[i].flags & PTA_SSE
5346 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5347 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5348 if (processor_alias_table[i].flags & PTA_SSE2
5349 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5350 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5351 if (processor_alias_table[i].flags & PTA_SSE3
5352 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5353 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5354 if (processor_alias_table[i].flags & PTA_SSSE3
5355 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5356 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5357 if (processor_alias_table[i].flags & PTA_SSE4_1
5358 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5359 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5360 if (processor_alias_table[i].flags & PTA_SSE4_2
5361 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5362 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5363 if (processor_alias_table[i].flags & PTA_AVX
5364 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5365 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5366 if (processor_alias_table[i].flags & PTA_AVX2
5367 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5368 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5369 if (processor_alias_table[i].flags & PTA_FMA
5370 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5371 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5372 if (processor_alias_table[i].flags & PTA_SSE4A
5373 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5374 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5375 if (processor_alias_table[i].flags & PTA_FMA4
5376 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5377 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5378 if (processor_alias_table[i].flags & PTA_XOP
5379 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5380 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5381 if (processor_alias_table[i].flags & PTA_LWP
5382 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5383 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5384 if (processor_alias_table[i].flags & PTA_ABM
5385 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5386 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5387 if (processor_alias_table[i].flags & PTA_BMI
5388 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5389 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5390 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5391 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5392 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5393 if (processor_alias_table[i].flags & PTA_TBM
5394 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5395 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5396 if (processor_alias_table[i].flags & PTA_BMI2
5397 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5398 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5399 if (processor_alias_table[i].flags & PTA_CX16
5400 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5401 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5402 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5403 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5404 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5405 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5406 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5407 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5408 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5409 if (processor_alias_table[i].flags & PTA_MOVBE
5410 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5411 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5412 if (processor_alias_table[i].flags & PTA_AES
5413 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5414 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5415 if (processor_alias_table[i].flags & PTA_SHA
5416 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5417 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5418 if (processor_alias_table[i].flags & PTA_PCLMUL
5419 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5420 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5421 if (processor_alias_table[i].flags & PTA_FSGSBASE
5422 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5423 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5424 if (processor_alias_table[i].flags & PTA_RDRND
5425 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5426 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5427 if (processor_alias_table[i].flags & PTA_F16C
5428 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5429 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5430 if (processor_alias_table[i].flags & PTA_RTM
5431 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5432 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5433 if (processor_alias_table[i].flags & PTA_HLE
5434 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5435 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5436 if (processor_alias_table[i].flags & PTA_PRFCHW
5437 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5438 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5439 if (processor_alias_table[i].flags & PTA_RDSEED
5440 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5441 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5442 if (processor_alias_table[i].flags & PTA_ADX
5443 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5444 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5445 if (processor_alias_table[i].flags & PTA_FXSR
5446 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5447 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5448 if (processor_alias_table[i].flags & PTA_XSAVE
5449 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5450 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5451 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5452 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5453 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5454 if (processor_alias_table[i].flags & PTA_AVX512F
5455 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5456 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5457 if (processor_alias_table[i].flags & PTA_AVX512ER
5458 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5459 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5460 if (processor_alias_table[i].flags & PTA_AVX512PF
5461 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5462 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5463 if (processor_alias_table[i].flags & PTA_AVX512CD
5464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5466 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5467 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5468 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5469 if (processor_alias_table[i].flags & PTA_CLWB
5470 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5471 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5472 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5473 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5474 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5475 if (processor_alias_table[i].flags & PTA_CLZERO
5476 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5477 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5478 if (processor_alias_table[i].flags & PTA_XSAVEC
5479 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5480 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5481 if (processor_alias_table[i].flags & PTA_XSAVES
5482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5484 if (processor_alias_table[i].flags & PTA_AVX512DQ
5485 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5486 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5487 if (processor_alias_table[i].flags & PTA_AVX512BW
5488 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5489 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5490 if (processor_alias_table[i].flags & PTA_AVX512VL
5491 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5492 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5493 if (processor_alias_table[i].flags & PTA_MPX
5494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5496 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5499 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5502 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5503 x86_prefetch_sse = true;
5504 if (processor_alias_table[i].flags & PTA_MWAITX
5505 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5506 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5507 if (processor_alias_table[i].flags & PTA_PKU
5508 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5509 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5511 /* Don't enable x87 instructions if only
5512 general registers are allowed. */
5513 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5514 && !(opts_set->x_target_flags & MASK_80387))
5516 if (processor_alias_table[i].flags & PTA_NO_80387)
5517 opts->x_target_flags &= ~MASK_80387;
5518 else
5519 opts->x_target_flags |= MASK_80387;
5521 break;
5524 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5525 error ("Intel MPX does not support x32");
5527 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5528 error ("Intel MPX does not support x32");
5530 if (i == pta_size)
5532 error (main_args_p
5533 ? "bad value (%qs) for %<-march=%> switch"
5534 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5535 opts->x_ix86_arch_string);
5537 auto_vec <const char *> candidates;
5538 for (i = 0; i < pta_size; i++)
5539 if (strcmp (processor_alias_table[i].name, "generic")
5540 && strcmp (processor_alias_table[i].name, "intel")
5541 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5542 || (processor_alias_table[i].flags & PTA_64BIT)))
5543 candidates.safe_push (processor_alias_table[i].name);
5545 char *s;
5546 const char *hint
5547 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5548 if (hint)
5549 inform (input_location,
5550 main_args_p
5551 ? "valid arguments to %<-march=%> switch are: "
5552 "%s; did you mean %qs?"
5553 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5554 "%s; did you mean %qs?", s, hint);
5555 else
5556 inform (input_location,
5557 main_args_p
5558 ? "valid arguments to %<-march=%> switch are: %s"
5559 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5561 XDELETEVEC (s);
5564 ix86_arch_mask = 1u << ix86_arch;
5565 for (i = 0; i < X86_ARCH_LAST; ++i)
5566 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5568 for (i = 0; i < pta_size; i++)
5569 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5571 ix86_schedule = processor_alias_table[i].schedule;
5572 ix86_tune = processor_alias_table[i].processor;
5573 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5575 if (!(processor_alias_table[i].flags & PTA_64BIT))
5577 if (ix86_tune_defaulted)
5579 opts->x_ix86_tune_string = "x86-64";
5580 for (i = 0; i < pta_size; i++)
5581 if (! strcmp (opts->x_ix86_tune_string,
5582 processor_alias_table[i].name))
5583 break;
5584 ix86_schedule = processor_alias_table[i].schedule;
5585 ix86_tune = processor_alias_table[i].processor;
5587 else
5588 error ("CPU you selected does not support x86-64 "
5589 "instruction set");
5592 /* Intel CPUs have always interpreted SSE prefetch instructions as
5593 NOPs; so, we can enable SSE prefetch instructions even when
5594 -mtune (rather than -march) points us to a processor that has them.
5595 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5596 higher processors. */
5597 if (TARGET_CMOV
5598 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5599 x86_prefetch_sse = true;
5600 break;
5603 if (ix86_tune_specified && i == pta_size)
5605 error (main_args_p
5606 ? "bad value (%qs) for %<-mtune=%> switch"
5607 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5608 opts->x_ix86_tune_string);
5610 auto_vec <const char *> candidates;
5611 for (i = 0; i < pta_size; i++)
5612 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5613 || (processor_alias_table[i].flags & PTA_64BIT))
5614 candidates.safe_push (processor_alias_table[i].name);
5616 char *s;
5617 const char *hint
5618 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5619 if (hint)
5620 inform (input_location,
5621 main_args_p
5622 ? "valid arguments to %<-mtune=%> switch are: "
5623 "%s; did you mean %qs?"
5624 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5625 "%s; did you mean %qs?", s, hint);
5626 else
5627 inform (input_location,
5628 main_args_p
5629 ? "valid arguments to %<-mtune=%> switch are: %s"
5630 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5632 XDELETEVEC (s);
5635 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5637 #ifndef USE_IX86_FRAME_POINTER
5638 #define USE_IX86_FRAME_POINTER 0
5639 #endif
5641 #ifndef USE_X86_64_FRAME_POINTER
5642 #define USE_X86_64_FRAME_POINTER 0
5643 #endif
5645 /* Set the default values for switches whose default depends on TARGET_64BIT
5646 in case they weren't overwritten by command line options. */
5647 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5649 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5650 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5651 if (opts->x_flag_asynchronous_unwind_tables
5652 && !opts_set->x_flag_unwind_tables
5653 && TARGET_64BIT_MS_ABI)
5654 opts->x_flag_unwind_tables = 1;
5655 if (opts->x_flag_asynchronous_unwind_tables == 2)
5656 opts->x_flag_unwind_tables
5657 = opts->x_flag_asynchronous_unwind_tables = 1;
5658 if (opts->x_flag_pcc_struct_return == 2)
5659 opts->x_flag_pcc_struct_return = 0;
5661 else
5663 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5664 opts->x_flag_omit_frame_pointer
5665 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5666 if (opts->x_flag_asynchronous_unwind_tables == 2)
5667 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5668 if (opts->x_flag_pcc_struct_return == 2)
5670 /* Intel MCU psABI specifies that -freg-struct-return should
5671 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5672 we check -miamcu so that -freg-struct-return is always
5673 turned on if -miamcu is used. */
5674 if (TARGET_IAMCU_P (opts->x_target_flags))
5675 opts->x_flag_pcc_struct_return = 0;
5676 else
5677 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5681 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5682 /* TODO: ix86_cost should be chosen at instruction or function granuality
5683 so for cold code we use size_cost even in !optimize_size compilation. */
5684 if (opts->x_optimize_size)
5685 ix86_cost = &ix86_size_cost;
5686 else
5687 ix86_cost = ix86_tune_cost;
5689 /* Arrange to set up i386_stack_locals for all functions. */
5690 init_machine_status = ix86_init_machine_status;
5692 /* Validate -mregparm= value. */
5693 if (opts_set->x_ix86_regparm)
5695 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5696 warning (0, "-mregparm is ignored in 64-bit mode");
5697 else if (TARGET_IAMCU_P (opts->x_target_flags))
5698 warning (0, "-mregparm is ignored for Intel MCU psABI");
5699 if (opts->x_ix86_regparm > REGPARM_MAX)
5701 error ("-mregparm=%d is not between 0 and %d",
5702 opts->x_ix86_regparm, REGPARM_MAX);
5703 opts->x_ix86_regparm = 0;
5706 if (TARGET_IAMCU_P (opts->x_target_flags)
5707 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5708 opts->x_ix86_regparm = REGPARM_MAX;
5710 /* Default align_* from the processor table. */
5711 ix86_default_align (opts);
5713 /* Provide default for -mbranch-cost= value. */
5714 if (!opts_set->x_ix86_branch_cost)
5715 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5717 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5719 opts->x_target_flags
5720 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5722 /* Enable by default the SSE and MMX builtins. Do allow the user to
5723 explicitly disable any of these. In particular, disabling SSE and
5724 MMX for kernel code is extremely useful. */
5725 if (!ix86_arch_specified)
5726 opts->x_ix86_isa_flags
5727 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5728 | TARGET_SUBTARGET64_ISA_DEFAULT)
5729 & ~opts->x_ix86_isa_flags_explicit);
5731 if (TARGET_RTD_P (opts->x_target_flags))
5732 warning (0,
5733 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5734 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5736 else
5738 opts->x_target_flags
5739 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5741 if (!ix86_arch_specified)
5742 opts->x_ix86_isa_flags
5743 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5745 /* i386 ABI does not specify red zone. It still makes sense to use it
5746 when programmer takes care to stack from being destroyed. */
5747 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5748 opts->x_target_flags |= MASK_NO_RED_ZONE;
5751 /* Keep nonleaf frame pointers. */
5752 if (opts->x_flag_omit_frame_pointer)
5753 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5754 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5755 opts->x_flag_omit_frame_pointer = 1;
5757 /* If we're doing fast math, we don't care about comparison order
5758 wrt NaNs. This lets us use a shorter comparison sequence. */
5759 if (opts->x_flag_finite_math_only)
5760 opts->x_target_flags &= ~MASK_IEEE_FP;
5762 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5763 since the insns won't need emulation. */
5764 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5765 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5767 /* Likewise, if the target doesn't have a 387, or we've specified
5768 software floating point, don't use 387 inline intrinsics. */
5769 if (!TARGET_80387_P (opts->x_target_flags))
5770 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5772 /* Turn on MMX builtins for -msse. */
5773 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5774 opts->x_ix86_isa_flags
5775 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5777 /* Enable SSE prefetch. */
5778 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5779 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5780 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5781 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5782 x86_prefetch_sse = true;
5784 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5785 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5786 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5787 opts->x_ix86_isa_flags
5788 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5790 /* Enable lzcnt instruction for -mabm. */
5791 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5792 opts->x_ix86_isa_flags
5793 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5795 /* Validate -mpreferred-stack-boundary= value or default it to
5796 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5797 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5798 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5800 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5801 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5802 int max = (TARGET_SEH ? 4 : 12);
5804 if (opts->x_ix86_preferred_stack_boundary_arg < min
5805 || opts->x_ix86_preferred_stack_boundary_arg > max)
5807 if (min == max)
5808 error ("-mpreferred-stack-boundary is not supported "
5809 "for this target");
5810 else
5811 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5812 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5814 else
5815 ix86_preferred_stack_boundary
5816 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5819 /* Set the default value for -mstackrealign. */
5820 if (opts->x_ix86_force_align_arg_pointer == -1)
5821 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5823 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5825 /* Validate -mincoming-stack-boundary= value or default it to
5826 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5827 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5828 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5830 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5832 if (opts->x_ix86_incoming_stack_boundary_arg < min
5833 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5834 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5835 opts->x_ix86_incoming_stack_boundary_arg, min);
5836 else
5838 ix86_user_incoming_stack_boundary
5839 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5840 ix86_incoming_stack_boundary
5841 = ix86_user_incoming_stack_boundary;
5845 #ifndef NO_PROFILE_COUNTERS
5846 if (flag_nop_mcount)
5847 error ("-mnop-mcount is not compatible with this target");
5848 #endif
5849 if (flag_nop_mcount && flag_pic)
5850 error ("-mnop-mcount is not implemented for -fPIC");
5852 /* Accept -msseregparm only if at least SSE support is enabled. */
5853 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5854 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5855 error (main_args_p
5856 ? "%<-msseregparm%> used without SSE enabled"
5857 : "%<target(\"sseregparm\")%> used without SSE enabled");
5859 if (opts_set->x_ix86_fpmath)
5861 if (opts->x_ix86_fpmath & FPMATH_SSE)
5863 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5865 if (TARGET_80387_P (opts->x_target_flags))
5867 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5868 opts->x_ix86_fpmath = FPMATH_387;
5871 else if ((opts->x_ix86_fpmath & FPMATH_387)
5872 && !TARGET_80387_P (opts->x_target_flags))
5874 warning (0, "387 instruction set disabled, using SSE arithmetics");
5875 opts->x_ix86_fpmath = FPMATH_SSE;
5879 /* For all chips supporting SSE2, -mfpmath=sse performs better than
5880 fpmath=387. The second is however default at many targets since the
5881 extra 80bit precision of temporaries is considered to be part of ABI.
5882 Overwrite the default at least for -ffast-math.
5883 TODO: -mfpmath=both seems to produce same performing code with bit
5884 smaller binaries. It is however not clear if register allocation is
5885 ready for this setting.
5886 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
5887 codegen. We may switch to 387 with -ffast-math for size optimized
5888 functions. */
5889 else if (fast_math_flags_set_p (&global_options)
5890 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
5891 opts->x_ix86_fpmath = FPMATH_SSE;
5892 else
5893 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
5895 /* Use external vectorized library in vectorizing intrinsics. */
5896 if (opts_set->x_ix86_veclibabi_type)
5897 switch (opts->x_ix86_veclibabi_type)
5899 case ix86_veclibabi_type_svml:
5900 ix86_veclib_handler = ix86_veclibabi_svml;
5901 break;
5903 case ix86_veclibabi_type_acml:
5904 ix86_veclib_handler = ix86_veclibabi_acml;
5905 break;
5907 default:
5908 gcc_unreachable ();
5911 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
5912 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5913 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5915 /* If stack probes are required, the space used for large function
5916 arguments on the stack must also be probed, so enable
5917 -maccumulate-outgoing-args so this happens in the prologue. */
5918 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
5919 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5921 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5922 warning (0,
5923 main_args_p
5924 ? "stack probing requires %<-maccumulate-outgoing-args%> "
5925 "for correctness"
5926 : "stack probing requires "
5927 "%<target(\"accumulate-outgoing-args\")%> for correctness");
5928 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5931 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
5932 so enable -maccumulate-outgoing-args when %ebp is fixed. */
5933 if (fixed_regs[BP_REG]
5934 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5936 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5937 warning (0,
5938 main_args_p
5939 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
5940 : "fixed ebp register requires "
5941 "%<target(\"accumulate-outgoing-args\")%>");
5942 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5945 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
5947 char *p;
5948 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
5949 p = strchr (internal_label_prefix, 'X');
5950 internal_label_prefix_len = p - internal_label_prefix;
5951 *p = '\0';
5954 /* When scheduling description is not available, disable scheduler pass
5955 so it won't slow down the compilation and make x87 code slower. */
5956 if (!TARGET_SCHEDULE)
5957 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
5959 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
5960 ix86_tune_cost->simultaneous_prefetches,
5961 opts->x_param_values,
5962 opts_set->x_param_values);
5963 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
5964 ix86_tune_cost->prefetch_block,
5965 opts->x_param_values,
5966 opts_set->x_param_values);
5967 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
5968 ix86_tune_cost->l1_cache_size,
5969 opts->x_param_values,
5970 opts_set->x_param_values);
5971 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
5972 ix86_tune_cost->l2_cache_size,
5973 opts->x_param_values,
5974 opts_set->x_param_values);
5976 /* Restrict number of if-converted SET insns to 1. */
5977 if (TARGET_ONE_IF_CONV_INSN)
5978 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
5980 opts->x_param_values,
5981 opts_set->x_param_values);
5983 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
5984 if (opts->x_flag_prefetch_loop_arrays < 0
5985 && HAVE_prefetch
5986 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
5987 && !opts->x_optimize_size
5988 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
5989 opts->x_flag_prefetch_loop_arrays = 1;
5991 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
5992 can be opts->x_optimized to ap = __builtin_next_arg (0). */
5993 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
5994 targetm.expand_builtin_va_start = NULL;
5996 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5998 ix86_gen_leave = gen_leave_rex64;
5999 if (Pmode == DImode)
6001 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6002 ix86_gen_tls_local_dynamic_base_64
6003 = gen_tls_local_dynamic_base_64_di;
6005 else
6007 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6008 ix86_gen_tls_local_dynamic_base_64
6009 = gen_tls_local_dynamic_base_64_si;
6012 else
6013 ix86_gen_leave = gen_leave;
6015 if (Pmode == DImode)
6017 ix86_gen_add3 = gen_adddi3;
6018 ix86_gen_sub3 = gen_subdi3;
6019 ix86_gen_sub3_carry = gen_subdi3_carry;
6020 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6021 ix86_gen_andsp = gen_anddi3;
6022 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6023 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6024 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6025 ix86_gen_monitor = gen_sse3_monitor_di;
6026 ix86_gen_monitorx = gen_monitorx_di;
6027 ix86_gen_clzero = gen_clzero_di;
6029 else
6031 ix86_gen_add3 = gen_addsi3;
6032 ix86_gen_sub3 = gen_subsi3;
6033 ix86_gen_sub3_carry = gen_subsi3_carry;
6034 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6035 ix86_gen_andsp = gen_andsi3;
6036 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6037 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6038 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6039 ix86_gen_monitor = gen_sse3_monitor_si;
6040 ix86_gen_monitorx = gen_monitorx_si;
6041 ix86_gen_clzero = gen_clzero_si;
6044 #ifdef USE_IX86_CLD
6045 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6046 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6047 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6048 #endif
6050 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6052 if (opts->x_flag_fentry > 0)
6053 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6054 "with -fpic");
6055 opts->x_flag_fentry = 0;
6057 else if (TARGET_SEH)
6059 if (opts->x_flag_fentry == 0)
6060 sorry ("-mno-fentry isn%'t compatible with SEH");
6061 opts->x_flag_fentry = 1;
6063 else if (opts->x_flag_fentry < 0)
6065 #if defined(PROFILE_BEFORE_PROLOGUE)
6066 opts->x_flag_fentry = 1;
6067 #else
6068 opts->x_flag_fentry = 0;
6069 #endif
6072 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6073 opts->x_target_flags |= MASK_VZEROUPPER;
6074 if (!(opts_set->x_target_flags & MASK_STV))
6075 opts->x_target_flags |= MASK_STV;
6076 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6077 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6078 stack realignment will be extra cost the pass doesn't take into
6079 account and the pass can't realign the stack. */
6080 if (ix86_preferred_stack_boundary < 128
6081 || ix86_incoming_stack_boundary < 128
6082 || opts->x_ix86_force_align_arg_pointer)
6083 opts->x_target_flags &= ~MASK_STV;
6084 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6085 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6086 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6087 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6088 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6089 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6090 /* Enable 128-bit AVX instruction generation
6091 for the auto-vectorizer. */
6092 if (TARGET_AVX128_OPTIMAL
6093 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6094 opts->x_target_flags |= MASK_PREFER_AVX128;
6096 if (opts->x_ix86_recip_name)
6098 char *p = ASTRDUP (opts->x_ix86_recip_name);
6099 char *q;
6100 unsigned int mask, i;
6101 bool invert;
6103 while ((q = strtok (p, ",")) != NULL)
6105 p = NULL;
6106 if (*q == '!')
6108 invert = true;
6109 q++;
6111 else
6112 invert = false;
6114 if (!strcmp (q, "default"))
6115 mask = RECIP_MASK_ALL;
6116 else
6118 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6119 if (!strcmp (q, recip_options[i].string))
6121 mask = recip_options[i].mask;
6122 break;
6125 if (i == ARRAY_SIZE (recip_options))
6127 error ("unknown option for -mrecip=%s", q);
6128 invert = false;
6129 mask = RECIP_MASK_NONE;
6133 opts->x_recip_mask_explicit |= mask;
6134 if (invert)
6135 opts->x_recip_mask &= ~mask;
6136 else
6137 opts->x_recip_mask |= mask;
6141 if (TARGET_RECIP_P (opts->x_target_flags))
6142 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6143 else if (opts_set->x_target_flags & MASK_RECIP)
6144 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6146 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6147 for 64-bit Bionic. Also default long double to 64-bit for Intel
6148 MCU psABI. */
6149 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6150 && !(opts_set->x_target_flags
6151 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6152 opts->x_target_flags |= (TARGET_64BIT
6153 ? MASK_LONG_DOUBLE_128
6154 : MASK_LONG_DOUBLE_64);
6156 /* Only one of them can be active. */
6157 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6158 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6160 /* Save the initial options in case the user does function specific
6161 options. */
6162 if (main_args_p)
6163 target_option_default_node = target_option_current_node
6164 = build_target_option_node (opts);
6166 /* Handle stack protector */
6167 if (!opts_set->x_ix86_stack_protector_guard)
6168 opts->x_ix86_stack_protector_guard
6169 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6171 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6172 if (opts->x_ix86_tune_memcpy_strategy)
6174 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6175 ix86_parse_stringop_strategy_string (str, false);
6176 free (str);
6179 if (opts->x_ix86_tune_memset_strategy)
6181 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6182 ix86_parse_stringop_strategy_string (str, true);
6183 free (str);
6186 return true;
6189 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6191 static void
6192 ix86_option_override (void)
6194 ix86_option_override_internal (true, &global_options, &global_options_set);
6197 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6198 static char *
6199 ix86_offload_options (void)
6201 if (TARGET_LP64)
6202 return xstrdup ("-foffload-abi=lp64");
6203 return xstrdup ("-foffload-abi=ilp32");
6206 /* Update register usage after having seen the compiler flags. */
6208 static void
6209 ix86_conditional_register_usage (void)
6211 int i, c_mask;
6213 /* If there are no caller-saved registers, preserve all registers.
6214 except fixed_regs and registers used for function return value
6215 since aggregate_value_p checks call_used_regs[regno] on return
6216 value. */
6217 if (cfun && cfun->machine->no_caller_saved_registers)
6218 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6219 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6220 call_used_regs[i] = 0;
6222 /* For 32-bit targets, squash the REX registers. */
6223 if (! TARGET_64BIT)
6225 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6226 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6227 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6228 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6229 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6230 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6233 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6234 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6236 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6238 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6240 /* Set/reset conditionally defined registers from
6241 CALL_USED_REGISTERS initializer. */
6242 if (call_used_regs[i] > 1)
6243 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6245 /* Calculate registers of CLOBBERED_REGS register set
6246 as call used registers from GENERAL_REGS register set. */
6247 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6248 && call_used_regs[i])
6249 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6252 /* If MMX is disabled, squash the registers. */
6253 if (! TARGET_MMX)
6254 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6255 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6256 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6258 /* If SSE is disabled, squash the registers. */
6259 if (! TARGET_SSE)
6260 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6261 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6262 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6264 /* If the FPU is disabled, squash the registers. */
6265 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6266 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6267 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6268 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6270 /* If AVX512F is disabled, squash the registers. */
6271 if (! TARGET_AVX512F)
6273 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6274 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6276 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6277 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6280 /* If MPX is disabled, squash the registers. */
6281 if (! TARGET_MPX)
6282 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6283 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6287 /* Save the current options */
6289 static void
6290 ix86_function_specific_save (struct cl_target_option *ptr,
6291 struct gcc_options *opts)
6293 ptr->arch = ix86_arch;
6294 ptr->schedule = ix86_schedule;
6295 ptr->prefetch_sse = x86_prefetch_sse;
6296 ptr->tune = ix86_tune;
6297 ptr->branch_cost = ix86_branch_cost;
6298 ptr->tune_defaulted = ix86_tune_defaulted;
6299 ptr->arch_specified = ix86_arch_specified;
6300 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6301 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6302 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6303 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6304 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6305 ptr->x_ix86_abi = opts->x_ix86_abi;
6306 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6307 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6308 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6309 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6310 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6311 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6312 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6313 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6314 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6315 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6316 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6317 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6318 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6319 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6320 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6321 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6322 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6323 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6324 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6325 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6327 /* The fields are char but the variables are not; make sure the
6328 values fit in the fields. */
6329 gcc_assert (ptr->arch == ix86_arch);
6330 gcc_assert (ptr->schedule == ix86_schedule);
6331 gcc_assert (ptr->tune == ix86_tune);
6332 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6335 /* Restore the current options */
6337 static void
6338 ix86_function_specific_restore (struct gcc_options *opts,
6339 struct cl_target_option *ptr)
6341 enum processor_type old_tune = ix86_tune;
6342 enum processor_type old_arch = ix86_arch;
6343 unsigned int ix86_arch_mask;
6344 int i;
6346 /* We don't change -fPIC. */
6347 opts->x_flag_pic = flag_pic;
6349 ix86_arch = (enum processor_type) ptr->arch;
6350 ix86_schedule = (enum attr_cpu) ptr->schedule;
6351 ix86_tune = (enum processor_type) ptr->tune;
6352 x86_prefetch_sse = ptr->prefetch_sse;
6353 opts->x_ix86_branch_cost = ptr->branch_cost;
6354 ix86_tune_defaulted = ptr->tune_defaulted;
6355 ix86_arch_specified = ptr->arch_specified;
6356 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6357 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6358 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6359 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6360 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6361 opts->x_ix86_abi = ptr->x_ix86_abi;
6362 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6363 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6364 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6365 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6366 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6367 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6368 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6369 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6370 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6371 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6372 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6373 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6374 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6375 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6376 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6377 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6378 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6379 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6380 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6381 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6382 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6383 /* TODO: ix86_cost should be chosen at instruction or function granuality
6384 so for cold code we use size_cost even in !optimize_size compilation. */
6385 if (opts->x_optimize_size)
6386 ix86_cost = &ix86_size_cost;
6387 else
6388 ix86_cost = ix86_tune_cost;
6390 /* Recreate the arch feature tests if the arch changed */
6391 if (old_arch != ix86_arch)
6393 ix86_arch_mask = 1u << ix86_arch;
6394 for (i = 0; i < X86_ARCH_LAST; ++i)
6395 ix86_arch_features[i]
6396 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6399 /* Recreate the tune optimization tests */
6400 if (old_tune != ix86_tune)
6401 set_ix86_tune_features (ix86_tune, false);
6404 /* Adjust target options after streaming them in. This is mainly about
6405 reconciling them with global options. */
6407 static void
6408 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6410 /* flag_pic is a global option, but ix86_cmodel is target saved option
6411 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6412 for PIC, or error out. */
6413 if (flag_pic)
6414 switch (ptr->x_ix86_cmodel)
6416 case CM_SMALL:
6417 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6418 break;
6420 case CM_MEDIUM:
6421 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6422 break;
6424 case CM_LARGE:
6425 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6426 break;
6428 case CM_KERNEL:
6429 error ("code model %s does not support PIC mode", "kernel");
6430 break;
6432 default:
6433 break;
6435 else
6436 switch (ptr->x_ix86_cmodel)
6438 case CM_SMALL_PIC:
6439 ptr->x_ix86_cmodel = CM_SMALL;
6440 break;
6442 case CM_MEDIUM_PIC:
6443 ptr->x_ix86_cmodel = CM_MEDIUM;
6444 break;
6446 case CM_LARGE_PIC:
6447 ptr->x_ix86_cmodel = CM_LARGE;
6448 break;
6450 default:
6451 break;
6455 /* Print the current options */
6457 static void
6458 ix86_function_specific_print (FILE *file, int indent,
6459 struct cl_target_option *ptr)
6461 char *target_string
6462 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
6463 ptr->x_ix86_target_flags, NULL, NULL,
6464 ptr->x_ix86_fpmath, false);
6466 gcc_assert (ptr->arch < PROCESSOR_max);
6467 fprintf (file, "%*sarch = %d (%s)\n",
6468 indent, "",
6469 ptr->arch, processor_target_table[ptr->arch].name);
6471 gcc_assert (ptr->tune < PROCESSOR_max);
6472 fprintf (file, "%*stune = %d (%s)\n",
6473 indent, "",
6474 ptr->tune, processor_target_table[ptr->tune].name);
6476 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6478 if (target_string)
6480 fprintf (file, "%*s%s\n", indent, "", target_string);
6481 free (target_string);
6486 /* Inner function to process the attribute((target(...))), take an argument and
6487 set the current options from the argument. If we have a list, recursively go
6488 over the list. */
6490 static bool
6491 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6492 struct gcc_options *opts,
6493 struct gcc_options *opts_set,
6494 struct gcc_options *enum_opts_set)
6496 char *next_optstr;
6497 bool ret = true;
6499 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6500 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6501 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6502 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6503 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6505 enum ix86_opt_type
6507 ix86_opt_unknown,
6508 ix86_opt_yes,
6509 ix86_opt_no,
6510 ix86_opt_str,
6511 ix86_opt_enum,
6512 ix86_opt_isa
6515 static const struct
6517 const char *string;
6518 size_t len;
6519 enum ix86_opt_type type;
6520 int opt;
6521 int mask;
6522 } attrs[] = {
6523 /* isa options */
6524 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6525 IX86_ATTR_ISA ("abm", OPT_mabm),
6526 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6527 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6528 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6529 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6530 IX86_ATTR_ISA ("aes", OPT_maes),
6531 IX86_ATTR_ISA ("sha", OPT_msha),
6532 IX86_ATTR_ISA ("avx", OPT_mavx),
6533 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6534 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6535 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6536 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6537 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6538 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6539 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6540 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6541 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6542 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6543 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6544 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6545 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6546 IX86_ATTR_ISA ("sse", OPT_msse),
6547 IX86_ATTR_ISA ("sse2", OPT_msse2),
6548 IX86_ATTR_ISA ("sse3", OPT_msse3),
6549 IX86_ATTR_ISA ("sse4", OPT_msse4),
6550 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6551 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6552 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6553 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6554 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6555 IX86_ATTR_ISA ("fma", OPT_mfma),
6556 IX86_ATTR_ISA ("xop", OPT_mxop),
6557 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6558 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6559 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6560 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6561 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6562 IX86_ATTR_ISA ("hle", OPT_mhle),
6563 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6564 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6565 IX86_ATTR_ISA ("adx", OPT_madx),
6566 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6567 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6568 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6569 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6570 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6571 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6572 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6573 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6574 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6575 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6576 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6577 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6578 IX86_ATTR_ISA ("pku", OPT_mpku),
6580 /* enum options */
6581 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6583 /* string options */
6584 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6585 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6587 /* flag options */
6588 IX86_ATTR_YES ("cld",
6589 OPT_mcld,
6590 MASK_CLD),
6592 IX86_ATTR_NO ("fancy-math-387",
6593 OPT_mfancy_math_387,
6594 MASK_NO_FANCY_MATH_387),
6596 IX86_ATTR_YES ("ieee-fp",
6597 OPT_mieee_fp,
6598 MASK_IEEE_FP),
6600 IX86_ATTR_YES ("inline-all-stringops",
6601 OPT_minline_all_stringops,
6602 MASK_INLINE_ALL_STRINGOPS),
6604 IX86_ATTR_YES ("inline-stringops-dynamically",
6605 OPT_minline_stringops_dynamically,
6606 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6608 IX86_ATTR_NO ("align-stringops",
6609 OPT_mno_align_stringops,
6610 MASK_NO_ALIGN_STRINGOPS),
6612 IX86_ATTR_YES ("recip",
6613 OPT_mrecip,
6614 MASK_RECIP),
6618 /* If this is a list, recurse to get the options. */
6619 if (TREE_CODE (args) == TREE_LIST)
6621 bool ret = true;
6623 for (; args; args = TREE_CHAIN (args))
6624 if (TREE_VALUE (args)
6625 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6626 p_strings, opts, opts_set,
6627 enum_opts_set))
6628 ret = false;
6630 return ret;
6633 else if (TREE_CODE (args) != STRING_CST)
6635 error ("attribute %<target%> argument not a string");
6636 return false;
6639 /* Handle multiple arguments separated by commas. */
6640 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6642 while (next_optstr && *next_optstr != '\0')
6644 char *p = next_optstr;
6645 char *orig_p = p;
6646 char *comma = strchr (next_optstr, ',');
6647 const char *opt_string;
6648 size_t len, opt_len;
6649 int opt;
6650 bool opt_set_p;
6651 char ch;
6652 unsigned i;
6653 enum ix86_opt_type type = ix86_opt_unknown;
6654 int mask = 0;
6656 if (comma)
6658 *comma = '\0';
6659 len = comma - next_optstr;
6660 next_optstr = comma + 1;
6662 else
6664 len = strlen (p);
6665 next_optstr = NULL;
6668 /* Recognize no-xxx. */
6669 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6671 opt_set_p = false;
6672 p += 3;
6673 len -= 3;
6675 else
6676 opt_set_p = true;
6678 /* Find the option. */
6679 ch = *p;
6680 opt = N_OPTS;
6681 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6683 type = attrs[i].type;
6684 opt_len = attrs[i].len;
6685 if (ch == attrs[i].string[0]
6686 && ((type != ix86_opt_str && type != ix86_opt_enum)
6687 ? len == opt_len
6688 : len > opt_len)
6689 && memcmp (p, attrs[i].string, opt_len) == 0)
6691 opt = attrs[i].opt;
6692 mask = attrs[i].mask;
6693 opt_string = attrs[i].string;
6694 break;
6698 /* Process the option. */
6699 if (opt == N_OPTS)
6701 error ("attribute(target(\"%s\")) is unknown", orig_p);
6702 ret = false;
6705 else if (type == ix86_opt_isa)
6707 struct cl_decoded_option decoded;
6709 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6710 ix86_handle_option (opts, opts_set,
6711 &decoded, input_location);
6714 else if (type == ix86_opt_yes || type == ix86_opt_no)
6716 if (type == ix86_opt_no)
6717 opt_set_p = !opt_set_p;
6719 if (opt_set_p)
6720 opts->x_target_flags |= mask;
6721 else
6722 opts->x_target_flags &= ~mask;
6725 else if (type == ix86_opt_str)
6727 if (p_strings[opt])
6729 error ("option(\"%s\") was already specified", opt_string);
6730 ret = false;
6732 else
6733 p_strings[opt] = xstrdup (p + opt_len);
6736 else if (type == ix86_opt_enum)
6738 bool arg_ok;
6739 int value;
6741 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6742 if (arg_ok)
6743 set_option (opts, enum_opts_set, opt, value,
6744 p + opt_len, DK_UNSPECIFIED, input_location,
6745 global_dc);
6746 else
6748 error ("attribute(target(\"%s\")) is unknown", orig_p);
6749 ret = false;
6753 else
6754 gcc_unreachable ();
6757 return ret;
6760 /* Release allocated strings. */
6761 static void
6762 release_options_strings (char **option_strings)
6764 /* Free up memory allocated to hold the strings */
6765 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6766 free (option_strings[i]);
6769 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6771 tree
6772 ix86_valid_target_attribute_tree (tree args,
6773 struct gcc_options *opts,
6774 struct gcc_options *opts_set)
6776 const char *orig_arch_string = opts->x_ix86_arch_string;
6777 const char *orig_tune_string = opts->x_ix86_tune_string;
6778 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6779 int orig_tune_defaulted = ix86_tune_defaulted;
6780 int orig_arch_specified = ix86_arch_specified;
6781 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6782 tree t = NULL_TREE;
6783 struct cl_target_option *def
6784 = TREE_TARGET_OPTION (target_option_default_node);
6785 struct gcc_options enum_opts_set;
6787 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6789 /* Process each of the options on the chain. */
6790 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6791 opts_set, &enum_opts_set))
6792 return error_mark_node;
6794 /* If the changed options are different from the default, rerun
6795 ix86_option_override_internal, and then save the options away.
6796 The string options are attribute options, and will be undone
6797 when we copy the save structure. */
6798 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6799 || opts->x_target_flags != def->x_target_flags
6800 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6801 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6802 || enum_opts_set.x_ix86_fpmath)
6804 /* If we are using the default tune= or arch=, undo the string assigned,
6805 and use the default. */
6806 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6808 opts->x_ix86_arch_string
6809 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6811 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6812 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6813 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6814 | OPTION_MASK_ABI_64
6815 | OPTION_MASK_ABI_X32
6816 | OPTION_MASK_CODE16);
6819 else if (!orig_arch_specified)
6820 opts->x_ix86_arch_string = NULL;
6822 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6823 opts->x_ix86_tune_string
6824 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6825 else if (orig_tune_defaulted)
6826 opts->x_ix86_tune_string = NULL;
6828 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6829 if (enum_opts_set.x_ix86_fpmath)
6830 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6831 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6832 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6834 if (TARGET_80387_P (opts->x_target_flags))
6835 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6836 | FPMATH_387);
6837 else
6838 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6839 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6842 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6843 bool r = ix86_option_override_internal (false, opts, opts_set);
6844 if (!r)
6846 release_options_strings (option_strings);
6847 return error_mark_node;
6850 /* Add any builtin functions with the new isa if any. */
6851 ix86_add_new_builtins (opts->x_ix86_isa_flags);
6853 /* Save the current options unless we are validating options for
6854 #pragma. */
6855 t = build_target_option_node (opts);
6857 opts->x_ix86_arch_string = orig_arch_string;
6858 opts->x_ix86_tune_string = orig_tune_string;
6859 opts_set->x_ix86_fpmath = orig_fpmath_set;
6861 release_options_strings (option_strings);
6864 return t;
6867 /* Hook to validate attribute((target("string"))). */
6869 static bool
6870 ix86_valid_target_attribute_p (tree fndecl,
6871 tree ARG_UNUSED (name),
6872 tree args,
6873 int ARG_UNUSED (flags))
6875 struct gcc_options func_options;
6876 tree new_target, new_optimize;
6877 bool ret = true;
6879 /* attribute((target("default"))) does nothing, beyond
6880 affecting multi-versioning. */
6881 if (TREE_VALUE (args)
6882 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
6883 && TREE_CHAIN (args) == NULL_TREE
6884 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
6885 return true;
6887 tree old_optimize = build_optimization_node (&global_options);
6889 /* Get the optimization options of the current function. */
6890 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6892 if (!func_optimize)
6893 func_optimize = old_optimize;
6895 /* Init func_options. */
6896 memset (&func_options, 0, sizeof (func_options));
6897 init_options_struct (&func_options, NULL);
6898 lang_hooks.init_options_struct (&func_options);
6900 cl_optimization_restore (&func_options,
6901 TREE_OPTIMIZATION (func_optimize));
6903 /* Initialize func_options to the default before its target options can
6904 be set. */
6905 cl_target_option_restore (&func_options,
6906 TREE_TARGET_OPTION (target_option_default_node));
6908 new_target = ix86_valid_target_attribute_tree (args, &func_options,
6909 &global_options_set);
6911 new_optimize = build_optimization_node (&func_options);
6913 if (new_target == error_mark_node)
6914 ret = false;
6916 else if (fndecl && new_target)
6918 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
6920 if (old_optimize != new_optimize)
6921 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
6924 finalize_options_struct (&func_options);
6926 return ret;
6930 /* Hook to determine if one function can safely inline another. */
6932 static bool
6933 ix86_can_inline_p (tree caller, tree callee)
6935 bool ret = false;
6936 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
6937 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
6939 /* If callee has no option attributes, then it is ok to inline. */
6940 if (!callee_tree)
6941 ret = true;
6943 /* If caller has no option attributes, but callee does then it is not ok to
6944 inline. */
6945 else if (!caller_tree)
6946 ret = false;
6948 else
6950 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
6951 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
6953 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
6954 can inline a SSE2 function but a SSE2 function can't inline a SSE4
6955 function. */
6956 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
6957 != callee_opts->x_ix86_isa_flags)
6958 ret = false;
6960 /* See if we have the same non-isa options. */
6961 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
6962 ret = false;
6964 /* See if arch, tune, etc. are the same. */
6965 else if (caller_opts->arch != callee_opts->arch)
6966 ret = false;
6968 else if (caller_opts->tune != callee_opts->tune)
6969 ret = false;
6971 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
6972 ret = false;
6974 else if (caller_opts->branch_cost != callee_opts->branch_cost)
6975 ret = false;
6977 else
6978 ret = true;
6981 return ret;
6985 /* Remember the last target of ix86_set_current_function. */
6986 static GTY(()) tree ix86_previous_fndecl;
6988 /* Set targets globals to the default (or current #pragma GCC target
6989 if active). Invalidate ix86_previous_fndecl cache. */
6991 void
6992 ix86_reset_previous_fndecl (void)
6994 tree new_tree = target_option_current_node;
6995 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6996 if (TREE_TARGET_GLOBALS (new_tree))
6997 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6998 else if (new_tree == target_option_default_node)
6999 restore_target_globals (&default_target_globals);
7000 else
7001 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7002 ix86_previous_fndecl = NULL_TREE;
7005 /* Set the func_type field from the function FNDECL. */
7007 static void
7008 ix86_set_func_type (tree fndecl)
7010 if (cfun->machine->func_type == TYPE_UNKNOWN)
7012 if (lookup_attribute ("interrupt",
7013 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7015 int nargs = 0;
7016 for (tree arg = DECL_ARGUMENTS (fndecl);
7017 arg;
7018 arg = TREE_CHAIN (arg))
7019 nargs++;
7020 cfun->machine->no_caller_saved_registers = true;
7021 cfun->machine->func_type
7022 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7024 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7026 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7027 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7028 sorry ("Only DWARF debug format is supported for interrupt "
7029 "service routine.");
7031 else
7033 cfun->machine->func_type = TYPE_NORMAL;
7034 if (lookup_attribute ("no_caller_saved_registers",
7035 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7036 cfun->machine->no_caller_saved_registers = true;
7041 /* Establish appropriate back-end context for processing the function
7042 FNDECL. The argument might be NULL to indicate processing at top
7043 level, outside of any function scope. */
7044 static void
7045 ix86_set_current_function (tree fndecl)
7047 /* Only change the context if the function changes. This hook is called
7048 several times in the course of compiling a function, and we don't want to
7049 slow things down too much or call target_reinit when it isn't safe. */
7050 if (fndecl == ix86_previous_fndecl)
7052 /* There may be 2 function bodies for the same function FNDECL,
7053 one is extern inline and one isn't. Call ix86_set_func_type
7054 to set the func_type field. */
7055 if (fndecl != NULL_TREE)
7056 ix86_set_func_type (fndecl);
7057 return;
7060 tree old_tree;
7061 if (ix86_previous_fndecl == NULL_TREE)
7062 old_tree = target_option_current_node;
7063 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7064 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7065 else
7066 old_tree = target_option_default_node;
7068 if (fndecl == NULL_TREE)
7070 if (old_tree != target_option_current_node)
7071 ix86_reset_previous_fndecl ();
7072 return;
7075 ix86_set_func_type (fndecl);
7077 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7078 if (new_tree == NULL_TREE)
7079 new_tree = target_option_default_node;
7081 if (old_tree != new_tree)
7083 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7084 if (TREE_TARGET_GLOBALS (new_tree))
7085 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7086 else if (new_tree == target_option_default_node)
7087 restore_target_globals (&default_target_globals);
7088 else
7089 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7091 ix86_previous_fndecl = fndecl;
7093 static bool prev_no_caller_saved_registers;
7095 /* 64-bit MS and SYSV ABI have different set of call used registers.
7096 Avoid expensive re-initialization of init_regs each time we switch
7097 function context. */
7098 if (TARGET_64BIT
7099 && (call_used_regs[SI_REG]
7100 == (cfun->machine->call_abi == MS_ABI)))
7101 reinit_regs ();
7102 /* Need to re-initialize init_regs if caller-saved registers are
7103 changed. */
7104 else if (prev_no_caller_saved_registers
7105 != cfun->machine->no_caller_saved_registers)
7106 reinit_regs ();
7108 if (cfun->machine->func_type != TYPE_NORMAL
7109 || cfun->machine->no_caller_saved_registers)
7111 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7112 may change processor state. */
7113 const char *isa;
7114 if (TARGET_MPX)
7115 isa = "MPX";
7116 else if (TARGET_SSE)
7117 isa = "SSE";
7118 else if (TARGET_MMX)
7119 isa = "MMX/3Dnow";
7120 else if (TARGET_80387)
7121 isa = "80387";
7122 else
7123 isa = NULL;
7124 if (isa != NULL)
7126 if (cfun->machine->func_type != TYPE_NORMAL)
7127 sorry ("%s instructions aren't allowed in %s service routine",
7128 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7129 ? "exception" : "interrupt"));
7130 else
7131 sorry ("%s instructions aren't allowed in function with "
7132 "no_caller_saved_registers attribute", isa);
7133 /* Don't issue the same error twice. */
7134 cfun->machine->func_type = TYPE_NORMAL;
7135 cfun->machine->no_caller_saved_registers = false;
7139 prev_no_caller_saved_registers
7140 = cfun->machine->no_caller_saved_registers;
7144 /* Return true if this goes in large data/bss. */
7146 static bool
7147 ix86_in_large_data_p (tree exp)
7149 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7150 return false;
7152 if (exp == NULL_TREE)
7153 return false;
7155 /* Functions are never large data. */
7156 if (TREE_CODE (exp) == FUNCTION_DECL)
7157 return false;
7159 /* Automatic variables are never large data. */
7160 if (VAR_P (exp) && !is_global_var (exp))
7161 return false;
7163 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7165 const char *section = DECL_SECTION_NAME (exp);
7166 if (strcmp (section, ".ldata") == 0
7167 || strcmp (section, ".lbss") == 0)
7168 return true;
7169 return false;
7171 else
7173 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7175 /* If this is an incomplete type with size 0, then we can't put it
7176 in data because it might be too big when completed. Also,
7177 int_size_in_bytes returns -1 if size can vary or is larger than
7178 an integer in which case also it is safer to assume that it goes in
7179 large data. */
7180 if (size <= 0 || size > ix86_section_threshold)
7181 return true;
7184 return false;
7187 /* i386-specific section flag to mark large sections. */
7188 #define SECTION_LARGE SECTION_MACH_DEP
7190 /* Switch to the appropriate section for output of DECL.
7191 DECL is either a `VAR_DECL' node or a constant of some sort.
7192 RELOC indicates whether forming the initial value of DECL requires
7193 link-time relocations. */
7195 ATTRIBUTE_UNUSED static section *
7196 x86_64_elf_select_section (tree decl, int reloc,
7197 unsigned HOST_WIDE_INT align)
7199 if (ix86_in_large_data_p (decl))
7201 const char *sname = NULL;
7202 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7203 switch (categorize_decl_for_section (decl, reloc))
7205 case SECCAT_DATA:
7206 sname = ".ldata";
7207 break;
7208 case SECCAT_DATA_REL:
7209 sname = ".ldata.rel";
7210 break;
7211 case SECCAT_DATA_REL_LOCAL:
7212 sname = ".ldata.rel.local";
7213 break;
7214 case SECCAT_DATA_REL_RO:
7215 sname = ".ldata.rel.ro";
7216 break;
7217 case SECCAT_DATA_REL_RO_LOCAL:
7218 sname = ".ldata.rel.ro.local";
7219 break;
7220 case SECCAT_BSS:
7221 sname = ".lbss";
7222 flags |= SECTION_BSS;
7223 break;
7224 case SECCAT_RODATA:
7225 case SECCAT_RODATA_MERGE_STR:
7226 case SECCAT_RODATA_MERGE_STR_INIT:
7227 case SECCAT_RODATA_MERGE_CONST:
7228 sname = ".lrodata";
7229 flags &= ~SECTION_WRITE;
7230 break;
7231 case SECCAT_SRODATA:
7232 case SECCAT_SDATA:
7233 case SECCAT_SBSS:
7234 gcc_unreachable ();
7235 case SECCAT_TEXT:
7236 case SECCAT_TDATA:
7237 case SECCAT_TBSS:
7238 /* We don't split these for medium model. Place them into
7239 default sections and hope for best. */
7240 break;
7242 if (sname)
7244 /* We might get called with string constants, but get_named_section
7245 doesn't like them as they are not DECLs. Also, we need to set
7246 flags in that case. */
7247 if (!DECL_P (decl))
7248 return get_section (sname, flags, NULL);
7249 return get_named_section (decl, sname, reloc);
7252 return default_elf_select_section (decl, reloc, align);
7255 /* Select a set of attributes for section NAME based on the properties
7256 of DECL and whether or not RELOC indicates that DECL's initializer
7257 might contain runtime relocations. */
7259 static unsigned int ATTRIBUTE_UNUSED
7260 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7262 unsigned int flags = default_section_type_flags (decl, name, reloc);
7264 if (ix86_in_large_data_p (decl))
7265 flags |= SECTION_LARGE;
7267 if (decl == NULL_TREE
7268 && (strcmp (name, ".ldata.rel.ro") == 0
7269 || strcmp (name, ".ldata.rel.ro.local") == 0))
7270 flags |= SECTION_RELRO;
7272 if (strcmp (name, ".lbss") == 0
7273 || strncmp (name, ".lbss.", 5) == 0
7274 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7275 flags |= SECTION_BSS;
7277 return flags;
7280 /* Build up a unique section name, expressed as a
7281 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7282 RELOC indicates whether the initial value of EXP requires
7283 link-time relocations. */
7285 static void ATTRIBUTE_UNUSED
7286 x86_64_elf_unique_section (tree decl, int reloc)
7288 if (ix86_in_large_data_p (decl))
7290 const char *prefix = NULL;
7291 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7292 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7294 switch (categorize_decl_for_section (decl, reloc))
7296 case SECCAT_DATA:
7297 case SECCAT_DATA_REL:
7298 case SECCAT_DATA_REL_LOCAL:
7299 case SECCAT_DATA_REL_RO:
7300 case SECCAT_DATA_REL_RO_LOCAL:
7301 prefix = one_only ? ".ld" : ".ldata";
7302 break;
7303 case SECCAT_BSS:
7304 prefix = one_only ? ".lb" : ".lbss";
7305 break;
7306 case SECCAT_RODATA:
7307 case SECCAT_RODATA_MERGE_STR:
7308 case SECCAT_RODATA_MERGE_STR_INIT:
7309 case SECCAT_RODATA_MERGE_CONST:
7310 prefix = one_only ? ".lr" : ".lrodata";
7311 break;
7312 case SECCAT_SRODATA:
7313 case SECCAT_SDATA:
7314 case SECCAT_SBSS:
7315 gcc_unreachable ();
7316 case SECCAT_TEXT:
7317 case SECCAT_TDATA:
7318 case SECCAT_TBSS:
7319 /* We don't split these for medium model. Place them into
7320 default sections and hope for best. */
7321 break;
7323 if (prefix)
7325 const char *name, *linkonce;
7326 char *string;
7328 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7329 name = targetm.strip_name_encoding (name);
7331 /* If we're using one_only, then there needs to be a .gnu.linkonce
7332 prefix to the section name. */
7333 linkonce = one_only ? ".gnu.linkonce" : "";
7335 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7337 set_decl_section_name (decl, string);
7338 return;
7341 default_unique_section (decl, reloc);
7344 #ifdef COMMON_ASM_OP
7346 #ifndef LARGECOMM_SECTION_ASM_OP
7347 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7348 #endif
7350 /* This says how to output assembler code to declare an
7351 uninitialized external linkage data object.
7353 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7354 large objects. */
7355 void
7356 x86_elf_aligned_decl_common (FILE *file, tree decl,
7357 const char *name, unsigned HOST_WIDE_INT size,
7358 int align)
7360 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7361 && size > (unsigned int)ix86_section_threshold)
7363 switch_to_section (get_named_section (decl, ".lbss", 0));
7364 fputs (LARGECOMM_SECTION_ASM_OP, file);
7366 else
7367 fputs (COMMON_ASM_OP, file);
7368 assemble_name (file, name);
7369 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7370 size, align / BITS_PER_UNIT);
7372 #endif
7374 /* Utility function for targets to use in implementing
7375 ASM_OUTPUT_ALIGNED_BSS. */
7377 void
7378 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7379 unsigned HOST_WIDE_INT size, int align)
7381 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7382 && size > (unsigned int)ix86_section_threshold)
7383 switch_to_section (get_named_section (decl, ".lbss", 0));
7384 else
7385 switch_to_section (bss_section);
7386 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7387 #ifdef ASM_DECLARE_OBJECT_NAME
7388 last_assemble_variable_decl = decl;
7389 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7390 #else
7391 /* Standard thing is just output label for the object. */
7392 ASM_OUTPUT_LABEL (file, name);
7393 #endif /* ASM_DECLARE_OBJECT_NAME */
7394 ASM_OUTPUT_SKIP (file, size ? size : 1);
7397 /* Decide whether we must probe the stack before any space allocation
7398 on this target. It's essentially TARGET_STACK_PROBE except when
7399 -fstack-check causes the stack to be already probed differently. */
7401 bool
7402 ix86_target_stack_probe (void)
7404 /* Do not probe the stack twice if static stack checking is enabled. */
7405 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7406 return false;
7408 return TARGET_STACK_PROBE;
7411 /* Decide whether we can make a sibling call to a function. DECL is the
7412 declaration of the function being targeted by the call and EXP is the
7413 CALL_EXPR representing the call. */
7415 static bool
7416 ix86_function_ok_for_sibcall (tree decl, tree exp)
7418 tree type, decl_or_type;
7419 rtx a, b;
7420 bool bind_global = decl && !targetm.binds_local_p (decl);
7422 /* Sibling call isn't OK if there are no caller-saved registers
7423 since all registers must be preserved before return. */
7424 if (cfun->machine->no_caller_saved_registers)
7425 return false;
7427 /* If we are generating position-independent code, we cannot sibcall
7428 optimize direct calls to global functions, as the PLT requires
7429 %ebx be live. (Darwin does not have a PLT.) */
7430 if (!TARGET_MACHO
7431 && !TARGET_64BIT
7432 && flag_pic
7433 && flag_plt
7434 && bind_global)
7435 return false;
7437 /* If we need to align the outgoing stack, then sibcalling would
7438 unalign the stack, which may break the called function. */
7439 if (ix86_minimum_incoming_stack_boundary (true)
7440 < PREFERRED_STACK_BOUNDARY)
7441 return false;
7443 if (decl)
7445 decl_or_type = decl;
7446 type = TREE_TYPE (decl);
7448 else
7450 /* We're looking at the CALL_EXPR, we need the type of the function. */
7451 type = CALL_EXPR_FN (exp); /* pointer expression */
7452 type = TREE_TYPE (type); /* pointer type */
7453 type = TREE_TYPE (type); /* function type */
7454 decl_or_type = type;
7457 /* Check that the return value locations are the same. Like
7458 if we are returning floats on the 80387 register stack, we cannot
7459 make a sibcall from a function that doesn't return a float to a
7460 function that does or, conversely, from a function that does return
7461 a float to a function that doesn't; the necessary stack adjustment
7462 would not be executed. This is also the place we notice
7463 differences in the return value ABI. Note that it is ok for one
7464 of the functions to have void return type as long as the return
7465 value of the other is passed in a register. */
7466 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7467 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7468 cfun->decl, false);
7469 if (STACK_REG_P (a) || STACK_REG_P (b))
7471 if (!rtx_equal_p (a, b))
7472 return false;
7474 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7476 else if (!rtx_equal_p (a, b))
7477 return false;
7479 if (TARGET_64BIT)
7481 /* The SYSV ABI has more call-clobbered registers;
7482 disallow sibcalls from MS to SYSV. */
7483 if (cfun->machine->call_abi == MS_ABI
7484 && ix86_function_type_abi (type) == SYSV_ABI)
7485 return false;
7487 else
7489 /* If this call is indirect, we'll need to be able to use a
7490 call-clobbered register for the address of the target function.
7491 Make sure that all such registers are not used for passing
7492 parameters. Note that DLLIMPORT functions and call to global
7493 function via GOT slot are indirect. */
7494 if (!decl
7495 || (bind_global && flag_pic && !flag_plt)
7496 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7498 /* Check if regparm >= 3 since arg_reg_available is set to
7499 false if regparm == 0. If regparm is 1 or 2, there is
7500 always a call-clobbered register available.
7502 ??? The symbol indirect call doesn't need a call-clobbered
7503 register. But we don't know if this is a symbol indirect
7504 call or not here. */
7505 if (ix86_function_regparm (type, NULL) >= 3
7506 && !cfun->machine->arg_reg_available)
7507 return false;
7511 /* Otherwise okay. That also includes certain types of indirect calls. */
7512 return true;
7515 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7516 and "sseregparm" calling convention attributes;
7517 arguments as in struct attribute_spec.handler. */
7519 static tree
7520 ix86_handle_cconv_attribute (tree *node, tree name,
7521 tree args,
7522 int,
7523 bool *no_add_attrs)
7525 if (TREE_CODE (*node) != FUNCTION_TYPE
7526 && TREE_CODE (*node) != METHOD_TYPE
7527 && TREE_CODE (*node) != FIELD_DECL
7528 && TREE_CODE (*node) != TYPE_DECL)
7530 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7531 name);
7532 *no_add_attrs = true;
7533 return NULL_TREE;
7536 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7537 if (is_attribute_p ("regparm", name))
7539 tree cst;
7541 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7543 error ("fastcall and regparm attributes are not compatible");
7546 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7548 error ("regparam and thiscall attributes are not compatible");
7551 cst = TREE_VALUE (args);
7552 if (TREE_CODE (cst) != INTEGER_CST)
7554 warning (OPT_Wattributes,
7555 "%qE attribute requires an integer constant argument",
7556 name);
7557 *no_add_attrs = true;
7559 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7561 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7562 name, REGPARM_MAX);
7563 *no_add_attrs = true;
7566 return NULL_TREE;
7569 if (TARGET_64BIT)
7571 /* Do not warn when emulating the MS ABI. */
7572 if ((TREE_CODE (*node) != FUNCTION_TYPE
7573 && TREE_CODE (*node) != METHOD_TYPE)
7574 || ix86_function_type_abi (*node) != MS_ABI)
7575 warning (OPT_Wattributes, "%qE attribute ignored",
7576 name);
7577 *no_add_attrs = true;
7578 return NULL_TREE;
7581 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7582 if (is_attribute_p ("fastcall", name))
7584 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7586 error ("fastcall and cdecl attributes are not compatible");
7588 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7590 error ("fastcall and stdcall attributes are not compatible");
7592 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7594 error ("fastcall and regparm attributes are not compatible");
7596 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7598 error ("fastcall and thiscall attributes are not compatible");
7602 /* Can combine stdcall with fastcall (redundant), regparm and
7603 sseregparm. */
7604 else if (is_attribute_p ("stdcall", name))
7606 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7608 error ("stdcall and cdecl attributes are not compatible");
7610 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7612 error ("stdcall and fastcall attributes are not compatible");
7614 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7616 error ("stdcall and thiscall attributes are not compatible");
7620 /* Can combine cdecl with regparm and sseregparm. */
7621 else if (is_attribute_p ("cdecl", name))
7623 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7625 error ("stdcall and cdecl attributes are not compatible");
7627 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7629 error ("fastcall and cdecl attributes are not compatible");
7631 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7633 error ("cdecl and thiscall attributes are not compatible");
7636 else if (is_attribute_p ("thiscall", name))
7638 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7639 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7640 name);
7641 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7643 error ("stdcall and thiscall attributes are not compatible");
7645 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7647 error ("fastcall and thiscall attributes are not compatible");
7649 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7651 error ("cdecl and thiscall attributes are not compatible");
7655 /* Can combine sseregparm with all attributes. */
7657 return NULL_TREE;
7660 /* The transactional memory builtins are implicitly regparm or fastcall
7661 depending on the ABI. Override the generic do-nothing attribute that
7662 these builtins were declared with, and replace it with one of the two
7663 attributes that we expect elsewhere. */
7665 static tree
7666 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7667 int flags, bool *no_add_attrs)
7669 tree alt;
7671 /* In no case do we want to add the placeholder attribute. */
7672 *no_add_attrs = true;
7674 /* The 64-bit ABI is unchanged for transactional memory. */
7675 if (TARGET_64BIT)
7676 return NULL_TREE;
7678 /* ??? Is there a better way to validate 32-bit windows? We have
7679 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7680 if (CHECK_STACK_LIMIT > 0)
7681 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7682 else
7684 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7685 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7687 decl_attributes (node, alt, flags);
7689 return NULL_TREE;
7692 /* This function determines from TYPE the calling-convention. */
7694 unsigned int
7695 ix86_get_callcvt (const_tree type)
7697 unsigned int ret = 0;
7698 bool is_stdarg;
7699 tree attrs;
7701 if (TARGET_64BIT)
7702 return IX86_CALLCVT_CDECL;
7704 attrs = TYPE_ATTRIBUTES (type);
7705 if (attrs != NULL_TREE)
7707 if (lookup_attribute ("cdecl", attrs))
7708 ret |= IX86_CALLCVT_CDECL;
7709 else if (lookup_attribute ("stdcall", attrs))
7710 ret |= IX86_CALLCVT_STDCALL;
7711 else if (lookup_attribute ("fastcall", attrs))
7712 ret |= IX86_CALLCVT_FASTCALL;
7713 else if (lookup_attribute ("thiscall", attrs))
7714 ret |= IX86_CALLCVT_THISCALL;
7716 /* Regparam isn't allowed for thiscall and fastcall. */
7717 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7719 if (lookup_attribute ("regparm", attrs))
7720 ret |= IX86_CALLCVT_REGPARM;
7721 if (lookup_attribute ("sseregparm", attrs))
7722 ret |= IX86_CALLCVT_SSEREGPARM;
7725 if (IX86_BASE_CALLCVT(ret) != 0)
7726 return ret;
7729 is_stdarg = stdarg_p (type);
7730 if (TARGET_RTD && !is_stdarg)
7731 return IX86_CALLCVT_STDCALL | ret;
7733 if (ret != 0
7734 || is_stdarg
7735 || TREE_CODE (type) != METHOD_TYPE
7736 || ix86_function_type_abi (type) != MS_ABI)
7737 return IX86_CALLCVT_CDECL | ret;
7739 return IX86_CALLCVT_THISCALL;
7742 /* Return 0 if the attributes for two types are incompatible, 1 if they
7743 are compatible, and 2 if they are nearly compatible (which causes a
7744 warning to be generated). */
7746 static int
7747 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7749 unsigned int ccvt1, ccvt2;
7751 if (TREE_CODE (type1) != FUNCTION_TYPE
7752 && TREE_CODE (type1) != METHOD_TYPE)
7753 return 1;
7755 ccvt1 = ix86_get_callcvt (type1);
7756 ccvt2 = ix86_get_callcvt (type2);
7757 if (ccvt1 != ccvt2)
7758 return 0;
7759 if (ix86_function_regparm (type1, NULL)
7760 != ix86_function_regparm (type2, NULL))
7761 return 0;
7763 return 1;
7766 /* Return the regparm value for a function with the indicated TYPE and DECL.
7767 DECL may be NULL when calling function indirectly
7768 or considering a libcall. */
7770 static int
7771 ix86_function_regparm (const_tree type, const_tree decl)
7773 tree attr;
7774 int regparm;
7775 unsigned int ccvt;
7777 if (TARGET_64BIT)
7778 return (ix86_function_type_abi (type) == SYSV_ABI
7779 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7780 ccvt = ix86_get_callcvt (type);
7781 regparm = ix86_regparm;
7783 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7785 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7786 if (attr)
7788 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7789 return regparm;
7792 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7793 return 2;
7794 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7795 return 1;
7797 /* Use register calling convention for local functions when possible. */
7798 if (decl
7799 && TREE_CODE (decl) == FUNCTION_DECL)
7801 cgraph_node *target = cgraph_node::get (decl);
7802 if (target)
7803 target = target->function_symbol ();
7805 /* Caller and callee must agree on the calling convention, so
7806 checking here just optimize means that with
7807 __attribute__((optimize (...))) caller could use regparm convention
7808 and callee not, or vice versa. Instead look at whether the callee
7809 is optimized or not. */
7810 if (target && opt_for_fn (target->decl, optimize)
7811 && !(profile_flag && !flag_fentry))
7813 cgraph_local_info *i = &target->local;
7814 if (i && i->local && i->can_change_signature)
7816 int local_regparm, globals = 0, regno;
7818 /* Make sure no regparm register is taken by a
7819 fixed register variable. */
7820 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7821 local_regparm++)
7822 if (fixed_regs[local_regparm])
7823 break;
7825 /* We don't want to use regparm(3) for nested functions as
7826 these use a static chain pointer in the third argument. */
7827 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7828 local_regparm = 2;
7830 /* Save a register for the split stack. */
7831 if (local_regparm == 3 && flag_split_stack)
7832 local_regparm = 2;
7834 /* Each fixed register usage increases register pressure,
7835 so less registers should be used for argument passing.
7836 This functionality can be overriden by an explicit
7837 regparm value. */
7838 for (regno = AX_REG; regno <= DI_REG; regno++)
7839 if (fixed_regs[regno])
7840 globals++;
7842 local_regparm
7843 = globals < local_regparm ? local_regparm - globals : 0;
7845 if (local_regparm > regparm)
7846 regparm = local_regparm;
7851 return regparm;
7854 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7855 DFmode (2) arguments in SSE registers for a function with the
7856 indicated TYPE and DECL. DECL may be NULL when calling function
7857 indirectly or considering a libcall. Return -1 if any FP parameter
7858 should be rejected by error. This is used in siutation we imply SSE
7859 calling convetion but the function is called from another function with
7860 SSE disabled. Otherwise return 0. */
7862 static int
7863 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
7865 gcc_assert (!TARGET_64BIT);
7867 /* Use SSE registers to pass SFmode and DFmode arguments if requested
7868 by the sseregparm attribute. */
7869 if (TARGET_SSEREGPARM
7870 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
7872 if (!TARGET_SSE)
7874 if (warn)
7876 if (decl)
7877 error ("calling %qD with attribute sseregparm without "
7878 "SSE/SSE2 enabled", decl);
7879 else
7880 error ("calling %qT with attribute sseregparm without "
7881 "SSE/SSE2 enabled", type);
7883 return 0;
7886 return 2;
7889 if (!decl)
7890 return 0;
7892 cgraph_node *target = cgraph_node::get (decl);
7893 if (target)
7894 target = target->function_symbol ();
7896 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
7897 (and DFmode for SSE2) arguments in SSE registers. */
7898 if (target
7899 /* TARGET_SSE_MATH */
7900 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
7901 && opt_for_fn (target->decl, optimize)
7902 && !(profile_flag && !flag_fentry))
7904 cgraph_local_info *i = &target->local;
7905 if (i && i->local && i->can_change_signature)
7907 /* Refuse to produce wrong code when local function with SSE enabled
7908 is called from SSE disabled function.
7909 FIXME: We need a way to detect these cases cross-ltrans partition
7910 and avoid using SSE calling conventions on local functions called
7911 from function with SSE disabled. For now at least delay the
7912 warning until we know we are going to produce wrong code.
7913 See PR66047 */
7914 if (!TARGET_SSE && warn)
7915 return -1;
7916 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
7917 ->x_ix86_isa_flags) ? 2 : 1;
7921 return 0;
7924 /* Return true if EAX is live at the start of the function. Used by
7925 ix86_expand_prologue to determine if we need special help before
7926 calling allocate_stack_worker. */
7928 static bool
7929 ix86_eax_live_at_start_p (void)
7931 /* Cheat. Don't bother working forward from ix86_function_regparm
7932 to the function type to whether an actual argument is located in
7933 eax. Instead just look at cfg info, which is still close enough
7934 to correct at this point. This gives false positives for broken
7935 functions that might use uninitialized data that happens to be
7936 allocated in eax, but who cares? */
7937 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
7940 static bool
7941 ix86_keep_aggregate_return_pointer (tree fntype)
7943 tree attr;
7945 if (!TARGET_64BIT)
7947 attr = lookup_attribute ("callee_pop_aggregate_return",
7948 TYPE_ATTRIBUTES (fntype));
7949 if (attr)
7950 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
7952 /* For 32-bit MS-ABI the default is to keep aggregate
7953 return pointer. */
7954 if (ix86_function_type_abi (fntype) == MS_ABI)
7955 return true;
7957 return KEEP_AGGREGATE_RETURN_POINTER != 0;
7960 /* Value is the number of bytes of arguments automatically
7961 popped when returning from a subroutine call.
7962 FUNDECL is the declaration node of the function (as a tree),
7963 FUNTYPE is the data type of the function (as a tree),
7964 or for a library call it is an identifier node for the subroutine name.
7965 SIZE is the number of bytes of arguments passed on the stack.
7967 On the 80386, the RTD insn may be used to pop them if the number
7968 of args is fixed, but if the number is variable then the caller
7969 must pop them all. RTD can't be used for library calls now
7970 because the library is compiled with the Unix compiler.
7971 Use of RTD is a selectable option, since it is incompatible with
7972 standard Unix calling sequences. If the option is not selected,
7973 the caller must always pop the args.
7975 The attribute stdcall is equivalent to RTD on a per module basis. */
7977 static int
7978 ix86_return_pops_args (tree fundecl, tree funtype, int size)
7980 unsigned int ccvt;
7982 /* None of the 64-bit ABIs pop arguments. */
7983 if (TARGET_64BIT)
7984 return 0;
7986 ccvt = ix86_get_callcvt (funtype);
7988 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
7989 | IX86_CALLCVT_THISCALL)) != 0
7990 && ! stdarg_p (funtype))
7991 return size;
7993 /* Lose any fake structure return argument if it is passed on the stack. */
7994 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
7995 && !ix86_keep_aggregate_return_pointer (funtype))
7997 int nregs = ix86_function_regparm (funtype, fundecl);
7998 if (nregs == 0)
7999 return GET_MODE_SIZE (Pmode);
8002 return 0;
8005 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8007 static bool
8008 ix86_legitimate_combined_insn (rtx_insn *insn)
8010 /* Check operand constraints in case hard registers were propagated
8011 into insn pattern. This check prevents combine pass from
8012 generating insn patterns with invalid hard register operands.
8013 These invalid insns can eventually confuse reload to error out
8014 with a spill failure. See also PRs 46829 and 46843. */
8015 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
8017 int i;
8019 extract_insn (insn);
8020 preprocess_constraints (insn);
8022 int n_operands = recog_data.n_operands;
8023 int n_alternatives = recog_data.n_alternatives;
8024 for (i = 0; i < n_operands; i++)
8026 rtx op = recog_data.operand[i];
8027 machine_mode mode = GET_MODE (op);
8028 const operand_alternative *op_alt;
8029 int offset = 0;
8030 bool win;
8031 int j;
8033 /* A unary operator may be accepted by the predicate, but it
8034 is irrelevant for matching constraints. */
8035 if (UNARY_P (op))
8036 op = XEXP (op, 0);
8038 if (SUBREG_P (op))
8040 if (REG_P (SUBREG_REG (op))
8041 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8042 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8043 GET_MODE (SUBREG_REG (op)),
8044 SUBREG_BYTE (op),
8045 GET_MODE (op));
8046 op = SUBREG_REG (op);
8049 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8050 continue;
8052 op_alt = recog_op_alt;
8054 /* Operand has no constraints, anything is OK. */
8055 win = !n_alternatives;
8057 alternative_mask preferred = get_preferred_alternatives (insn);
8058 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8060 if (!TEST_BIT (preferred, j))
8061 continue;
8062 if (op_alt[i].anything_ok
8063 || (op_alt[i].matches != -1
8064 && operands_match_p
8065 (recog_data.operand[i],
8066 recog_data.operand[op_alt[i].matches]))
8067 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8069 win = true;
8070 break;
8074 if (!win)
8075 return false;
8079 return true;
8082 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8084 static unsigned HOST_WIDE_INT
8085 ix86_asan_shadow_offset (void)
8087 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8088 : HOST_WIDE_INT_C (0x7fff8000))
8089 : (HOST_WIDE_INT_1 << 29);
8092 /* Argument support functions. */
8094 /* Return true when register may be used to pass function parameters. */
8095 bool
8096 ix86_function_arg_regno_p (int regno)
8098 int i;
8099 enum calling_abi call_abi;
8100 const int *parm_regs;
8102 if (TARGET_MPX && BND_REGNO_P (regno))
8103 return true;
8105 if (!TARGET_64BIT)
8107 if (TARGET_MACHO)
8108 return (regno < REGPARM_MAX
8109 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8110 else
8111 return (regno < REGPARM_MAX
8112 || (TARGET_MMX && MMX_REGNO_P (regno)
8113 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8114 || (TARGET_SSE && SSE_REGNO_P (regno)
8115 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8118 if (TARGET_SSE && SSE_REGNO_P (regno)
8119 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8120 return true;
8122 /* TODO: The function should depend on current function ABI but
8123 builtins.c would need updating then. Therefore we use the
8124 default ABI. */
8125 call_abi = ix86_cfun_abi ();
8127 /* RAX is used as hidden argument to va_arg functions. */
8128 if (call_abi == SYSV_ABI && regno == AX_REG)
8129 return true;
8131 if (call_abi == MS_ABI)
8132 parm_regs = x86_64_ms_abi_int_parameter_registers;
8133 else
8134 parm_regs = x86_64_int_parameter_registers;
8136 for (i = 0; i < (call_abi == MS_ABI
8137 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8138 if (regno == parm_regs[i])
8139 return true;
8140 return false;
8143 /* Return if we do not know how to pass TYPE solely in registers. */
8145 static bool
8146 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8148 if (must_pass_in_stack_var_size_or_pad (mode, type))
8149 return true;
8151 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8152 The layout_type routine is crafty and tries to trick us into passing
8153 currently unsupported vector types on the stack by using TImode. */
8154 return (!TARGET_64BIT && mode == TImode
8155 && type && TREE_CODE (type) != VECTOR_TYPE);
8158 /* It returns the size, in bytes, of the area reserved for arguments passed
8159 in registers for the function represented by fndecl dependent to the used
8160 abi format. */
8162 ix86_reg_parm_stack_space (const_tree fndecl)
8164 enum calling_abi call_abi = SYSV_ABI;
8165 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8166 call_abi = ix86_function_abi (fndecl);
8167 else
8168 call_abi = ix86_function_type_abi (fndecl);
8169 if (TARGET_64BIT && call_abi == MS_ABI)
8170 return 32;
8171 return 0;
8174 /* We add this as a workaround in order to use libc_has_function
8175 hook in i386.md. */
8176 bool
8177 ix86_libc_has_function (enum function_class fn_class)
8179 return targetm.libc_has_function (fn_class);
8182 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8183 specifying the call abi used. */
8184 enum calling_abi
8185 ix86_function_type_abi (const_tree fntype)
8187 enum calling_abi abi = ix86_abi;
8189 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8190 return abi;
8192 if (abi == SYSV_ABI
8193 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8195 if (TARGET_X32)
8196 error ("X32 does not support ms_abi attribute");
8198 abi = MS_ABI;
8200 else if (abi == MS_ABI
8201 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8202 abi = SYSV_ABI;
8204 return abi;
8207 static enum calling_abi
8208 ix86_function_abi (const_tree fndecl)
8210 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8213 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8214 specifying the call abi used. */
8215 enum calling_abi
8216 ix86_cfun_abi (void)
8218 return cfun ? cfun->machine->call_abi : ix86_abi;
8221 static bool
8222 ix86_function_ms_hook_prologue (const_tree fn)
8224 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8226 if (decl_function_context (fn) != NULL_TREE)
8227 error_at (DECL_SOURCE_LOCATION (fn),
8228 "ms_hook_prologue is not compatible with nested function");
8229 else
8230 return true;
8232 return false;
8235 /* Write the extra assembler code needed to declare a function properly. */
8237 void
8238 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8239 tree decl)
8241 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8243 if (is_ms_hook)
8245 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8246 unsigned int filler_cc = 0xcccccccc;
8248 for (i = 0; i < filler_count; i += 4)
8249 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8252 #ifdef SUBTARGET_ASM_UNWIND_INIT
8253 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8254 #endif
8256 ASM_OUTPUT_LABEL (asm_out_file, fname);
8258 /* Output magic byte marker, if hot-patch attribute is set. */
8259 if (is_ms_hook)
8261 if (TARGET_64BIT)
8263 /* leaq [%rsp + 0], %rsp */
8264 asm_fprintf (asm_out_file, ASM_BYTE
8265 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8267 else
8269 /* movl.s %edi, %edi
8270 push %ebp
8271 movl.s %esp, %ebp */
8272 asm_fprintf (asm_out_file, ASM_BYTE
8273 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8278 /* regclass.c */
8279 extern void init_regs (void);
8281 /* Implementation of call abi switching target hook. Specific to FNDECL
8282 the specific call register sets are set. See also
8283 ix86_conditional_register_usage for more details. */
8284 void
8285 ix86_call_abi_override (const_tree fndecl)
8287 cfun->machine->call_abi = ix86_function_abi (fndecl);
8290 /* Return 1 if pseudo register should be created and used to hold
8291 GOT address for PIC code. */
8292 bool
8293 ix86_use_pseudo_pic_reg (void)
8295 if ((TARGET_64BIT
8296 && (ix86_cmodel == CM_SMALL_PIC
8297 || TARGET_PECOFF))
8298 || !flag_pic)
8299 return false;
8300 return true;
8303 /* Initialize large model PIC register. */
8305 static void
8306 ix86_init_large_pic_reg (unsigned int tmp_regno)
8308 rtx_code_label *label;
8309 rtx tmp_reg;
8311 gcc_assert (Pmode == DImode);
8312 label = gen_label_rtx ();
8313 emit_label (label);
8314 LABEL_PRESERVE_P (label) = 1;
8315 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8316 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8317 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8318 label));
8319 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8320 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8321 pic_offset_table_rtx, tmp_reg));
8324 /* Create and initialize PIC register if required. */
8325 static void
8326 ix86_init_pic_reg (void)
8328 edge entry_edge;
8329 rtx_insn *seq;
8331 if (!ix86_use_pseudo_pic_reg ())
8332 return;
8334 start_sequence ();
8336 if (TARGET_64BIT)
8338 if (ix86_cmodel == CM_LARGE_PIC)
8339 ix86_init_large_pic_reg (R11_REG);
8340 else
8341 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8343 else
8345 /* If there is future mcount call in the function it is more profitable
8346 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8347 rtx reg = crtl->profile
8348 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8349 : pic_offset_table_rtx;
8350 rtx_insn *insn = emit_insn (gen_set_got (reg));
8351 RTX_FRAME_RELATED_P (insn) = 1;
8352 if (crtl->profile)
8353 emit_move_insn (pic_offset_table_rtx, reg);
8354 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8357 seq = get_insns ();
8358 end_sequence ();
8360 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8361 insert_insn_on_edge (seq, entry_edge);
8362 commit_one_edge_insertion (entry_edge);
8365 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8366 for a call to a function whose data type is FNTYPE.
8367 For a library call, FNTYPE is 0. */
8369 void
8370 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8371 tree fntype, /* tree ptr for function decl */
8372 rtx libname, /* SYMBOL_REF of library name or 0 */
8373 tree fndecl,
8374 int caller)
8376 struct cgraph_local_info *i = NULL;
8377 struct cgraph_node *target = NULL;
8379 memset (cum, 0, sizeof (*cum));
8381 if (fndecl)
8383 target = cgraph_node::get (fndecl);
8384 if (target)
8386 target = target->function_symbol ();
8387 i = cgraph_node::local_info (target->decl);
8388 cum->call_abi = ix86_function_abi (target->decl);
8390 else
8391 cum->call_abi = ix86_function_abi (fndecl);
8393 else
8394 cum->call_abi = ix86_function_type_abi (fntype);
8396 cum->caller = caller;
8398 /* Set up the number of registers to use for passing arguments. */
8399 cum->nregs = ix86_regparm;
8400 if (TARGET_64BIT)
8402 cum->nregs = (cum->call_abi == SYSV_ABI
8403 ? X86_64_REGPARM_MAX
8404 : X86_64_MS_REGPARM_MAX);
8406 if (TARGET_SSE)
8408 cum->sse_nregs = SSE_REGPARM_MAX;
8409 if (TARGET_64BIT)
8411 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8412 ? X86_64_SSE_REGPARM_MAX
8413 : X86_64_MS_SSE_REGPARM_MAX);
8416 if (TARGET_MMX)
8417 cum->mmx_nregs = MMX_REGPARM_MAX;
8418 cum->warn_avx512f = true;
8419 cum->warn_avx = true;
8420 cum->warn_sse = true;
8421 cum->warn_mmx = true;
8423 /* Because type might mismatch in between caller and callee, we need to
8424 use actual type of function for local calls.
8425 FIXME: cgraph_analyze can be told to actually record if function uses
8426 va_start so for local functions maybe_vaarg can be made aggressive
8427 helping K&R code.
8428 FIXME: once typesytem is fixed, we won't need this code anymore. */
8429 if (i && i->local && i->can_change_signature)
8430 fntype = TREE_TYPE (target->decl);
8431 cum->stdarg = stdarg_p (fntype);
8432 cum->maybe_vaarg = (fntype
8433 ? (!prototype_p (fntype) || stdarg_p (fntype))
8434 : !libname);
8436 cum->bnd_regno = FIRST_BND_REG;
8437 cum->bnds_in_bt = 0;
8438 cum->force_bnd_pass = 0;
8439 cum->decl = fndecl;
8441 if (!TARGET_64BIT)
8443 /* If there are variable arguments, then we won't pass anything
8444 in registers in 32-bit mode. */
8445 if (stdarg_p (fntype))
8447 cum->nregs = 0;
8448 /* Since in 32-bit, variable arguments are always passed on
8449 stack, there is scratch register available for indirect
8450 sibcall. */
8451 cfun->machine->arg_reg_available = true;
8452 cum->sse_nregs = 0;
8453 cum->mmx_nregs = 0;
8454 cum->warn_avx512f = false;
8455 cum->warn_avx = false;
8456 cum->warn_sse = false;
8457 cum->warn_mmx = false;
8458 return;
8461 /* Use ecx and edx registers if function has fastcall attribute,
8462 else look for regparm information. */
8463 if (fntype)
8465 unsigned int ccvt = ix86_get_callcvt (fntype);
8466 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8468 cum->nregs = 1;
8469 cum->fastcall = 1; /* Same first register as in fastcall. */
8471 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8473 cum->nregs = 2;
8474 cum->fastcall = 1;
8476 else
8477 cum->nregs = ix86_function_regparm (fntype, fndecl);
8480 /* Set up the number of SSE registers used for passing SFmode
8481 and DFmode arguments. Warn for mismatching ABI. */
8482 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8485 cfun->machine->arg_reg_available = (cum->nregs > 0);
8488 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8489 But in the case of vector types, it is some vector mode.
8491 When we have only some of our vector isa extensions enabled, then there
8492 are some modes for which vector_mode_supported_p is false. For these
8493 modes, the generic vector support in gcc will choose some non-vector mode
8494 in order to implement the type. By computing the natural mode, we'll
8495 select the proper ABI location for the operand and not depend on whatever
8496 the middle-end decides to do with these vector types.
8498 The midde-end can't deal with the vector types > 16 bytes. In this
8499 case, we return the original mode and warn ABI change if CUM isn't
8500 NULL.
8502 If INT_RETURN is true, warn ABI change if the vector mode isn't
8503 available for function return value. */
8505 static machine_mode
8506 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8507 bool in_return)
8509 machine_mode mode = TYPE_MODE (type);
8511 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8513 HOST_WIDE_INT size = int_size_in_bytes (type);
8514 if ((size == 8 || size == 16 || size == 32 || size == 64)
8515 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8516 && TYPE_VECTOR_SUBPARTS (type) > 1)
8518 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8520 /* There are no XFmode vector modes. */
8521 if (innermode == XFmode)
8522 return mode;
8524 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8525 mode = MIN_MODE_VECTOR_FLOAT;
8526 else
8527 mode = MIN_MODE_VECTOR_INT;
8529 /* Get the mode which has this inner mode and number of units. */
8530 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8531 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8532 && GET_MODE_INNER (mode) == innermode)
8534 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8536 static bool warnedavx512f;
8537 static bool warnedavx512f_ret;
8539 if (cum && cum->warn_avx512f && !warnedavx512f)
8541 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8542 "without AVX512F enabled changes the ABI"))
8543 warnedavx512f = true;
8545 else if (in_return && !warnedavx512f_ret)
8547 if (warning (OPT_Wpsabi, "AVX512F vector return "
8548 "without AVX512F enabled changes the ABI"))
8549 warnedavx512f_ret = true;
8552 return TYPE_MODE (type);
8554 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8556 static bool warnedavx;
8557 static bool warnedavx_ret;
8559 if (cum && cum->warn_avx && !warnedavx)
8561 if (warning (OPT_Wpsabi, "AVX vector argument "
8562 "without AVX enabled changes the ABI"))
8563 warnedavx = true;
8565 else if (in_return && !warnedavx_ret)
8567 if (warning (OPT_Wpsabi, "AVX vector return "
8568 "without AVX enabled changes the ABI"))
8569 warnedavx_ret = true;
8572 return TYPE_MODE (type);
8574 else if (((size == 8 && TARGET_64BIT) || size == 16)
8575 && !TARGET_SSE
8576 && !TARGET_IAMCU)
8578 static bool warnedsse;
8579 static bool warnedsse_ret;
8581 if (cum && cum->warn_sse && !warnedsse)
8583 if (warning (OPT_Wpsabi, "SSE vector argument "
8584 "without SSE enabled changes the ABI"))
8585 warnedsse = true;
8587 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8589 if (warning (OPT_Wpsabi, "SSE vector return "
8590 "without SSE enabled changes the ABI"))
8591 warnedsse_ret = true;
8594 else if ((size == 8 && !TARGET_64BIT)
8595 && (!cfun
8596 || cfun->machine->func_type == TYPE_NORMAL)
8597 && !TARGET_MMX
8598 && !TARGET_IAMCU)
8600 static bool warnedmmx;
8601 static bool warnedmmx_ret;
8603 if (cum && cum->warn_mmx && !warnedmmx)
8605 if (warning (OPT_Wpsabi, "MMX vector argument "
8606 "without MMX enabled changes the ABI"))
8607 warnedmmx = true;
8609 else if (in_return && !warnedmmx_ret)
8611 if (warning (OPT_Wpsabi, "MMX vector return "
8612 "without MMX enabled changes the ABI"))
8613 warnedmmx_ret = true;
8616 return mode;
8619 gcc_unreachable ();
8623 return mode;
8626 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8627 this may not agree with the mode that the type system has chosen for the
8628 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8629 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8631 static rtx
8632 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8633 unsigned int regno)
8635 rtx tmp;
8637 if (orig_mode != BLKmode)
8638 tmp = gen_rtx_REG (orig_mode, regno);
8639 else
8641 tmp = gen_rtx_REG (mode, regno);
8642 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8643 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8646 return tmp;
8649 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8650 of this code is to classify each 8bytes of incoming argument by the register
8651 class and assign registers accordingly. */
8653 /* Return the union class of CLASS1 and CLASS2.
8654 See the x86-64 PS ABI for details. */
8656 static enum x86_64_reg_class
8657 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8659 /* Rule #1: If both classes are equal, this is the resulting class. */
8660 if (class1 == class2)
8661 return class1;
8663 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8664 the other class. */
8665 if (class1 == X86_64_NO_CLASS)
8666 return class2;
8667 if (class2 == X86_64_NO_CLASS)
8668 return class1;
8670 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8671 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8672 return X86_64_MEMORY_CLASS;
8674 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8675 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8676 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8677 return X86_64_INTEGERSI_CLASS;
8678 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8679 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8680 return X86_64_INTEGER_CLASS;
8682 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8683 MEMORY is used. */
8684 if (class1 == X86_64_X87_CLASS
8685 || class1 == X86_64_X87UP_CLASS
8686 || class1 == X86_64_COMPLEX_X87_CLASS
8687 || class2 == X86_64_X87_CLASS
8688 || class2 == X86_64_X87UP_CLASS
8689 || class2 == X86_64_COMPLEX_X87_CLASS)
8690 return X86_64_MEMORY_CLASS;
8692 /* Rule #6: Otherwise class SSE is used. */
8693 return X86_64_SSE_CLASS;
8696 /* Classify the argument of type TYPE and mode MODE.
8697 CLASSES will be filled by the register class used to pass each word
8698 of the operand. The number of words is returned. In case the parameter
8699 should be passed in memory, 0 is returned. As a special case for zero
8700 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8702 BIT_OFFSET is used internally for handling records and specifies offset
8703 of the offset in bits modulo 512 to avoid overflow cases.
8705 See the x86-64 PS ABI for details.
8708 static int
8709 classify_argument (machine_mode mode, const_tree type,
8710 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8712 HOST_WIDE_INT bytes =
8713 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8714 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8716 /* Variable sized entities are always passed/returned in memory. */
8717 if (bytes < 0)
8718 return 0;
8720 if (mode != VOIDmode
8721 && targetm.calls.must_pass_in_stack (mode, type))
8722 return 0;
8724 if (type && AGGREGATE_TYPE_P (type))
8726 int i;
8727 tree field;
8728 enum x86_64_reg_class subclasses[MAX_CLASSES];
8730 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8731 if (bytes > 64)
8732 return 0;
8734 for (i = 0; i < words; i++)
8735 classes[i] = X86_64_NO_CLASS;
8737 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8738 signalize memory class, so handle it as special case. */
8739 if (!words)
8741 classes[0] = X86_64_NO_CLASS;
8742 return 1;
8745 /* Classify each field of record and merge classes. */
8746 switch (TREE_CODE (type))
8748 case RECORD_TYPE:
8749 /* And now merge the fields of structure. */
8750 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8752 if (TREE_CODE (field) == FIELD_DECL)
8754 int num;
8756 if (TREE_TYPE (field) == error_mark_node)
8757 continue;
8759 /* Bitfields are always classified as integer. Handle them
8760 early, since later code would consider them to be
8761 misaligned integers. */
8762 if (DECL_BIT_FIELD (field))
8764 for (i = (int_bit_position (field)
8765 + (bit_offset % 64)) / 8 / 8;
8766 i < ((int_bit_position (field) + (bit_offset % 64))
8767 + tree_to_shwi (DECL_SIZE (field))
8768 + 63) / 8 / 8; i++)
8769 classes[i] =
8770 merge_classes (X86_64_INTEGER_CLASS,
8771 classes[i]);
8773 else
8775 int pos;
8777 type = TREE_TYPE (field);
8779 /* Flexible array member is ignored. */
8780 if (TYPE_MODE (type) == BLKmode
8781 && TREE_CODE (type) == ARRAY_TYPE
8782 && TYPE_SIZE (type) == NULL_TREE
8783 && TYPE_DOMAIN (type) != NULL_TREE
8784 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8785 == NULL_TREE))
8787 static bool warned;
8789 if (!warned && warn_psabi)
8791 warned = true;
8792 inform (input_location,
8793 "the ABI of passing struct with"
8794 " a flexible array member has"
8795 " changed in GCC 4.4");
8797 continue;
8799 num = classify_argument (TYPE_MODE (type), type,
8800 subclasses,
8801 (int_bit_position (field)
8802 + bit_offset) % 512);
8803 if (!num)
8804 return 0;
8805 pos = (int_bit_position (field)
8806 + (bit_offset % 64)) / 8 / 8;
8807 for (i = 0; i < num && (i + pos) < words; i++)
8808 classes[i + pos] =
8809 merge_classes (subclasses[i], classes[i + pos]);
8813 break;
8815 case ARRAY_TYPE:
8816 /* Arrays are handled as small records. */
8818 int num;
8819 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8820 TREE_TYPE (type), subclasses, bit_offset);
8821 if (!num)
8822 return 0;
8824 /* The partial classes are now full classes. */
8825 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8826 subclasses[0] = X86_64_SSE_CLASS;
8827 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8828 && !((bit_offset % 64) == 0 && bytes == 4))
8829 subclasses[0] = X86_64_INTEGER_CLASS;
8831 for (i = 0; i < words; i++)
8832 classes[i] = subclasses[i % num];
8834 break;
8836 case UNION_TYPE:
8837 case QUAL_UNION_TYPE:
8838 /* Unions are similar to RECORD_TYPE but offset is always 0.
8840 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8842 if (TREE_CODE (field) == FIELD_DECL)
8844 int num;
8846 if (TREE_TYPE (field) == error_mark_node)
8847 continue;
8849 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8850 TREE_TYPE (field), subclasses,
8851 bit_offset);
8852 if (!num)
8853 return 0;
8854 for (i = 0; i < num && i < words; i++)
8855 classes[i] = merge_classes (subclasses[i], classes[i]);
8858 break;
8860 default:
8861 gcc_unreachable ();
8864 if (words > 2)
8866 /* When size > 16 bytes, if the first one isn't
8867 X86_64_SSE_CLASS or any other ones aren't
8868 X86_64_SSEUP_CLASS, everything should be passed in
8869 memory. */
8870 if (classes[0] != X86_64_SSE_CLASS)
8871 return 0;
8873 for (i = 1; i < words; i++)
8874 if (classes[i] != X86_64_SSEUP_CLASS)
8875 return 0;
8878 /* Final merger cleanup. */
8879 for (i = 0; i < words; i++)
8881 /* If one class is MEMORY, everything should be passed in
8882 memory. */
8883 if (classes[i] == X86_64_MEMORY_CLASS)
8884 return 0;
8886 /* The X86_64_SSEUP_CLASS should be always preceded by
8887 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
8888 if (classes[i] == X86_64_SSEUP_CLASS
8889 && classes[i - 1] != X86_64_SSE_CLASS
8890 && classes[i - 1] != X86_64_SSEUP_CLASS)
8892 /* The first one should never be X86_64_SSEUP_CLASS. */
8893 gcc_assert (i != 0);
8894 classes[i] = X86_64_SSE_CLASS;
8897 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
8898 everything should be passed in memory. */
8899 if (classes[i] == X86_64_X87UP_CLASS
8900 && (classes[i - 1] != X86_64_X87_CLASS))
8902 static bool warned;
8904 /* The first one should never be X86_64_X87UP_CLASS. */
8905 gcc_assert (i != 0);
8906 if (!warned && warn_psabi)
8908 warned = true;
8909 inform (input_location,
8910 "the ABI of passing union with long double"
8911 " has changed in GCC 4.4");
8913 return 0;
8916 return words;
8919 /* Compute alignment needed. We align all types to natural boundaries with
8920 exception of XFmode that is aligned to 64bits. */
8921 if (mode != VOIDmode && mode != BLKmode)
8923 int mode_alignment = GET_MODE_BITSIZE (mode);
8925 if (mode == XFmode)
8926 mode_alignment = 128;
8927 else if (mode == XCmode)
8928 mode_alignment = 256;
8929 if (COMPLEX_MODE_P (mode))
8930 mode_alignment /= 2;
8931 /* Misaligned fields are always returned in memory. */
8932 if (bit_offset % mode_alignment)
8933 return 0;
8936 /* for V1xx modes, just use the base mode */
8937 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
8938 && GET_MODE_UNIT_SIZE (mode) == bytes)
8939 mode = GET_MODE_INNER (mode);
8941 /* Classification of atomic types. */
8942 switch (mode)
8944 case SDmode:
8945 case DDmode:
8946 classes[0] = X86_64_SSE_CLASS;
8947 return 1;
8948 case TDmode:
8949 classes[0] = X86_64_SSE_CLASS;
8950 classes[1] = X86_64_SSEUP_CLASS;
8951 return 2;
8952 case DImode:
8953 case SImode:
8954 case HImode:
8955 case QImode:
8956 case CSImode:
8957 case CHImode:
8958 case CQImode:
8960 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
8962 /* Analyze last 128 bits only. */
8963 size = (size - 1) & 0x7f;
8965 if (size < 32)
8967 classes[0] = X86_64_INTEGERSI_CLASS;
8968 return 1;
8970 else if (size < 64)
8972 classes[0] = X86_64_INTEGER_CLASS;
8973 return 1;
8975 else if (size < 64+32)
8977 classes[0] = X86_64_INTEGER_CLASS;
8978 classes[1] = X86_64_INTEGERSI_CLASS;
8979 return 2;
8981 else if (size < 64+64)
8983 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8984 return 2;
8986 else
8987 gcc_unreachable ();
8989 case CDImode:
8990 case TImode:
8991 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8992 return 2;
8993 case COImode:
8994 case OImode:
8995 /* OImode shouldn't be used directly. */
8996 gcc_unreachable ();
8997 case CTImode:
8998 return 0;
8999 case SFmode:
9000 if (!(bit_offset % 64))
9001 classes[0] = X86_64_SSESF_CLASS;
9002 else
9003 classes[0] = X86_64_SSE_CLASS;
9004 return 1;
9005 case DFmode:
9006 classes[0] = X86_64_SSEDF_CLASS;
9007 return 1;
9008 case XFmode:
9009 classes[0] = X86_64_X87_CLASS;
9010 classes[1] = X86_64_X87UP_CLASS;
9011 return 2;
9012 case TFmode:
9013 classes[0] = X86_64_SSE_CLASS;
9014 classes[1] = X86_64_SSEUP_CLASS;
9015 return 2;
9016 case SCmode:
9017 classes[0] = X86_64_SSE_CLASS;
9018 if (!(bit_offset % 64))
9019 return 1;
9020 else
9022 static bool warned;
9024 if (!warned && warn_psabi)
9026 warned = true;
9027 inform (input_location,
9028 "the ABI of passing structure with complex float"
9029 " member has changed in GCC 4.4");
9031 classes[1] = X86_64_SSESF_CLASS;
9032 return 2;
9034 case DCmode:
9035 classes[0] = X86_64_SSEDF_CLASS;
9036 classes[1] = X86_64_SSEDF_CLASS;
9037 return 2;
9038 case XCmode:
9039 classes[0] = X86_64_COMPLEX_X87_CLASS;
9040 return 1;
9041 case TCmode:
9042 /* This modes is larger than 16 bytes. */
9043 return 0;
9044 case V8SFmode:
9045 case V8SImode:
9046 case V32QImode:
9047 case V16HImode:
9048 case V4DFmode:
9049 case V4DImode:
9050 classes[0] = X86_64_SSE_CLASS;
9051 classes[1] = X86_64_SSEUP_CLASS;
9052 classes[2] = X86_64_SSEUP_CLASS;
9053 classes[3] = X86_64_SSEUP_CLASS;
9054 return 4;
9055 case V8DFmode:
9056 case V16SFmode:
9057 case V8DImode:
9058 case V16SImode:
9059 case V32HImode:
9060 case V64QImode:
9061 classes[0] = X86_64_SSE_CLASS;
9062 classes[1] = X86_64_SSEUP_CLASS;
9063 classes[2] = X86_64_SSEUP_CLASS;
9064 classes[3] = X86_64_SSEUP_CLASS;
9065 classes[4] = X86_64_SSEUP_CLASS;
9066 classes[5] = X86_64_SSEUP_CLASS;
9067 classes[6] = X86_64_SSEUP_CLASS;
9068 classes[7] = X86_64_SSEUP_CLASS;
9069 return 8;
9070 case V4SFmode:
9071 case V4SImode:
9072 case V16QImode:
9073 case V8HImode:
9074 case V2DFmode:
9075 case V2DImode:
9076 classes[0] = X86_64_SSE_CLASS;
9077 classes[1] = X86_64_SSEUP_CLASS;
9078 return 2;
9079 case V1TImode:
9080 case V1DImode:
9081 case V2SFmode:
9082 case V2SImode:
9083 case V4HImode:
9084 case V8QImode:
9085 classes[0] = X86_64_SSE_CLASS;
9086 return 1;
9087 case BLKmode:
9088 case VOIDmode:
9089 return 0;
9090 default:
9091 gcc_assert (VECTOR_MODE_P (mode));
9093 if (bytes > 16)
9094 return 0;
9096 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9098 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9099 classes[0] = X86_64_INTEGERSI_CLASS;
9100 else
9101 classes[0] = X86_64_INTEGER_CLASS;
9102 classes[1] = X86_64_INTEGER_CLASS;
9103 return 1 + (bytes > 8);
9107 /* Examine the argument and return set number of register required in each
9108 class. Return true iff parameter should be passed in memory. */
9110 static bool
9111 examine_argument (machine_mode mode, const_tree type, int in_return,
9112 int *int_nregs, int *sse_nregs)
9114 enum x86_64_reg_class regclass[MAX_CLASSES];
9115 int n = classify_argument (mode, type, regclass, 0);
9117 *int_nregs = 0;
9118 *sse_nregs = 0;
9120 if (!n)
9121 return true;
9122 for (n--; n >= 0; n--)
9123 switch (regclass[n])
9125 case X86_64_INTEGER_CLASS:
9126 case X86_64_INTEGERSI_CLASS:
9127 (*int_nregs)++;
9128 break;
9129 case X86_64_SSE_CLASS:
9130 case X86_64_SSESF_CLASS:
9131 case X86_64_SSEDF_CLASS:
9132 (*sse_nregs)++;
9133 break;
9134 case X86_64_NO_CLASS:
9135 case X86_64_SSEUP_CLASS:
9136 break;
9137 case X86_64_X87_CLASS:
9138 case X86_64_X87UP_CLASS:
9139 case X86_64_COMPLEX_X87_CLASS:
9140 if (!in_return)
9141 return true;
9142 break;
9143 case X86_64_MEMORY_CLASS:
9144 gcc_unreachable ();
9147 return false;
9150 /* Construct container for the argument used by GCC interface. See
9151 FUNCTION_ARG for the detailed description. */
9153 static rtx
9154 construct_container (machine_mode mode, machine_mode orig_mode,
9155 const_tree type, int in_return, int nintregs, int nsseregs,
9156 const int *intreg, int sse_regno)
9158 /* The following variables hold the static issued_error state. */
9159 static bool issued_sse_arg_error;
9160 static bool issued_sse_ret_error;
9161 static bool issued_x87_ret_error;
9163 machine_mode tmpmode;
9164 int bytes =
9165 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9166 enum x86_64_reg_class regclass[MAX_CLASSES];
9167 int n;
9168 int i;
9169 int nexps = 0;
9170 int needed_sseregs, needed_intregs;
9171 rtx exp[MAX_CLASSES];
9172 rtx ret;
9174 n = classify_argument (mode, type, regclass, 0);
9175 if (!n)
9176 return NULL;
9177 if (examine_argument (mode, type, in_return, &needed_intregs,
9178 &needed_sseregs))
9179 return NULL;
9180 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9181 return NULL;
9183 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9184 some less clueful developer tries to use floating-point anyway. */
9185 if (needed_sseregs && !TARGET_SSE)
9187 if (in_return)
9189 if (!issued_sse_ret_error)
9191 error ("SSE register return with SSE disabled");
9192 issued_sse_ret_error = true;
9195 else if (!issued_sse_arg_error)
9197 error ("SSE register argument with SSE disabled");
9198 issued_sse_arg_error = true;
9200 return NULL;
9203 /* Likewise, error if the ABI requires us to return values in the
9204 x87 registers and the user specified -mno-80387. */
9205 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9206 for (i = 0; i < n; i++)
9207 if (regclass[i] == X86_64_X87_CLASS
9208 || regclass[i] == X86_64_X87UP_CLASS
9209 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9211 if (!issued_x87_ret_error)
9213 error ("x87 register return with x87 disabled");
9214 issued_x87_ret_error = true;
9216 return NULL;
9219 /* First construct simple cases. Avoid SCmode, since we want to use
9220 single register to pass this type. */
9221 if (n == 1 && mode != SCmode)
9222 switch (regclass[0])
9224 case X86_64_INTEGER_CLASS:
9225 case X86_64_INTEGERSI_CLASS:
9226 return gen_rtx_REG (mode, intreg[0]);
9227 case X86_64_SSE_CLASS:
9228 case X86_64_SSESF_CLASS:
9229 case X86_64_SSEDF_CLASS:
9230 if (mode != BLKmode)
9231 return gen_reg_or_parallel (mode, orig_mode,
9232 SSE_REGNO (sse_regno));
9233 break;
9234 case X86_64_X87_CLASS:
9235 case X86_64_COMPLEX_X87_CLASS:
9236 return gen_rtx_REG (mode, FIRST_STACK_REG);
9237 case X86_64_NO_CLASS:
9238 /* Zero sized array, struct or class. */
9239 return NULL;
9240 default:
9241 gcc_unreachable ();
9243 if (n == 2
9244 && regclass[0] == X86_64_SSE_CLASS
9245 && regclass[1] == X86_64_SSEUP_CLASS
9246 && mode != BLKmode)
9247 return gen_reg_or_parallel (mode, orig_mode,
9248 SSE_REGNO (sse_regno));
9249 if (n == 4
9250 && regclass[0] == X86_64_SSE_CLASS
9251 && regclass[1] == X86_64_SSEUP_CLASS
9252 && regclass[2] == X86_64_SSEUP_CLASS
9253 && regclass[3] == X86_64_SSEUP_CLASS
9254 && mode != BLKmode)
9255 return gen_reg_or_parallel (mode, orig_mode,
9256 SSE_REGNO (sse_regno));
9257 if (n == 8
9258 && regclass[0] == X86_64_SSE_CLASS
9259 && regclass[1] == X86_64_SSEUP_CLASS
9260 && regclass[2] == X86_64_SSEUP_CLASS
9261 && regclass[3] == X86_64_SSEUP_CLASS
9262 && regclass[4] == X86_64_SSEUP_CLASS
9263 && regclass[5] == X86_64_SSEUP_CLASS
9264 && regclass[6] == X86_64_SSEUP_CLASS
9265 && regclass[7] == X86_64_SSEUP_CLASS
9266 && mode != BLKmode)
9267 return gen_reg_or_parallel (mode, orig_mode,
9268 SSE_REGNO (sse_regno));
9269 if (n == 2
9270 && regclass[0] == X86_64_X87_CLASS
9271 && regclass[1] == X86_64_X87UP_CLASS)
9272 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9274 if (n == 2
9275 && regclass[0] == X86_64_INTEGER_CLASS
9276 && regclass[1] == X86_64_INTEGER_CLASS
9277 && (mode == CDImode || mode == TImode)
9278 && intreg[0] + 1 == intreg[1])
9279 return gen_rtx_REG (mode, intreg[0]);
9281 /* Otherwise figure out the entries of the PARALLEL. */
9282 for (i = 0; i < n; i++)
9284 int pos;
9286 switch (regclass[i])
9288 case X86_64_NO_CLASS:
9289 break;
9290 case X86_64_INTEGER_CLASS:
9291 case X86_64_INTEGERSI_CLASS:
9292 /* Merge TImodes on aligned occasions here too. */
9293 if (i * 8 + 8 > bytes)
9294 tmpmode
9295 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9296 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9297 tmpmode = SImode;
9298 else
9299 tmpmode = DImode;
9300 /* We've requested 24 bytes we
9301 don't have mode for. Use DImode. */
9302 if (tmpmode == BLKmode)
9303 tmpmode = DImode;
9304 exp [nexps++]
9305 = gen_rtx_EXPR_LIST (VOIDmode,
9306 gen_rtx_REG (tmpmode, *intreg),
9307 GEN_INT (i*8));
9308 intreg++;
9309 break;
9310 case X86_64_SSESF_CLASS:
9311 exp [nexps++]
9312 = gen_rtx_EXPR_LIST (VOIDmode,
9313 gen_rtx_REG (SFmode,
9314 SSE_REGNO (sse_regno)),
9315 GEN_INT (i*8));
9316 sse_regno++;
9317 break;
9318 case X86_64_SSEDF_CLASS:
9319 exp [nexps++]
9320 = gen_rtx_EXPR_LIST (VOIDmode,
9321 gen_rtx_REG (DFmode,
9322 SSE_REGNO (sse_regno)),
9323 GEN_INT (i*8));
9324 sse_regno++;
9325 break;
9326 case X86_64_SSE_CLASS:
9327 pos = i;
9328 switch (n)
9330 case 1:
9331 tmpmode = DImode;
9332 break;
9333 case 2:
9334 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9336 tmpmode = TImode;
9337 i++;
9339 else
9340 tmpmode = DImode;
9341 break;
9342 case 4:
9343 gcc_assert (i == 0
9344 && regclass[1] == X86_64_SSEUP_CLASS
9345 && regclass[2] == X86_64_SSEUP_CLASS
9346 && regclass[3] == X86_64_SSEUP_CLASS);
9347 tmpmode = OImode;
9348 i += 3;
9349 break;
9350 case 8:
9351 gcc_assert (i == 0
9352 && regclass[1] == X86_64_SSEUP_CLASS
9353 && regclass[2] == X86_64_SSEUP_CLASS
9354 && regclass[3] == X86_64_SSEUP_CLASS
9355 && regclass[4] == X86_64_SSEUP_CLASS
9356 && regclass[5] == X86_64_SSEUP_CLASS
9357 && regclass[6] == X86_64_SSEUP_CLASS
9358 && regclass[7] == X86_64_SSEUP_CLASS);
9359 tmpmode = XImode;
9360 i += 7;
9361 break;
9362 default:
9363 gcc_unreachable ();
9365 exp [nexps++]
9366 = gen_rtx_EXPR_LIST (VOIDmode,
9367 gen_rtx_REG (tmpmode,
9368 SSE_REGNO (sse_regno)),
9369 GEN_INT (pos*8));
9370 sse_regno++;
9371 break;
9372 default:
9373 gcc_unreachable ();
9377 /* Empty aligned struct, union or class. */
9378 if (nexps == 0)
9379 return NULL;
9381 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9382 for (i = 0; i < nexps; i++)
9383 XVECEXP (ret, 0, i) = exp [i];
9384 return ret;
9387 /* Update the data in CUM to advance over an argument of mode MODE
9388 and data type TYPE. (TYPE is null for libcalls where that information
9389 may not be available.)
9391 Return a number of integer regsiters advanced over. */
9393 static int
9394 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9395 const_tree type, HOST_WIDE_INT bytes,
9396 HOST_WIDE_INT words)
9398 int res = 0;
9399 bool error_p = NULL;
9401 if (TARGET_IAMCU)
9403 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9404 bytes in registers. */
9405 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9406 goto pass_in_reg;
9407 return res;
9410 switch (mode)
9412 default:
9413 break;
9415 case BLKmode:
9416 if (bytes < 0)
9417 break;
9418 /* FALLTHRU */
9420 case DImode:
9421 case SImode:
9422 case HImode:
9423 case QImode:
9424 pass_in_reg:
9425 cum->words += words;
9426 cum->nregs -= words;
9427 cum->regno += words;
9428 if (cum->nregs >= 0)
9429 res = words;
9430 if (cum->nregs <= 0)
9432 cum->nregs = 0;
9433 cfun->machine->arg_reg_available = false;
9434 cum->regno = 0;
9436 break;
9438 case OImode:
9439 /* OImode shouldn't be used directly. */
9440 gcc_unreachable ();
9442 case DFmode:
9443 if (cum->float_in_sse == -1)
9444 error_p = 1;
9445 if (cum->float_in_sse < 2)
9446 break;
9447 /* FALLTHRU */
9448 case SFmode:
9449 if (cum->float_in_sse == -1)
9450 error_p = 1;
9451 if (cum->float_in_sse < 1)
9452 break;
9453 /* FALLTHRU */
9455 case V8SFmode:
9456 case V8SImode:
9457 case V64QImode:
9458 case V32HImode:
9459 case V16SImode:
9460 case V8DImode:
9461 case V16SFmode:
9462 case V8DFmode:
9463 case V32QImode:
9464 case V16HImode:
9465 case V4DFmode:
9466 case V4DImode:
9467 case TImode:
9468 case V16QImode:
9469 case V8HImode:
9470 case V4SImode:
9471 case V2DImode:
9472 case V4SFmode:
9473 case V2DFmode:
9474 if (!type || !AGGREGATE_TYPE_P (type))
9476 cum->sse_words += words;
9477 cum->sse_nregs -= 1;
9478 cum->sse_regno += 1;
9479 if (cum->sse_nregs <= 0)
9481 cum->sse_nregs = 0;
9482 cum->sse_regno = 0;
9485 break;
9487 case V8QImode:
9488 case V4HImode:
9489 case V2SImode:
9490 case V2SFmode:
9491 case V1TImode:
9492 case V1DImode:
9493 if (!type || !AGGREGATE_TYPE_P (type))
9495 cum->mmx_words += words;
9496 cum->mmx_nregs -= 1;
9497 cum->mmx_regno += 1;
9498 if (cum->mmx_nregs <= 0)
9500 cum->mmx_nregs = 0;
9501 cum->mmx_regno = 0;
9504 break;
9506 if (error_p)
9508 cum->float_in_sse = 0;
9509 error ("calling %qD with SSE calling convention without "
9510 "SSE/SSE2 enabled", cum->decl);
9511 sorry ("this is a GCC bug that can be worked around by adding "
9512 "attribute used to function called");
9515 return res;
9518 static int
9519 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9520 const_tree type, HOST_WIDE_INT words, bool named)
9522 int int_nregs, sse_nregs;
9524 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9525 if (!named && (VALID_AVX512F_REG_MODE (mode)
9526 || VALID_AVX256_REG_MODE (mode)))
9527 return 0;
9529 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9530 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9532 cum->nregs -= int_nregs;
9533 cum->sse_nregs -= sse_nregs;
9534 cum->regno += int_nregs;
9535 cum->sse_regno += sse_nregs;
9536 return int_nregs;
9538 else
9540 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9541 cum->words = ROUND_UP (cum->words, align);
9542 cum->words += words;
9543 return 0;
9547 static int
9548 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9549 HOST_WIDE_INT words)
9551 /* Otherwise, this should be passed indirect. */
9552 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9554 cum->words += words;
9555 if (cum->nregs > 0)
9557 cum->nregs -= 1;
9558 cum->regno += 1;
9559 return 1;
9561 return 0;
9564 /* Update the data in CUM to advance over an argument of mode MODE and
9565 data type TYPE. (TYPE is null for libcalls where that information
9566 may not be available.) */
9568 static void
9569 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9570 const_tree type, bool named)
9572 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9573 HOST_WIDE_INT bytes, words;
9574 int nregs;
9576 /* The argument of interrupt handler is a special case and is
9577 handled in ix86_function_arg. */
9578 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9579 return;
9581 if (mode == BLKmode)
9582 bytes = int_size_in_bytes (type);
9583 else
9584 bytes = GET_MODE_SIZE (mode);
9585 words = CEIL (bytes, UNITS_PER_WORD);
9587 if (type)
9588 mode = type_natural_mode (type, NULL, false);
9590 if ((type && POINTER_BOUNDS_TYPE_P (type))
9591 || POINTER_BOUNDS_MODE_P (mode))
9593 /* If we pass bounds in BT then just update remained bounds count. */
9594 if (cum->bnds_in_bt)
9596 cum->bnds_in_bt--;
9597 return;
9600 /* Update remained number of bounds to force. */
9601 if (cum->force_bnd_pass)
9602 cum->force_bnd_pass--;
9604 cum->bnd_regno++;
9606 return;
9609 /* The first arg not going to Bounds Tables resets this counter. */
9610 cum->bnds_in_bt = 0;
9611 /* For unnamed args we always pass bounds to avoid bounds mess when
9612 passed and received types do not match. If bounds do not follow
9613 unnamed arg, still pretend required number of bounds were passed. */
9614 if (cum->force_bnd_pass)
9616 cum->bnd_regno += cum->force_bnd_pass;
9617 cum->force_bnd_pass = 0;
9620 if (TARGET_64BIT)
9622 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9624 if (call_abi == MS_ABI)
9625 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9626 else
9627 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9629 else
9630 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9632 /* For stdarg we expect bounds to be passed for each value passed
9633 in register. */
9634 if (cum->stdarg)
9635 cum->force_bnd_pass = nregs;
9636 /* For pointers passed in memory we expect bounds passed in Bounds
9637 Table. */
9638 if (!nregs)
9639 cum->bnds_in_bt = chkp_type_bounds_count (type);
9642 /* Define where to put the arguments to a function.
9643 Value is zero to push the argument on the stack,
9644 or a hard register in which to store the argument.
9646 MODE is the argument's machine mode.
9647 TYPE is the data type of the argument (as a tree).
9648 This is null for libcalls where that information may
9649 not be available.
9650 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9651 the preceding args and about the function being called.
9652 NAMED is nonzero if this argument is a named parameter
9653 (otherwise it is an extra parameter matching an ellipsis). */
9655 static rtx
9656 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9657 machine_mode orig_mode, const_tree type,
9658 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9660 bool error_p = false;
9661 /* Avoid the AL settings for the Unix64 ABI. */
9662 if (mode == VOIDmode)
9663 return constm1_rtx;
9665 if (TARGET_IAMCU)
9667 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9668 bytes in registers. */
9669 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9670 goto pass_in_reg;
9671 return NULL_RTX;
9674 switch (mode)
9676 default:
9677 break;
9679 case BLKmode:
9680 if (bytes < 0)
9681 break;
9682 /* FALLTHRU */
9683 case DImode:
9684 case SImode:
9685 case HImode:
9686 case QImode:
9687 pass_in_reg:
9688 if (words <= cum->nregs)
9690 int regno = cum->regno;
9692 /* Fastcall allocates the first two DWORD (SImode) or
9693 smaller arguments to ECX and EDX if it isn't an
9694 aggregate type . */
9695 if (cum->fastcall)
9697 if (mode == BLKmode
9698 || mode == DImode
9699 || (type && AGGREGATE_TYPE_P (type)))
9700 break;
9702 /* ECX not EAX is the first allocated register. */
9703 if (regno == AX_REG)
9704 regno = CX_REG;
9706 return gen_rtx_REG (mode, regno);
9708 break;
9710 case DFmode:
9711 if (cum->float_in_sse == -1)
9712 error_p = 1;
9713 if (cum->float_in_sse < 2)
9714 break;
9715 /* FALLTHRU */
9716 case SFmode:
9717 if (cum->float_in_sse == -1)
9718 error_p = 1;
9719 if (cum->float_in_sse < 1)
9720 break;
9721 /* FALLTHRU */
9722 case TImode:
9723 /* In 32bit, we pass TImode in xmm registers. */
9724 case V16QImode:
9725 case V8HImode:
9726 case V4SImode:
9727 case V2DImode:
9728 case V4SFmode:
9729 case V2DFmode:
9730 if (!type || !AGGREGATE_TYPE_P (type))
9732 if (cum->sse_nregs)
9733 return gen_reg_or_parallel (mode, orig_mode,
9734 cum->sse_regno + FIRST_SSE_REG);
9736 break;
9738 case OImode:
9739 case XImode:
9740 /* OImode and XImode shouldn't be used directly. */
9741 gcc_unreachable ();
9743 case V64QImode:
9744 case V32HImode:
9745 case V16SImode:
9746 case V8DImode:
9747 case V16SFmode:
9748 case V8DFmode:
9749 case V8SFmode:
9750 case V8SImode:
9751 case V32QImode:
9752 case V16HImode:
9753 case V4DFmode:
9754 case V4DImode:
9755 if (!type || !AGGREGATE_TYPE_P (type))
9757 if (cum->sse_nregs)
9758 return gen_reg_or_parallel (mode, orig_mode,
9759 cum->sse_regno + FIRST_SSE_REG);
9761 break;
9763 case V8QImode:
9764 case V4HImode:
9765 case V2SImode:
9766 case V2SFmode:
9767 case V1TImode:
9768 case V1DImode:
9769 if (!type || !AGGREGATE_TYPE_P (type))
9771 if (cum->mmx_nregs)
9772 return gen_reg_or_parallel (mode, orig_mode,
9773 cum->mmx_regno + FIRST_MMX_REG);
9775 break;
9777 if (error_p)
9779 cum->float_in_sse = 0;
9780 error ("calling %qD with SSE calling convention without "
9781 "SSE/SSE2 enabled", cum->decl);
9782 sorry ("this is a GCC bug that can be worked around by adding "
9783 "attribute used to function called");
9786 return NULL_RTX;
9789 static rtx
9790 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9791 machine_mode orig_mode, const_tree type, bool named)
9793 /* Handle a hidden AL argument containing number of registers
9794 for varargs x86-64 functions. */
9795 if (mode == VOIDmode)
9796 return GEN_INT (cum->maybe_vaarg
9797 ? (cum->sse_nregs < 0
9798 ? X86_64_SSE_REGPARM_MAX
9799 : cum->sse_regno)
9800 : -1);
9802 switch (mode)
9804 default:
9805 break;
9807 case V8SFmode:
9808 case V8SImode:
9809 case V32QImode:
9810 case V16HImode:
9811 case V4DFmode:
9812 case V4DImode:
9813 case V16SFmode:
9814 case V16SImode:
9815 case V64QImode:
9816 case V32HImode:
9817 case V8DFmode:
9818 case V8DImode:
9819 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9820 if (!named)
9821 return NULL;
9822 break;
9825 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9826 cum->sse_nregs,
9827 &x86_64_int_parameter_registers [cum->regno],
9828 cum->sse_regno);
9831 static rtx
9832 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9833 machine_mode orig_mode, bool named,
9834 HOST_WIDE_INT bytes)
9836 unsigned int regno;
9838 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9839 We use value of -2 to specify that current function call is MSABI. */
9840 if (mode == VOIDmode)
9841 return GEN_INT (-2);
9843 /* If we've run out of registers, it goes on the stack. */
9844 if (cum->nregs == 0)
9845 return NULL_RTX;
9847 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9849 /* Only floating point modes are passed in anything but integer regs. */
9850 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9852 if (named)
9853 regno = cum->regno + FIRST_SSE_REG;
9854 else
9856 rtx t1, t2;
9858 /* Unnamed floating parameters are passed in both the
9859 SSE and integer registers. */
9860 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
9861 t2 = gen_rtx_REG (mode, regno);
9862 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
9863 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
9864 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
9867 /* Handle aggregated types passed in register. */
9868 if (orig_mode == BLKmode)
9870 if (bytes > 0 && bytes <= 8)
9871 mode = (bytes > 4 ? DImode : SImode);
9872 if (mode == BLKmode)
9873 mode = DImode;
9876 return gen_reg_or_parallel (mode, orig_mode, regno);
9879 /* Return where to put the arguments to a function.
9880 Return zero to push the argument on the stack, or a hard register in which to store the argument.
9882 MODE is the argument's machine mode. TYPE is the data type of the
9883 argument. It is null for libcalls where that information may not be
9884 available. CUM gives information about the preceding args and about
9885 the function being called. NAMED is nonzero if this argument is a
9886 named parameter (otherwise it is an extra parameter matching an
9887 ellipsis). */
9889 static rtx
9890 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
9891 const_tree type, bool named)
9893 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9894 machine_mode mode = omode;
9895 HOST_WIDE_INT bytes, words;
9896 rtx arg;
9898 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9900 gcc_assert (type != NULL_TREE);
9901 if (POINTER_TYPE_P (type))
9903 /* This is the pointer argument. */
9904 gcc_assert (TYPE_MODE (type) == Pmode);
9905 if (cfun->machine->func_type == TYPE_INTERRUPT)
9906 /* -WORD(AP) in the current frame in interrupt handler. */
9907 arg = plus_constant (Pmode, arg_pointer_rtx,
9908 -UNITS_PER_WORD);
9909 else
9910 /* (AP) in the current frame in exception handler. */
9911 arg = arg_pointer_rtx;
9913 else
9915 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
9916 && TREE_CODE (type) == INTEGER_TYPE
9917 && TYPE_MODE (type) == word_mode);
9918 /* The integer argument is the error code at -WORD(AP) in
9919 the current frame in exception handler. */
9920 arg = gen_rtx_MEM (word_mode,
9921 plus_constant (Pmode,
9922 arg_pointer_rtx,
9923 -UNITS_PER_WORD));
9925 return arg;
9928 /* All pointer bounds arguments are handled separately here. */
9929 if ((type && POINTER_BOUNDS_TYPE_P (type))
9930 || POINTER_BOUNDS_MODE_P (mode))
9932 /* Return NULL if bounds are forced to go in Bounds Table. */
9933 if (cum->bnds_in_bt)
9934 arg = NULL;
9935 /* Return the next available bound reg if any. */
9936 else if (cum->bnd_regno <= LAST_BND_REG)
9937 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
9938 /* Return the next special slot number otherwise. */
9939 else
9940 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
9942 return arg;
9945 if (mode == BLKmode)
9946 bytes = int_size_in_bytes (type);
9947 else
9948 bytes = GET_MODE_SIZE (mode);
9949 words = CEIL (bytes, UNITS_PER_WORD);
9951 /* To simplify the code below, represent vector types with a vector mode
9952 even if MMX/SSE are not active. */
9953 if (type && TREE_CODE (type) == VECTOR_TYPE)
9954 mode = type_natural_mode (type, cum, false);
9956 if (TARGET_64BIT)
9958 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9960 if (call_abi == MS_ABI)
9961 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
9962 else
9963 arg = function_arg_64 (cum, mode, omode, type, named);
9965 else
9966 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
9968 return arg;
9971 /* A C expression that indicates when an argument must be passed by
9972 reference. If nonzero for an argument, a copy of that argument is
9973 made in memory and a pointer to the argument is passed instead of
9974 the argument itself. The pointer is passed in whatever way is
9975 appropriate for passing a pointer to that type. */
9977 static bool
9978 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
9979 const_tree type, bool)
9981 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9983 /* Bounds are never passed by reference. */
9984 if ((type && POINTER_BOUNDS_TYPE_P (type))
9985 || POINTER_BOUNDS_MODE_P (mode))
9986 return false;
9988 if (TARGET_64BIT)
9990 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9992 /* See Windows x64 Software Convention. */
9993 if (call_abi == MS_ABI)
9995 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
9997 if (type)
9999 /* Arrays are passed by reference. */
10000 if (TREE_CODE (type) == ARRAY_TYPE)
10001 return true;
10003 if (RECORD_OR_UNION_TYPE_P (type))
10005 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10006 are passed by reference. */
10007 msize = int_size_in_bytes (type);
10011 /* __m128 is passed by reference. */
10012 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10014 else if (type && int_size_in_bytes (type) == -1)
10015 return true;
10018 return false;
10021 /* Return true when TYPE should be 128bit aligned for 32bit argument
10022 passing ABI. XXX: This function is obsolete and is only used for
10023 checking psABI compatibility with previous versions of GCC. */
10025 static bool
10026 ix86_compat_aligned_value_p (const_tree type)
10028 machine_mode mode = TYPE_MODE (type);
10029 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10030 || mode == TDmode
10031 || mode == TFmode
10032 || mode == TCmode)
10033 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10034 return true;
10035 if (TYPE_ALIGN (type) < 128)
10036 return false;
10038 if (AGGREGATE_TYPE_P (type))
10040 /* Walk the aggregates recursively. */
10041 switch (TREE_CODE (type))
10043 case RECORD_TYPE:
10044 case UNION_TYPE:
10045 case QUAL_UNION_TYPE:
10047 tree field;
10049 /* Walk all the structure fields. */
10050 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10052 if (TREE_CODE (field) == FIELD_DECL
10053 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10054 return true;
10056 break;
10059 case ARRAY_TYPE:
10060 /* Just for use if some languages passes arrays by value. */
10061 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10062 return true;
10063 break;
10065 default:
10066 gcc_unreachable ();
10069 return false;
10072 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10073 XXX: This function is obsolete and is only used for checking psABI
10074 compatibility with previous versions of GCC. */
10076 static unsigned int
10077 ix86_compat_function_arg_boundary (machine_mode mode,
10078 const_tree type, unsigned int align)
10080 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10081 natural boundaries. */
10082 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10084 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10085 make an exception for SSE modes since these require 128bit
10086 alignment.
10088 The handling here differs from field_alignment. ICC aligns MMX
10089 arguments to 4 byte boundaries, while structure fields are aligned
10090 to 8 byte boundaries. */
10091 if (!type)
10093 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10094 align = PARM_BOUNDARY;
10096 else
10098 if (!ix86_compat_aligned_value_p (type))
10099 align = PARM_BOUNDARY;
10102 if (align > BIGGEST_ALIGNMENT)
10103 align = BIGGEST_ALIGNMENT;
10104 return align;
10107 /* Return true when TYPE should be 128bit aligned for 32bit argument
10108 passing ABI. */
10110 static bool
10111 ix86_contains_aligned_value_p (const_tree type)
10113 machine_mode mode = TYPE_MODE (type);
10115 if (mode == XFmode || mode == XCmode)
10116 return false;
10118 if (TYPE_ALIGN (type) < 128)
10119 return false;
10121 if (AGGREGATE_TYPE_P (type))
10123 /* Walk the aggregates recursively. */
10124 switch (TREE_CODE (type))
10126 case RECORD_TYPE:
10127 case UNION_TYPE:
10128 case QUAL_UNION_TYPE:
10130 tree field;
10132 /* Walk all the structure fields. */
10133 for (field = TYPE_FIELDS (type);
10134 field;
10135 field = DECL_CHAIN (field))
10137 if (TREE_CODE (field) == FIELD_DECL
10138 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10139 return true;
10141 break;
10144 case ARRAY_TYPE:
10145 /* Just for use if some languages passes arrays by value. */
10146 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10147 return true;
10148 break;
10150 default:
10151 gcc_unreachable ();
10154 else
10155 return TYPE_ALIGN (type) >= 128;
10157 return false;
10160 /* Gives the alignment boundary, in bits, of an argument with the
10161 specified mode and type. */
10163 static unsigned int
10164 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10166 unsigned int align;
10167 if (type)
10169 /* Since the main variant type is used for call, we convert it to
10170 the main variant type. */
10171 type = TYPE_MAIN_VARIANT (type);
10172 align = TYPE_ALIGN (type);
10174 else
10175 align = GET_MODE_ALIGNMENT (mode);
10176 if (align < PARM_BOUNDARY)
10177 align = PARM_BOUNDARY;
10178 else
10180 static bool warned;
10181 unsigned int saved_align = align;
10183 if (!TARGET_64BIT)
10185 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10186 if (!type)
10188 if (mode == XFmode || mode == XCmode)
10189 align = PARM_BOUNDARY;
10191 else if (!ix86_contains_aligned_value_p (type))
10192 align = PARM_BOUNDARY;
10194 if (align < 128)
10195 align = PARM_BOUNDARY;
10198 if (warn_psabi
10199 && !warned
10200 && align != ix86_compat_function_arg_boundary (mode, type,
10201 saved_align))
10203 warned = true;
10204 inform (input_location,
10205 "The ABI for passing parameters with %d-byte"
10206 " alignment has changed in GCC 4.6",
10207 align / BITS_PER_UNIT);
10211 return align;
10214 /* Return true if N is a possible register number of function value. */
10216 static bool
10217 ix86_function_value_regno_p (const unsigned int regno)
10219 switch (regno)
10221 case AX_REG:
10222 return true;
10223 case DX_REG:
10224 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10225 case DI_REG:
10226 case SI_REG:
10227 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10229 case BND0_REG:
10230 case BND1_REG:
10231 return chkp_function_instrumented_p (current_function_decl);
10233 /* Complex values are returned in %st(0)/%st(1) pair. */
10234 case ST0_REG:
10235 case ST1_REG:
10236 /* TODO: The function should depend on current function ABI but
10237 builtins.c would need updating then. Therefore we use the
10238 default ABI. */
10239 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10240 return false;
10241 return TARGET_FLOAT_RETURNS_IN_80387;
10243 /* Complex values are returned in %xmm0/%xmm1 pair. */
10244 case XMM0_REG:
10245 case XMM1_REG:
10246 return TARGET_SSE;
10248 case MM0_REG:
10249 if (TARGET_MACHO || TARGET_64BIT)
10250 return false;
10251 return TARGET_MMX;
10254 return false;
10257 /* Define how to find the value returned by a function.
10258 VALTYPE is the data type of the value (as a tree).
10259 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10260 otherwise, FUNC is 0. */
10262 static rtx
10263 function_value_32 (machine_mode orig_mode, machine_mode mode,
10264 const_tree fntype, const_tree fn)
10266 unsigned int regno;
10268 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10269 we normally prevent this case when mmx is not available. However
10270 some ABIs may require the result to be returned like DImode. */
10271 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10272 regno = FIRST_MMX_REG;
10274 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10275 we prevent this case when sse is not available. However some ABIs
10276 may require the result to be returned like integer TImode. */
10277 else if (mode == TImode
10278 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10279 regno = FIRST_SSE_REG;
10281 /* 32-byte vector modes in %ymm0. */
10282 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10283 regno = FIRST_SSE_REG;
10285 /* 64-byte vector modes in %zmm0. */
10286 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10287 regno = FIRST_SSE_REG;
10289 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10290 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10291 regno = FIRST_FLOAT_REG;
10292 else
10293 /* Most things go in %eax. */
10294 regno = AX_REG;
10296 /* Override FP return register with %xmm0 for local functions when
10297 SSE math is enabled or for functions with sseregparm attribute. */
10298 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10300 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10301 if (sse_level == -1)
10303 error ("calling %qD with SSE caling convention without "
10304 "SSE/SSE2 enabled", fn);
10305 sorry ("this is a GCC bug that can be worked around by adding "
10306 "attribute used to function called");
10308 else if ((sse_level >= 1 && mode == SFmode)
10309 || (sse_level == 2 && mode == DFmode))
10310 regno = FIRST_SSE_REG;
10313 /* OImode shouldn't be used directly. */
10314 gcc_assert (mode != OImode);
10316 return gen_rtx_REG (orig_mode, regno);
10319 static rtx
10320 function_value_64 (machine_mode orig_mode, machine_mode mode,
10321 const_tree valtype)
10323 rtx ret;
10325 /* Handle libcalls, which don't provide a type node. */
10326 if (valtype == NULL)
10328 unsigned int regno;
10330 switch (mode)
10332 case SFmode:
10333 case SCmode:
10334 case DFmode:
10335 case DCmode:
10336 case TFmode:
10337 case SDmode:
10338 case DDmode:
10339 case TDmode:
10340 regno = FIRST_SSE_REG;
10341 break;
10342 case XFmode:
10343 case XCmode:
10344 regno = FIRST_FLOAT_REG;
10345 break;
10346 case TCmode:
10347 return NULL;
10348 default:
10349 regno = AX_REG;
10352 return gen_rtx_REG (mode, regno);
10354 else if (POINTER_TYPE_P (valtype))
10356 /* Pointers are always returned in word_mode. */
10357 mode = word_mode;
10360 ret = construct_container (mode, orig_mode, valtype, 1,
10361 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10362 x86_64_int_return_registers, 0);
10364 /* For zero sized structures, construct_container returns NULL, but we
10365 need to keep rest of compiler happy by returning meaningful value. */
10366 if (!ret)
10367 ret = gen_rtx_REG (orig_mode, AX_REG);
10369 return ret;
10372 static rtx
10373 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10374 const_tree valtype)
10376 unsigned int regno = AX_REG;
10378 if (TARGET_SSE)
10380 switch (GET_MODE_SIZE (mode))
10382 case 16:
10383 if (valtype != NULL_TREE
10384 && !VECTOR_INTEGER_TYPE_P (valtype)
10385 && !VECTOR_INTEGER_TYPE_P (valtype)
10386 && !INTEGRAL_TYPE_P (valtype)
10387 && !VECTOR_FLOAT_TYPE_P (valtype))
10388 break;
10389 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10390 && !COMPLEX_MODE_P (mode))
10391 regno = FIRST_SSE_REG;
10392 break;
10393 case 8:
10394 case 4:
10395 if (mode == SFmode || mode == DFmode)
10396 regno = FIRST_SSE_REG;
10397 break;
10398 default:
10399 break;
10402 return gen_rtx_REG (orig_mode, regno);
10405 static rtx
10406 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10407 machine_mode orig_mode, machine_mode mode)
10409 const_tree fn, fntype;
10411 fn = NULL_TREE;
10412 if (fntype_or_decl && DECL_P (fntype_or_decl))
10413 fn = fntype_or_decl;
10414 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10416 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10417 || POINTER_BOUNDS_MODE_P (mode))
10418 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10419 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10420 return function_value_ms_64 (orig_mode, mode, valtype);
10421 else if (TARGET_64BIT)
10422 return function_value_64 (orig_mode, mode, valtype);
10423 else
10424 return function_value_32 (orig_mode, mode, fntype, fn);
10427 static rtx
10428 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10430 machine_mode mode, orig_mode;
10432 orig_mode = TYPE_MODE (valtype);
10433 mode = type_natural_mode (valtype, NULL, true);
10434 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10437 /* Return an RTX representing a place where a function returns
10438 or recieves pointer bounds or NULL if no bounds are returned.
10440 VALTYPE is a data type of a value returned by the function.
10442 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10443 or FUNCTION_TYPE of the function.
10445 If OUTGOING is false, return a place in which the caller will
10446 see the return value. Otherwise, return a place where a
10447 function returns a value. */
10449 static rtx
10450 ix86_function_value_bounds (const_tree valtype,
10451 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10452 bool outgoing ATTRIBUTE_UNUSED)
10454 rtx res = NULL_RTX;
10456 if (BOUNDED_TYPE_P (valtype))
10457 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10458 else if (chkp_type_has_pointer (valtype))
10460 bitmap slots;
10461 rtx bounds[2];
10462 bitmap_iterator bi;
10463 unsigned i, bnd_no = 0;
10465 bitmap_obstack_initialize (NULL);
10466 slots = BITMAP_ALLOC (NULL);
10467 chkp_find_bound_slots (valtype, slots);
10469 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10471 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10472 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10473 gcc_assert (bnd_no < 2);
10474 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10477 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10479 BITMAP_FREE (slots);
10480 bitmap_obstack_release (NULL);
10482 else
10483 res = NULL_RTX;
10485 return res;
10488 /* Pointer function arguments and return values are promoted to
10489 word_mode for normal functions. */
10491 static machine_mode
10492 ix86_promote_function_mode (const_tree type, machine_mode mode,
10493 int *punsignedp, const_tree fntype,
10494 int for_return)
10496 if (cfun->machine->func_type == TYPE_NORMAL
10497 && type != NULL_TREE
10498 && POINTER_TYPE_P (type))
10500 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10501 return word_mode;
10503 return default_promote_function_mode (type, mode, punsignedp, fntype,
10504 for_return);
10507 /* Return true if a structure, union or array with MODE containing FIELD
10508 should be accessed using BLKmode. */
10510 static bool
10511 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10513 /* Union with XFmode must be in BLKmode. */
10514 return (mode == XFmode
10515 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10516 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10520 ix86_libcall_value (machine_mode mode)
10522 return ix86_function_value_1 (NULL, NULL, mode, mode);
10525 /* Return true iff type is returned in memory. */
10527 static bool
10528 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10530 #ifdef SUBTARGET_RETURN_IN_MEMORY
10531 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10532 #else
10533 const machine_mode mode = type_natural_mode (type, NULL, true);
10534 HOST_WIDE_INT size;
10536 if (POINTER_BOUNDS_TYPE_P (type))
10537 return false;
10539 if (TARGET_64BIT)
10541 if (ix86_function_type_abi (fntype) == MS_ABI)
10543 size = int_size_in_bytes (type);
10545 /* __m128 is returned in xmm0. */
10546 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10547 || INTEGRAL_TYPE_P (type)
10548 || VECTOR_FLOAT_TYPE_P (type))
10549 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10550 && !COMPLEX_MODE_P (mode)
10551 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10552 return false;
10554 /* Otherwise, the size must be exactly in [1248]. */
10555 return size != 1 && size != 2 && size != 4 && size != 8;
10557 else
10559 int needed_intregs, needed_sseregs;
10561 return examine_argument (mode, type, 1,
10562 &needed_intregs, &needed_sseregs);
10565 else
10567 size = int_size_in_bytes (type);
10569 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10570 bytes in registers. */
10571 if (TARGET_IAMCU)
10572 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10574 if (mode == BLKmode)
10575 return true;
10577 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10578 return false;
10580 if (VECTOR_MODE_P (mode) || mode == TImode)
10582 /* User-created vectors small enough to fit in EAX. */
10583 if (size < 8)
10584 return false;
10586 /* Unless ABI prescibes otherwise,
10587 MMX/3dNow values are returned in MM0 if available. */
10589 if (size == 8)
10590 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10592 /* SSE values are returned in XMM0 if available. */
10593 if (size == 16)
10594 return !TARGET_SSE;
10596 /* AVX values are returned in YMM0 if available. */
10597 if (size == 32)
10598 return !TARGET_AVX;
10600 /* AVX512F values are returned in ZMM0 if available. */
10601 if (size == 64)
10602 return !TARGET_AVX512F;
10605 if (mode == XFmode)
10606 return false;
10608 if (size > 12)
10609 return true;
10611 /* OImode shouldn't be used directly. */
10612 gcc_assert (mode != OImode);
10614 return false;
10616 #endif
10620 /* Create the va_list data type. */
10622 static tree
10623 ix86_build_builtin_va_list_64 (void)
10625 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10627 record = lang_hooks.types.make_type (RECORD_TYPE);
10628 type_decl = build_decl (BUILTINS_LOCATION,
10629 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10631 f_gpr = build_decl (BUILTINS_LOCATION,
10632 FIELD_DECL, get_identifier ("gp_offset"),
10633 unsigned_type_node);
10634 f_fpr = build_decl (BUILTINS_LOCATION,
10635 FIELD_DECL, get_identifier ("fp_offset"),
10636 unsigned_type_node);
10637 f_ovf = build_decl (BUILTINS_LOCATION,
10638 FIELD_DECL, get_identifier ("overflow_arg_area"),
10639 ptr_type_node);
10640 f_sav = build_decl (BUILTINS_LOCATION,
10641 FIELD_DECL, get_identifier ("reg_save_area"),
10642 ptr_type_node);
10644 va_list_gpr_counter_field = f_gpr;
10645 va_list_fpr_counter_field = f_fpr;
10647 DECL_FIELD_CONTEXT (f_gpr) = record;
10648 DECL_FIELD_CONTEXT (f_fpr) = record;
10649 DECL_FIELD_CONTEXT (f_ovf) = record;
10650 DECL_FIELD_CONTEXT (f_sav) = record;
10652 TYPE_STUB_DECL (record) = type_decl;
10653 TYPE_NAME (record) = type_decl;
10654 TYPE_FIELDS (record) = f_gpr;
10655 DECL_CHAIN (f_gpr) = f_fpr;
10656 DECL_CHAIN (f_fpr) = f_ovf;
10657 DECL_CHAIN (f_ovf) = f_sav;
10659 layout_type (record);
10661 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10662 NULL_TREE, TYPE_ATTRIBUTES (record));
10664 /* The correct type is an array type of one element. */
10665 return build_array_type (record, build_index_type (size_zero_node));
10668 /* Setup the builtin va_list data type and for 64-bit the additional
10669 calling convention specific va_list data types. */
10671 static tree
10672 ix86_build_builtin_va_list (void)
10674 if (TARGET_64BIT)
10676 /* Initialize ABI specific va_list builtin types.
10678 In lto1, we can encounter two va_list types:
10679 - one as a result of the type-merge across TUs, and
10680 - the one constructed here.
10681 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10682 a type identity check in canonical_va_list_type based on
10683 TYPE_MAIN_VARIANT (which we used to have) will not work.
10684 Instead, we tag each va_list_type_node with its unique attribute, and
10685 look for the attribute in the type identity check in
10686 canonical_va_list_type.
10688 Tagging sysv_va_list_type_node directly with the attribute is
10689 problematic since it's a array of one record, which will degrade into a
10690 pointer to record when used as parameter (see build_va_arg comments for
10691 an example), dropping the attribute in the process. So we tag the
10692 record instead. */
10694 /* For SYSV_ABI we use an array of one record. */
10695 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10697 /* For MS_ABI we use plain pointer to argument area. */
10698 tree char_ptr_type = build_pointer_type (char_type_node);
10699 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10700 TYPE_ATTRIBUTES (char_ptr_type));
10701 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10703 return ((ix86_abi == MS_ABI)
10704 ? ms_va_list_type_node
10705 : sysv_va_list_type_node);
10707 else
10709 /* For i386 we use plain pointer to argument area. */
10710 return build_pointer_type (char_type_node);
10714 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10716 static void
10717 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10719 rtx save_area, mem;
10720 alias_set_type set;
10721 int i, max;
10723 /* GPR size of varargs save area. */
10724 if (cfun->va_list_gpr_size)
10725 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10726 else
10727 ix86_varargs_gpr_size = 0;
10729 /* FPR size of varargs save area. We don't need it if we don't pass
10730 anything in SSE registers. */
10731 if (TARGET_SSE && cfun->va_list_fpr_size)
10732 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10733 else
10734 ix86_varargs_fpr_size = 0;
10736 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10737 return;
10739 save_area = frame_pointer_rtx;
10740 set = get_varargs_alias_set ();
10742 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10743 if (max > X86_64_REGPARM_MAX)
10744 max = X86_64_REGPARM_MAX;
10746 for (i = cum->regno; i < max; i++)
10748 mem = gen_rtx_MEM (word_mode,
10749 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10750 MEM_NOTRAP_P (mem) = 1;
10751 set_mem_alias_set (mem, set);
10752 emit_move_insn (mem,
10753 gen_rtx_REG (word_mode,
10754 x86_64_int_parameter_registers[i]));
10757 if (ix86_varargs_fpr_size)
10759 machine_mode smode;
10760 rtx_code_label *label;
10761 rtx test;
10763 /* Now emit code to save SSE registers. The AX parameter contains number
10764 of SSE parameter registers used to call this function, though all we
10765 actually check here is the zero/non-zero status. */
10767 label = gen_label_rtx ();
10768 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10769 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10770 label));
10772 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10773 we used movdqa (i.e. TImode) instead? Perhaps even better would
10774 be if we could determine the real mode of the data, via a hook
10775 into pass_stdarg. Ignore all that for now. */
10776 smode = V4SFmode;
10777 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10778 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10780 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10781 if (max > X86_64_SSE_REGPARM_MAX)
10782 max = X86_64_SSE_REGPARM_MAX;
10784 for (i = cum->sse_regno; i < max; ++i)
10786 mem = plus_constant (Pmode, save_area,
10787 i * 16 + ix86_varargs_gpr_size);
10788 mem = gen_rtx_MEM (smode, mem);
10789 MEM_NOTRAP_P (mem) = 1;
10790 set_mem_alias_set (mem, set);
10791 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10793 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10796 emit_label (label);
10800 static void
10801 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10803 alias_set_type set = get_varargs_alias_set ();
10804 int i;
10806 /* Reset to zero, as there might be a sysv vaarg used
10807 before. */
10808 ix86_varargs_gpr_size = 0;
10809 ix86_varargs_fpr_size = 0;
10811 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10813 rtx reg, mem;
10815 mem = gen_rtx_MEM (Pmode,
10816 plus_constant (Pmode, virtual_incoming_args_rtx,
10817 i * UNITS_PER_WORD));
10818 MEM_NOTRAP_P (mem) = 1;
10819 set_mem_alias_set (mem, set);
10821 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10822 emit_move_insn (mem, reg);
10826 static void
10827 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10828 tree type, int *, int no_rtl)
10830 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10831 CUMULATIVE_ARGS next_cum;
10832 tree fntype;
10834 /* This argument doesn't appear to be used anymore. Which is good,
10835 because the old code here didn't suppress rtl generation. */
10836 gcc_assert (!no_rtl);
10838 if (!TARGET_64BIT)
10839 return;
10841 fntype = TREE_TYPE (current_function_decl);
10843 /* For varargs, we do not want to skip the dummy va_dcl argument.
10844 For stdargs, we do want to skip the last named argument. */
10845 next_cum = *cum;
10846 if (stdarg_p (fntype))
10847 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10848 true);
10850 if (cum->call_abi == MS_ABI)
10851 setup_incoming_varargs_ms_64 (&next_cum);
10852 else
10853 setup_incoming_varargs_64 (&next_cum);
10856 static void
10857 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10858 enum machine_mode mode,
10859 tree type,
10860 int *pretend_size ATTRIBUTE_UNUSED,
10861 int no_rtl)
10863 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10864 CUMULATIVE_ARGS next_cum;
10865 tree fntype;
10866 rtx save_area;
10867 int bnd_reg, i, max;
10869 gcc_assert (!no_rtl);
10871 /* Do nothing if we use plain pointer to argument area. */
10872 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
10873 return;
10875 fntype = TREE_TYPE (current_function_decl);
10877 /* For varargs, we do not want to skip the dummy va_dcl argument.
10878 For stdargs, we do want to skip the last named argument. */
10879 next_cum = *cum;
10880 if (stdarg_p (fntype))
10881 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10882 true);
10883 save_area = frame_pointer_rtx;
10885 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10886 if (max > X86_64_REGPARM_MAX)
10887 max = X86_64_REGPARM_MAX;
10889 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
10890 if (chkp_function_instrumented_p (current_function_decl))
10891 for (i = cum->regno; i < max; i++)
10893 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
10894 rtx ptr = gen_rtx_REG (Pmode,
10895 x86_64_int_parameter_registers[i]);
10896 rtx bounds;
10898 if (bnd_reg <= LAST_BND_REG)
10899 bounds = gen_rtx_REG (BNDmode, bnd_reg);
10900 else
10902 rtx ldx_addr =
10903 plus_constant (Pmode, arg_pointer_rtx,
10904 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
10905 bounds = gen_reg_rtx (BNDmode);
10906 emit_insn (BNDmode == BND64mode
10907 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
10908 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
10911 emit_insn (BNDmode == BND64mode
10912 ? gen_bnd64_stx (addr, ptr, bounds)
10913 : gen_bnd32_stx (addr, ptr, bounds));
10915 bnd_reg++;
10920 /* Checks if TYPE is of kind va_list char *. */
10922 static bool
10923 is_va_list_char_pointer (tree type)
10925 tree canonic;
10927 /* For 32-bit it is always true. */
10928 if (!TARGET_64BIT)
10929 return true;
10930 canonic = ix86_canonical_va_list_type (type);
10931 return (canonic == ms_va_list_type_node
10932 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
10935 /* Implement va_start. */
10937 static void
10938 ix86_va_start (tree valist, rtx nextarg)
10940 HOST_WIDE_INT words, n_gpr, n_fpr;
10941 tree f_gpr, f_fpr, f_ovf, f_sav;
10942 tree gpr, fpr, ovf, sav, t;
10943 tree type;
10944 rtx ovf_rtx;
10946 if (flag_split_stack
10947 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10949 unsigned int scratch_regno;
10951 /* When we are splitting the stack, we can't refer to the stack
10952 arguments using internal_arg_pointer, because they may be on
10953 the old stack. The split stack prologue will arrange to
10954 leave a pointer to the old stack arguments in a scratch
10955 register, which we here copy to a pseudo-register. The split
10956 stack prologue can't set the pseudo-register directly because
10957 it (the prologue) runs before any registers have been saved. */
10959 scratch_regno = split_stack_prologue_scratch_regno ();
10960 if (scratch_regno != INVALID_REGNUM)
10962 rtx reg;
10963 rtx_insn *seq;
10965 reg = gen_reg_rtx (Pmode);
10966 cfun->machine->split_stack_varargs_pointer = reg;
10968 start_sequence ();
10969 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
10970 seq = get_insns ();
10971 end_sequence ();
10973 push_topmost_sequence ();
10974 emit_insn_after (seq, entry_of_function ());
10975 pop_topmost_sequence ();
10979 /* Only 64bit target needs something special. */
10980 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10982 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10983 std_expand_builtin_va_start (valist, nextarg);
10984 else
10986 rtx va_r, next;
10988 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
10989 next = expand_binop (ptr_mode, add_optab,
10990 cfun->machine->split_stack_varargs_pointer,
10991 crtl->args.arg_offset_rtx,
10992 NULL_RTX, 0, OPTAB_LIB_WIDEN);
10993 convert_move (va_r, next, 0);
10995 /* Store zero bounds for va_list. */
10996 if (chkp_function_instrumented_p (current_function_decl))
10997 chkp_expand_bounds_reset_for_mem (valist,
10998 make_tree (TREE_TYPE (valist),
10999 next));
11002 return;
11005 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11006 f_fpr = DECL_CHAIN (f_gpr);
11007 f_ovf = DECL_CHAIN (f_fpr);
11008 f_sav = DECL_CHAIN (f_ovf);
11010 valist = build_simple_mem_ref (valist);
11011 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11012 /* The following should be folded into the MEM_REF offset. */
11013 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11014 f_gpr, NULL_TREE);
11015 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11016 f_fpr, NULL_TREE);
11017 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11018 f_ovf, NULL_TREE);
11019 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11020 f_sav, NULL_TREE);
11022 /* Count number of gp and fp argument registers used. */
11023 words = crtl->args.info.words;
11024 n_gpr = crtl->args.info.regno;
11025 n_fpr = crtl->args.info.sse_regno;
11027 if (cfun->va_list_gpr_size)
11029 type = TREE_TYPE (gpr);
11030 t = build2 (MODIFY_EXPR, type,
11031 gpr, build_int_cst (type, n_gpr * 8));
11032 TREE_SIDE_EFFECTS (t) = 1;
11033 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11036 if (TARGET_SSE && cfun->va_list_fpr_size)
11038 type = TREE_TYPE (fpr);
11039 t = build2 (MODIFY_EXPR, type, fpr,
11040 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11041 TREE_SIDE_EFFECTS (t) = 1;
11042 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11045 /* Find the overflow area. */
11046 type = TREE_TYPE (ovf);
11047 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11048 ovf_rtx = crtl->args.internal_arg_pointer;
11049 else
11050 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11051 t = make_tree (type, ovf_rtx);
11052 if (words != 0)
11053 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11055 /* Store zero bounds for overflow area pointer. */
11056 if (chkp_function_instrumented_p (current_function_decl))
11057 chkp_expand_bounds_reset_for_mem (ovf, t);
11059 t = build2 (MODIFY_EXPR, type, ovf, t);
11060 TREE_SIDE_EFFECTS (t) = 1;
11061 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11063 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11065 /* Find the register save area.
11066 Prologue of the function save it right above stack frame. */
11067 type = TREE_TYPE (sav);
11068 t = make_tree (type, frame_pointer_rtx);
11069 if (!ix86_varargs_gpr_size)
11070 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11072 /* Store zero bounds for save area pointer. */
11073 if (chkp_function_instrumented_p (current_function_decl))
11074 chkp_expand_bounds_reset_for_mem (sav, t);
11076 t = build2 (MODIFY_EXPR, type, sav, t);
11077 TREE_SIDE_EFFECTS (t) = 1;
11078 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11082 /* Implement va_arg. */
11084 static tree
11085 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11086 gimple_seq *post_p)
11088 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11089 tree f_gpr, f_fpr, f_ovf, f_sav;
11090 tree gpr, fpr, ovf, sav, t;
11091 int size, rsize;
11092 tree lab_false, lab_over = NULL_TREE;
11093 tree addr, t2;
11094 rtx container;
11095 int indirect_p = 0;
11096 tree ptrtype;
11097 machine_mode nat_mode;
11098 unsigned int arg_boundary;
11100 /* Only 64bit target needs something special. */
11101 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11102 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11104 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11105 f_fpr = DECL_CHAIN (f_gpr);
11106 f_ovf = DECL_CHAIN (f_fpr);
11107 f_sav = DECL_CHAIN (f_ovf);
11109 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11110 valist, f_gpr, NULL_TREE);
11112 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11113 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11114 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11116 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11117 if (indirect_p)
11118 type = build_pointer_type (type);
11119 size = int_size_in_bytes (type);
11120 rsize = CEIL (size, UNITS_PER_WORD);
11122 nat_mode = type_natural_mode (type, NULL, false);
11123 switch (nat_mode)
11125 case V8SFmode:
11126 case V8SImode:
11127 case V32QImode:
11128 case V16HImode:
11129 case V4DFmode:
11130 case V4DImode:
11131 case V16SFmode:
11132 case V16SImode:
11133 case V64QImode:
11134 case V32HImode:
11135 case V8DFmode:
11136 case V8DImode:
11137 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11138 if (!TARGET_64BIT_MS_ABI)
11140 container = NULL;
11141 break;
11143 /* FALLTHRU */
11145 default:
11146 container = construct_container (nat_mode, TYPE_MODE (type),
11147 type, 0, X86_64_REGPARM_MAX,
11148 X86_64_SSE_REGPARM_MAX, intreg,
11150 break;
11153 /* Pull the value out of the saved registers. */
11155 addr = create_tmp_var (ptr_type_node, "addr");
11157 if (container)
11159 int needed_intregs, needed_sseregs;
11160 bool need_temp;
11161 tree int_addr, sse_addr;
11163 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11164 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11166 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11168 need_temp = (!REG_P (container)
11169 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11170 || TYPE_ALIGN (type) > 128));
11172 /* In case we are passing structure, verify that it is consecutive block
11173 on the register save area. If not we need to do moves. */
11174 if (!need_temp && !REG_P (container))
11176 /* Verify that all registers are strictly consecutive */
11177 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11179 int i;
11181 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11183 rtx slot = XVECEXP (container, 0, i);
11184 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11185 || INTVAL (XEXP (slot, 1)) != i * 16)
11186 need_temp = true;
11189 else
11191 int i;
11193 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11195 rtx slot = XVECEXP (container, 0, i);
11196 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11197 || INTVAL (XEXP (slot, 1)) != i * 8)
11198 need_temp = true;
11202 if (!need_temp)
11204 int_addr = addr;
11205 sse_addr = addr;
11207 else
11209 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11210 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11213 /* First ensure that we fit completely in registers. */
11214 if (needed_intregs)
11216 t = build_int_cst (TREE_TYPE (gpr),
11217 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11218 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11219 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11220 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11221 gimplify_and_add (t, pre_p);
11223 if (needed_sseregs)
11225 t = build_int_cst (TREE_TYPE (fpr),
11226 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11227 + X86_64_REGPARM_MAX * 8);
11228 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11229 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11230 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11231 gimplify_and_add (t, pre_p);
11234 /* Compute index to start of area used for integer regs. */
11235 if (needed_intregs)
11237 /* int_addr = gpr + sav; */
11238 t = fold_build_pointer_plus (sav, gpr);
11239 gimplify_assign (int_addr, t, pre_p);
11241 if (needed_sseregs)
11243 /* sse_addr = fpr + sav; */
11244 t = fold_build_pointer_plus (sav, fpr);
11245 gimplify_assign (sse_addr, t, pre_p);
11247 if (need_temp)
11249 int i, prev_size = 0;
11250 tree temp = create_tmp_var (type, "va_arg_tmp");
11252 /* addr = &temp; */
11253 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11254 gimplify_assign (addr, t, pre_p);
11256 for (i = 0; i < XVECLEN (container, 0); i++)
11258 rtx slot = XVECEXP (container, 0, i);
11259 rtx reg = XEXP (slot, 0);
11260 machine_mode mode = GET_MODE (reg);
11261 tree piece_type;
11262 tree addr_type;
11263 tree daddr_type;
11264 tree src_addr, src;
11265 int src_offset;
11266 tree dest_addr, dest;
11267 int cur_size = GET_MODE_SIZE (mode);
11269 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11270 prev_size = INTVAL (XEXP (slot, 1));
11271 if (prev_size + cur_size > size)
11273 cur_size = size - prev_size;
11274 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11275 if (mode == BLKmode)
11276 mode = QImode;
11278 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11279 if (mode == GET_MODE (reg))
11280 addr_type = build_pointer_type (piece_type);
11281 else
11282 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11283 true);
11284 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11285 true);
11287 if (SSE_REGNO_P (REGNO (reg)))
11289 src_addr = sse_addr;
11290 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11292 else
11294 src_addr = int_addr;
11295 src_offset = REGNO (reg) * 8;
11297 src_addr = fold_convert (addr_type, src_addr);
11298 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11300 dest_addr = fold_convert (daddr_type, addr);
11301 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11302 if (cur_size == GET_MODE_SIZE (mode))
11304 src = build_va_arg_indirect_ref (src_addr);
11305 dest = build_va_arg_indirect_ref (dest_addr);
11307 gimplify_assign (dest, src, pre_p);
11309 else
11311 tree copy
11312 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11313 3, dest_addr, src_addr,
11314 size_int (cur_size));
11315 gimplify_and_add (copy, pre_p);
11317 prev_size += cur_size;
11321 if (needed_intregs)
11323 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11324 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11325 gimplify_assign (gpr, t, pre_p);
11328 if (needed_sseregs)
11330 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11331 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11332 gimplify_assign (unshare_expr (fpr), t, pre_p);
11335 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11337 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11340 /* ... otherwise out of the overflow area. */
11342 /* When we align parameter on stack for caller, if the parameter
11343 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11344 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11345 here with caller. */
11346 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11347 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11348 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11350 /* Care for on-stack alignment if needed. */
11351 if (arg_boundary <= 64 || size == 0)
11352 t = ovf;
11353 else
11355 HOST_WIDE_INT align = arg_boundary / 8;
11356 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11357 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11358 build_int_cst (TREE_TYPE (t), -align));
11361 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11362 gimplify_assign (addr, t, pre_p);
11364 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11365 gimplify_assign (unshare_expr (ovf), t, pre_p);
11367 if (container)
11368 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11370 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11371 addr = fold_convert (ptrtype, addr);
11373 if (indirect_p)
11374 addr = build_va_arg_indirect_ref (addr);
11375 return build_va_arg_indirect_ref (addr);
11378 /* Return true if OPNUM's MEM should be matched
11379 in movabs* patterns. */
11381 bool
11382 ix86_check_movabs (rtx insn, int opnum)
11384 rtx set, mem;
11386 set = PATTERN (insn);
11387 if (GET_CODE (set) == PARALLEL)
11388 set = XVECEXP (set, 0, 0);
11389 gcc_assert (GET_CODE (set) == SET);
11390 mem = XEXP (set, opnum);
11391 while (SUBREG_P (mem))
11392 mem = SUBREG_REG (mem);
11393 gcc_assert (MEM_P (mem));
11394 return volatile_ok || !MEM_VOLATILE_P (mem);
11397 /* Return false if INSN contains a MEM with a non-default address space. */
11398 bool
11399 ix86_check_no_addr_space (rtx insn)
11401 subrtx_var_iterator::array_type array;
11402 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11404 rtx x = *iter;
11405 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11406 return false;
11408 return true;
11411 /* Initialize the table of extra 80387 mathematical constants. */
11413 static void
11414 init_ext_80387_constants (void)
11416 static const char * cst[5] =
11418 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11419 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11420 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11421 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11422 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11424 int i;
11426 for (i = 0; i < 5; i++)
11428 real_from_string (&ext_80387_constants_table[i], cst[i]);
11429 /* Ensure each constant is rounded to XFmode precision. */
11430 real_convert (&ext_80387_constants_table[i],
11431 XFmode, &ext_80387_constants_table[i]);
11434 ext_80387_constants_init = 1;
11437 /* Return non-zero if the constant is something that
11438 can be loaded with a special instruction. */
11441 standard_80387_constant_p (rtx x)
11443 machine_mode mode = GET_MODE (x);
11445 const REAL_VALUE_TYPE *r;
11447 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11448 return -1;
11450 if (x == CONST0_RTX (mode))
11451 return 1;
11452 if (x == CONST1_RTX (mode))
11453 return 2;
11455 r = CONST_DOUBLE_REAL_VALUE (x);
11457 /* For XFmode constants, try to find a special 80387 instruction when
11458 optimizing for size or on those CPUs that benefit from them. */
11459 if (mode == XFmode
11460 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11462 int i;
11464 if (! ext_80387_constants_init)
11465 init_ext_80387_constants ();
11467 for (i = 0; i < 5; i++)
11468 if (real_identical (r, &ext_80387_constants_table[i]))
11469 return i + 3;
11472 /* Load of the constant -0.0 or -1.0 will be split as
11473 fldz;fchs or fld1;fchs sequence. */
11474 if (real_isnegzero (r))
11475 return 8;
11476 if (real_identical (r, &dconstm1))
11477 return 9;
11479 return 0;
11482 /* Return the opcode of the special instruction to be used to load
11483 the constant X. */
11485 const char *
11486 standard_80387_constant_opcode (rtx x)
11488 switch (standard_80387_constant_p (x))
11490 case 1:
11491 return "fldz";
11492 case 2:
11493 return "fld1";
11494 case 3:
11495 return "fldlg2";
11496 case 4:
11497 return "fldln2";
11498 case 5:
11499 return "fldl2e";
11500 case 6:
11501 return "fldl2t";
11502 case 7:
11503 return "fldpi";
11504 case 8:
11505 case 9:
11506 return "#";
11507 default:
11508 gcc_unreachable ();
11512 /* Return the CONST_DOUBLE representing the 80387 constant that is
11513 loaded by the specified special instruction. The argument IDX
11514 matches the return value from standard_80387_constant_p. */
11517 standard_80387_constant_rtx (int idx)
11519 int i;
11521 if (! ext_80387_constants_init)
11522 init_ext_80387_constants ();
11524 switch (idx)
11526 case 3:
11527 case 4:
11528 case 5:
11529 case 6:
11530 case 7:
11531 i = idx - 3;
11532 break;
11534 default:
11535 gcc_unreachable ();
11538 return const_double_from_real_value (ext_80387_constants_table[i],
11539 XFmode);
11542 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11543 in supported SSE/AVX vector mode. */
11546 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11548 machine_mode mode;
11550 if (!TARGET_SSE)
11551 return 0;
11553 mode = GET_MODE (x);
11555 if (x == const0_rtx || const0_operand (x, mode))
11556 return 1;
11558 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11560 /* VOIDmode integer constant, get mode from the predicate. */
11561 if (mode == VOIDmode)
11562 mode = pred_mode;
11564 switch (GET_MODE_SIZE (mode))
11566 case 64:
11567 if (TARGET_AVX512F)
11568 return 2;
11569 break;
11570 case 32:
11571 if (TARGET_AVX2)
11572 return 2;
11573 break;
11574 case 16:
11575 if (TARGET_SSE2)
11576 return 2;
11577 break;
11578 case 0:
11579 /* VOIDmode */
11580 gcc_unreachable ();
11581 default:
11582 break;
11586 return 0;
11589 /* Return the opcode of the special instruction to be used to load
11590 the constant X. */
11592 const char *
11593 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11595 machine_mode mode;
11597 gcc_assert (TARGET_SSE);
11599 mode = GET_MODE (x);
11601 if (x == const0_rtx || const0_operand (x, mode))
11603 switch (get_attr_mode (insn))
11605 case MODE_XI:
11606 return "vpxord\t%g0, %g0, %g0";
11607 case MODE_OI:
11608 return (TARGET_AVX512VL
11609 ? "vpxord\t%x0, %x0, %x0"
11610 : "vpxor\t%x0, %x0, %x0");
11611 case MODE_TI:
11612 return (TARGET_AVX512VL
11613 ? "vpxord\t%t0, %t0, %t0"
11614 : "%vpxor\t%0, %d0");
11616 case MODE_V8DF:
11617 return (TARGET_AVX512DQ
11618 ? "vxorpd\t%g0, %g0, %g0"
11619 : "vpxorq\t%g0, %g0, %g0");
11620 case MODE_V4DF:
11621 return "vxorpd\t%x0, %x0, %x0";
11622 case MODE_V2DF:
11623 return "%vxorpd\t%0, %d0";
11625 case MODE_V16SF:
11626 return (TARGET_AVX512DQ
11627 ? "vxorps\t%g0, %g0, %g0"
11628 : "vpxord\t%g0, %g0, %g0");
11629 case MODE_V8SF:
11630 return "vxorps\t%x0, %x0, %x0";
11631 case MODE_V4SF:
11632 return "%vxorps\t%0, %d0";
11634 default:
11635 gcc_unreachable ();
11638 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11640 enum attr_mode insn_mode = get_attr_mode (insn);
11642 switch (insn_mode)
11644 case MODE_XI:
11645 case MODE_V8DF:
11646 case MODE_V16SF:
11647 gcc_assert (TARGET_AVX512F);
11648 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11650 case MODE_OI:
11651 case MODE_V4DF:
11652 case MODE_V8SF:
11653 gcc_assert (TARGET_AVX2);
11654 /* FALLTHRU */
11655 case MODE_TI:
11656 case MODE_V2DF:
11657 case MODE_V4SF:
11658 gcc_assert (TARGET_SSE2);
11659 return (TARGET_AVX
11660 ? "vpcmpeqd\t%0, %0, %0"
11661 : "pcmpeqd\t%0, %0");
11663 default:
11664 gcc_unreachable ();
11668 gcc_unreachable ();
11671 /* Returns true if INSN can be transformed from a memory load
11672 to a supported FP constant load. */
11674 bool
11675 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11677 rtx src = find_constant_src (insn);
11679 gcc_assert (REG_P (dst));
11681 if (src == NULL
11682 || (SSE_REGNO_P (REGNO (dst))
11683 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11684 || (STACK_REGNO_P (REGNO (dst))
11685 && standard_80387_constant_p (src) < 1))
11686 return false;
11688 return true;
11691 /* Returns true if OP contains a symbol reference */
11693 bool
11694 symbolic_reference_mentioned_p (rtx op)
11696 const char *fmt;
11697 int i;
11699 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11700 return true;
11702 fmt = GET_RTX_FORMAT (GET_CODE (op));
11703 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11705 if (fmt[i] == 'E')
11707 int j;
11709 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11710 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11711 return true;
11714 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11715 return true;
11718 return false;
11721 /* Return true if it is appropriate to emit `ret' instructions in the
11722 body of a function. Do this only if the epilogue is simple, needing a
11723 couple of insns. Prior to reloading, we can't tell how many registers
11724 must be saved, so return false then. Return false if there is no frame
11725 marker to de-allocate. */
11727 bool
11728 ix86_can_use_return_insn_p (void)
11730 struct ix86_frame frame;
11732 /* Don't use `ret' instruction in interrupt handler. */
11733 if (! reload_completed
11734 || frame_pointer_needed
11735 || cfun->machine->func_type != TYPE_NORMAL)
11736 return 0;
11738 /* Don't allow more than 32k pop, since that's all we can do
11739 with one instruction. */
11740 if (crtl->args.pops_args && crtl->args.size >= 32768)
11741 return 0;
11743 ix86_compute_frame_layout (&frame);
11744 return (frame.stack_pointer_offset == UNITS_PER_WORD
11745 && (frame.nregs + frame.nsseregs) == 0);
11748 /* Value should be nonzero if functions must have frame pointers.
11749 Zero means the frame pointer need not be set up (and parms may
11750 be accessed via the stack pointer) in functions that seem suitable. */
11752 static bool
11753 ix86_frame_pointer_required (void)
11755 /* If we accessed previous frames, then the generated code expects
11756 to be able to access the saved ebp value in our frame. */
11757 if (cfun->machine->accesses_prev_frame)
11758 return true;
11760 /* Several x86 os'es need a frame pointer for other reasons,
11761 usually pertaining to setjmp. */
11762 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11763 return true;
11765 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11766 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11767 return true;
11769 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11770 allocation is 4GB. */
11771 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11772 return true;
11774 /* SSE saves require frame-pointer when stack is misaligned. */
11775 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11776 return true;
11778 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11779 turns off the frame pointer by default. Turn it back on now if
11780 we've not got a leaf function. */
11781 if (TARGET_OMIT_LEAF_FRAME_POINTER
11782 && (!crtl->is_leaf
11783 || ix86_current_function_calls_tls_descriptor))
11784 return true;
11786 if (crtl->profile && !flag_fentry)
11787 return true;
11789 return false;
11792 /* Record that the current function accesses previous call frames. */
11794 void
11795 ix86_setup_frame_addresses (void)
11797 cfun->machine->accesses_prev_frame = 1;
11800 #ifndef USE_HIDDEN_LINKONCE
11801 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11802 # define USE_HIDDEN_LINKONCE 1
11803 # else
11804 # define USE_HIDDEN_LINKONCE 0
11805 # endif
11806 #endif
11808 static int pic_labels_used;
11810 /* Fills in the label name that should be used for a pc thunk for
11811 the given register. */
11813 static void
11814 get_pc_thunk_name (char name[32], unsigned int regno)
11816 gcc_assert (!TARGET_64BIT);
11818 if (USE_HIDDEN_LINKONCE)
11819 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11820 else
11821 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11825 /* This function generates code for -fpic that loads %ebx with
11826 the return address of the caller and then returns. */
11828 static void
11829 ix86_code_end (void)
11831 rtx xops[2];
11832 int regno;
11834 for (regno = AX_REG; regno <= SP_REG; regno++)
11836 char name[32];
11837 tree decl;
11839 if (!(pic_labels_used & (1 << regno)))
11840 continue;
11842 get_pc_thunk_name (name, regno);
11844 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11845 get_identifier (name),
11846 build_function_type_list (void_type_node, NULL_TREE));
11847 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11848 NULL_TREE, void_type_node);
11849 TREE_PUBLIC (decl) = 1;
11850 TREE_STATIC (decl) = 1;
11851 DECL_IGNORED_P (decl) = 1;
11853 #if TARGET_MACHO
11854 if (TARGET_MACHO)
11856 switch_to_section (darwin_sections[text_coal_section]);
11857 fputs ("\t.weak_definition\t", asm_out_file);
11858 assemble_name (asm_out_file, name);
11859 fputs ("\n\t.private_extern\t", asm_out_file);
11860 assemble_name (asm_out_file, name);
11861 putc ('\n', asm_out_file);
11862 ASM_OUTPUT_LABEL (asm_out_file, name);
11863 DECL_WEAK (decl) = 1;
11865 else
11866 #endif
11867 if (USE_HIDDEN_LINKONCE)
11869 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11871 targetm.asm_out.unique_section (decl, 0);
11872 switch_to_section (get_named_section (decl, NULL, 0));
11874 targetm.asm_out.globalize_label (asm_out_file, name);
11875 fputs ("\t.hidden\t", asm_out_file);
11876 assemble_name (asm_out_file, name);
11877 putc ('\n', asm_out_file);
11878 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11880 else
11882 switch_to_section (text_section);
11883 ASM_OUTPUT_LABEL (asm_out_file, name);
11886 DECL_INITIAL (decl) = make_node (BLOCK);
11887 current_function_decl = decl;
11888 allocate_struct_function (decl, false);
11889 init_function_start (decl);
11890 first_function_block_is_cold = false;
11891 /* Make sure unwind info is emitted for the thunk if needed. */
11892 final_start_function (emit_barrier (), asm_out_file, 1);
11894 /* Pad stack IP move with 4 instructions (two NOPs count
11895 as one instruction). */
11896 if (TARGET_PAD_SHORT_FUNCTION)
11898 int i = 8;
11900 while (i--)
11901 fputs ("\tnop\n", asm_out_file);
11904 xops[0] = gen_rtx_REG (Pmode, regno);
11905 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11906 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11907 output_asm_insn ("%!ret", NULL);
11908 final_end_function ();
11909 init_insn_lengths ();
11910 free_after_compilation (cfun);
11911 set_cfun (NULL);
11912 current_function_decl = NULL;
11915 if (flag_split_stack)
11916 file_end_indicate_split_stack ();
11919 /* Emit code for the SET_GOT patterns. */
11921 const char *
11922 output_set_got (rtx dest, rtx label)
11924 rtx xops[3];
11926 xops[0] = dest;
11928 if (TARGET_VXWORKS_RTP && flag_pic)
11930 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11931 xops[2] = gen_rtx_MEM (Pmode,
11932 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11933 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11935 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11936 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11937 an unadorned address. */
11938 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11939 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11940 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11941 return "";
11944 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11946 if (flag_pic)
11948 char name[32];
11949 get_pc_thunk_name (name, REGNO (dest));
11950 pic_labels_used |= 1 << REGNO (dest);
11952 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11953 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11954 output_asm_insn ("%!call\t%X2", xops);
11956 #if TARGET_MACHO
11957 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11958 This is what will be referenced by the Mach-O PIC subsystem. */
11959 if (machopic_should_output_picbase_label () || !label)
11960 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11962 /* When we are restoring the pic base at the site of a nonlocal label,
11963 and we decided to emit the pic base above, we will still output a
11964 local label used for calculating the correction offset (even though
11965 the offset will be 0 in that case). */
11966 if (label)
11967 targetm.asm_out.internal_label (asm_out_file, "L",
11968 CODE_LABEL_NUMBER (label));
11969 #endif
11971 else
11973 if (TARGET_MACHO)
11974 /* We don't need a pic base, we're not producing pic. */
11975 gcc_unreachable ();
11977 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11978 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11979 targetm.asm_out.internal_label (asm_out_file, "L",
11980 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11983 if (!TARGET_MACHO)
11984 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11986 return "";
11989 /* Generate an "push" pattern for input ARG. */
11991 static rtx
11992 gen_push (rtx arg)
11994 struct machine_function *m = cfun->machine;
11996 if (m->fs.cfa_reg == stack_pointer_rtx)
11997 m->fs.cfa_offset += UNITS_PER_WORD;
11998 m->fs.sp_offset += UNITS_PER_WORD;
12000 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12001 arg = gen_rtx_REG (word_mode, REGNO (arg));
12003 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12004 gen_rtx_PRE_DEC (Pmode,
12005 stack_pointer_rtx)),
12006 arg);
12009 /* Generate an "pop" pattern for input ARG. */
12011 static rtx
12012 gen_pop (rtx arg)
12014 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12015 arg = gen_rtx_REG (word_mode, REGNO (arg));
12017 return gen_rtx_SET (arg,
12018 gen_rtx_MEM (word_mode,
12019 gen_rtx_POST_INC (Pmode,
12020 stack_pointer_rtx)));
12023 /* Return >= 0 if there is an unused call-clobbered register available
12024 for the entire function. */
12026 static unsigned int
12027 ix86_select_alt_pic_regnum (void)
12029 if (ix86_use_pseudo_pic_reg ())
12030 return INVALID_REGNUM;
12032 if (crtl->is_leaf
12033 && !crtl->profile
12034 && !ix86_current_function_calls_tls_descriptor)
12036 int i, drap;
12037 /* Can't use the same register for both PIC and DRAP. */
12038 if (crtl->drap_reg)
12039 drap = REGNO (crtl->drap_reg);
12040 else
12041 drap = -1;
12042 for (i = 2; i >= 0; --i)
12043 if (i != drap && !df_regs_ever_live_p (i))
12044 return i;
12047 return INVALID_REGNUM;
12050 /* Return true if REGNO is used by the epilogue. */
12052 bool
12053 ix86_epilogue_uses (int regno)
12055 /* If there are no caller-saved registers, we preserve all registers,
12056 except for MMX and x87 registers which aren't supported when saving
12057 and restoring registers. Don't explicitly save SP register since
12058 it is always preserved. */
12059 return (epilogue_completed
12060 && cfun->machine->no_caller_saved_registers
12061 && !fixed_regs[regno]
12062 && !STACK_REGNO_P (regno)
12063 && !MMX_REGNO_P (regno));
12066 /* Return nonzero if register REGNO can be used as a scratch register
12067 in peephole2. */
12069 static bool
12070 ix86_hard_regno_scratch_ok (unsigned int regno)
12072 /* If there are no caller-saved registers, we can't use any register
12073 as a scratch register after epilogue and use REGNO as scratch
12074 register only if it has been used before to avoid saving and
12075 restoring it. */
12076 return (!cfun->machine->no_caller_saved_registers
12077 || (!epilogue_completed
12078 && df_regs_ever_live_p (regno)));
12081 /* Return TRUE if we need to save REGNO. */
12083 static bool
12084 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12086 /* If there are no caller-saved registers, we preserve all registers,
12087 except for MMX and x87 registers which aren't supported when saving
12088 and restoring registers. Don't explicitly save SP register since
12089 it is always preserved. */
12090 if (cfun->machine->no_caller_saved_registers)
12092 /* Don't preserve registers used for function return value. */
12093 rtx reg = crtl->return_rtx;
12094 if (reg)
12096 unsigned int i = REGNO (reg);
12097 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12098 while (nregs-- > 0)
12099 if ((i + nregs) == regno)
12100 return false;
12102 reg = crtl->return_bnd;
12103 if (reg)
12105 i = REGNO (reg);
12106 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12107 while (nregs-- > 0)
12108 if ((i + nregs) == regno)
12109 return false;
12113 return (df_regs_ever_live_p (regno)
12114 && !fixed_regs[regno]
12115 && !STACK_REGNO_P (regno)
12116 && !MMX_REGNO_P (regno)
12117 && (regno != HARD_FRAME_POINTER_REGNUM
12118 || !frame_pointer_needed));
12121 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12122 && pic_offset_table_rtx)
12124 if (ix86_use_pseudo_pic_reg ())
12126 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12127 _mcount in prologue. */
12128 if (!TARGET_64BIT && flag_pic && crtl->profile)
12129 return true;
12131 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12132 || crtl->profile
12133 || crtl->calls_eh_return
12134 || crtl->uses_const_pool
12135 || cfun->has_nonlocal_label)
12136 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12139 if (crtl->calls_eh_return && maybe_eh_return)
12141 unsigned i;
12142 for (i = 0; ; i++)
12144 unsigned test = EH_RETURN_DATA_REGNO (i);
12145 if (test == INVALID_REGNUM)
12146 break;
12147 if (test == regno)
12148 return true;
12152 if (crtl->drap_reg
12153 && regno == REGNO (crtl->drap_reg)
12154 && !cfun->machine->no_drap_save_restore)
12155 return true;
12157 return (df_regs_ever_live_p (regno)
12158 && !call_used_regs[regno]
12159 && !fixed_regs[regno]
12160 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12163 /* Return number of saved general prupose registers. */
12165 static int
12166 ix86_nsaved_regs (void)
12168 int nregs = 0;
12169 int regno;
12171 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12172 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12173 nregs ++;
12174 return nregs;
12177 /* Return number of saved SSE registers. */
12179 static int
12180 ix86_nsaved_sseregs (void)
12182 int nregs = 0;
12183 int regno;
12185 if (!TARGET_64BIT_MS_ABI)
12186 return 0;
12187 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12188 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12189 nregs ++;
12190 return nregs;
12193 /* Given FROM and TO register numbers, say whether this elimination is
12194 allowed. If stack alignment is needed, we can only replace argument
12195 pointer with hard frame pointer, or replace frame pointer with stack
12196 pointer. Otherwise, frame pointer elimination is automatically
12197 handled and all other eliminations are valid. */
12199 static bool
12200 ix86_can_eliminate (const int from, const int to)
12202 if (stack_realign_fp)
12203 return ((from == ARG_POINTER_REGNUM
12204 && to == HARD_FRAME_POINTER_REGNUM)
12205 || (from == FRAME_POINTER_REGNUM
12206 && to == STACK_POINTER_REGNUM));
12207 else
12208 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12211 /* Return the offset between two registers, one to be eliminated, and the other
12212 its replacement, at the start of a routine. */
12214 HOST_WIDE_INT
12215 ix86_initial_elimination_offset (int from, int to)
12217 struct ix86_frame frame;
12218 ix86_compute_frame_layout (&frame);
12220 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12221 return frame.hard_frame_pointer_offset;
12222 else if (from == FRAME_POINTER_REGNUM
12223 && to == HARD_FRAME_POINTER_REGNUM)
12224 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12225 else
12227 gcc_assert (to == STACK_POINTER_REGNUM);
12229 if (from == ARG_POINTER_REGNUM)
12230 return frame.stack_pointer_offset;
12232 gcc_assert (from == FRAME_POINTER_REGNUM);
12233 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12237 /* In a dynamically-aligned function, we can't know the offset from
12238 stack pointer to frame pointer, so we must ensure that setjmp
12239 eliminates fp against the hard fp (%ebp) rather than trying to
12240 index from %esp up to the top of the frame across a gap that is
12241 of unknown (at compile-time) size. */
12242 static rtx
12243 ix86_builtin_setjmp_frame_value (void)
12245 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12248 /* When using -fsplit-stack, the allocation routines set a field in
12249 the TCB to the bottom of the stack plus this much space, measured
12250 in bytes. */
12252 #define SPLIT_STACK_AVAILABLE 256
12254 /* Fill structure ix86_frame about frame of currently computed function. */
12256 static void
12257 ix86_compute_frame_layout (struct ix86_frame *frame)
12259 unsigned HOST_WIDE_INT stack_alignment_needed;
12260 HOST_WIDE_INT offset;
12261 unsigned HOST_WIDE_INT preferred_alignment;
12262 HOST_WIDE_INT size = get_frame_size ();
12263 HOST_WIDE_INT to_allocate;
12265 frame->nregs = ix86_nsaved_regs ();
12266 frame->nsseregs = ix86_nsaved_sseregs ();
12268 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12269 except for function prologues, leaf functions and when the defult
12270 incoming stack boundary is overriden at command line or via
12271 force_align_arg_pointer attribute. */
12272 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12273 && (!crtl->is_leaf || cfun->calls_alloca != 0
12274 || ix86_current_function_calls_tls_descriptor
12275 || ix86_incoming_stack_boundary < 128))
12277 crtl->preferred_stack_boundary = 128;
12278 crtl->stack_alignment_needed = 128;
12281 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12282 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12284 gcc_assert (!size || stack_alignment_needed);
12285 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12286 gcc_assert (preferred_alignment <= stack_alignment_needed);
12288 /* For SEH we have to limit the amount of code movement into the prologue.
12289 At present we do this via a BLOCKAGE, at which point there's very little
12290 scheduling that can be done, which means that there's very little point
12291 in doing anything except PUSHs. */
12292 if (TARGET_SEH)
12293 cfun->machine->use_fast_prologue_epilogue = false;
12295 /* During reload iteration the amount of registers saved can change.
12296 Recompute the value as needed. Do not recompute when amount of registers
12297 didn't change as reload does multiple calls to the function and does not
12298 expect the decision to change within single iteration. */
12299 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12300 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12302 int count = frame->nregs;
12303 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12305 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12307 /* The fast prologue uses move instead of push to save registers. This
12308 is significantly longer, but also executes faster as modern hardware
12309 can execute the moves in parallel, but can't do that for push/pop.
12311 Be careful about choosing what prologue to emit: When function takes
12312 many instructions to execute we may use slow version as well as in
12313 case function is known to be outside hot spot (this is known with
12314 feedback only). Weight the size of function by number of registers
12315 to save as it is cheap to use one or two push instructions but very
12316 slow to use many of them. */
12317 if (count)
12318 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12319 if (node->frequency < NODE_FREQUENCY_NORMAL
12320 || (flag_branch_probabilities
12321 && node->frequency < NODE_FREQUENCY_HOT))
12322 cfun->machine->use_fast_prologue_epilogue = false;
12323 else
12324 cfun->machine->use_fast_prologue_epilogue
12325 = !expensive_function_p (count);
12328 frame->save_regs_using_mov
12329 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12330 /* If static stack checking is enabled and done with probes,
12331 the registers need to be saved before allocating the frame. */
12332 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12334 /* Skip return address. */
12335 offset = UNITS_PER_WORD;
12337 /* Skip pushed static chain. */
12338 if (ix86_static_chain_on_stack)
12339 offset += UNITS_PER_WORD;
12341 /* Skip saved base pointer. */
12342 if (frame_pointer_needed)
12343 offset += UNITS_PER_WORD;
12344 frame->hfp_save_offset = offset;
12346 /* The traditional frame pointer location is at the top of the frame. */
12347 frame->hard_frame_pointer_offset = offset;
12349 /* Register save area */
12350 offset += frame->nregs * UNITS_PER_WORD;
12351 frame->reg_save_offset = offset;
12353 /* On SEH target, registers are pushed just before the frame pointer
12354 location. */
12355 if (TARGET_SEH)
12356 frame->hard_frame_pointer_offset = offset;
12358 /* Align and set SSE register save area. */
12359 if (frame->nsseregs)
12361 /* The only ABI that has saved SSE registers (Win64) also has a
12362 16-byte aligned default stack, and thus we don't need to be
12363 within the re-aligned local stack frame to save them. In case
12364 incoming stack boundary is aligned to less than 16 bytes,
12365 unaligned move of SSE register will be emitted, so there is
12366 no point to round up the SSE register save area outside the
12367 re-aligned local stack frame to 16 bytes. */
12368 if (ix86_incoming_stack_boundary >= 128)
12369 offset = ROUND_UP (offset, 16);
12370 offset += frame->nsseregs * 16;
12372 frame->sse_reg_save_offset = offset;
12374 /* The re-aligned stack starts here. Values before this point are not
12375 directly comparable with values below this point. In order to make
12376 sure that no value happens to be the same before and after, force
12377 the alignment computation below to add a non-zero value. */
12378 if (stack_realign_fp)
12379 offset = ROUND_UP (offset, stack_alignment_needed);
12381 /* Va-arg area */
12382 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12383 offset += frame->va_arg_size;
12385 /* Align start of frame for local function. */
12386 if (stack_realign_fp
12387 || offset != frame->sse_reg_save_offset
12388 || size != 0
12389 || !crtl->is_leaf
12390 || cfun->calls_alloca
12391 || ix86_current_function_calls_tls_descriptor)
12392 offset = ROUND_UP (offset, stack_alignment_needed);
12394 /* Frame pointer points here. */
12395 frame->frame_pointer_offset = offset;
12397 offset += size;
12399 /* Add outgoing arguments area. Can be skipped if we eliminated
12400 all the function calls as dead code.
12401 Skipping is however impossible when function calls alloca. Alloca
12402 expander assumes that last crtl->outgoing_args_size
12403 of stack frame are unused. */
12404 if (ACCUMULATE_OUTGOING_ARGS
12405 && (!crtl->is_leaf || cfun->calls_alloca
12406 || ix86_current_function_calls_tls_descriptor))
12408 offset += crtl->outgoing_args_size;
12409 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12411 else
12412 frame->outgoing_arguments_size = 0;
12414 /* Align stack boundary. Only needed if we're calling another function
12415 or using alloca. */
12416 if (!crtl->is_leaf || cfun->calls_alloca
12417 || ix86_current_function_calls_tls_descriptor)
12418 offset = ROUND_UP (offset, preferred_alignment);
12420 /* We've reached end of stack frame. */
12421 frame->stack_pointer_offset = offset;
12423 /* Size prologue needs to allocate. */
12424 to_allocate = offset - frame->sse_reg_save_offset;
12426 if ((!to_allocate && frame->nregs <= 1)
12427 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12428 frame->save_regs_using_mov = false;
12430 if (ix86_using_red_zone ()
12431 && crtl->sp_is_unchanging
12432 && crtl->is_leaf
12433 && !ix86_pc_thunk_call_expanded
12434 && !ix86_current_function_calls_tls_descriptor)
12436 frame->red_zone_size = to_allocate;
12437 if (frame->save_regs_using_mov)
12438 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12439 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12440 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12442 else
12443 frame->red_zone_size = 0;
12444 frame->stack_pointer_offset -= frame->red_zone_size;
12446 /* The SEH frame pointer location is near the bottom of the frame.
12447 This is enforced by the fact that the difference between the
12448 stack pointer and the frame pointer is limited to 240 bytes in
12449 the unwind data structure. */
12450 if (TARGET_SEH)
12452 HOST_WIDE_INT diff;
12454 /* If we can leave the frame pointer where it is, do so. Also, returns
12455 the establisher frame for __builtin_frame_address (0). */
12456 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12457 if (diff <= SEH_MAX_FRAME_SIZE
12458 && (diff > 240 || (diff & 15) != 0)
12459 && !crtl->accesses_prior_frames)
12461 /* Ideally we'd determine what portion of the local stack frame
12462 (within the constraint of the lowest 240) is most heavily used.
12463 But without that complication, simply bias the frame pointer
12464 by 128 bytes so as to maximize the amount of the local stack
12465 frame that is addressable with 8-bit offsets. */
12466 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12471 /* This is semi-inlined memory_address_length, but simplified
12472 since we know that we're always dealing with reg+offset, and
12473 to avoid having to create and discard all that rtl. */
12475 static inline int
12476 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12478 int len = 4;
12480 if (offset == 0)
12482 /* EBP and R13 cannot be encoded without an offset. */
12483 len = (regno == BP_REG || regno == R13_REG);
12485 else if (IN_RANGE (offset, -128, 127))
12486 len = 1;
12488 /* ESP and R12 must be encoded with a SIB byte. */
12489 if (regno == SP_REG || regno == R12_REG)
12490 len++;
12492 return len;
12495 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12496 The valid base registers are taken from CFUN->MACHINE->FS. */
12498 static rtx
12499 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12501 const struct machine_function *m = cfun->machine;
12502 rtx base_reg = NULL;
12503 HOST_WIDE_INT base_offset = 0;
12505 if (m->use_fast_prologue_epilogue)
12507 /* Choose the base register most likely to allow the most scheduling
12508 opportunities. Generally FP is valid throughout the function,
12509 while DRAP must be reloaded within the epilogue. But choose either
12510 over the SP due to increased encoding size. */
12512 if (m->fs.fp_valid)
12514 base_reg = hard_frame_pointer_rtx;
12515 base_offset = m->fs.fp_offset - cfa_offset;
12517 else if (m->fs.drap_valid)
12519 base_reg = crtl->drap_reg;
12520 base_offset = 0 - cfa_offset;
12522 else if (m->fs.sp_valid)
12524 base_reg = stack_pointer_rtx;
12525 base_offset = m->fs.sp_offset - cfa_offset;
12528 else
12530 HOST_WIDE_INT toffset;
12531 int len = 16, tlen;
12533 /* Choose the base register with the smallest address encoding.
12534 With a tie, choose FP > DRAP > SP. */
12535 if (m->fs.sp_valid)
12537 base_reg = stack_pointer_rtx;
12538 base_offset = m->fs.sp_offset - cfa_offset;
12539 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12541 if (m->fs.drap_valid)
12543 toffset = 0 - cfa_offset;
12544 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12545 if (tlen <= len)
12547 base_reg = crtl->drap_reg;
12548 base_offset = toffset;
12549 len = tlen;
12552 if (m->fs.fp_valid)
12554 toffset = m->fs.fp_offset - cfa_offset;
12555 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12556 if (tlen <= len)
12558 base_reg = hard_frame_pointer_rtx;
12559 base_offset = toffset;
12560 len = tlen;
12564 gcc_assert (base_reg != NULL);
12566 return plus_constant (Pmode, base_reg, base_offset);
12569 /* Emit code to save registers in the prologue. */
12571 static void
12572 ix86_emit_save_regs (void)
12574 unsigned int regno;
12575 rtx_insn *insn;
12577 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12578 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12580 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12581 RTX_FRAME_RELATED_P (insn) = 1;
12585 /* Emit a single register save at CFA - CFA_OFFSET. */
12587 static void
12588 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12589 HOST_WIDE_INT cfa_offset)
12591 struct machine_function *m = cfun->machine;
12592 rtx reg = gen_rtx_REG (mode, regno);
12593 rtx mem, addr, base, insn;
12594 unsigned int align;
12596 addr = choose_baseaddr (cfa_offset);
12597 mem = gen_frame_mem (mode, addr);
12599 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12600 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12601 set_mem_align (mem, align);
12603 insn = emit_insn (gen_rtx_SET (mem, reg));
12604 RTX_FRAME_RELATED_P (insn) = 1;
12606 base = addr;
12607 if (GET_CODE (base) == PLUS)
12608 base = XEXP (base, 0);
12609 gcc_checking_assert (REG_P (base));
12611 /* When saving registers into a re-aligned local stack frame, avoid
12612 any tricky guessing by dwarf2out. */
12613 if (m->fs.realigned)
12615 gcc_checking_assert (stack_realign_drap);
12617 if (regno == REGNO (crtl->drap_reg))
12619 /* A bit of a hack. We force the DRAP register to be saved in
12620 the re-aligned stack frame, which provides us with a copy
12621 of the CFA that will last past the prologue. Install it. */
12622 gcc_checking_assert (cfun->machine->fs.fp_valid);
12623 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12624 cfun->machine->fs.fp_offset - cfa_offset);
12625 mem = gen_rtx_MEM (mode, addr);
12626 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12628 else
12630 /* The frame pointer is a stable reference within the
12631 aligned frame. Use it. */
12632 gcc_checking_assert (cfun->machine->fs.fp_valid);
12633 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12634 cfun->machine->fs.fp_offset - cfa_offset);
12635 mem = gen_rtx_MEM (mode, addr);
12636 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12640 /* The memory may not be relative to the current CFA register,
12641 which means that we may need to generate a new pattern for
12642 use by the unwind info. */
12643 else if (base != m->fs.cfa_reg)
12645 addr = plus_constant (Pmode, m->fs.cfa_reg,
12646 m->fs.cfa_offset - cfa_offset);
12647 mem = gen_rtx_MEM (mode, addr);
12648 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12652 /* Emit code to save registers using MOV insns.
12653 First register is stored at CFA - CFA_OFFSET. */
12654 static void
12655 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12657 unsigned int regno;
12659 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12660 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12662 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12663 cfa_offset -= UNITS_PER_WORD;
12667 /* Emit code to save SSE registers using MOV insns.
12668 First register is stored at CFA - CFA_OFFSET. */
12669 static void
12670 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12672 unsigned int regno;
12674 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12675 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12677 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12678 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12682 static GTY(()) rtx queued_cfa_restores;
12684 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12685 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12686 Don't add the note if the previously saved value will be left untouched
12687 within stack red-zone till return, as unwinders can find the same value
12688 in the register and on the stack. */
12690 static void
12691 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12693 if (!crtl->shrink_wrapped
12694 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12695 return;
12697 if (insn)
12699 add_reg_note (insn, REG_CFA_RESTORE, reg);
12700 RTX_FRAME_RELATED_P (insn) = 1;
12702 else
12703 queued_cfa_restores
12704 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12707 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12709 static void
12710 ix86_add_queued_cfa_restore_notes (rtx insn)
12712 rtx last;
12713 if (!queued_cfa_restores)
12714 return;
12715 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12717 XEXP (last, 1) = REG_NOTES (insn);
12718 REG_NOTES (insn) = queued_cfa_restores;
12719 queued_cfa_restores = NULL_RTX;
12720 RTX_FRAME_RELATED_P (insn) = 1;
12723 /* Expand prologue or epilogue stack adjustment.
12724 The pattern exist to put a dependency on all ebp-based memory accesses.
12725 STYLE should be negative if instructions should be marked as frame related,
12726 zero if %r11 register is live and cannot be freely used and positive
12727 otherwise. */
12729 static void
12730 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12731 int style, bool set_cfa)
12733 struct machine_function *m = cfun->machine;
12734 rtx insn;
12735 bool add_frame_related_expr = false;
12737 if (Pmode == SImode)
12738 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12739 else if (x86_64_immediate_operand (offset, DImode))
12740 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12741 else
12743 rtx tmp;
12744 /* r11 is used by indirect sibcall return as well, set before the
12745 epilogue and used after the epilogue. */
12746 if (style)
12747 tmp = gen_rtx_REG (DImode, R11_REG);
12748 else
12750 gcc_assert (src != hard_frame_pointer_rtx
12751 && dest != hard_frame_pointer_rtx);
12752 tmp = hard_frame_pointer_rtx;
12754 insn = emit_insn (gen_rtx_SET (tmp, offset));
12755 if (style < 0)
12756 add_frame_related_expr = true;
12758 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12761 insn = emit_insn (insn);
12762 if (style >= 0)
12763 ix86_add_queued_cfa_restore_notes (insn);
12765 if (set_cfa)
12767 rtx r;
12769 gcc_assert (m->fs.cfa_reg == src);
12770 m->fs.cfa_offset += INTVAL (offset);
12771 m->fs.cfa_reg = dest;
12773 r = gen_rtx_PLUS (Pmode, src, offset);
12774 r = gen_rtx_SET (dest, r);
12775 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12776 RTX_FRAME_RELATED_P (insn) = 1;
12778 else if (style < 0)
12780 RTX_FRAME_RELATED_P (insn) = 1;
12781 if (add_frame_related_expr)
12783 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12784 r = gen_rtx_SET (dest, r);
12785 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12789 if (dest == stack_pointer_rtx)
12791 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12792 bool valid = m->fs.sp_valid;
12794 if (src == hard_frame_pointer_rtx)
12796 valid = m->fs.fp_valid;
12797 ooffset = m->fs.fp_offset;
12799 else if (src == crtl->drap_reg)
12801 valid = m->fs.drap_valid;
12802 ooffset = 0;
12804 else
12806 /* Else there are two possibilities: SP itself, which we set
12807 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12808 taken care of this by hand along the eh_return path. */
12809 gcc_checking_assert (src == stack_pointer_rtx
12810 || offset == const0_rtx);
12813 m->fs.sp_offset = ooffset - INTVAL (offset);
12814 m->fs.sp_valid = valid;
12818 /* Find an available register to be used as dynamic realign argument
12819 pointer regsiter. Such a register will be written in prologue and
12820 used in begin of body, so it must not be
12821 1. parameter passing register.
12822 2. GOT pointer.
12823 We reuse static-chain register if it is available. Otherwise, we
12824 use DI for i386 and R13 for x86-64. We chose R13 since it has
12825 shorter encoding.
12827 Return: the regno of chosen register. */
12829 static unsigned int
12830 find_drap_reg (void)
12832 tree decl = cfun->decl;
12834 /* Always use callee-saved register if there are no caller-saved
12835 registers. */
12836 if (TARGET_64BIT)
12838 /* Use R13 for nested function or function need static chain.
12839 Since function with tail call may use any caller-saved
12840 registers in epilogue, DRAP must not use caller-saved
12841 register in such case. */
12842 if (DECL_STATIC_CHAIN (decl)
12843 || cfun->machine->no_caller_saved_registers
12844 || crtl->tail_call_emit)
12845 return R13_REG;
12847 return R10_REG;
12849 else
12851 /* Use DI for nested function or function need static chain.
12852 Since function with tail call may use any caller-saved
12853 registers in epilogue, DRAP must not use caller-saved
12854 register in such case. */
12855 if (DECL_STATIC_CHAIN (decl)
12856 || cfun->machine->no_caller_saved_registers
12857 || crtl->tail_call_emit)
12858 return DI_REG;
12860 /* Reuse static chain register if it isn't used for parameter
12861 passing. */
12862 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12864 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12865 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12866 return CX_REG;
12868 return DI_REG;
12872 /* Handle a "force_align_arg_pointer" attribute. */
12874 static tree
12875 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12876 tree, int, bool *no_add_attrs)
12878 if (TREE_CODE (*node) != FUNCTION_TYPE
12879 && TREE_CODE (*node) != METHOD_TYPE
12880 && TREE_CODE (*node) != FIELD_DECL
12881 && TREE_CODE (*node) != TYPE_DECL)
12883 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12884 name);
12885 *no_add_attrs = true;
12888 return NULL_TREE;
12891 /* Return minimum incoming stack alignment. */
12893 static unsigned int
12894 ix86_minimum_incoming_stack_boundary (bool sibcall)
12896 unsigned int incoming_stack_boundary;
12898 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
12900 if (cfun->machine->func_type != TYPE_NORMAL)
12901 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12902 /* Prefer the one specified at command line. */
12903 else if (ix86_user_incoming_stack_boundary)
12904 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12905 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12906 if -mstackrealign is used, it isn't used for sibcall check and
12907 estimated stack alignment is 128bit. */
12908 else if (!sibcall
12909 && ix86_force_align_arg_pointer
12910 && crtl->stack_alignment_estimated == 128)
12911 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12912 else
12913 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12915 /* Incoming stack alignment can be changed on individual functions
12916 via force_align_arg_pointer attribute. We use the smallest
12917 incoming stack boundary. */
12918 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12919 && lookup_attribute (ix86_force_align_arg_pointer_string,
12920 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12921 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12923 /* The incoming stack frame has to be aligned at least at
12924 parm_stack_boundary. */
12925 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12926 incoming_stack_boundary = crtl->parm_stack_boundary;
12928 /* Stack at entrance of main is aligned by runtime. We use the
12929 smallest incoming stack boundary. */
12930 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12931 && DECL_NAME (current_function_decl)
12932 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12933 && DECL_FILE_SCOPE_P (current_function_decl))
12934 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12936 return incoming_stack_boundary;
12939 /* Update incoming stack boundary and estimated stack alignment. */
12941 static void
12942 ix86_update_stack_boundary (void)
12944 ix86_incoming_stack_boundary
12945 = ix86_minimum_incoming_stack_boundary (false);
12947 /* x86_64 vararg needs 16byte stack alignment for register save
12948 area. */
12949 if (TARGET_64BIT
12950 && cfun->stdarg
12951 && crtl->stack_alignment_estimated < 128)
12952 crtl->stack_alignment_estimated = 128;
12954 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12955 if (ix86_tls_descriptor_calls_expanded_in_cfun
12956 && crtl->preferred_stack_boundary < 128)
12957 crtl->preferred_stack_boundary = 128;
12960 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12961 needed or an rtx for DRAP otherwise. */
12963 static rtx
12964 ix86_get_drap_rtx (void)
12966 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
12967 crtl->need_drap = true;
12969 if (stack_realign_drap)
12971 /* Assign DRAP to vDRAP and returns vDRAP */
12972 unsigned int regno = find_drap_reg ();
12973 rtx drap_vreg;
12974 rtx arg_ptr;
12975 rtx_insn *seq, *insn;
12977 arg_ptr = gen_rtx_REG (Pmode, regno);
12978 crtl->drap_reg = arg_ptr;
12980 start_sequence ();
12981 drap_vreg = copy_to_reg (arg_ptr);
12982 seq = get_insns ();
12983 end_sequence ();
12985 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12986 if (!optimize)
12988 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12989 RTX_FRAME_RELATED_P (insn) = 1;
12991 return drap_vreg;
12993 else
12994 return NULL;
12997 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12999 static rtx
13000 ix86_internal_arg_pointer (void)
13002 return virtual_incoming_args_rtx;
13005 struct scratch_reg {
13006 rtx reg;
13007 bool saved;
13010 /* Return a short-lived scratch register for use on function entry.
13011 In 32-bit mode, it is valid only after the registers are saved
13012 in the prologue. This register must be released by means of
13013 release_scratch_register_on_entry once it is dead. */
13015 static void
13016 get_scratch_register_on_entry (struct scratch_reg *sr)
13018 int regno;
13020 sr->saved = false;
13022 if (TARGET_64BIT)
13024 /* We always use R11 in 64-bit mode. */
13025 regno = R11_REG;
13027 else
13029 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13030 bool fastcall_p
13031 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13032 bool thiscall_p
13033 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13034 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13035 int regparm = ix86_function_regparm (fntype, decl);
13036 int drap_regno
13037 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13039 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13040 for the static chain register. */
13041 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13042 && drap_regno != AX_REG)
13043 regno = AX_REG;
13044 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13045 for the static chain register. */
13046 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13047 regno = AX_REG;
13048 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13049 regno = DX_REG;
13050 /* ecx is the static chain register. */
13051 else if (regparm < 3 && !fastcall_p && !thiscall_p
13052 && !static_chain_p
13053 && drap_regno != CX_REG)
13054 regno = CX_REG;
13055 else if (ix86_save_reg (BX_REG, true))
13056 regno = BX_REG;
13057 /* esi is the static chain register. */
13058 else if (!(regparm == 3 && static_chain_p)
13059 && ix86_save_reg (SI_REG, true))
13060 regno = SI_REG;
13061 else if (ix86_save_reg (DI_REG, true))
13062 regno = DI_REG;
13063 else
13065 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13066 sr->saved = true;
13070 sr->reg = gen_rtx_REG (Pmode, regno);
13071 if (sr->saved)
13073 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13074 RTX_FRAME_RELATED_P (insn) = 1;
13078 /* Release a scratch register obtained from the preceding function. */
13080 static void
13081 release_scratch_register_on_entry (struct scratch_reg *sr)
13083 if (sr->saved)
13085 struct machine_function *m = cfun->machine;
13086 rtx x, insn = emit_insn (gen_pop (sr->reg));
13088 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13089 RTX_FRAME_RELATED_P (insn) = 1;
13090 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13091 x = gen_rtx_SET (stack_pointer_rtx, x);
13092 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13093 m->fs.sp_offset -= UNITS_PER_WORD;
13097 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13099 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13101 static void
13102 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13104 /* We skip the probe for the first interval + a small dope of 4 words and
13105 probe that many bytes past the specified size to maintain a protection
13106 area at the botton of the stack. */
13107 const int dope = 4 * UNITS_PER_WORD;
13108 rtx size_rtx = GEN_INT (size), last;
13110 /* See if we have a constant small number of probes to generate. If so,
13111 that's the easy case. The run-time loop is made up of 9 insns in the
13112 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13113 for n # of intervals. */
13114 if (size <= 4 * PROBE_INTERVAL)
13116 HOST_WIDE_INT i, adjust;
13117 bool first_probe = true;
13119 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13120 values of N from 1 until it exceeds SIZE. If only one probe is
13121 needed, this will not generate any code. Then adjust and probe
13122 to PROBE_INTERVAL + SIZE. */
13123 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13125 if (first_probe)
13127 adjust = 2 * PROBE_INTERVAL + dope;
13128 first_probe = false;
13130 else
13131 adjust = PROBE_INTERVAL;
13133 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13134 plus_constant (Pmode, stack_pointer_rtx,
13135 -adjust)));
13136 emit_stack_probe (stack_pointer_rtx);
13139 if (first_probe)
13140 adjust = size + PROBE_INTERVAL + dope;
13141 else
13142 adjust = size + PROBE_INTERVAL - i;
13144 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13145 plus_constant (Pmode, stack_pointer_rtx,
13146 -adjust)));
13147 emit_stack_probe (stack_pointer_rtx);
13149 /* Adjust back to account for the additional first interval. */
13150 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13151 plus_constant (Pmode, stack_pointer_rtx,
13152 PROBE_INTERVAL + dope)));
13155 /* Otherwise, do the same as above, but in a loop. Note that we must be
13156 extra careful with variables wrapping around because we might be at
13157 the very top (or the very bottom) of the address space and we have
13158 to be able to handle this case properly; in particular, we use an
13159 equality test for the loop condition. */
13160 else
13162 HOST_WIDE_INT rounded_size;
13163 struct scratch_reg sr;
13165 get_scratch_register_on_entry (&sr);
13168 /* Step 1: round SIZE to the previous multiple of the interval. */
13170 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13173 /* Step 2: compute initial and final value of the loop counter. */
13175 /* SP = SP_0 + PROBE_INTERVAL. */
13176 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13177 plus_constant (Pmode, stack_pointer_rtx,
13178 - (PROBE_INTERVAL + dope))));
13180 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13181 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13182 emit_insn (gen_rtx_SET (sr.reg,
13183 plus_constant (Pmode, stack_pointer_rtx,
13184 -rounded_size)));
13185 else
13187 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13188 emit_insn (gen_rtx_SET (sr.reg,
13189 gen_rtx_PLUS (Pmode, sr.reg,
13190 stack_pointer_rtx)));
13194 /* Step 3: the loop
13198 SP = SP + PROBE_INTERVAL
13199 probe at SP
13201 while (SP != LAST_ADDR)
13203 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13204 values of N from 1 until it is equal to ROUNDED_SIZE. */
13206 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13209 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13210 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13212 if (size != rounded_size)
13214 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13215 plus_constant (Pmode, stack_pointer_rtx,
13216 rounded_size - size)));
13217 emit_stack_probe (stack_pointer_rtx);
13220 /* Adjust back to account for the additional first interval. */
13221 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13222 plus_constant (Pmode, stack_pointer_rtx,
13223 PROBE_INTERVAL + dope)));
13225 release_scratch_register_on_entry (&sr);
13228 /* Even if the stack pointer isn't the CFA register, we need to correctly
13229 describe the adjustments made to it, in particular differentiate the
13230 frame-related ones from the frame-unrelated ones. */
13231 if (size > 0)
13233 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13234 XVECEXP (expr, 0, 0)
13235 = gen_rtx_SET (stack_pointer_rtx,
13236 plus_constant (Pmode, stack_pointer_rtx, -size));
13237 XVECEXP (expr, 0, 1)
13238 = gen_rtx_SET (stack_pointer_rtx,
13239 plus_constant (Pmode, stack_pointer_rtx,
13240 PROBE_INTERVAL + dope + size));
13241 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13242 RTX_FRAME_RELATED_P (last) = 1;
13244 cfun->machine->fs.sp_offset += size;
13247 /* Make sure nothing is scheduled before we are done. */
13248 emit_insn (gen_blockage ());
13251 /* Adjust the stack pointer up to REG while probing it. */
13253 const char *
13254 output_adjust_stack_and_probe (rtx reg)
13256 static int labelno = 0;
13257 char loop_lab[32];
13258 rtx xops[2];
13260 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13262 /* Loop. */
13263 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13265 /* SP = SP + PROBE_INTERVAL. */
13266 xops[0] = stack_pointer_rtx;
13267 xops[1] = GEN_INT (PROBE_INTERVAL);
13268 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13270 /* Probe at SP. */
13271 xops[1] = const0_rtx;
13272 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13274 /* Test if SP == LAST_ADDR. */
13275 xops[0] = stack_pointer_rtx;
13276 xops[1] = reg;
13277 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13279 /* Branch. */
13280 fputs ("\tjne\t", asm_out_file);
13281 assemble_name_raw (asm_out_file, loop_lab);
13282 fputc ('\n', asm_out_file);
13284 return "";
13287 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13288 inclusive. These are offsets from the current stack pointer. */
13290 static void
13291 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13293 /* See if we have a constant small number of probes to generate. If so,
13294 that's the easy case. The run-time loop is made up of 6 insns in the
13295 generic case while the compile-time loop is made up of n insns for n #
13296 of intervals. */
13297 if (size <= 6 * PROBE_INTERVAL)
13299 HOST_WIDE_INT i;
13301 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13302 it exceeds SIZE. If only one probe is needed, this will not
13303 generate any code. Then probe at FIRST + SIZE. */
13304 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13305 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13306 -(first + i)));
13308 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13309 -(first + size)));
13312 /* Otherwise, do the same as above, but in a loop. Note that we must be
13313 extra careful with variables wrapping around because we might be at
13314 the very top (or the very bottom) of the address space and we have
13315 to be able to handle this case properly; in particular, we use an
13316 equality test for the loop condition. */
13317 else
13319 HOST_WIDE_INT rounded_size, last;
13320 struct scratch_reg sr;
13322 get_scratch_register_on_entry (&sr);
13325 /* Step 1: round SIZE to the previous multiple of the interval. */
13327 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13330 /* Step 2: compute initial and final value of the loop counter. */
13332 /* TEST_OFFSET = FIRST. */
13333 emit_move_insn (sr.reg, GEN_INT (-first));
13335 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13336 last = first + rounded_size;
13339 /* Step 3: the loop
13343 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13344 probe at TEST_ADDR
13346 while (TEST_ADDR != LAST_ADDR)
13348 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13349 until it is equal to ROUNDED_SIZE. */
13351 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13354 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13355 that SIZE is equal to ROUNDED_SIZE. */
13357 if (size != rounded_size)
13358 emit_stack_probe (plus_constant (Pmode,
13359 gen_rtx_PLUS (Pmode,
13360 stack_pointer_rtx,
13361 sr.reg),
13362 rounded_size - size));
13364 release_scratch_register_on_entry (&sr);
13367 /* Make sure nothing is scheduled before we are done. */
13368 emit_insn (gen_blockage ());
13371 /* Probe a range of stack addresses from REG to END, inclusive. These are
13372 offsets from the current stack pointer. */
13374 const char *
13375 output_probe_stack_range (rtx reg, rtx end)
13377 static int labelno = 0;
13378 char loop_lab[32];
13379 rtx xops[3];
13381 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13383 /* Loop. */
13384 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13386 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13387 xops[0] = reg;
13388 xops[1] = GEN_INT (PROBE_INTERVAL);
13389 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13391 /* Probe at TEST_ADDR. */
13392 xops[0] = stack_pointer_rtx;
13393 xops[1] = reg;
13394 xops[2] = const0_rtx;
13395 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13397 /* Test if TEST_ADDR == LAST_ADDR. */
13398 xops[0] = reg;
13399 xops[1] = end;
13400 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13402 /* Branch. */
13403 fputs ("\tjne\t", asm_out_file);
13404 assemble_name_raw (asm_out_file, loop_lab);
13405 fputc ('\n', asm_out_file);
13407 return "";
13410 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13411 to be generated in correct form. */
13412 static void
13413 ix86_finalize_stack_realign_flags (void)
13415 /* Check if stack realign is really needed after reload, and
13416 stores result in cfun */
13417 unsigned int incoming_stack_boundary
13418 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13419 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13420 unsigned int stack_realign
13421 = (incoming_stack_boundary
13422 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13423 ? crtl->max_used_stack_slot_alignment
13424 : crtl->stack_alignment_needed));
13426 if (crtl->stack_realign_finalized)
13428 /* After stack_realign_needed is finalized, we can't no longer
13429 change it. */
13430 gcc_assert (crtl->stack_realign_needed == stack_realign);
13431 return;
13434 /* If the only reason for frame_pointer_needed is that we conservatively
13435 assumed stack realignment might be needed, but in the end nothing that
13436 needed the stack alignment had been spilled, clear frame_pointer_needed
13437 and say we don't need stack realignment. */
13438 if (stack_realign
13439 && frame_pointer_needed
13440 && crtl->is_leaf
13441 && flag_omit_frame_pointer
13442 && crtl->sp_is_unchanging
13443 && !ix86_current_function_calls_tls_descriptor
13444 && !crtl->accesses_prior_frames
13445 && !cfun->calls_alloca
13446 && !crtl->calls_eh_return
13447 /* See ira_setup_eliminable_regset for the rationale. */
13448 && !(STACK_CHECK_MOVING_SP
13449 && flag_stack_check
13450 && flag_exceptions
13451 && cfun->can_throw_non_call_exceptions)
13452 && !ix86_frame_pointer_required ()
13453 && get_frame_size () == 0
13454 && ix86_nsaved_sseregs () == 0
13455 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13457 HARD_REG_SET set_up_by_prologue, prologue_used;
13458 basic_block bb;
13460 CLEAR_HARD_REG_SET (prologue_used);
13461 CLEAR_HARD_REG_SET (set_up_by_prologue);
13462 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13463 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13464 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13465 HARD_FRAME_POINTER_REGNUM);
13466 FOR_EACH_BB_FN (bb, cfun)
13468 rtx_insn *insn;
13469 FOR_BB_INSNS (bb, insn)
13470 if (NONDEBUG_INSN_P (insn)
13471 && requires_stack_frame_p (insn, prologue_used,
13472 set_up_by_prologue))
13474 crtl->stack_realign_needed = stack_realign;
13475 crtl->stack_realign_finalized = true;
13476 return;
13480 /* If drap has been set, but it actually isn't live at the start
13481 of the function, there is no reason to set it up. */
13482 if (crtl->drap_reg)
13484 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13485 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13487 crtl->drap_reg = NULL_RTX;
13488 crtl->need_drap = false;
13491 else
13492 cfun->machine->no_drap_save_restore = true;
13494 frame_pointer_needed = false;
13495 stack_realign = false;
13496 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13497 crtl->stack_alignment_needed = incoming_stack_boundary;
13498 crtl->stack_alignment_estimated = incoming_stack_boundary;
13499 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13500 crtl->preferred_stack_boundary = incoming_stack_boundary;
13501 df_finish_pass (true);
13502 df_scan_alloc (NULL);
13503 df_scan_blocks ();
13504 df_compute_regs_ever_live (true);
13505 df_analyze ();
13508 crtl->stack_realign_needed = stack_realign;
13509 crtl->stack_realign_finalized = true;
13512 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13514 static void
13515 ix86_elim_entry_set_got (rtx reg)
13517 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13518 rtx_insn *c_insn = BB_HEAD (bb);
13519 if (!NONDEBUG_INSN_P (c_insn))
13520 c_insn = next_nonnote_nondebug_insn (c_insn);
13521 if (c_insn && NONJUMP_INSN_P (c_insn))
13523 rtx pat = PATTERN (c_insn);
13524 if (GET_CODE (pat) == PARALLEL)
13526 rtx vec = XVECEXP (pat, 0, 0);
13527 if (GET_CODE (vec) == SET
13528 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13529 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13530 delete_insn (c_insn);
13535 /* Expand the prologue into a bunch of separate insns. */
13537 void
13538 ix86_expand_prologue (void)
13540 struct machine_function *m = cfun->machine;
13541 rtx insn, t;
13542 struct ix86_frame frame;
13543 HOST_WIDE_INT allocate;
13544 bool int_registers_saved;
13545 bool sse_registers_saved;
13546 rtx static_chain = NULL_RTX;
13548 ix86_finalize_stack_realign_flags ();
13550 /* DRAP should not coexist with stack_realign_fp */
13551 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13553 memset (&m->fs, 0, sizeof (m->fs));
13555 /* Initialize CFA state for before the prologue. */
13556 m->fs.cfa_reg = stack_pointer_rtx;
13557 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13559 /* Track SP offset to the CFA. We continue tracking this after we've
13560 swapped the CFA register away from SP. In the case of re-alignment
13561 this is fudged; we're interested to offsets within the local frame. */
13562 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13563 m->fs.sp_valid = true;
13565 ix86_compute_frame_layout (&frame);
13567 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13569 /* We should have already generated an error for any use of
13570 ms_hook on a nested function. */
13571 gcc_checking_assert (!ix86_static_chain_on_stack);
13573 /* Check if profiling is active and we shall use profiling before
13574 prologue variant. If so sorry. */
13575 if (crtl->profile && flag_fentry != 0)
13576 sorry ("ms_hook_prologue attribute isn%'t compatible "
13577 "with -mfentry for 32-bit");
13579 /* In ix86_asm_output_function_label we emitted:
13580 8b ff movl.s %edi,%edi
13581 55 push %ebp
13582 8b ec movl.s %esp,%ebp
13584 This matches the hookable function prologue in Win32 API
13585 functions in Microsoft Windows XP Service Pack 2 and newer.
13586 Wine uses this to enable Windows apps to hook the Win32 API
13587 functions provided by Wine.
13589 What that means is that we've already set up the frame pointer. */
13591 if (frame_pointer_needed
13592 && !(crtl->drap_reg && crtl->stack_realign_needed))
13594 rtx push, mov;
13596 /* We've decided to use the frame pointer already set up.
13597 Describe this to the unwinder by pretending that both
13598 push and mov insns happen right here.
13600 Putting the unwind info here at the end of the ms_hook
13601 is done so that we can make absolutely certain we get
13602 the required byte sequence at the start of the function,
13603 rather than relying on an assembler that can produce
13604 the exact encoding required.
13606 However it does mean (in the unpatched case) that we have
13607 a 1 insn window where the asynchronous unwind info is
13608 incorrect. However, if we placed the unwind info at
13609 its correct location we would have incorrect unwind info
13610 in the patched case. Which is probably all moot since
13611 I don't expect Wine generates dwarf2 unwind info for the
13612 system libraries that use this feature. */
13614 insn = emit_insn (gen_blockage ());
13616 push = gen_push (hard_frame_pointer_rtx);
13617 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13618 stack_pointer_rtx);
13619 RTX_FRAME_RELATED_P (push) = 1;
13620 RTX_FRAME_RELATED_P (mov) = 1;
13622 RTX_FRAME_RELATED_P (insn) = 1;
13623 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13624 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13626 /* Note that gen_push incremented m->fs.cfa_offset, even
13627 though we didn't emit the push insn here. */
13628 m->fs.cfa_reg = hard_frame_pointer_rtx;
13629 m->fs.fp_offset = m->fs.cfa_offset;
13630 m->fs.fp_valid = true;
13632 else
13634 /* The frame pointer is not needed so pop %ebp again.
13635 This leaves us with a pristine state. */
13636 emit_insn (gen_pop (hard_frame_pointer_rtx));
13640 /* The first insn of a function that accepts its static chain on the
13641 stack is to push the register that would be filled in by a direct
13642 call. This insn will be skipped by the trampoline. */
13643 else if (ix86_static_chain_on_stack)
13645 static_chain = ix86_static_chain (cfun->decl, false);
13646 insn = emit_insn (gen_push (static_chain));
13647 emit_insn (gen_blockage ());
13649 /* We don't want to interpret this push insn as a register save,
13650 only as a stack adjustment. The real copy of the register as
13651 a save will be done later, if needed. */
13652 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13653 t = gen_rtx_SET (stack_pointer_rtx, t);
13654 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13655 RTX_FRAME_RELATED_P (insn) = 1;
13658 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13659 of DRAP is needed and stack realignment is really needed after reload */
13660 if (stack_realign_drap)
13662 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13664 /* Can't use DRAP in interrupt function. */
13665 if (cfun->machine->func_type != TYPE_NORMAL)
13666 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13667 "in interrupt service routine. This may be worked "
13668 "around by avoiding functions with aggregate return.");
13670 /* Only need to push parameter pointer reg if it is caller saved. */
13671 if (!call_used_regs[REGNO (crtl->drap_reg)])
13673 /* Push arg pointer reg */
13674 insn = emit_insn (gen_push (crtl->drap_reg));
13675 RTX_FRAME_RELATED_P (insn) = 1;
13678 /* Grab the argument pointer. */
13679 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13680 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13681 RTX_FRAME_RELATED_P (insn) = 1;
13682 m->fs.cfa_reg = crtl->drap_reg;
13683 m->fs.cfa_offset = 0;
13685 /* Align the stack. */
13686 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13687 stack_pointer_rtx,
13688 GEN_INT (-align_bytes)));
13689 RTX_FRAME_RELATED_P (insn) = 1;
13691 /* Replicate the return address on the stack so that return
13692 address can be reached via (argp - 1) slot. This is needed
13693 to implement macro RETURN_ADDR_RTX and intrinsic function
13694 expand_builtin_return_addr etc. */
13695 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13696 t = gen_frame_mem (word_mode, t);
13697 insn = emit_insn (gen_push (t));
13698 RTX_FRAME_RELATED_P (insn) = 1;
13700 /* For the purposes of frame and register save area addressing,
13701 we've started over with a new frame. */
13702 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13703 m->fs.realigned = true;
13705 if (static_chain)
13707 /* Replicate static chain on the stack so that static chain
13708 can be reached via (argp - 2) slot. This is needed for
13709 nested function with stack realignment. */
13710 insn = emit_insn (gen_push (static_chain));
13711 RTX_FRAME_RELATED_P (insn) = 1;
13715 int_registers_saved = (frame.nregs == 0);
13716 sse_registers_saved = (frame.nsseregs == 0);
13718 if (frame_pointer_needed && !m->fs.fp_valid)
13720 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13721 slower on all targets. Also sdb doesn't like it. */
13722 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13723 RTX_FRAME_RELATED_P (insn) = 1;
13725 /* Push registers now, before setting the frame pointer
13726 on SEH target. */
13727 if (!int_registers_saved
13728 && TARGET_SEH
13729 && !frame.save_regs_using_mov)
13731 ix86_emit_save_regs ();
13732 int_registers_saved = true;
13733 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13736 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13738 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13739 RTX_FRAME_RELATED_P (insn) = 1;
13741 if (m->fs.cfa_reg == stack_pointer_rtx)
13742 m->fs.cfa_reg = hard_frame_pointer_rtx;
13743 m->fs.fp_offset = m->fs.sp_offset;
13744 m->fs.fp_valid = true;
13748 if (!int_registers_saved)
13750 /* If saving registers via PUSH, do so now. */
13751 if (!frame.save_regs_using_mov)
13753 ix86_emit_save_regs ();
13754 int_registers_saved = true;
13755 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13758 /* When using red zone we may start register saving before allocating
13759 the stack frame saving one cycle of the prologue. However, avoid
13760 doing this if we have to probe the stack; at least on x86_64 the
13761 stack probe can turn into a call that clobbers a red zone location. */
13762 else if (ix86_using_red_zone ()
13763 && (! TARGET_STACK_PROBE
13764 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13766 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13767 int_registers_saved = true;
13771 if (stack_realign_fp)
13773 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13774 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13776 /* The computation of the size of the re-aligned stack frame means
13777 that we must allocate the size of the register save area before
13778 performing the actual alignment. Otherwise we cannot guarantee
13779 that there's enough storage above the realignment point. */
13780 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13781 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13782 GEN_INT (m->fs.sp_offset
13783 - frame.sse_reg_save_offset),
13784 -1, false);
13786 /* Align the stack. */
13787 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13788 stack_pointer_rtx,
13789 GEN_INT (-align_bytes)));
13791 /* For the purposes of register save area addressing, the stack
13792 pointer is no longer valid. As for the value of sp_offset,
13793 see ix86_compute_frame_layout, which we need to match in order
13794 to pass verification of stack_pointer_offset at the end. */
13795 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13796 m->fs.sp_valid = false;
13799 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13801 if (flag_stack_usage_info)
13803 /* We start to count from ARG_POINTER. */
13804 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13806 /* If it was realigned, take into account the fake frame. */
13807 if (stack_realign_drap)
13809 if (ix86_static_chain_on_stack)
13810 stack_size += UNITS_PER_WORD;
13812 if (!call_used_regs[REGNO (crtl->drap_reg)])
13813 stack_size += UNITS_PER_WORD;
13815 /* This over-estimates by 1 minimal-stack-alignment-unit but
13816 mitigates that by counting in the new return address slot. */
13817 current_function_dynamic_stack_size
13818 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13821 current_function_static_stack_size = stack_size;
13824 /* On SEH target with very large frame size, allocate an area to save
13825 SSE registers (as the very large allocation won't be described). */
13826 if (TARGET_SEH
13827 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13828 && !sse_registers_saved)
13830 HOST_WIDE_INT sse_size =
13831 frame.sse_reg_save_offset - frame.reg_save_offset;
13833 gcc_assert (int_registers_saved);
13835 /* No need to do stack checking as the area will be immediately
13836 written. */
13837 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13838 GEN_INT (-sse_size), -1,
13839 m->fs.cfa_reg == stack_pointer_rtx);
13840 allocate -= sse_size;
13841 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13842 sse_registers_saved = true;
13845 /* The stack has already been decremented by the instruction calling us
13846 so probe if the size is non-negative to preserve the protection area. */
13847 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
13849 /* We expect the registers to be saved when probes are used. */
13850 gcc_assert (int_registers_saved);
13852 if (STACK_CHECK_MOVING_SP)
13854 if (!(crtl->is_leaf && !cfun->calls_alloca
13855 && allocate <= PROBE_INTERVAL))
13857 ix86_adjust_stack_and_probe (allocate);
13858 allocate = 0;
13861 else
13863 HOST_WIDE_INT size = allocate;
13865 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13866 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
13868 if (TARGET_STACK_PROBE)
13870 if (crtl->is_leaf && !cfun->calls_alloca)
13872 if (size > PROBE_INTERVAL)
13873 ix86_emit_probe_stack_range (0, size);
13875 else
13876 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
13878 else
13880 if (crtl->is_leaf && !cfun->calls_alloca)
13882 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
13883 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
13884 size - STACK_CHECK_PROTECT);
13886 else
13887 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
13892 if (allocate == 0)
13894 else if (!ix86_target_stack_probe ()
13895 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13897 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13898 GEN_INT (-allocate), -1,
13899 m->fs.cfa_reg == stack_pointer_rtx);
13901 else
13903 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13904 rtx r10 = NULL;
13905 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13906 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13907 bool eax_live = ix86_eax_live_at_start_p ();
13908 bool r10_live = false;
13910 if (TARGET_64BIT)
13911 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13913 if (eax_live)
13915 insn = emit_insn (gen_push (eax));
13916 allocate -= UNITS_PER_WORD;
13917 /* Note that SEH directives need to continue tracking the stack
13918 pointer even after the frame pointer has been set up. */
13919 if (sp_is_cfa_reg || TARGET_SEH)
13921 if (sp_is_cfa_reg)
13922 m->fs.cfa_offset += UNITS_PER_WORD;
13923 RTX_FRAME_RELATED_P (insn) = 1;
13924 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13925 gen_rtx_SET (stack_pointer_rtx,
13926 plus_constant (Pmode, stack_pointer_rtx,
13927 -UNITS_PER_WORD)));
13931 if (r10_live)
13933 r10 = gen_rtx_REG (Pmode, R10_REG);
13934 insn = emit_insn (gen_push (r10));
13935 allocate -= UNITS_PER_WORD;
13936 if (sp_is_cfa_reg || TARGET_SEH)
13938 if (sp_is_cfa_reg)
13939 m->fs.cfa_offset += UNITS_PER_WORD;
13940 RTX_FRAME_RELATED_P (insn) = 1;
13941 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13942 gen_rtx_SET (stack_pointer_rtx,
13943 plus_constant (Pmode, stack_pointer_rtx,
13944 -UNITS_PER_WORD)));
13948 emit_move_insn (eax, GEN_INT (allocate));
13949 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13951 /* Use the fact that AX still contains ALLOCATE. */
13952 adjust_stack_insn = (Pmode == DImode
13953 ? gen_pro_epilogue_adjust_stack_di_sub
13954 : gen_pro_epilogue_adjust_stack_si_sub);
13956 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13957 stack_pointer_rtx, eax));
13959 if (sp_is_cfa_reg || TARGET_SEH)
13961 if (sp_is_cfa_reg)
13962 m->fs.cfa_offset += allocate;
13963 RTX_FRAME_RELATED_P (insn) = 1;
13964 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13965 gen_rtx_SET (stack_pointer_rtx,
13966 plus_constant (Pmode, stack_pointer_rtx,
13967 -allocate)));
13969 m->fs.sp_offset += allocate;
13971 /* Use stack_pointer_rtx for relative addressing so that code
13972 works for realigned stack, too. */
13973 if (r10_live && eax_live)
13975 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13976 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13977 gen_frame_mem (word_mode, t));
13978 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13979 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13980 gen_frame_mem (word_mode, t));
13982 else if (eax_live || r10_live)
13984 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13985 emit_move_insn (gen_rtx_REG (word_mode,
13986 (eax_live ? AX_REG : R10_REG)),
13987 gen_frame_mem (word_mode, t));
13990 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13992 /* If we havn't already set up the frame pointer, do so now. */
13993 if (frame_pointer_needed && !m->fs.fp_valid)
13995 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13996 GEN_INT (frame.stack_pointer_offset
13997 - frame.hard_frame_pointer_offset));
13998 insn = emit_insn (insn);
13999 RTX_FRAME_RELATED_P (insn) = 1;
14000 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14002 if (m->fs.cfa_reg == stack_pointer_rtx)
14003 m->fs.cfa_reg = hard_frame_pointer_rtx;
14004 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14005 m->fs.fp_valid = true;
14008 if (!int_registers_saved)
14009 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14010 if (!sse_registers_saved)
14011 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14013 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14014 in PROLOGUE. */
14015 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14017 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14018 insn = emit_insn (gen_set_got (pic));
14019 RTX_FRAME_RELATED_P (insn) = 1;
14020 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14021 emit_insn (gen_prologue_use (pic));
14022 /* Deleting already emmitted SET_GOT if exist and allocated to
14023 REAL_PIC_OFFSET_TABLE_REGNUM. */
14024 ix86_elim_entry_set_got (pic);
14027 if (crtl->drap_reg && !crtl->stack_realign_needed)
14029 /* vDRAP is setup but after reload it turns out stack realign
14030 isn't necessary, here we will emit prologue to setup DRAP
14031 without stack realign adjustment */
14032 t = choose_baseaddr (0);
14033 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14036 /* Prevent instructions from being scheduled into register save push
14037 sequence when access to the redzone area is done through frame pointer.
14038 The offset between the frame pointer and the stack pointer is calculated
14039 relative to the value of the stack pointer at the end of the function
14040 prologue, and moving instructions that access redzone area via frame
14041 pointer inside push sequence violates this assumption. */
14042 if (frame_pointer_needed && frame.red_zone_size)
14043 emit_insn (gen_memory_blockage ());
14045 /* SEH requires that the prologue end within 256 bytes of the start of
14046 the function. Prevent instruction schedules that would extend that.
14047 Further, prevent alloca modifications to the stack pointer from being
14048 combined with prologue modifications. */
14049 if (TARGET_SEH)
14050 emit_insn (gen_prologue_use (stack_pointer_rtx));
14053 /* Emit code to restore REG using a POP insn. */
14055 static void
14056 ix86_emit_restore_reg_using_pop (rtx reg)
14058 struct machine_function *m = cfun->machine;
14059 rtx_insn *insn = emit_insn (gen_pop (reg));
14061 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14062 m->fs.sp_offset -= UNITS_PER_WORD;
14064 if (m->fs.cfa_reg == crtl->drap_reg
14065 && REGNO (reg) == REGNO (crtl->drap_reg))
14067 /* Previously we'd represented the CFA as an expression
14068 like *(%ebp - 8). We've just popped that value from
14069 the stack, which means we need to reset the CFA to
14070 the drap register. This will remain until we restore
14071 the stack pointer. */
14072 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14073 RTX_FRAME_RELATED_P (insn) = 1;
14075 /* This means that the DRAP register is valid for addressing too. */
14076 m->fs.drap_valid = true;
14077 return;
14080 if (m->fs.cfa_reg == stack_pointer_rtx)
14082 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14083 x = gen_rtx_SET (stack_pointer_rtx, x);
14084 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14085 RTX_FRAME_RELATED_P (insn) = 1;
14087 m->fs.cfa_offset -= UNITS_PER_WORD;
14090 /* When the frame pointer is the CFA, and we pop it, we are
14091 swapping back to the stack pointer as the CFA. This happens
14092 for stack frames that don't allocate other data, so we assume
14093 the stack pointer is now pointing at the return address, i.e.
14094 the function entry state, which makes the offset be 1 word. */
14095 if (reg == hard_frame_pointer_rtx)
14097 m->fs.fp_valid = false;
14098 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14100 m->fs.cfa_reg = stack_pointer_rtx;
14101 m->fs.cfa_offset -= UNITS_PER_WORD;
14103 add_reg_note (insn, REG_CFA_DEF_CFA,
14104 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14105 GEN_INT (m->fs.cfa_offset)));
14106 RTX_FRAME_RELATED_P (insn) = 1;
14111 /* Emit code to restore saved registers using POP insns. */
14113 static void
14114 ix86_emit_restore_regs_using_pop (void)
14116 unsigned int regno;
14118 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14119 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14120 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14123 /* Emit code and notes for the LEAVE instruction. */
14125 static void
14126 ix86_emit_leave (void)
14128 struct machine_function *m = cfun->machine;
14129 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14131 ix86_add_queued_cfa_restore_notes (insn);
14133 gcc_assert (m->fs.fp_valid);
14134 m->fs.sp_valid = true;
14135 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14136 m->fs.fp_valid = false;
14138 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14140 m->fs.cfa_reg = stack_pointer_rtx;
14141 m->fs.cfa_offset = m->fs.sp_offset;
14143 add_reg_note (insn, REG_CFA_DEF_CFA,
14144 plus_constant (Pmode, stack_pointer_rtx,
14145 m->fs.sp_offset));
14146 RTX_FRAME_RELATED_P (insn) = 1;
14148 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14149 m->fs.fp_offset);
14152 /* Emit code to restore saved registers using MOV insns.
14153 First register is restored from CFA - CFA_OFFSET. */
14154 static void
14155 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14156 bool maybe_eh_return)
14158 struct machine_function *m = cfun->machine;
14159 unsigned int regno;
14161 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14162 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14164 rtx reg = gen_rtx_REG (word_mode, regno);
14165 rtx mem;
14166 rtx_insn *insn;
14168 mem = choose_baseaddr (cfa_offset);
14169 mem = gen_frame_mem (word_mode, mem);
14170 insn = emit_move_insn (reg, mem);
14172 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14174 /* Previously we'd represented the CFA as an expression
14175 like *(%ebp - 8). We've just popped that value from
14176 the stack, which means we need to reset the CFA to
14177 the drap register. This will remain until we restore
14178 the stack pointer. */
14179 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14180 RTX_FRAME_RELATED_P (insn) = 1;
14182 /* This means that the DRAP register is valid for addressing. */
14183 m->fs.drap_valid = true;
14185 else
14186 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14188 cfa_offset -= UNITS_PER_WORD;
14192 /* Emit code to restore saved registers using MOV insns.
14193 First register is restored from CFA - CFA_OFFSET. */
14194 static void
14195 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14196 bool maybe_eh_return)
14198 unsigned int regno;
14200 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14201 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14203 rtx reg = gen_rtx_REG (V4SFmode, regno);
14204 rtx mem;
14205 unsigned int align;
14207 mem = choose_baseaddr (cfa_offset);
14208 mem = gen_rtx_MEM (V4SFmode, mem);
14210 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14211 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14212 set_mem_align (mem, align);
14213 emit_insn (gen_rtx_SET (reg, mem));
14215 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14217 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14221 /* Restore function stack, frame, and registers. */
14223 void
14224 ix86_expand_epilogue (int style)
14226 struct machine_function *m = cfun->machine;
14227 struct machine_frame_state frame_state_save = m->fs;
14228 struct ix86_frame frame;
14229 bool restore_regs_via_mov;
14230 bool using_drap;
14232 ix86_finalize_stack_realign_flags ();
14233 ix86_compute_frame_layout (&frame);
14235 m->fs.sp_valid = (!frame_pointer_needed
14236 || (crtl->sp_is_unchanging
14237 && !stack_realign_fp));
14238 gcc_assert (!m->fs.sp_valid
14239 || m->fs.sp_offset == frame.stack_pointer_offset);
14241 /* The FP must be valid if the frame pointer is present. */
14242 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14243 gcc_assert (!m->fs.fp_valid
14244 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14246 /* We must have *some* valid pointer to the stack frame. */
14247 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14249 /* The DRAP is never valid at this point. */
14250 gcc_assert (!m->fs.drap_valid);
14252 /* See the comment about red zone and frame
14253 pointer usage in ix86_expand_prologue. */
14254 if (frame_pointer_needed && frame.red_zone_size)
14255 emit_insn (gen_memory_blockage ());
14257 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14258 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14260 /* Determine the CFA offset of the end of the red-zone. */
14261 m->fs.red_zone_offset = 0;
14262 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14264 /* The red-zone begins below the return address. */
14265 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14267 /* When the register save area is in the aligned portion of
14268 the stack, determine the maximum runtime displacement that
14269 matches up with the aligned frame. */
14270 if (stack_realign_drap)
14271 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14272 + UNITS_PER_WORD);
14275 /* Special care must be taken for the normal return case of a function
14276 using eh_return: the eax and edx registers are marked as saved, but
14277 not restored along this path. Adjust the save location to match. */
14278 if (crtl->calls_eh_return && style != 2)
14279 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14281 /* EH_RETURN requires the use of moves to function properly. */
14282 if (crtl->calls_eh_return)
14283 restore_regs_via_mov = true;
14284 /* SEH requires the use of pops to identify the epilogue. */
14285 else if (TARGET_SEH)
14286 restore_regs_via_mov = false;
14287 /* If we're only restoring one register and sp is not valid then
14288 using a move instruction to restore the register since it's
14289 less work than reloading sp and popping the register. */
14290 else if (!m->fs.sp_valid && frame.nregs <= 1)
14291 restore_regs_via_mov = true;
14292 else if (TARGET_EPILOGUE_USING_MOVE
14293 && cfun->machine->use_fast_prologue_epilogue
14294 && (frame.nregs > 1
14295 || m->fs.sp_offset != frame.reg_save_offset))
14296 restore_regs_via_mov = true;
14297 else if (frame_pointer_needed
14298 && !frame.nregs
14299 && m->fs.sp_offset != frame.reg_save_offset)
14300 restore_regs_via_mov = true;
14301 else if (frame_pointer_needed
14302 && TARGET_USE_LEAVE
14303 && cfun->machine->use_fast_prologue_epilogue
14304 && frame.nregs == 1)
14305 restore_regs_via_mov = true;
14306 else
14307 restore_regs_via_mov = false;
14309 if (restore_regs_via_mov || frame.nsseregs)
14311 /* Ensure that the entire register save area is addressable via
14312 the stack pointer, if we will restore via sp. */
14313 if (TARGET_64BIT
14314 && m->fs.sp_offset > 0x7fffffff
14315 && !(m->fs.fp_valid || m->fs.drap_valid)
14316 && (frame.nsseregs + frame.nregs) != 0)
14318 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14319 GEN_INT (m->fs.sp_offset
14320 - frame.sse_reg_save_offset),
14321 style,
14322 m->fs.cfa_reg == stack_pointer_rtx);
14326 /* If there are any SSE registers to restore, then we have to do it
14327 via moves, since there's obviously no pop for SSE regs. */
14328 if (frame.nsseregs)
14329 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14330 style == 2);
14332 if (restore_regs_via_mov)
14334 rtx t;
14336 if (frame.nregs)
14337 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14339 /* eh_return epilogues need %ecx added to the stack pointer. */
14340 if (style == 2)
14342 rtx sa = EH_RETURN_STACKADJ_RTX;
14343 rtx_insn *insn;
14345 /* %ecx can't be used for both DRAP register and eh_return. */
14346 if (crtl->drap_reg)
14347 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14349 /* regparm nested functions don't work with eh_return. */
14350 gcc_assert (!ix86_static_chain_on_stack);
14352 if (frame_pointer_needed)
14354 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14355 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14356 emit_insn (gen_rtx_SET (sa, t));
14358 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14359 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14361 /* Note that we use SA as a temporary CFA, as the return
14362 address is at the proper place relative to it. We
14363 pretend this happens at the FP restore insn because
14364 prior to this insn the FP would be stored at the wrong
14365 offset relative to SA, and after this insn we have no
14366 other reasonable register to use for the CFA. We don't
14367 bother resetting the CFA to the SP for the duration of
14368 the return insn. */
14369 add_reg_note (insn, REG_CFA_DEF_CFA,
14370 plus_constant (Pmode, sa, UNITS_PER_WORD));
14371 ix86_add_queued_cfa_restore_notes (insn);
14372 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14373 RTX_FRAME_RELATED_P (insn) = 1;
14375 m->fs.cfa_reg = sa;
14376 m->fs.cfa_offset = UNITS_PER_WORD;
14377 m->fs.fp_valid = false;
14379 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14380 const0_rtx, style, false);
14382 else
14384 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14385 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14386 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14387 ix86_add_queued_cfa_restore_notes (insn);
14389 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14390 if (m->fs.cfa_offset != UNITS_PER_WORD)
14392 m->fs.cfa_offset = UNITS_PER_WORD;
14393 add_reg_note (insn, REG_CFA_DEF_CFA,
14394 plus_constant (Pmode, stack_pointer_rtx,
14395 UNITS_PER_WORD));
14396 RTX_FRAME_RELATED_P (insn) = 1;
14399 m->fs.sp_offset = UNITS_PER_WORD;
14400 m->fs.sp_valid = true;
14403 else
14405 /* SEH requires that the function end with (1) a stack adjustment
14406 if necessary, (2) a sequence of pops, and (3) a return or
14407 jump instruction. Prevent insns from the function body from
14408 being scheduled into this sequence. */
14409 if (TARGET_SEH)
14411 /* Prevent a catch region from being adjacent to the standard
14412 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14413 several other flags that would be interesting to test are
14414 not yet set up. */
14415 if (flag_non_call_exceptions)
14416 emit_insn (gen_nops (const1_rtx));
14417 else
14418 emit_insn (gen_blockage ());
14421 /* First step is to deallocate the stack frame so that we can
14422 pop the registers. Also do it on SEH target for very large
14423 frame as the emitted instructions aren't allowed by the ABI in
14424 epilogues. */
14425 if (!m->fs.sp_valid
14426 || (TARGET_SEH
14427 && (m->fs.sp_offset - frame.reg_save_offset
14428 >= SEH_MAX_FRAME_SIZE)))
14430 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14431 GEN_INT (m->fs.fp_offset
14432 - frame.reg_save_offset),
14433 style, false);
14435 else if (m->fs.sp_offset != frame.reg_save_offset)
14437 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14438 GEN_INT (m->fs.sp_offset
14439 - frame.reg_save_offset),
14440 style,
14441 m->fs.cfa_reg == stack_pointer_rtx);
14444 ix86_emit_restore_regs_using_pop ();
14447 /* If we used a stack pointer and haven't already got rid of it,
14448 then do so now. */
14449 if (m->fs.fp_valid)
14451 /* If the stack pointer is valid and pointing at the frame
14452 pointer store address, then we only need a pop. */
14453 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14454 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14455 /* Leave results in shorter dependency chains on CPUs that are
14456 able to grok it fast. */
14457 else if (TARGET_USE_LEAVE
14458 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14459 || !cfun->machine->use_fast_prologue_epilogue)
14460 ix86_emit_leave ();
14461 else
14463 pro_epilogue_adjust_stack (stack_pointer_rtx,
14464 hard_frame_pointer_rtx,
14465 const0_rtx, style, !using_drap);
14466 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14470 if (using_drap)
14472 int param_ptr_offset = UNITS_PER_WORD;
14473 rtx_insn *insn;
14475 gcc_assert (stack_realign_drap);
14477 if (ix86_static_chain_on_stack)
14478 param_ptr_offset += UNITS_PER_WORD;
14479 if (!call_used_regs[REGNO (crtl->drap_reg)])
14480 param_ptr_offset += UNITS_PER_WORD;
14482 insn = emit_insn (gen_rtx_SET
14483 (stack_pointer_rtx,
14484 gen_rtx_PLUS (Pmode,
14485 crtl->drap_reg,
14486 GEN_INT (-param_ptr_offset))));
14487 m->fs.cfa_reg = stack_pointer_rtx;
14488 m->fs.cfa_offset = param_ptr_offset;
14489 m->fs.sp_offset = param_ptr_offset;
14490 m->fs.realigned = false;
14492 add_reg_note (insn, REG_CFA_DEF_CFA,
14493 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14494 GEN_INT (param_ptr_offset)));
14495 RTX_FRAME_RELATED_P (insn) = 1;
14497 if (!call_used_regs[REGNO (crtl->drap_reg)])
14498 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14501 /* At this point the stack pointer must be valid, and we must have
14502 restored all of the registers. We may not have deallocated the
14503 entire stack frame. We've delayed this until now because it may
14504 be possible to merge the local stack deallocation with the
14505 deallocation forced by ix86_static_chain_on_stack. */
14506 gcc_assert (m->fs.sp_valid);
14507 gcc_assert (!m->fs.fp_valid);
14508 gcc_assert (!m->fs.realigned);
14509 if (m->fs.sp_offset != UNITS_PER_WORD)
14511 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14512 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14513 style, true);
14515 else
14516 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14518 /* Sibcall epilogues don't want a return instruction. */
14519 if (style == 0)
14521 m->fs = frame_state_save;
14522 return;
14525 if (cfun->machine->func_type != TYPE_NORMAL)
14527 /* Return with the "IRET" instruction from interrupt handler.
14528 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14529 instruction in exception handler. */
14530 if (cfun->machine->func_type == TYPE_EXCEPTION)
14532 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14533 UNITS_PER_WORD);
14534 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14536 emit_jump_insn (gen_interrupt_return ());
14538 else if (crtl->args.pops_args && crtl->args.size)
14540 rtx popc = GEN_INT (crtl->args.pops_args);
14542 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14543 address, do explicit add, and jump indirectly to the caller. */
14545 if (crtl->args.pops_args >= 65536)
14547 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14548 rtx_insn *insn;
14550 /* There is no "pascal" calling convention in any 64bit ABI. */
14551 gcc_assert (!TARGET_64BIT);
14553 insn = emit_insn (gen_pop (ecx));
14554 m->fs.cfa_offset -= UNITS_PER_WORD;
14555 m->fs.sp_offset -= UNITS_PER_WORD;
14557 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14558 x = gen_rtx_SET (stack_pointer_rtx, x);
14559 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14560 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14561 RTX_FRAME_RELATED_P (insn) = 1;
14563 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14564 popc, -1, true);
14565 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14567 else
14568 emit_jump_insn (gen_simple_return_pop_internal (popc));
14570 else
14571 emit_jump_insn (gen_simple_return_internal ());
14573 /* Restore the state back to the state from the prologue,
14574 so that it's correct for the next epilogue. */
14575 m->fs = frame_state_save;
14578 /* Reset from the function's potential modifications. */
14580 static void
14581 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14583 if (pic_offset_table_rtx
14584 && !ix86_use_pseudo_pic_reg ())
14585 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14586 #if TARGET_MACHO
14587 /* Mach-O doesn't support labels at the end of objects, so if
14588 it looks like we might want one, insert a NOP. */
14590 rtx_insn *insn = get_last_insn ();
14591 rtx_insn *deleted_debug_label = NULL;
14592 while (insn
14593 && NOTE_P (insn)
14594 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14596 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14597 notes only, instead set their CODE_LABEL_NUMBER to -1,
14598 otherwise there would be code generation differences
14599 in between -g and -g0. */
14600 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14601 deleted_debug_label = insn;
14602 insn = PREV_INSN (insn);
14604 if (insn
14605 && (LABEL_P (insn)
14606 || (NOTE_P (insn)
14607 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
14608 fputs ("\tnop\n", file);
14609 else if (deleted_debug_label)
14610 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14611 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14612 CODE_LABEL_NUMBER (insn) = -1;
14614 #endif
14618 /* Return a scratch register to use in the split stack prologue. The
14619 split stack prologue is used for -fsplit-stack. It is the first
14620 instructions in the function, even before the regular prologue.
14621 The scratch register can be any caller-saved register which is not
14622 used for parameters or for the static chain. */
14624 static unsigned int
14625 split_stack_prologue_scratch_regno (void)
14627 if (TARGET_64BIT)
14628 return R11_REG;
14629 else
14631 bool is_fastcall, is_thiscall;
14632 int regparm;
14634 is_fastcall = (lookup_attribute ("fastcall",
14635 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14636 != NULL);
14637 is_thiscall = (lookup_attribute ("thiscall",
14638 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14639 != NULL);
14640 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14642 if (is_fastcall)
14644 if (DECL_STATIC_CHAIN (cfun->decl))
14646 sorry ("-fsplit-stack does not support fastcall with "
14647 "nested function");
14648 return INVALID_REGNUM;
14650 return AX_REG;
14652 else if (is_thiscall)
14654 if (!DECL_STATIC_CHAIN (cfun->decl))
14655 return DX_REG;
14656 return AX_REG;
14658 else if (regparm < 3)
14660 if (!DECL_STATIC_CHAIN (cfun->decl))
14661 return CX_REG;
14662 else
14664 if (regparm >= 2)
14666 sorry ("-fsplit-stack does not support 2 register "
14667 "parameters for a nested function");
14668 return INVALID_REGNUM;
14670 return DX_REG;
14673 else
14675 /* FIXME: We could make this work by pushing a register
14676 around the addition and comparison. */
14677 sorry ("-fsplit-stack does not support 3 register parameters");
14678 return INVALID_REGNUM;
14683 /* A SYMBOL_REF for the function which allocates new stackspace for
14684 -fsplit-stack. */
14686 static GTY(()) rtx split_stack_fn;
14688 /* A SYMBOL_REF for the more stack function when using the large
14689 model. */
14691 static GTY(()) rtx split_stack_fn_large;
14693 /* Handle -fsplit-stack. These are the first instructions in the
14694 function, even before the regular prologue. */
14696 void
14697 ix86_expand_split_stack_prologue (void)
14699 struct ix86_frame frame;
14700 HOST_WIDE_INT allocate;
14701 unsigned HOST_WIDE_INT args_size;
14702 rtx_code_label *label;
14703 rtx limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
14704 rtx scratch_reg = NULL_RTX;
14705 rtx_code_label *varargs_label = NULL;
14706 rtx fn;
14708 gcc_assert (flag_split_stack && reload_completed);
14710 ix86_finalize_stack_realign_flags ();
14711 ix86_compute_frame_layout (&frame);
14712 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14714 /* This is the label we will branch to if we have enough stack
14715 space. We expect the basic block reordering pass to reverse this
14716 branch if optimizing, so that we branch in the unlikely case. */
14717 label = gen_label_rtx ();
14719 /* We need to compare the stack pointer minus the frame size with
14720 the stack boundary in the TCB. The stack boundary always gives
14721 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14722 can compare directly. Otherwise we need to do an addition. */
14724 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14725 UNSPEC_STACK_CHECK);
14726 limit = gen_rtx_CONST (Pmode, limit);
14727 limit = gen_rtx_MEM (Pmode, limit);
14728 if (allocate < SPLIT_STACK_AVAILABLE)
14729 current = stack_pointer_rtx;
14730 else
14732 unsigned int scratch_regno;
14733 rtx offset;
14735 /* We need a scratch register to hold the stack pointer minus
14736 the required frame size. Since this is the very start of the
14737 function, the scratch register can be any caller-saved
14738 register which is not used for parameters. */
14739 offset = GEN_INT (- allocate);
14740 scratch_regno = split_stack_prologue_scratch_regno ();
14741 if (scratch_regno == INVALID_REGNUM)
14742 return;
14743 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14744 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14746 /* We don't use ix86_gen_add3 in this case because it will
14747 want to split to lea, but when not optimizing the insn
14748 will not be split after this point. */
14749 emit_insn (gen_rtx_SET (scratch_reg,
14750 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14751 offset)));
14753 else
14755 emit_move_insn (scratch_reg, offset);
14756 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14757 stack_pointer_rtx));
14759 current = scratch_reg;
14762 ix86_expand_branch (GEU, current, limit, label);
14763 jump_insn = get_last_insn ();
14764 JUMP_LABEL (jump_insn) = label;
14766 /* Mark the jump as very likely to be taken. */
14767 add_int_reg_note (jump_insn, REG_BR_PROB,
14768 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14770 if (split_stack_fn == NULL_RTX)
14772 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14773 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14775 fn = split_stack_fn;
14777 /* Get more stack space. We pass in the desired stack space and the
14778 size of the arguments to copy to the new stack. In 32-bit mode
14779 we push the parameters; __morestack will return on a new stack
14780 anyhow. In 64-bit mode we pass the parameters in r10 and
14781 r11. */
14782 allocate_rtx = GEN_INT (allocate);
14783 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14784 call_fusage = NULL_RTX;
14785 if (TARGET_64BIT)
14787 rtx reg10, reg11;
14789 reg10 = gen_rtx_REG (Pmode, R10_REG);
14790 reg11 = gen_rtx_REG (Pmode, R11_REG);
14792 /* If this function uses a static chain, it will be in %r10.
14793 Preserve it across the call to __morestack. */
14794 if (DECL_STATIC_CHAIN (cfun->decl))
14796 rtx rax;
14798 rax = gen_rtx_REG (word_mode, AX_REG);
14799 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14800 use_reg (&call_fusage, rax);
14803 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14804 && !TARGET_PECOFF)
14806 HOST_WIDE_INT argval;
14808 gcc_assert (Pmode == DImode);
14809 /* When using the large model we need to load the address
14810 into a register, and we've run out of registers. So we
14811 switch to a different calling convention, and we call a
14812 different function: __morestack_large. We pass the
14813 argument size in the upper 32 bits of r10 and pass the
14814 frame size in the lower 32 bits. */
14815 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14816 gcc_assert ((args_size & 0xffffffff) == args_size);
14818 if (split_stack_fn_large == NULL_RTX)
14820 split_stack_fn_large =
14821 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14822 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14824 if (ix86_cmodel == CM_LARGE_PIC)
14826 rtx_code_label *label;
14827 rtx x;
14829 label = gen_label_rtx ();
14830 emit_label (label);
14831 LABEL_PRESERVE_P (label) = 1;
14832 emit_insn (gen_set_rip_rex64 (reg10, label));
14833 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14834 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14835 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14836 UNSPEC_GOT);
14837 x = gen_rtx_CONST (Pmode, x);
14838 emit_move_insn (reg11, x);
14839 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14840 x = gen_const_mem (Pmode, x);
14841 emit_move_insn (reg11, x);
14843 else
14844 emit_move_insn (reg11, split_stack_fn_large);
14846 fn = reg11;
14848 argval = ((args_size << 16) << 16) + allocate;
14849 emit_move_insn (reg10, GEN_INT (argval));
14851 else
14853 emit_move_insn (reg10, allocate_rtx);
14854 emit_move_insn (reg11, GEN_INT (args_size));
14855 use_reg (&call_fusage, reg11);
14858 use_reg (&call_fusage, reg10);
14860 else
14862 emit_insn (gen_push (GEN_INT (args_size)));
14863 emit_insn (gen_push (allocate_rtx));
14865 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14866 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14867 NULL_RTX, false);
14868 add_function_usage_to (call_insn, call_fusage);
14870 /* In order to make call/return prediction work right, we now need
14871 to execute a return instruction. See
14872 libgcc/config/i386/morestack.S for the details on how this works.
14874 For flow purposes gcc must not see this as a return
14875 instruction--we need control flow to continue at the subsequent
14876 label. Therefore, we use an unspec. */
14877 gcc_assert (crtl->args.pops_args < 65536);
14878 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14880 /* If we are in 64-bit mode and this function uses a static chain,
14881 we saved %r10 in %rax before calling _morestack. */
14882 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14883 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14884 gen_rtx_REG (word_mode, AX_REG));
14886 /* If this function calls va_start, we need to store a pointer to
14887 the arguments on the old stack, because they may not have been
14888 all copied to the new stack. At this point the old stack can be
14889 found at the frame pointer value used by __morestack, because
14890 __morestack has set that up before calling back to us. Here we
14891 store that pointer in a scratch register, and in
14892 ix86_expand_prologue we store the scratch register in a stack
14893 slot. */
14894 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14896 unsigned int scratch_regno;
14897 rtx frame_reg;
14898 int words;
14900 scratch_regno = split_stack_prologue_scratch_regno ();
14901 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14902 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14904 /* 64-bit:
14905 fp -> old fp value
14906 return address within this function
14907 return address of caller of this function
14908 stack arguments
14909 So we add three words to get to the stack arguments.
14911 32-bit:
14912 fp -> old fp value
14913 return address within this function
14914 first argument to __morestack
14915 second argument to __morestack
14916 return address of caller of this function
14917 stack arguments
14918 So we add five words to get to the stack arguments.
14920 words = TARGET_64BIT ? 3 : 5;
14921 emit_insn (gen_rtx_SET (scratch_reg,
14922 gen_rtx_PLUS (Pmode, frame_reg,
14923 GEN_INT (words * UNITS_PER_WORD))));
14925 varargs_label = gen_label_rtx ();
14926 emit_jump_insn (gen_jump (varargs_label));
14927 JUMP_LABEL (get_last_insn ()) = varargs_label;
14929 emit_barrier ();
14932 emit_label (label);
14933 LABEL_NUSES (label) = 1;
14935 /* If this function calls va_start, we now have to set the scratch
14936 register for the case where we do not call __morestack. In this
14937 case we need to set it based on the stack pointer. */
14938 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14940 emit_insn (gen_rtx_SET (scratch_reg,
14941 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14942 GEN_INT (UNITS_PER_WORD))));
14944 emit_label (varargs_label);
14945 LABEL_NUSES (varargs_label) = 1;
14949 /* We may have to tell the dataflow pass that the split stack prologue
14950 is initializing a scratch register. */
14952 static void
14953 ix86_live_on_entry (bitmap regs)
14955 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14957 gcc_assert (flag_split_stack);
14958 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14962 /* Extract the parts of an RTL expression that is a valid memory address
14963 for an instruction. Return 0 if the structure of the address is
14964 grossly off. Return -1 if the address contains ASHIFT, so it is not
14965 strictly valid, but still used for computing length of lea instruction. */
14968 ix86_decompose_address (rtx addr, struct ix86_address *out)
14970 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14971 rtx base_reg, index_reg;
14972 HOST_WIDE_INT scale = 1;
14973 rtx scale_rtx = NULL_RTX;
14974 rtx tmp;
14975 int retval = 1;
14976 addr_space_t seg = ADDR_SPACE_GENERIC;
14978 /* Allow zero-extended SImode addresses,
14979 they will be emitted with addr32 prefix. */
14980 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14982 if (GET_CODE (addr) == ZERO_EXTEND
14983 && GET_MODE (XEXP (addr, 0)) == SImode)
14985 addr = XEXP (addr, 0);
14986 if (CONST_INT_P (addr))
14987 return 0;
14989 else if (GET_CODE (addr) == AND
14990 && const_32bit_mask (XEXP (addr, 1), DImode))
14992 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14993 if (addr == NULL_RTX)
14994 return 0;
14996 if (CONST_INT_P (addr))
14997 return 0;
15001 /* Allow SImode subregs of DImode addresses,
15002 they will be emitted with addr32 prefix. */
15003 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15005 if (SUBREG_P (addr)
15006 && GET_MODE (SUBREG_REG (addr)) == DImode)
15008 addr = SUBREG_REG (addr);
15009 if (CONST_INT_P (addr))
15010 return 0;
15014 if (REG_P (addr))
15015 base = addr;
15016 else if (SUBREG_P (addr))
15018 if (REG_P (SUBREG_REG (addr)))
15019 base = addr;
15020 else
15021 return 0;
15023 else if (GET_CODE (addr) == PLUS)
15025 rtx addends[4], op;
15026 int n = 0, i;
15028 op = addr;
15031 if (n >= 4)
15032 return 0;
15033 addends[n++] = XEXP (op, 1);
15034 op = XEXP (op, 0);
15036 while (GET_CODE (op) == PLUS);
15037 if (n >= 4)
15038 return 0;
15039 addends[n] = op;
15041 for (i = n; i >= 0; --i)
15043 op = addends[i];
15044 switch (GET_CODE (op))
15046 case MULT:
15047 if (index)
15048 return 0;
15049 index = XEXP (op, 0);
15050 scale_rtx = XEXP (op, 1);
15051 break;
15053 case ASHIFT:
15054 if (index)
15055 return 0;
15056 index = XEXP (op, 0);
15057 tmp = XEXP (op, 1);
15058 if (!CONST_INT_P (tmp))
15059 return 0;
15060 scale = INTVAL (tmp);
15061 if ((unsigned HOST_WIDE_INT) scale > 3)
15062 return 0;
15063 scale = 1 << scale;
15064 break;
15066 case ZERO_EXTEND:
15067 op = XEXP (op, 0);
15068 if (GET_CODE (op) != UNSPEC)
15069 return 0;
15070 /* FALLTHRU */
15072 case UNSPEC:
15073 if (XINT (op, 1) == UNSPEC_TP
15074 && TARGET_TLS_DIRECT_SEG_REFS
15075 && seg == ADDR_SPACE_GENERIC)
15076 seg = DEFAULT_TLS_SEG_REG;
15077 else
15078 return 0;
15079 break;
15081 case SUBREG:
15082 if (!REG_P (SUBREG_REG (op)))
15083 return 0;
15084 /* FALLTHRU */
15086 case REG:
15087 if (!base)
15088 base = op;
15089 else if (!index)
15090 index = op;
15091 else
15092 return 0;
15093 break;
15095 case CONST:
15096 case CONST_INT:
15097 case SYMBOL_REF:
15098 case LABEL_REF:
15099 if (disp)
15100 return 0;
15101 disp = op;
15102 break;
15104 default:
15105 return 0;
15109 else if (GET_CODE (addr) == MULT)
15111 index = XEXP (addr, 0); /* index*scale */
15112 scale_rtx = XEXP (addr, 1);
15114 else if (GET_CODE (addr) == ASHIFT)
15116 /* We're called for lea too, which implements ashift on occasion. */
15117 index = XEXP (addr, 0);
15118 tmp = XEXP (addr, 1);
15119 if (!CONST_INT_P (tmp))
15120 return 0;
15121 scale = INTVAL (tmp);
15122 if ((unsigned HOST_WIDE_INT) scale > 3)
15123 return 0;
15124 scale = 1 << scale;
15125 retval = -1;
15127 else
15128 disp = addr; /* displacement */
15130 if (index)
15132 if (REG_P (index))
15134 else if (SUBREG_P (index)
15135 && REG_P (SUBREG_REG (index)))
15137 else
15138 return 0;
15141 /* Extract the integral value of scale. */
15142 if (scale_rtx)
15144 if (!CONST_INT_P (scale_rtx))
15145 return 0;
15146 scale = INTVAL (scale_rtx);
15149 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15150 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15152 /* Avoid useless 0 displacement. */
15153 if (disp == const0_rtx && (base || index))
15154 disp = NULL_RTX;
15156 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15157 if (base_reg && index_reg && scale == 1
15158 && (index_reg == arg_pointer_rtx
15159 || index_reg == frame_pointer_rtx
15160 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15162 std::swap (base, index);
15163 std::swap (base_reg, index_reg);
15166 /* Special case: %ebp cannot be encoded as a base without a displacement.
15167 Similarly %r13. */
15168 if (!disp
15169 && base_reg
15170 && (base_reg == hard_frame_pointer_rtx
15171 || base_reg == frame_pointer_rtx
15172 || base_reg == arg_pointer_rtx
15173 || (REG_P (base_reg)
15174 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15175 || REGNO (base_reg) == R13_REG))))
15176 disp = const0_rtx;
15178 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15179 Avoid this by transforming to [%esi+0].
15180 Reload calls address legitimization without cfun defined, so we need
15181 to test cfun for being non-NULL. */
15182 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15183 && base_reg && !index_reg && !disp
15184 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15185 disp = const0_rtx;
15187 /* Special case: encode reg+reg instead of reg*2. */
15188 if (!base && index && scale == 2)
15189 base = index, base_reg = index_reg, scale = 1;
15191 /* Special case: scaling cannot be encoded without base or displacement. */
15192 if (!base && !disp && index && scale != 1)
15193 disp = const0_rtx;
15195 out->base = base;
15196 out->index = index;
15197 out->disp = disp;
15198 out->scale = scale;
15199 out->seg = seg;
15201 return retval;
15204 /* Return cost of the memory address x.
15205 For i386, it is better to use a complex address than let gcc copy
15206 the address into a reg and make a new pseudo. But not if the address
15207 requires to two regs - that would mean more pseudos with longer
15208 lifetimes. */
15209 static int
15210 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15212 struct ix86_address parts;
15213 int cost = 1;
15214 int ok = ix86_decompose_address (x, &parts);
15216 gcc_assert (ok);
15218 if (parts.base && SUBREG_P (parts.base))
15219 parts.base = SUBREG_REG (parts.base);
15220 if (parts.index && SUBREG_P (parts.index))
15221 parts.index = SUBREG_REG (parts.index);
15223 /* Attempt to minimize number of registers in the address by increasing
15224 address cost for each used register. We don't increase address cost
15225 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15226 is not invariant itself it most likely means that base or index is not
15227 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15228 which is not profitable for x86. */
15229 if (parts.base
15230 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15231 && (current_pass->type == GIMPLE_PASS
15232 || !pic_offset_table_rtx
15233 || !REG_P (parts.base)
15234 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15235 cost++;
15237 if (parts.index
15238 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15239 && (current_pass->type == GIMPLE_PASS
15240 || !pic_offset_table_rtx
15241 || !REG_P (parts.index)
15242 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15243 cost++;
15245 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15246 since it's predecode logic can't detect the length of instructions
15247 and it degenerates to vector decoded. Increase cost of such
15248 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15249 to split such addresses or even refuse such addresses at all.
15251 Following addressing modes are affected:
15252 [base+scale*index]
15253 [scale*index+disp]
15254 [base+index]
15256 The first and last case may be avoidable by explicitly coding the zero in
15257 memory address, but I don't have AMD-K6 machine handy to check this
15258 theory. */
15260 if (TARGET_K6
15261 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15262 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15263 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15264 cost += 10;
15266 return cost;
15269 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15270 this is used for to form addresses to local data when -fPIC is in
15271 use. */
15273 static bool
15274 darwin_local_data_pic (rtx disp)
15276 return (GET_CODE (disp) == UNSPEC
15277 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15280 /* True if operand X should be loaded from GOT. */
15282 bool
15283 ix86_force_load_from_GOT_p (rtx x)
15285 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15286 && !TARGET_PECOFF && !TARGET_MACHO
15287 && !flag_plt && !flag_pic
15288 && ix86_cmodel != CM_LARGE
15289 && GET_CODE (x) == SYMBOL_REF
15290 && SYMBOL_REF_FUNCTION_P (x)
15291 && !SYMBOL_REF_LOCAL_P (x));
15294 /* Determine if a given RTX is a valid constant. We already know this
15295 satisfies CONSTANT_P. */
15297 static bool
15298 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15300 /* Pointer bounds constants are not valid. */
15301 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15302 return false;
15304 switch (GET_CODE (x))
15306 case CONST:
15307 x = XEXP (x, 0);
15309 if (GET_CODE (x) == PLUS)
15311 if (!CONST_INT_P (XEXP (x, 1)))
15312 return false;
15313 x = XEXP (x, 0);
15316 if (TARGET_MACHO && darwin_local_data_pic (x))
15317 return true;
15319 /* Only some unspecs are valid as "constants". */
15320 if (GET_CODE (x) == UNSPEC)
15321 switch (XINT (x, 1))
15323 case UNSPEC_GOT:
15324 case UNSPEC_GOTOFF:
15325 case UNSPEC_PLTOFF:
15326 return TARGET_64BIT;
15327 case UNSPEC_TPOFF:
15328 case UNSPEC_NTPOFF:
15329 x = XVECEXP (x, 0, 0);
15330 return (GET_CODE (x) == SYMBOL_REF
15331 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15332 case UNSPEC_DTPOFF:
15333 x = XVECEXP (x, 0, 0);
15334 return (GET_CODE (x) == SYMBOL_REF
15335 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15336 default:
15337 return false;
15340 /* We must have drilled down to a symbol. */
15341 if (GET_CODE (x) == LABEL_REF)
15342 return true;
15343 if (GET_CODE (x) != SYMBOL_REF)
15344 return false;
15345 /* FALLTHRU */
15347 case SYMBOL_REF:
15348 /* TLS symbols are never valid. */
15349 if (SYMBOL_REF_TLS_MODEL (x))
15350 return false;
15352 /* DLLIMPORT symbols are never valid. */
15353 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15354 && SYMBOL_REF_DLLIMPORT_P (x))
15355 return false;
15357 #if TARGET_MACHO
15358 /* mdynamic-no-pic */
15359 if (MACHO_DYNAMIC_NO_PIC_P)
15360 return machopic_symbol_defined_p (x);
15361 #endif
15363 /* External function address should be loaded
15364 via the GOT slot to avoid PLT. */
15365 if (ix86_force_load_from_GOT_p (x))
15366 return false;
15368 break;
15370 CASE_CONST_SCALAR_INT:
15371 switch (mode)
15373 case TImode:
15374 if (TARGET_64BIT)
15375 return true;
15376 /* FALLTHRU */
15377 case OImode:
15378 case XImode:
15379 if (!standard_sse_constant_p (x, mode))
15380 return false;
15381 default:
15382 break;
15384 break;
15386 case CONST_VECTOR:
15387 if (!standard_sse_constant_p (x, mode))
15388 return false;
15390 default:
15391 break;
15394 /* Otherwise we handle everything else in the move patterns. */
15395 return true;
15398 /* Determine if it's legal to put X into the constant pool. This
15399 is not possible for the address of thread-local symbols, which
15400 is checked above. */
15402 static bool
15403 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15405 /* We can put any immediate constant in memory. */
15406 switch (GET_CODE (x))
15408 CASE_CONST_ANY:
15409 return false;
15411 default:
15412 break;
15415 return !ix86_legitimate_constant_p (mode, x);
15418 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15419 otherwise zero. */
15421 static bool
15422 is_imported_p (rtx x)
15424 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15425 || GET_CODE (x) != SYMBOL_REF)
15426 return false;
15428 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15432 /* Nonzero if the constant value X is a legitimate general operand
15433 when generating PIC code. It is given that flag_pic is on and
15434 that X satisfies CONSTANT_P. */
15436 bool
15437 legitimate_pic_operand_p (rtx x)
15439 rtx inner;
15441 switch (GET_CODE (x))
15443 case CONST:
15444 inner = XEXP (x, 0);
15445 if (GET_CODE (inner) == PLUS
15446 && CONST_INT_P (XEXP (inner, 1)))
15447 inner = XEXP (inner, 0);
15449 /* Only some unspecs are valid as "constants". */
15450 if (GET_CODE (inner) == UNSPEC)
15451 switch (XINT (inner, 1))
15453 case UNSPEC_GOT:
15454 case UNSPEC_GOTOFF:
15455 case UNSPEC_PLTOFF:
15456 return TARGET_64BIT;
15457 case UNSPEC_TPOFF:
15458 x = XVECEXP (inner, 0, 0);
15459 return (GET_CODE (x) == SYMBOL_REF
15460 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15461 case UNSPEC_MACHOPIC_OFFSET:
15462 return legitimate_pic_address_disp_p (x);
15463 default:
15464 return false;
15466 /* FALLTHRU */
15468 case SYMBOL_REF:
15469 case LABEL_REF:
15470 return legitimate_pic_address_disp_p (x);
15472 default:
15473 return true;
15477 /* Determine if a given CONST RTX is a valid memory displacement
15478 in PIC mode. */
15480 bool
15481 legitimate_pic_address_disp_p (rtx disp)
15483 bool saw_plus;
15485 /* In 64bit mode we can allow direct addresses of symbols and labels
15486 when they are not dynamic symbols. */
15487 if (TARGET_64BIT)
15489 rtx op0 = disp, op1;
15491 switch (GET_CODE (disp))
15493 case LABEL_REF:
15494 return true;
15496 case CONST:
15497 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15498 break;
15499 op0 = XEXP (XEXP (disp, 0), 0);
15500 op1 = XEXP (XEXP (disp, 0), 1);
15501 if (!CONST_INT_P (op1)
15502 || INTVAL (op1) >= 16*1024*1024
15503 || INTVAL (op1) < -16*1024*1024)
15504 break;
15505 if (GET_CODE (op0) == LABEL_REF)
15506 return true;
15507 if (GET_CODE (op0) == CONST
15508 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15509 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15510 return true;
15511 if (GET_CODE (op0) == UNSPEC
15512 && XINT (op0, 1) == UNSPEC_PCREL)
15513 return true;
15514 if (GET_CODE (op0) != SYMBOL_REF)
15515 break;
15516 /* FALLTHRU */
15518 case SYMBOL_REF:
15519 /* TLS references should always be enclosed in UNSPEC.
15520 The dllimported symbol needs always to be resolved. */
15521 if (SYMBOL_REF_TLS_MODEL (op0)
15522 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15523 return false;
15525 if (TARGET_PECOFF)
15527 if (is_imported_p (op0))
15528 return true;
15530 if (SYMBOL_REF_FAR_ADDR_P (op0)
15531 || !SYMBOL_REF_LOCAL_P (op0))
15532 break;
15534 /* Function-symbols need to be resolved only for
15535 large-model.
15536 For the small-model we don't need to resolve anything
15537 here. */
15538 if ((ix86_cmodel != CM_LARGE_PIC
15539 && SYMBOL_REF_FUNCTION_P (op0))
15540 || ix86_cmodel == CM_SMALL_PIC)
15541 return true;
15542 /* Non-external symbols don't need to be resolved for
15543 large, and medium-model. */
15544 if ((ix86_cmodel == CM_LARGE_PIC
15545 || ix86_cmodel == CM_MEDIUM_PIC)
15546 && !SYMBOL_REF_EXTERNAL_P (op0))
15547 return true;
15549 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15550 && (SYMBOL_REF_LOCAL_P (op0)
15551 || (HAVE_LD_PIE_COPYRELOC
15552 && flag_pie
15553 && !SYMBOL_REF_WEAK (op0)
15554 && !SYMBOL_REF_FUNCTION_P (op0)))
15555 && ix86_cmodel != CM_LARGE_PIC)
15556 return true;
15557 break;
15559 default:
15560 break;
15563 if (GET_CODE (disp) != CONST)
15564 return false;
15565 disp = XEXP (disp, 0);
15567 if (TARGET_64BIT)
15569 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15570 of GOT tables. We should not need these anyway. */
15571 if (GET_CODE (disp) != UNSPEC
15572 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15573 && XINT (disp, 1) != UNSPEC_GOTOFF
15574 && XINT (disp, 1) != UNSPEC_PCREL
15575 && XINT (disp, 1) != UNSPEC_PLTOFF))
15576 return false;
15578 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15579 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15580 return false;
15581 return true;
15584 saw_plus = false;
15585 if (GET_CODE (disp) == PLUS)
15587 if (!CONST_INT_P (XEXP (disp, 1)))
15588 return false;
15589 disp = XEXP (disp, 0);
15590 saw_plus = true;
15593 if (TARGET_MACHO && darwin_local_data_pic (disp))
15594 return true;
15596 if (GET_CODE (disp) != UNSPEC)
15597 return false;
15599 switch (XINT (disp, 1))
15601 case UNSPEC_GOT:
15602 if (saw_plus)
15603 return false;
15604 /* We need to check for both symbols and labels because VxWorks loads
15605 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15606 details. */
15607 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15608 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15609 case UNSPEC_GOTOFF:
15610 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15611 While ABI specify also 32bit relocation but we don't produce it in
15612 small PIC model at all. */
15613 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15614 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15615 && !TARGET_64BIT)
15616 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15617 return false;
15618 case UNSPEC_GOTTPOFF:
15619 case UNSPEC_GOTNTPOFF:
15620 case UNSPEC_INDNTPOFF:
15621 if (saw_plus)
15622 return false;
15623 disp = XVECEXP (disp, 0, 0);
15624 return (GET_CODE (disp) == SYMBOL_REF
15625 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15626 case UNSPEC_NTPOFF:
15627 disp = XVECEXP (disp, 0, 0);
15628 return (GET_CODE (disp) == SYMBOL_REF
15629 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15630 case UNSPEC_DTPOFF:
15631 disp = XVECEXP (disp, 0, 0);
15632 return (GET_CODE (disp) == SYMBOL_REF
15633 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15636 return false;
15639 /* Determine if op is suitable RTX for an address register.
15640 Return naked register if a register or a register subreg is
15641 found, otherwise return NULL_RTX. */
15643 static rtx
15644 ix86_validate_address_register (rtx op)
15646 machine_mode mode = GET_MODE (op);
15648 /* Only SImode or DImode registers can form the address. */
15649 if (mode != SImode && mode != DImode)
15650 return NULL_RTX;
15652 if (REG_P (op))
15653 return op;
15654 else if (SUBREG_P (op))
15656 rtx reg = SUBREG_REG (op);
15658 if (!REG_P (reg))
15659 return NULL_RTX;
15661 mode = GET_MODE (reg);
15663 /* Don't allow SUBREGs that span more than a word. It can
15664 lead to spill failures when the register is one word out
15665 of a two word structure. */
15666 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15667 return NULL_RTX;
15669 /* Allow only SUBREGs of non-eliminable hard registers. */
15670 if (register_no_elim_operand (reg, mode))
15671 return reg;
15674 /* Op is not a register. */
15675 return NULL_RTX;
15678 /* Recognizes RTL expressions that are valid memory addresses for an
15679 instruction. The MODE argument is the machine mode for the MEM
15680 expression that wants to use this address.
15682 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15683 convert common non-canonical forms to canonical form so that they will
15684 be recognized. */
15686 static bool
15687 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15689 struct ix86_address parts;
15690 rtx base, index, disp;
15691 HOST_WIDE_INT scale;
15692 addr_space_t seg;
15694 if (ix86_decompose_address (addr, &parts) <= 0)
15695 /* Decomposition failed. */
15696 return false;
15698 base = parts.base;
15699 index = parts.index;
15700 disp = parts.disp;
15701 scale = parts.scale;
15702 seg = parts.seg;
15704 /* Validate base register. */
15705 if (base)
15707 rtx reg = ix86_validate_address_register (base);
15709 if (reg == NULL_RTX)
15710 return false;
15712 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15713 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15714 /* Base is not valid. */
15715 return false;
15718 /* Validate index register. */
15719 if (index)
15721 rtx reg = ix86_validate_address_register (index);
15723 if (reg == NULL_RTX)
15724 return false;
15726 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15727 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15728 /* Index is not valid. */
15729 return false;
15732 /* Index and base should have the same mode. */
15733 if (base && index
15734 && GET_MODE (base) != GET_MODE (index))
15735 return false;
15737 /* Address override works only on the (%reg) part of %fs:(%reg). */
15738 if (seg != ADDR_SPACE_GENERIC
15739 && ((base && GET_MODE (base) != word_mode)
15740 || (index && GET_MODE (index) != word_mode)))
15741 return false;
15743 /* Validate scale factor. */
15744 if (scale != 1)
15746 if (!index)
15747 /* Scale without index. */
15748 return false;
15750 if (scale != 2 && scale != 4 && scale != 8)
15751 /* Scale is not a valid multiplier. */
15752 return false;
15755 /* Validate displacement. */
15756 if (disp)
15758 if (GET_CODE (disp) == CONST
15759 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15760 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15761 switch (XINT (XEXP (disp, 0), 1))
15763 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15764 when used. While ABI specify also 32bit relocations, we
15765 don't produce them at all and use IP relative instead.
15766 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15767 should be loaded via GOT. */
15768 case UNSPEC_GOT:
15769 if (!TARGET_64BIT
15770 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15771 goto is_legitimate_pic;
15772 /* FALLTHRU */
15773 case UNSPEC_GOTOFF:
15774 gcc_assert (flag_pic);
15775 if (!TARGET_64BIT)
15776 goto is_legitimate_pic;
15778 /* 64bit address unspec. */
15779 return false;
15781 case UNSPEC_GOTPCREL:
15782 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15783 goto is_legitimate_pic;
15784 /* FALLTHRU */
15785 case UNSPEC_PCREL:
15786 gcc_assert (flag_pic);
15787 goto is_legitimate_pic;
15789 case UNSPEC_GOTTPOFF:
15790 case UNSPEC_GOTNTPOFF:
15791 case UNSPEC_INDNTPOFF:
15792 case UNSPEC_NTPOFF:
15793 case UNSPEC_DTPOFF:
15794 break;
15796 case UNSPEC_STACK_CHECK:
15797 gcc_assert (flag_split_stack);
15798 break;
15800 default:
15801 /* Invalid address unspec. */
15802 return false;
15805 else if (SYMBOLIC_CONST (disp)
15806 && (flag_pic
15807 || (TARGET_MACHO
15808 #if TARGET_MACHO
15809 && MACHOPIC_INDIRECT
15810 && !machopic_operand_p (disp)
15811 #endif
15815 is_legitimate_pic:
15816 if (TARGET_64BIT && (index || base))
15818 /* foo@dtpoff(%rX) is ok. */
15819 if (GET_CODE (disp) != CONST
15820 || GET_CODE (XEXP (disp, 0)) != PLUS
15821 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15822 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15823 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15824 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15825 /* Non-constant pic memory reference. */
15826 return false;
15828 else if ((!TARGET_MACHO || flag_pic)
15829 && ! legitimate_pic_address_disp_p (disp))
15830 /* Displacement is an invalid pic construct. */
15831 return false;
15832 #if TARGET_MACHO
15833 else if (MACHO_DYNAMIC_NO_PIC_P
15834 && !ix86_legitimate_constant_p (Pmode, disp))
15835 /* displacment must be referenced via non_lazy_pointer */
15836 return false;
15837 #endif
15839 /* This code used to verify that a symbolic pic displacement
15840 includes the pic_offset_table_rtx register.
15842 While this is good idea, unfortunately these constructs may
15843 be created by "adds using lea" optimization for incorrect
15844 code like:
15846 int a;
15847 int foo(int i)
15849 return *(&a+i);
15852 This code is nonsensical, but results in addressing
15853 GOT table with pic_offset_table_rtx base. We can't
15854 just refuse it easily, since it gets matched by
15855 "addsi3" pattern, that later gets split to lea in the
15856 case output register differs from input. While this
15857 can be handled by separate addsi pattern for this case
15858 that never results in lea, this seems to be easier and
15859 correct fix for crash to disable this test. */
15861 else if (GET_CODE (disp) != LABEL_REF
15862 && !CONST_INT_P (disp)
15863 && (GET_CODE (disp) != CONST
15864 || !ix86_legitimate_constant_p (Pmode, disp))
15865 && (GET_CODE (disp) != SYMBOL_REF
15866 || !ix86_legitimate_constant_p (Pmode, disp)))
15867 /* Displacement is not constant. */
15868 return false;
15869 else if (TARGET_64BIT
15870 && !x86_64_immediate_operand (disp, VOIDmode))
15871 /* Displacement is out of range. */
15872 return false;
15873 /* In x32 mode, constant addresses are sign extended to 64bit, so
15874 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15875 else if (TARGET_X32 && !(index || base)
15876 && CONST_INT_P (disp)
15877 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15878 return false;
15881 /* Everything looks valid. */
15882 return true;
15885 /* Determine if a given RTX is a valid constant address. */
15887 bool
15888 constant_address_p (rtx x)
15890 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15893 /* Return a unique alias set for the GOT. */
15895 static alias_set_type
15896 ix86_GOT_alias_set (void)
15898 static alias_set_type set = -1;
15899 if (set == -1)
15900 set = new_alias_set ();
15901 return set;
15904 /* Return a legitimate reference for ORIG (an address) using the
15905 register REG. If REG is 0, a new pseudo is generated.
15907 There are two types of references that must be handled:
15909 1. Global data references must load the address from the GOT, via
15910 the PIC reg. An insn is emitted to do this load, and the reg is
15911 returned.
15913 2. Static data references, constant pool addresses, and code labels
15914 compute the address as an offset from the GOT, whose base is in
15915 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15916 differentiate them from global data objects. The returned
15917 address is the PIC reg + an unspec constant.
15919 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15920 reg also appears in the address. */
15922 static rtx
15923 legitimize_pic_address (rtx orig, rtx reg)
15925 rtx addr = orig;
15926 rtx new_rtx = orig;
15928 #if TARGET_MACHO
15929 if (TARGET_MACHO && !TARGET_64BIT)
15931 if (reg == 0)
15932 reg = gen_reg_rtx (Pmode);
15933 /* Use the generic Mach-O PIC machinery. */
15934 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15936 #endif
15938 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15940 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15941 if (tmp)
15942 return tmp;
15945 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15946 new_rtx = addr;
15947 else if ((!TARGET_64BIT
15948 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15949 && !TARGET_PECOFF
15950 && gotoff_operand (addr, Pmode))
15952 /* This symbol may be referenced via a displacement
15953 from the PIC base address (@GOTOFF). */
15954 if (GET_CODE (addr) == CONST)
15955 addr = XEXP (addr, 0);
15957 if (GET_CODE (addr) == PLUS)
15959 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15960 UNSPEC_GOTOFF);
15961 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15963 else
15964 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15966 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15968 if (TARGET_64BIT)
15969 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15971 if (reg != 0)
15973 gcc_assert (REG_P (reg));
15974 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15975 new_rtx, reg, 1, OPTAB_DIRECT);
15977 else
15978 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15980 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15981 /* We can't use @GOTOFF for text labels
15982 on VxWorks, see gotoff_operand. */
15983 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15985 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15986 if (tmp)
15987 return tmp;
15989 /* For x64 PE-COFF there is no GOT table,
15990 so we use address directly. */
15991 if (TARGET_64BIT && TARGET_PECOFF)
15993 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15994 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15996 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15998 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15999 UNSPEC_GOTPCREL);
16000 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16001 new_rtx = gen_const_mem (Pmode, new_rtx);
16002 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16004 else
16006 /* This symbol must be referenced via a load
16007 from the Global Offset Table (@GOT). */
16008 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16009 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16010 if (TARGET_64BIT)
16011 new_rtx = force_reg (Pmode, new_rtx);
16012 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16013 new_rtx = gen_const_mem (Pmode, new_rtx);
16014 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16017 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16019 else
16021 if (CONST_INT_P (addr)
16022 && !x86_64_immediate_operand (addr, VOIDmode))
16023 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16024 else if (GET_CODE (addr) == CONST)
16026 addr = XEXP (addr, 0);
16028 /* We must match stuff we generate before. Assume the only
16029 unspecs that can get here are ours. Not that we could do
16030 anything with them anyway.... */
16031 if (GET_CODE (addr) == UNSPEC
16032 || (GET_CODE (addr) == PLUS
16033 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16034 return orig;
16035 gcc_assert (GET_CODE (addr) == PLUS);
16038 if (GET_CODE (addr) == PLUS)
16040 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16042 /* Check first to see if this is a constant
16043 offset from a @GOTOFF symbol reference. */
16044 if (!TARGET_PECOFF
16045 && gotoff_operand (op0, Pmode)
16046 && CONST_INT_P (op1))
16048 if (!TARGET_64BIT)
16050 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16051 UNSPEC_GOTOFF);
16052 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16053 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16055 if (reg != 0)
16057 gcc_assert (REG_P (reg));
16058 new_rtx = expand_simple_binop (Pmode, PLUS,
16059 pic_offset_table_rtx,
16060 new_rtx, reg, 1,
16061 OPTAB_DIRECT);
16063 else
16064 new_rtx
16065 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16067 else
16069 if (INTVAL (op1) < -16*1024*1024
16070 || INTVAL (op1) >= 16*1024*1024)
16072 if (!x86_64_immediate_operand (op1, Pmode))
16073 op1 = force_reg (Pmode, op1);
16075 new_rtx
16076 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16080 else
16082 rtx base = legitimize_pic_address (op0, reg);
16083 machine_mode mode = GET_MODE (base);
16084 new_rtx
16085 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16087 if (CONST_INT_P (new_rtx))
16089 if (INTVAL (new_rtx) < -16*1024*1024
16090 || INTVAL (new_rtx) >= 16*1024*1024)
16092 if (!x86_64_immediate_operand (new_rtx, mode))
16093 new_rtx = force_reg (mode, new_rtx);
16095 new_rtx
16096 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16098 else
16099 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16101 else
16103 /* For %rip addressing, we have to use
16104 just disp32, not base nor index. */
16105 if (TARGET_64BIT
16106 && (GET_CODE (base) == SYMBOL_REF
16107 || GET_CODE (base) == LABEL_REF))
16108 base = force_reg (mode, base);
16109 if (GET_CODE (new_rtx) == PLUS
16110 && CONSTANT_P (XEXP (new_rtx, 1)))
16112 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16113 new_rtx = XEXP (new_rtx, 1);
16115 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16120 return new_rtx;
16123 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16125 static rtx
16126 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16128 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16130 if (GET_MODE (tp) != tp_mode)
16132 gcc_assert (GET_MODE (tp) == SImode);
16133 gcc_assert (tp_mode == DImode);
16135 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16138 if (to_reg)
16139 tp = copy_to_mode_reg (tp_mode, tp);
16141 return tp;
16144 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16146 static GTY(()) rtx ix86_tls_symbol;
16148 static rtx
16149 ix86_tls_get_addr (void)
16151 if (!ix86_tls_symbol)
16153 const char *sym
16154 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16155 ? "___tls_get_addr" : "__tls_get_addr");
16157 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16160 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16162 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16163 UNSPEC_PLTOFF);
16164 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16165 gen_rtx_CONST (Pmode, unspec));
16168 return ix86_tls_symbol;
16171 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16173 static GTY(()) rtx ix86_tls_module_base_symbol;
16176 ix86_tls_module_base (void)
16178 if (!ix86_tls_module_base_symbol)
16180 ix86_tls_module_base_symbol
16181 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16183 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16184 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16187 return ix86_tls_module_base_symbol;
16190 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16191 false if we expect this to be used for a memory address and true if
16192 we expect to load the address into a register. */
16194 static rtx
16195 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16197 rtx dest, base, off;
16198 rtx pic = NULL_RTX, tp = NULL_RTX;
16199 machine_mode tp_mode = Pmode;
16200 int type;
16202 /* Fall back to global dynamic model if tool chain cannot support local
16203 dynamic. */
16204 if (TARGET_SUN_TLS && !TARGET_64BIT
16205 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16206 && model == TLS_MODEL_LOCAL_DYNAMIC)
16207 model = TLS_MODEL_GLOBAL_DYNAMIC;
16209 switch (model)
16211 case TLS_MODEL_GLOBAL_DYNAMIC:
16212 dest = gen_reg_rtx (Pmode);
16214 if (!TARGET_64BIT)
16216 if (flag_pic && !TARGET_PECOFF)
16217 pic = pic_offset_table_rtx;
16218 else
16220 pic = gen_reg_rtx (Pmode);
16221 emit_insn (gen_set_got (pic));
16225 if (TARGET_GNU2_TLS)
16227 if (TARGET_64BIT)
16228 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16229 else
16230 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16232 tp = get_thread_pointer (Pmode, true);
16233 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16235 if (GET_MODE (x) != Pmode)
16236 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16238 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16240 else
16242 rtx caddr = ix86_tls_get_addr ();
16244 if (TARGET_64BIT)
16246 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16247 rtx_insn *insns;
16249 start_sequence ();
16250 emit_call_insn
16251 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16252 insns = get_insns ();
16253 end_sequence ();
16255 if (GET_MODE (x) != Pmode)
16256 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16258 RTL_CONST_CALL_P (insns) = 1;
16259 emit_libcall_block (insns, dest, rax, x);
16261 else
16262 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16264 break;
16266 case TLS_MODEL_LOCAL_DYNAMIC:
16267 base = gen_reg_rtx (Pmode);
16269 if (!TARGET_64BIT)
16271 if (flag_pic)
16272 pic = pic_offset_table_rtx;
16273 else
16275 pic = gen_reg_rtx (Pmode);
16276 emit_insn (gen_set_got (pic));
16280 if (TARGET_GNU2_TLS)
16282 rtx tmp = ix86_tls_module_base ();
16284 if (TARGET_64BIT)
16285 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16286 else
16287 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16289 tp = get_thread_pointer (Pmode, true);
16290 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16291 gen_rtx_MINUS (Pmode, tmp, tp));
16293 else
16295 rtx caddr = ix86_tls_get_addr ();
16297 if (TARGET_64BIT)
16299 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16300 rtx_insn *insns;
16301 rtx eqv;
16303 start_sequence ();
16304 emit_call_insn
16305 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16306 insns = get_insns ();
16307 end_sequence ();
16309 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16310 share the LD_BASE result with other LD model accesses. */
16311 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16312 UNSPEC_TLS_LD_BASE);
16314 RTL_CONST_CALL_P (insns) = 1;
16315 emit_libcall_block (insns, base, rax, eqv);
16317 else
16318 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16321 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16322 off = gen_rtx_CONST (Pmode, off);
16324 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16326 if (TARGET_GNU2_TLS)
16328 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16330 if (GET_MODE (x) != Pmode)
16331 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16333 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16335 break;
16337 case TLS_MODEL_INITIAL_EXEC:
16338 if (TARGET_64BIT)
16340 if (TARGET_SUN_TLS && !TARGET_X32)
16342 /* The Sun linker took the AMD64 TLS spec literally
16343 and can only handle %rax as destination of the
16344 initial executable code sequence. */
16346 dest = gen_reg_rtx (DImode);
16347 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16348 return dest;
16351 /* Generate DImode references to avoid %fs:(%reg32)
16352 problems and linker IE->LE relaxation bug. */
16353 tp_mode = DImode;
16354 pic = NULL;
16355 type = UNSPEC_GOTNTPOFF;
16357 else if (flag_pic)
16359 pic = pic_offset_table_rtx;
16360 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16362 else if (!TARGET_ANY_GNU_TLS)
16364 pic = gen_reg_rtx (Pmode);
16365 emit_insn (gen_set_got (pic));
16366 type = UNSPEC_GOTTPOFF;
16368 else
16370 pic = NULL;
16371 type = UNSPEC_INDNTPOFF;
16374 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16375 off = gen_rtx_CONST (tp_mode, off);
16376 if (pic)
16377 off = gen_rtx_PLUS (tp_mode, pic, off);
16378 off = gen_const_mem (tp_mode, off);
16379 set_mem_alias_set (off, ix86_GOT_alias_set ());
16381 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16383 base = get_thread_pointer (tp_mode,
16384 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16385 off = force_reg (tp_mode, off);
16386 dest = gen_rtx_PLUS (tp_mode, base, off);
16387 if (tp_mode != Pmode)
16388 dest = convert_to_mode (Pmode, dest, 1);
16390 else
16392 base = get_thread_pointer (Pmode, true);
16393 dest = gen_reg_rtx (Pmode);
16394 emit_insn (ix86_gen_sub3 (dest, base, off));
16396 break;
16398 case TLS_MODEL_LOCAL_EXEC:
16399 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16400 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16401 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16402 off = gen_rtx_CONST (Pmode, off);
16404 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16406 base = get_thread_pointer (Pmode,
16407 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16408 return gen_rtx_PLUS (Pmode, base, off);
16410 else
16412 base = get_thread_pointer (Pmode, true);
16413 dest = gen_reg_rtx (Pmode);
16414 emit_insn (ix86_gen_sub3 (dest, base, off));
16416 break;
16418 default:
16419 gcc_unreachable ();
16422 return dest;
16425 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16426 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16427 unique refptr-DECL symbol corresponding to symbol DECL. */
16429 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16431 static inline hashval_t hash (tree_map *m) { return m->hash; }
16432 static inline bool
16433 equal (tree_map *a, tree_map *b)
16435 return a->base.from == b->base.from;
16438 static int
16439 keep_cache_entry (tree_map *&m)
16441 return ggc_marked_p (m->base.from);
16445 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16447 static tree
16448 get_dllimport_decl (tree decl, bool beimport)
16450 struct tree_map *h, in;
16451 const char *name;
16452 const char *prefix;
16453 size_t namelen, prefixlen;
16454 char *imp_name;
16455 tree to;
16456 rtx rtl;
16458 if (!dllimport_map)
16459 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16461 in.hash = htab_hash_pointer (decl);
16462 in.base.from = decl;
16463 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16464 h = *loc;
16465 if (h)
16466 return h->to;
16468 *loc = h = ggc_alloc<tree_map> ();
16469 h->hash = in.hash;
16470 h->base.from = decl;
16471 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16472 VAR_DECL, NULL, ptr_type_node);
16473 DECL_ARTIFICIAL (to) = 1;
16474 DECL_IGNORED_P (to) = 1;
16475 DECL_EXTERNAL (to) = 1;
16476 TREE_READONLY (to) = 1;
16478 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16479 name = targetm.strip_name_encoding (name);
16480 if (beimport)
16481 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16482 ? "*__imp_" : "*__imp__";
16483 else
16484 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16485 namelen = strlen (name);
16486 prefixlen = strlen (prefix);
16487 imp_name = (char *) alloca (namelen + prefixlen + 1);
16488 memcpy (imp_name, prefix, prefixlen);
16489 memcpy (imp_name + prefixlen, name, namelen + 1);
16491 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16492 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16493 SET_SYMBOL_REF_DECL (rtl, to);
16494 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16495 if (!beimport)
16497 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16498 #ifdef SUB_TARGET_RECORD_STUB
16499 SUB_TARGET_RECORD_STUB (name);
16500 #endif
16503 rtl = gen_const_mem (Pmode, rtl);
16504 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16506 SET_DECL_RTL (to, rtl);
16507 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16509 return to;
16512 /* Expand SYMBOL into its corresponding far-addresse symbol.
16513 WANT_REG is true if we require the result be a register. */
16515 static rtx
16516 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16518 tree imp_decl;
16519 rtx x;
16521 gcc_assert (SYMBOL_REF_DECL (symbol));
16522 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16524 x = DECL_RTL (imp_decl);
16525 if (want_reg)
16526 x = force_reg (Pmode, x);
16527 return x;
16530 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16531 true if we require the result be a register. */
16533 static rtx
16534 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16536 tree imp_decl;
16537 rtx x;
16539 gcc_assert (SYMBOL_REF_DECL (symbol));
16540 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16542 x = DECL_RTL (imp_decl);
16543 if (want_reg)
16544 x = force_reg (Pmode, x);
16545 return x;
16548 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16549 is true if we require the result be a register. */
16551 static rtx
16552 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16554 if (!TARGET_PECOFF)
16555 return NULL_RTX;
16557 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16559 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16560 return legitimize_dllimport_symbol (addr, inreg);
16561 if (GET_CODE (addr) == CONST
16562 && GET_CODE (XEXP (addr, 0)) == PLUS
16563 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16564 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16566 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16567 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16571 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16572 return NULL_RTX;
16573 if (GET_CODE (addr) == SYMBOL_REF
16574 && !is_imported_p (addr)
16575 && SYMBOL_REF_EXTERNAL_P (addr)
16576 && SYMBOL_REF_DECL (addr))
16577 return legitimize_pe_coff_extern_decl (addr, inreg);
16579 if (GET_CODE (addr) == CONST
16580 && GET_CODE (XEXP (addr, 0)) == PLUS
16581 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16582 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16583 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16584 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16586 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16587 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16589 return NULL_RTX;
16592 /* Try machine-dependent ways of modifying an illegitimate address
16593 to be legitimate. If we find one, return the new, valid address.
16594 This macro is used in only one place: `memory_address' in explow.c.
16596 OLDX is the address as it was before break_out_memory_refs was called.
16597 In some cases it is useful to look at this to decide what needs to be done.
16599 It is always safe for this macro to do nothing. It exists to recognize
16600 opportunities to optimize the output.
16602 For the 80386, we handle X+REG by loading X into a register R and
16603 using R+REG. R will go in a general reg and indexing will be used.
16604 However, if REG is a broken-out memory address or multiplication,
16605 nothing needs to be done because REG can certainly go in a general reg.
16607 When -fpic is used, special handling is needed for symbolic references.
16608 See comments by legitimize_pic_address in i386.c for details. */
16610 static rtx
16611 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16613 bool changed = false;
16614 unsigned log;
16616 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16617 if (log)
16618 return legitimize_tls_address (x, (enum tls_model) log, false);
16619 if (GET_CODE (x) == CONST
16620 && GET_CODE (XEXP (x, 0)) == PLUS
16621 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16622 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16624 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16625 (enum tls_model) log, false);
16626 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16629 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16631 rtx tmp = legitimize_pe_coff_symbol (x, true);
16632 if (tmp)
16633 return tmp;
16636 if (flag_pic && SYMBOLIC_CONST (x))
16637 return legitimize_pic_address (x, 0);
16639 #if TARGET_MACHO
16640 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16641 return machopic_indirect_data_reference (x, 0);
16642 #endif
16644 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16645 if (GET_CODE (x) == ASHIFT
16646 && CONST_INT_P (XEXP (x, 1))
16647 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16649 changed = true;
16650 log = INTVAL (XEXP (x, 1));
16651 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16652 GEN_INT (1 << log));
16655 if (GET_CODE (x) == PLUS)
16657 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16659 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16660 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16661 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16663 changed = true;
16664 log = INTVAL (XEXP (XEXP (x, 0), 1));
16665 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16666 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16667 GEN_INT (1 << log));
16670 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16671 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16672 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16674 changed = true;
16675 log = INTVAL (XEXP (XEXP (x, 1), 1));
16676 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16677 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16678 GEN_INT (1 << log));
16681 /* Put multiply first if it isn't already. */
16682 if (GET_CODE (XEXP (x, 1)) == MULT)
16684 std::swap (XEXP (x, 0), XEXP (x, 1));
16685 changed = true;
16688 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16689 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16690 created by virtual register instantiation, register elimination, and
16691 similar optimizations. */
16692 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16694 changed = true;
16695 x = gen_rtx_PLUS (Pmode,
16696 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16697 XEXP (XEXP (x, 1), 0)),
16698 XEXP (XEXP (x, 1), 1));
16701 /* Canonicalize
16702 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16703 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16704 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16705 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16706 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16707 && CONSTANT_P (XEXP (x, 1)))
16709 rtx constant;
16710 rtx other = NULL_RTX;
16712 if (CONST_INT_P (XEXP (x, 1)))
16714 constant = XEXP (x, 1);
16715 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16717 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16719 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16720 other = XEXP (x, 1);
16722 else
16723 constant = 0;
16725 if (constant)
16727 changed = true;
16728 x = gen_rtx_PLUS (Pmode,
16729 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16730 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16731 plus_constant (Pmode, other,
16732 INTVAL (constant)));
16736 if (changed && ix86_legitimate_address_p (mode, x, false))
16737 return x;
16739 if (GET_CODE (XEXP (x, 0)) == MULT)
16741 changed = true;
16742 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16745 if (GET_CODE (XEXP (x, 1)) == MULT)
16747 changed = true;
16748 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16751 if (changed
16752 && REG_P (XEXP (x, 1))
16753 && REG_P (XEXP (x, 0)))
16754 return x;
16756 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16758 changed = true;
16759 x = legitimize_pic_address (x, 0);
16762 if (changed && ix86_legitimate_address_p (mode, x, false))
16763 return x;
16765 if (REG_P (XEXP (x, 0)))
16767 rtx temp = gen_reg_rtx (Pmode);
16768 rtx val = force_operand (XEXP (x, 1), temp);
16769 if (val != temp)
16771 val = convert_to_mode (Pmode, val, 1);
16772 emit_move_insn (temp, val);
16775 XEXP (x, 1) = temp;
16776 return x;
16779 else if (REG_P (XEXP (x, 1)))
16781 rtx temp = gen_reg_rtx (Pmode);
16782 rtx val = force_operand (XEXP (x, 0), temp);
16783 if (val != temp)
16785 val = convert_to_mode (Pmode, val, 1);
16786 emit_move_insn (temp, val);
16789 XEXP (x, 0) = temp;
16790 return x;
16794 return x;
16797 /* Print an integer constant expression in assembler syntax. Addition
16798 and subtraction are the only arithmetic that may appear in these
16799 expressions. FILE is the stdio stream to write to, X is the rtx, and
16800 CODE is the operand print code from the output string. */
16802 static void
16803 output_pic_addr_const (FILE *file, rtx x, int code)
16805 char buf[256];
16807 switch (GET_CODE (x))
16809 case PC:
16810 gcc_assert (flag_pic);
16811 putc ('.', file);
16812 break;
16814 case SYMBOL_REF:
16815 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16816 output_addr_const (file, x);
16817 else
16819 const char *name = XSTR (x, 0);
16821 /* Mark the decl as referenced so that cgraph will
16822 output the function. */
16823 if (SYMBOL_REF_DECL (x))
16824 mark_decl_referenced (SYMBOL_REF_DECL (x));
16826 #if TARGET_MACHO
16827 if (MACHOPIC_INDIRECT
16828 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16829 name = machopic_indirection_name (x, /*stub_p=*/true);
16830 #endif
16831 assemble_name (file, name);
16833 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16834 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16835 fputs ("@PLT", file);
16836 break;
16838 case LABEL_REF:
16839 x = XEXP (x, 0);
16840 /* FALLTHRU */
16841 case CODE_LABEL:
16842 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16843 assemble_name (asm_out_file, buf);
16844 break;
16846 case CONST_INT:
16847 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16848 break;
16850 case CONST:
16851 /* This used to output parentheses around the expression,
16852 but that does not work on the 386 (either ATT or BSD assembler). */
16853 output_pic_addr_const (file, XEXP (x, 0), code);
16854 break;
16856 case CONST_DOUBLE:
16857 /* We can't handle floating point constants;
16858 TARGET_PRINT_OPERAND must handle them. */
16859 output_operand_lossage ("floating constant misused");
16860 break;
16862 case PLUS:
16863 /* Some assemblers need integer constants to appear first. */
16864 if (CONST_INT_P (XEXP (x, 0)))
16866 output_pic_addr_const (file, XEXP (x, 0), code);
16867 putc ('+', file);
16868 output_pic_addr_const (file, XEXP (x, 1), code);
16870 else
16872 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16873 output_pic_addr_const (file, XEXP (x, 1), code);
16874 putc ('+', file);
16875 output_pic_addr_const (file, XEXP (x, 0), code);
16877 break;
16879 case MINUS:
16880 if (!TARGET_MACHO)
16881 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16882 output_pic_addr_const (file, XEXP (x, 0), code);
16883 putc ('-', file);
16884 output_pic_addr_const (file, XEXP (x, 1), code);
16885 if (!TARGET_MACHO)
16886 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16887 break;
16889 case UNSPEC:
16890 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
16892 bool f = i386_asm_output_addr_const_extra (file, x);
16893 gcc_assert (f);
16894 break;
16897 gcc_assert (XVECLEN (x, 0) == 1);
16898 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16899 switch (XINT (x, 1))
16901 case UNSPEC_GOT:
16902 fputs ("@GOT", file);
16903 break;
16904 case UNSPEC_GOTOFF:
16905 fputs ("@GOTOFF", file);
16906 break;
16907 case UNSPEC_PLTOFF:
16908 fputs ("@PLTOFF", file);
16909 break;
16910 case UNSPEC_PCREL:
16911 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16912 "(%rip)" : "[rip]", file);
16913 break;
16914 case UNSPEC_GOTPCREL:
16915 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16916 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16917 break;
16918 case UNSPEC_GOTTPOFF:
16919 /* FIXME: This might be @TPOFF in Sun ld too. */
16920 fputs ("@gottpoff", file);
16921 break;
16922 case UNSPEC_TPOFF:
16923 fputs ("@tpoff", file);
16924 break;
16925 case UNSPEC_NTPOFF:
16926 if (TARGET_64BIT)
16927 fputs ("@tpoff", file);
16928 else
16929 fputs ("@ntpoff", file);
16930 break;
16931 case UNSPEC_DTPOFF:
16932 fputs ("@dtpoff", file);
16933 break;
16934 case UNSPEC_GOTNTPOFF:
16935 if (TARGET_64BIT)
16936 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16937 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16938 else
16939 fputs ("@gotntpoff", file);
16940 break;
16941 case UNSPEC_INDNTPOFF:
16942 fputs ("@indntpoff", file);
16943 break;
16944 #if TARGET_MACHO
16945 case UNSPEC_MACHOPIC_OFFSET:
16946 putc ('-', file);
16947 machopic_output_function_base_name (file);
16948 break;
16949 #endif
16950 default:
16951 output_operand_lossage ("invalid UNSPEC as operand");
16952 break;
16954 break;
16956 default:
16957 output_operand_lossage ("invalid expression as operand");
16961 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16962 We need to emit DTP-relative relocations. */
16964 static void ATTRIBUTE_UNUSED
16965 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16967 fputs (ASM_LONG, file);
16968 output_addr_const (file, x);
16969 fputs ("@dtpoff", file);
16970 switch (size)
16972 case 4:
16973 break;
16974 case 8:
16975 fputs (", 0", file);
16976 break;
16977 default:
16978 gcc_unreachable ();
16982 /* Return true if X is a representation of the PIC register. This copes
16983 with calls from ix86_find_base_term, where the register might have
16984 been replaced by a cselib value. */
16986 static bool
16987 ix86_pic_register_p (rtx x)
16989 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16990 return (pic_offset_table_rtx
16991 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16992 else if (!REG_P (x))
16993 return false;
16994 else if (pic_offset_table_rtx)
16996 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16997 return true;
16998 if (HARD_REGISTER_P (x)
16999 && !HARD_REGISTER_P (pic_offset_table_rtx)
17000 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17001 return true;
17002 return false;
17004 else
17005 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17008 /* Helper function for ix86_delegitimize_address.
17009 Attempt to delegitimize TLS local-exec accesses. */
17011 static rtx
17012 ix86_delegitimize_tls_address (rtx orig_x)
17014 rtx x = orig_x, unspec;
17015 struct ix86_address addr;
17017 if (!TARGET_TLS_DIRECT_SEG_REFS)
17018 return orig_x;
17019 if (MEM_P (x))
17020 x = XEXP (x, 0);
17021 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17022 return orig_x;
17023 if (ix86_decompose_address (x, &addr) == 0
17024 || addr.seg != DEFAULT_TLS_SEG_REG
17025 || addr.disp == NULL_RTX
17026 || GET_CODE (addr.disp) != CONST)
17027 return orig_x;
17028 unspec = XEXP (addr.disp, 0);
17029 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17030 unspec = XEXP (unspec, 0);
17031 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17032 return orig_x;
17033 x = XVECEXP (unspec, 0, 0);
17034 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17035 if (unspec != XEXP (addr.disp, 0))
17036 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17037 if (addr.index)
17039 rtx idx = addr.index;
17040 if (addr.scale != 1)
17041 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17042 x = gen_rtx_PLUS (Pmode, idx, x);
17044 if (addr.base)
17045 x = gen_rtx_PLUS (Pmode, addr.base, x);
17046 if (MEM_P (orig_x))
17047 x = replace_equiv_address_nv (orig_x, x);
17048 return x;
17051 /* In the name of slightly smaller debug output, and to cater to
17052 general assembler lossage, recognize PIC+GOTOFF and turn it back
17053 into a direct symbol reference.
17055 On Darwin, this is necessary to avoid a crash, because Darwin
17056 has a different PIC label for each routine but the DWARF debugging
17057 information is not associated with any particular routine, so it's
17058 necessary to remove references to the PIC label from RTL stored by
17059 the DWARF output code. */
17061 static rtx
17062 ix86_delegitimize_address (rtx x)
17064 rtx orig_x = delegitimize_mem_from_attrs (x);
17065 /* addend is NULL or some rtx if x is something+GOTOFF where
17066 something doesn't include the PIC register. */
17067 rtx addend = NULL_RTX;
17068 /* reg_addend is NULL or a multiple of some register. */
17069 rtx reg_addend = NULL_RTX;
17070 /* const_addend is NULL or a const_int. */
17071 rtx const_addend = NULL_RTX;
17072 /* This is the result, or NULL. */
17073 rtx result = NULL_RTX;
17075 x = orig_x;
17077 if (MEM_P (x))
17078 x = XEXP (x, 0);
17080 if (TARGET_64BIT)
17082 if (GET_CODE (x) == CONST
17083 && GET_CODE (XEXP (x, 0)) == PLUS
17084 && GET_MODE (XEXP (x, 0)) == Pmode
17085 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17086 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17087 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17089 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17090 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17091 if (MEM_P (orig_x))
17092 x = replace_equiv_address_nv (orig_x, x);
17093 return x;
17096 if (GET_CODE (x) == CONST
17097 && GET_CODE (XEXP (x, 0)) == UNSPEC
17098 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17099 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17100 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17102 x = XVECEXP (XEXP (x, 0), 0, 0);
17103 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17105 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17106 if (x == NULL_RTX)
17107 return orig_x;
17109 return x;
17112 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17113 return ix86_delegitimize_tls_address (orig_x);
17115 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17116 and -mcmodel=medium -fpic. */
17119 if (GET_CODE (x) != PLUS
17120 || GET_CODE (XEXP (x, 1)) != CONST)
17121 return ix86_delegitimize_tls_address (orig_x);
17123 if (ix86_pic_register_p (XEXP (x, 0)))
17124 /* %ebx + GOT/GOTOFF */
17126 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17128 /* %ebx + %reg * scale + GOT/GOTOFF */
17129 reg_addend = XEXP (x, 0);
17130 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17131 reg_addend = XEXP (reg_addend, 1);
17132 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17133 reg_addend = XEXP (reg_addend, 0);
17134 else
17136 reg_addend = NULL_RTX;
17137 addend = XEXP (x, 0);
17140 else
17141 addend = XEXP (x, 0);
17143 x = XEXP (XEXP (x, 1), 0);
17144 if (GET_CODE (x) == PLUS
17145 && CONST_INT_P (XEXP (x, 1)))
17147 const_addend = XEXP (x, 1);
17148 x = XEXP (x, 0);
17151 if (GET_CODE (x) == UNSPEC
17152 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17153 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17154 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17155 && !MEM_P (orig_x) && !addend)))
17156 result = XVECEXP (x, 0, 0);
17158 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17159 && !MEM_P (orig_x))
17160 result = XVECEXP (x, 0, 0);
17162 if (! result)
17163 return ix86_delegitimize_tls_address (orig_x);
17165 if (const_addend)
17166 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17167 if (reg_addend)
17168 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17169 if (addend)
17171 /* If the rest of original X doesn't involve the PIC register, add
17172 addend and subtract pic_offset_table_rtx. This can happen e.g.
17173 for code like:
17174 leal (%ebx, %ecx, 4), %ecx
17176 movl foo@GOTOFF(%ecx), %edx
17177 in which case we return (%ecx - %ebx) + foo
17178 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17179 and reload has completed. */
17180 if (pic_offset_table_rtx
17181 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17182 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17183 pic_offset_table_rtx),
17184 result);
17185 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17187 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17188 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17189 result = gen_rtx_PLUS (Pmode, tmp, result);
17191 else
17192 return orig_x;
17194 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17196 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17197 if (result == NULL_RTX)
17198 return orig_x;
17200 return result;
17203 /* If X is a machine specific address (i.e. a symbol or label being
17204 referenced as a displacement from the GOT implemented using an
17205 UNSPEC), then return the base term. Otherwise return X. */
17208 ix86_find_base_term (rtx x)
17210 rtx term;
17212 if (TARGET_64BIT)
17214 if (GET_CODE (x) != CONST)
17215 return x;
17216 term = XEXP (x, 0);
17217 if (GET_CODE (term) == PLUS
17218 && CONST_INT_P (XEXP (term, 1)))
17219 term = XEXP (term, 0);
17220 if (GET_CODE (term) != UNSPEC
17221 || (XINT (term, 1) != UNSPEC_GOTPCREL
17222 && XINT (term, 1) != UNSPEC_PCREL))
17223 return x;
17225 return XVECEXP (term, 0, 0);
17228 return ix86_delegitimize_address (x);
17231 static void
17232 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17233 bool fp, FILE *file)
17235 const char *suffix;
17237 if (mode == CCFPmode || mode == CCFPUmode)
17239 code = ix86_fp_compare_code_to_integer (code);
17240 mode = CCmode;
17242 if (reverse)
17243 code = reverse_condition (code);
17245 switch (code)
17247 case EQ:
17248 switch (mode)
17250 case CCAmode:
17251 suffix = "a";
17252 break;
17253 case CCCmode:
17254 suffix = "c";
17255 break;
17256 case CCOmode:
17257 suffix = "o";
17258 break;
17259 case CCPmode:
17260 suffix = "p";
17261 break;
17262 case CCSmode:
17263 suffix = "s";
17264 break;
17265 default:
17266 suffix = "e";
17267 break;
17269 break;
17270 case NE:
17271 switch (mode)
17273 case CCAmode:
17274 suffix = "na";
17275 break;
17276 case CCCmode:
17277 suffix = "nc";
17278 break;
17279 case CCOmode:
17280 suffix = "no";
17281 break;
17282 case CCPmode:
17283 suffix = "np";
17284 break;
17285 case CCSmode:
17286 suffix = "ns";
17287 break;
17288 default:
17289 suffix = "ne";
17290 break;
17292 break;
17293 case GT:
17294 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17295 suffix = "g";
17296 break;
17297 case GTU:
17298 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17299 Those same assemblers have the same but opposite lossage on cmov. */
17300 if (mode == CCmode)
17301 suffix = fp ? "nbe" : "a";
17302 else
17303 gcc_unreachable ();
17304 break;
17305 case LT:
17306 switch (mode)
17308 case CCNOmode:
17309 case CCGOCmode:
17310 suffix = "s";
17311 break;
17313 case CCmode:
17314 case CCGCmode:
17315 suffix = "l";
17316 break;
17318 default:
17319 gcc_unreachable ();
17321 break;
17322 case LTU:
17323 if (mode == CCmode)
17324 suffix = "b";
17325 else if (mode == CCCmode)
17326 suffix = fp ? "b" : "c";
17327 else
17328 gcc_unreachable ();
17329 break;
17330 case GE:
17331 switch (mode)
17333 case CCNOmode:
17334 case CCGOCmode:
17335 suffix = "ns";
17336 break;
17338 case CCmode:
17339 case CCGCmode:
17340 suffix = "ge";
17341 break;
17343 default:
17344 gcc_unreachable ();
17346 break;
17347 case GEU:
17348 if (mode == CCmode)
17349 suffix = "nb";
17350 else if (mode == CCCmode)
17351 suffix = fp ? "nb" : "nc";
17352 else
17353 gcc_unreachable ();
17354 break;
17355 case LE:
17356 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17357 suffix = "le";
17358 break;
17359 case LEU:
17360 if (mode == CCmode)
17361 suffix = "be";
17362 else
17363 gcc_unreachable ();
17364 break;
17365 case UNORDERED:
17366 suffix = fp ? "u" : "p";
17367 break;
17368 case ORDERED:
17369 suffix = fp ? "nu" : "np";
17370 break;
17371 default:
17372 gcc_unreachable ();
17374 fputs (suffix, file);
17377 /* Print the name of register X to FILE based on its machine mode and number.
17378 If CODE is 'w', pretend the mode is HImode.
17379 If CODE is 'b', pretend the mode is QImode.
17380 If CODE is 'k', pretend the mode is SImode.
17381 If CODE is 'q', pretend the mode is DImode.
17382 If CODE is 'x', pretend the mode is V4SFmode.
17383 If CODE is 't', pretend the mode is V8SFmode.
17384 If CODE is 'g', pretend the mode is V16SFmode.
17385 If CODE is 'h', pretend the reg is the 'high' byte register.
17386 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17387 If CODE is 'd', duplicate the operand for AVX instruction.
17390 void
17391 print_reg (rtx x, int code, FILE *file)
17393 const char *reg;
17394 int msize;
17395 unsigned int regno;
17396 bool duplicated;
17398 if (ASSEMBLER_DIALECT == ASM_ATT)
17399 putc ('%', file);
17401 if (x == pc_rtx)
17403 gcc_assert (TARGET_64BIT);
17404 fputs ("rip", file);
17405 return;
17408 if (code == 'y' && STACK_TOP_P (x))
17410 fputs ("st(0)", file);
17411 return;
17414 if (code == 'w')
17415 msize = 2;
17416 else if (code == 'b')
17417 msize = 1;
17418 else if (code == 'k')
17419 msize = 4;
17420 else if (code == 'q')
17421 msize = 8;
17422 else if (code == 'h')
17423 msize = 0;
17424 else if (code == 'x')
17425 msize = 16;
17426 else if (code == 't')
17427 msize = 32;
17428 else if (code == 'g')
17429 msize = 64;
17430 else
17431 msize = GET_MODE_SIZE (GET_MODE (x));
17433 regno = true_regnum (x);
17435 gcc_assert (regno != ARG_POINTER_REGNUM
17436 && regno != FRAME_POINTER_REGNUM
17437 && regno != FPSR_REG
17438 && regno != FPCR_REG);
17440 if (regno == FLAGS_REG)
17442 output_operand_lossage ("invalid use of asm flag output");
17443 return;
17446 duplicated = code == 'd' && TARGET_AVX;
17448 switch (msize)
17450 case 8:
17451 case 4:
17452 if (LEGACY_INT_REGNO_P (regno))
17453 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17454 /* FALLTHRU */
17455 case 16:
17456 case 12:
17457 case 2:
17458 normal:
17459 reg = hi_reg_name[regno];
17460 break;
17461 case 1:
17462 if (regno >= ARRAY_SIZE (qi_reg_name))
17463 goto normal;
17464 reg = qi_reg_name[regno];
17465 break;
17466 case 0:
17467 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17468 goto normal;
17469 reg = qi_high_reg_name[regno];
17470 break;
17471 case 32:
17472 case 64:
17473 if (SSE_REGNO_P (regno))
17475 gcc_assert (!duplicated);
17476 putc (msize == 32 ? 'y' : 'z', file);
17477 reg = hi_reg_name[regno] + 1;
17478 break;
17480 goto normal;
17481 default:
17482 gcc_unreachable ();
17485 fputs (reg, file);
17487 /* Irritatingly, AMD extended registers use
17488 different naming convention: "r%d[bwd]" */
17489 if (REX_INT_REGNO_P (regno))
17491 gcc_assert (TARGET_64BIT);
17492 switch (msize)
17494 case 0:
17495 error ("extended registers have no high halves");
17496 break;
17497 case 1:
17498 putc ('b', file);
17499 break;
17500 case 2:
17501 putc ('w', file);
17502 break;
17503 case 4:
17504 putc ('d', file);
17505 break;
17506 case 8:
17507 /* no suffix */
17508 break;
17509 default:
17510 error ("unsupported operand size for extended register");
17511 break;
17513 return;
17516 if (duplicated)
17518 if (ASSEMBLER_DIALECT == ASM_ATT)
17519 fprintf (file, ", %%%s", reg);
17520 else
17521 fprintf (file, ", %s", reg);
17525 /* Meaning of CODE:
17526 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17527 C -- print opcode suffix for set/cmov insn.
17528 c -- like C, but print reversed condition
17529 F,f -- likewise, but for floating-point.
17530 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17531 otherwise nothing
17532 R -- print embeded rounding and sae.
17533 r -- print only sae.
17534 z -- print the opcode suffix for the size of the current operand.
17535 Z -- likewise, with special suffixes for x87 instructions.
17536 * -- print a star (in certain assembler syntax)
17537 A -- print an absolute memory reference.
17538 E -- print address with DImode register names if TARGET_64BIT.
17539 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17540 s -- print a shift double count, followed by the assemblers argument
17541 delimiter.
17542 b -- print the QImode name of the register for the indicated operand.
17543 %b0 would print %al if operands[0] is reg 0.
17544 w -- likewise, print the HImode name of the register.
17545 k -- likewise, print the SImode name of the register.
17546 q -- likewise, print the DImode name of the register.
17547 x -- likewise, print the V4SFmode name of the register.
17548 t -- likewise, print the V8SFmode name of the register.
17549 g -- likewise, print the V16SFmode name of the register.
17550 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17551 y -- print "st(0)" instead of "st" as a register.
17552 d -- print duplicated register operand for AVX instruction.
17553 D -- print condition for SSE cmp instruction.
17554 P -- if PIC, print an @PLT suffix.
17555 p -- print raw symbol name.
17556 X -- don't print any sort of PIC '@' suffix for a symbol.
17557 & -- print some in-use local-dynamic symbol name.
17558 H -- print a memory address offset by 8; used for sse high-parts
17559 Y -- print condition for XOP pcom* instruction.
17560 + -- print a branch hint as 'cs' or 'ds' prefix
17561 ; -- print a semicolon (after prefixes due to bug in older gas).
17562 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17563 @ -- print a segment register of thread base pointer load
17564 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17565 ! -- print MPX prefix for jxx/call/ret instructions if required.
17568 void
17569 ix86_print_operand (FILE *file, rtx x, int code)
17571 if (code)
17573 switch (code)
17575 case 'A':
17576 switch (ASSEMBLER_DIALECT)
17578 case ASM_ATT:
17579 putc ('*', file);
17580 break;
17582 case ASM_INTEL:
17583 /* Intel syntax. For absolute addresses, registers should not
17584 be surrounded by braces. */
17585 if (!REG_P (x))
17587 putc ('[', file);
17588 ix86_print_operand (file, x, 0);
17589 putc (']', file);
17590 return;
17592 break;
17594 default:
17595 gcc_unreachable ();
17598 ix86_print_operand (file, x, 0);
17599 return;
17601 case 'E':
17602 /* Wrap address in an UNSPEC to declare special handling. */
17603 if (TARGET_64BIT)
17604 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17606 output_address (VOIDmode, x);
17607 return;
17609 case 'L':
17610 if (ASSEMBLER_DIALECT == ASM_ATT)
17611 putc ('l', file);
17612 return;
17614 case 'W':
17615 if (ASSEMBLER_DIALECT == ASM_ATT)
17616 putc ('w', file);
17617 return;
17619 case 'B':
17620 if (ASSEMBLER_DIALECT == ASM_ATT)
17621 putc ('b', file);
17622 return;
17624 case 'Q':
17625 if (ASSEMBLER_DIALECT == ASM_ATT)
17626 putc ('l', file);
17627 return;
17629 case 'S':
17630 if (ASSEMBLER_DIALECT == ASM_ATT)
17631 putc ('s', file);
17632 return;
17634 case 'T':
17635 if (ASSEMBLER_DIALECT == ASM_ATT)
17636 putc ('t', file);
17637 return;
17639 case 'O':
17640 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17641 if (ASSEMBLER_DIALECT != ASM_ATT)
17642 return;
17644 switch (GET_MODE_SIZE (GET_MODE (x)))
17646 case 2:
17647 putc ('w', file);
17648 break;
17650 case 4:
17651 putc ('l', file);
17652 break;
17654 case 8:
17655 putc ('q', file);
17656 break;
17658 default:
17659 output_operand_lossage
17660 ("invalid operand size for operand code 'O'");
17661 return;
17664 putc ('.', file);
17665 #endif
17666 return;
17668 case 'z':
17669 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17671 /* Opcodes don't get size suffixes if using Intel opcodes. */
17672 if (ASSEMBLER_DIALECT == ASM_INTEL)
17673 return;
17675 switch (GET_MODE_SIZE (GET_MODE (x)))
17677 case 1:
17678 putc ('b', file);
17679 return;
17681 case 2:
17682 putc ('w', file);
17683 return;
17685 case 4:
17686 putc ('l', file);
17687 return;
17689 case 8:
17690 putc ('q', file);
17691 return;
17693 default:
17694 output_operand_lossage
17695 ("invalid operand size for operand code 'z'");
17696 return;
17700 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17701 warning
17702 (0, "non-integer operand used with operand code 'z'");
17703 /* FALLTHRU */
17705 case 'Z':
17706 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17707 if (ASSEMBLER_DIALECT == ASM_INTEL)
17708 return;
17710 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17712 switch (GET_MODE_SIZE (GET_MODE (x)))
17714 case 2:
17715 #ifdef HAVE_AS_IX86_FILDS
17716 putc ('s', file);
17717 #endif
17718 return;
17720 case 4:
17721 putc ('l', file);
17722 return;
17724 case 8:
17725 #ifdef HAVE_AS_IX86_FILDQ
17726 putc ('q', file);
17727 #else
17728 fputs ("ll", file);
17729 #endif
17730 return;
17732 default:
17733 break;
17736 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17738 /* 387 opcodes don't get size suffixes
17739 if the operands are registers. */
17740 if (STACK_REG_P (x))
17741 return;
17743 switch (GET_MODE_SIZE (GET_MODE (x)))
17745 case 4:
17746 putc ('s', file);
17747 return;
17749 case 8:
17750 putc ('l', file);
17751 return;
17753 case 12:
17754 case 16:
17755 putc ('t', file);
17756 return;
17758 default:
17759 break;
17762 else
17764 output_operand_lossage
17765 ("invalid operand type used with operand code 'Z'");
17766 return;
17769 output_operand_lossage
17770 ("invalid operand size for operand code 'Z'");
17771 return;
17773 case 'd':
17774 case 'b':
17775 case 'w':
17776 case 'k':
17777 case 'q':
17778 case 'h':
17779 case 't':
17780 case 'g':
17781 case 'y':
17782 case 'x':
17783 case 'X':
17784 case 'P':
17785 case 'p':
17786 break;
17788 case 's':
17789 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17791 ix86_print_operand (file, x, 0);
17792 fputs (", ", file);
17794 return;
17796 case 'Y':
17797 switch (GET_CODE (x))
17799 case NE:
17800 fputs ("neq", file);
17801 break;
17802 case EQ:
17803 fputs ("eq", file);
17804 break;
17805 case GE:
17806 case GEU:
17807 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17808 break;
17809 case GT:
17810 case GTU:
17811 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17812 break;
17813 case LE:
17814 case LEU:
17815 fputs ("le", file);
17816 break;
17817 case LT:
17818 case LTU:
17819 fputs ("lt", file);
17820 break;
17821 case UNORDERED:
17822 fputs ("unord", file);
17823 break;
17824 case ORDERED:
17825 fputs ("ord", file);
17826 break;
17827 case UNEQ:
17828 fputs ("ueq", file);
17829 break;
17830 case UNGE:
17831 fputs ("nlt", file);
17832 break;
17833 case UNGT:
17834 fputs ("nle", file);
17835 break;
17836 case UNLE:
17837 fputs ("ule", file);
17838 break;
17839 case UNLT:
17840 fputs ("ult", file);
17841 break;
17842 case LTGT:
17843 fputs ("une", file);
17844 break;
17845 default:
17846 output_operand_lossage ("operand is not a condition code, "
17847 "invalid operand code 'Y'");
17848 return;
17850 return;
17852 case 'D':
17853 /* Little bit of braindamage here. The SSE compare instructions
17854 does use completely different names for the comparisons that the
17855 fp conditional moves. */
17856 switch (GET_CODE (x))
17858 case UNEQ:
17859 if (TARGET_AVX)
17861 fputs ("eq_us", file);
17862 break;
17864 /* FALLTHRU */
17865 case EQ:
17866 fputs ("eq", file);
17867 break;
17868 case UNLT:
17869 if (TARGET_AVX)
17871 fputs ("nge", file);
17872 break;
17874 /* FALLTHRU */
17875 case LT:
17876 fputs ("lt", file);
17877 break;
17878 case UNLE:
17879 if (TARGET_AVX)
17881 fputs ("ngt", file);
17882 break;
17884 /* FALLTHRU */
17885 case LE:
17886 fputs ("le", file);
17887 break;
17888 case UNORDERED:
17889 fputs ("unord", file);
17890 break;
17891 case LTGT:
17892 if (TARGET_AVX)
17894 fputs ("neq_oq", file);
17895 break;
17897 /* FALLTHRU */
17898 case NE:
17899 fputs ("neq", file);
17900 break;
17901 case GE:
17902 if (TARGET_AVX)
17904 fputs ("ge", file);
17905 break;
17907 /* FALLTHRU */
17908 case UNGE:
17909 fputs ("nlt", file);
17910 break;
17911 case GT:
17912 if (TARGET_AVX)
17914 fputs ("gt", file);
17915 break;
17917 /* FALLTHRU */
17918 case UNGT:
17919 fputs ("nle", file);
17920 break;
17921 case ORDERED:
17922 fputs ("ord", file);
17923 break;
17924 default:
17925 output_operand_lossage ("operand is not a condition code, "
17926 "invalid operand code 'D'");
17927 return;
17929 return;
17931 case 'F':
17932 case 'f':
17933 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17934 if (ASSEMBLER_DIALECT == ASM_ATT)
17935 putc ('.', file);
17936 gcc_fallthrough ();
17937 #endif
17939 case 'C':
17940 case 'c':
17941 if (!COMPARISON_P (x))
17943 output_operand_lossage ("operand is not a condition code, "
17944 "invalid operand code '%c'", code);
17945 return;
17947 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17948 code == 'c' || code == 'f',
17949 code == 'F' || code == 'f',
17950 file);
17951 return;
17953 case 'H':
17954 if (!offsettable_memref_p (x))
17956 output_operand_lossage ("operand is not an offsettable memory "
17957 "reference, invalid operand code 'H'");
17958 return;
17960 /* It doesn't actually matter what mode we use here, as we're
17961 only going to use this for printing. */
17962 x = adjust_address_nv (x, DImode, 8);
17963 /* Output 'qword ptr' for intel assembler dialect. */
17964 if (ASSEMBLER_DIALECT == ASM_INTEL)
17965 code = 'q';
17966 break;
17968 case 'K':
17969 gcc_assert (CONST_INT_P (x));
17971 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17972 #ifdef HAVE_AS_IX86_HLE
17973 fputs ("xacquire ", file);
17974 #else
17975 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17976 #endif
17977 else if (INTVAL (x) & IX86_HLE_RELEASE)
17978 #ifdef HAVE_AS_IX86_HLE
17979 fputs ("xrelease ", file);
17980 #else
17981 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17982 #endif
17983 /* We do not want to print value of the operand. */
17984 return;
17986 case 'N':
17987 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17988 fputs ("{z}", file);
17989 return;
17991 case 'r':
17992 gcc_assert (CONST_INT_P (x));
17993 gcc_assert (INTVAL (x) == ROUND_SAE);
17995 if (ASSEMBLER_DIALECT == ASM_INTEL)
17996 fputs (", ", file);
17998 fputs ("{sae}", file);
18000 if (ASSEMBLER_DIALECT == ASM_ATT)
18001 fputs (", ", file);
18003 return;
18005 case 'R':
18006 gcc_assert (CONST_INT_P (x));
18008 if (ASSEMBLER_DIALECT == ASM_INTEL)
18009 fputs (", ", file);
18011 switch (INTVAL (x))
18013 case ROUND_NEAREST_INT | ROUND_SAE:
18014 fputs ("{rn-sae}", file);
18015 break;
18016 case ROUND_NEG_INF | ROUND_SAE:
18017 fputs ("{rd-sae}", file);
18018 break;
18019 case ROUND_POS_INF | ROUND_SAE:
18020 fputs ("{ru-sae}", file);
18021 break;
18022 case ROUND_ZERO | ROUND_SAE:
18023 fputs ("{rz-sae}", file);
18024 break;
18025 default:
18026 gcc_unreachable ();
18029 if (ASSEMBLER_DIALECT == ASM_ATT)
18030 fputs (", ", file);
18032 return;
18034 case '*':
18035 if (ASSEMBLER_DIALECT == ASM_ATT)
18036 putc ('*', file);
18037 return;
18039 case '&':
18041 const char *name = get_some_local_dynamic_name ();
18042 if (name == NULL)
18043 output_operand_lossage ("'%%&' used without any "
18044 "local dynamic TLS references");
18045 else
18046 assemble_name (file, name);
18047 return;
18050 case '+':
18052 rtx x;
18054 if (!optimize
18055 || optimize_function_for_size_p (cfun)
18056 || !TARGET_BRANCH_PREDICTION_HINTS)
18057 return;
18059 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18060 if (x)
18062 int pred_val = XINT (x, 0);
18064 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18065 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18067 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18068 bool cputaken
18069 = final_forward_branch_p (current_output_insn) == 0;
18071 /* Emit hints only in the case default branch prediction
18072 heuristics would fail. */
18073 if (taken != cputaken)
18075 /* We use 3e (DS) prefix for taken branches and
18076 2e (CS) prefix for not taken branches. */
18077 if (taken)
18078 fputs ("ds ; ", file);
18079 else
18080 fputs ("cs ; ", file);
18084 return;
18087 case ';':
18088 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18089 putc (';', file);
18090 #endif
18091 return;
18093 case '@':
18094 if (ASSEMBLER_DIALECT == ASM_ATT)
18095 putc ('%', file);
18097 /* The kernel uses a different segment register for performance
18098 reasons; a system call would not have to trash the userspace
18099 segment register, which would be expensive. */
18100 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18101 fputs ("fs", file);
18102 else
18103 fputs ("gs", file);
18104 return;
18106 case '~':
18107 putc (TARGET_AVX2 ? 'i' : 'f', file);
18108 return;
18110 case '^':
18111 if (TARGET_64BIT && Pmode != word_mode)
18112 fputs ("addr32 ", file);
18113 return;
18115 case '!':
18116 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18117 fputs ("bnd ", file);
18118 return;
18120 default:
18121 output_operand_lossage ("invalid operand code '%c'", code);
18125 if (REG_P (x))
18126 print_reg (x, code, file);
18128 else if (MEM_P (x))
18130 rtx addr = XEXP (x, 0);
18132 /* No `byte ptr' prefix for call instructions ... */
18133 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18135 machine_mode mode = GET_MODE (x);
18136 const char *size;
18138 /* Check for explicit size override codes. */
18139 if (code == 'b')
18140 size = "BYTE";
18141 else if (code == 'w')
18142 size = "WORD";
18143 else if (code == 'k')
18144 size = "DWORD";
18145 else if (code == 'q')
18146 size = "QWORD";
18147 else if (code == 'x')
18148 size = "XMMWORD";
18149 else if (code == 't')
18150 size = "YMMWORD";
18151 else if (code == 'g')
18152 size = "ZMMWORD";
18153 else if (mode == BLKmode)
18154 /* ... or BLKmode operands, when not overridden. */
18155 size = NULL;
18156 else
18157 switch (GET_MODE_SIZE (mode))
18159 case 1: size = "BYTE"; break;
18160 case 2: size = "WORD"; break;
18161 case 4: size = "DWORD"; break;
18162 case 8: size = "QWORD"; break;
18163 case 12: size = "TBYTE"; break;
18164 case 16:
18165 if (mode == XFmode)
18166 size = "TBYTE";
18167 else
18168 size = "XMMWORD";
18169 break;
18170 case 32: size = "YMMWORD"; break;
18171 case 64: size = "ZMMWORD"; break;
18172 default:
18173 gcc_unreachable ();
18175 if (size)
18177 fputs (size, file);
18178 fputs (" PTR ", file);
18182 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18183 output_operand_lossage ("invalid constraints for operand");
18184 else
18185 ix86_print_operand_address_as
18186 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18189 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18191 long l;
18193 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18195 if (ASSEMBLER_DIALECT == ASM_ATT)
18196 putc ('$', file);
18197 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18198 if (code == 'q')
18199 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18200 (unsigned long long) (int) l);
18201 else
18202 fprintf (file, "0x%08x", (unsigned int) l);
18205 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18207 long l[2];
18209 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18211 if (ASSEMBLER_DIALECT == ASM_ATT)
18212 putc ('$', file);
18213 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18216 /* These float cases don't actually occur as immediate operands. */
18217 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18219 char dstr[30];
18221 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18222 fputs (dstr, file);
18225 else
18227 /* We have patterns that allow zero sets of memory, for instance.
18228 In 64-bit mode, we should probably support all 8-byte vectors,
18229 since we can in fact encode that into an immediate. */
18230 if (GET_CODE (x) == CONST_VECTOR)
18232 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18233 x = const0_rtx;
18236 if (code != 'P' && code != 'p')
18238 if (CONST_INT_P (x))
18240 if (ASSEMBLER_DIALECT == ASM_ATT)
18241 putc ('$', file);
18243 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18244 || GET_CODE (x) == LABEL_REF)
18246 if (ASSEMBLER_DIALECT == ASM_ATT)
18247 putc ('$', file);
18248 else
18249 fputs ("OFFSET FLAT:", file);
18252 if (CONST_INT_P (x))
18253 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18254 else if (flag_pic || MACHOPIC_INDIRECT)
18255 output_pic_addr_const (file, x, code);
18256 else
18257 output_addr_const (file, x);
18261 static bool
18262 ix86_print_operand_punct_valid_p (unsigned char code)
18264 return (code == '@' || code == '*' || code == '+' || code == '&'
18265 || code == ';' || code == '~' || code == '^' || code == '!');
18268 /* Print a memory operand whose address is ADDR. */
18270 static void
18271 ix86_print_operand_address_as (FILE *file, rtx addr,
18272 addr_space_t as, bool no_rip)
18274 struct ix86_address parts;
18275 rtx base, index, disp;
18276 int scale;
18277 int ok;
18278 bool vsib = false;
18279 int code = 0;
18281 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18283 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18284 gcc_assert (parts.index == NULL_RTX);
18285 parts.index = XVECEXP (addr, 0, 1);
18286 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18287 addr = XVECEXP (addr, 0, 0);
18288 vsib = true;
18290 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18292 gcc_assert (TARGET_64BIT);
18293 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18294 code = 'q';
18296 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18298 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18299 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18300 if (parts.base != NULL_RTX)
18302 parts.index = parts.base;
18303 parts.scale = 1;
18305 parts.base = XVECEXP (addr, 0, 0);
18306 addr = XVECEXP (addr, 0, 0);
18308 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18310 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18311 gcc_assert (parts.index == NULL_RTX);
18312 parts.index = XVECEXP (addr, 0, 1);
18313 addr = XVECEXP (addr, 0, 0);
18315 else
18316 ok = ix86_decompose_address (addr, &parts);
18318 gcc_assert (ok);
18320 base = parts.base;
18321 index = parts.index;
18322 disp = parts.disp;
18323 scale = parts.scale;
18325 if (ADDR_SPACE_GENERIC_P (as))
18326 as = parts.seg;
18327 else
18328 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18330 if (!ADDR_SPACE_GENERIC_P (as))
18332 const char *string;
18334 if (as == ADDR_SPACE_SEG_FS)
18335 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18336 else if (as == ADDR_SPACE_SEG_GS)
18337 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18338 else
18339 gcc_unreachable ();
18340 fputs (string, file);
18343 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18344 if (TARGET_64BIT && !base && !index && !no_rip)
18346 rtx symbol = disp;
18348 if (GET_CODE (disp) == CONST
18349 && GET_CODE (XEXP (disp, 0)) == PLUS
18350 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18351 symbol = XEXP (XEXP (disp, 0), 0);
18353 if (GET_CODE (symbol) == LABEL_REF
18354 || (GET_CODE (symbol) == SYMBOL_REF
18355 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18356 base = pc_rtx;
18359 if (!base && !index)
18361 /* Displacement only requires special attention. */
18362 if (CONST_INT_P (disp))
18364 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18365 fputs ("ds:", file);
18366 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18368 /* Load the external function address via the GOT slot to avoid PLT. */
18369 else if (GET_CODE (disp) == CONST
18370 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18371 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18372 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18373 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18374 output_pic_addr_const (file, disp, 0);
18375 else if (flag_pic)
18376 output_pic_addr_const (file, disp, 0);
18377 else
18378 output_addr_const (file, disp);
18380 else
18382 /* Print SImode register names to force addr32 prefix. */
18383 if (SImode_address_operand (addr, VOIDmode))
18385 if (flag_checking)
18387 gcc_assert (TARGET_64BIT);
18388 switch (GET_CODE (addr))
18390 case SUBREG:
18391 gcc_assert (GET_MODE (addr) == SImode);
18392 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18393 break;
18394 case ZERO_EXTEND:
18395 case AND:
18396 gcc_assert (GET_MODE (addr) == DImode);
18397 break;
18398 default:
18399 gcc_unreachable ();
18402 gcc_assert (!code);
18403 code = 'k';
18405 else if (code == 0
18406 && TARGET_X32
18407 && disp
18408 && CONST_INT_P (disp)
18409 && INTVAL (disp) < -16*1024*1024)
18411 /* X32 runs in 64-bit mode, where displacement, DISP, in
18412 address DISP(%r64), is encoded as 32-bit immediate sign-
18413 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18414 address is %r64 + 0xffffffffbffffd00. When %r64 <
18415 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18416 which is invalid for x32. The correct address is %r64
18417 - 0x40000300 == 0xf7ffdd64. To properly encode
18418 -0x40000300(%r64) for x32, we zero-extend negative
18419 displacement by forcing addr32 prefix which truncates
18420 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18421 zero-extend all negative displacements, including -1(%rsp).
18422 However, for small negative displacements, sign-extension
18423 won't cause overflow. We only zero-extend negative
18424 displacements if they < -16*1024*1024, which is also used
18425 to check legitimate address displacements for PIC. */
18426 code = 'k';
18429 if (ASSEMBLER_DIALECT == ASM_ATT)
18431 if (disp)
18433 if (flag_pic)
18434 output_pic_addr_const (file, disp, 0);
18435 else if (GET_CODE (disp) == LABEL_REF)
18436 output_asm_label (disp);
18437 else
18438 output_addr_const (file, disp);
18441 putc ('(', file);
18442 if (base)
18443 print_reg (base, code, file);
18444 if (index)
18446 putc (',', file);
18447 print_reg (index, vsib ? 0 : code, file);
18448 if (scale != 1 || vsib)
18449 fprintf (file, ",%d", scale);
18451 putc (')', file);
18453 else
18455 rtx offset = NULL_RTX;
18457 if (disp)
18459 /* Pull out the offset of a symbol; print any symbol itself. */
18460 if (GET_CODE (disp) == CONST
18461 && GET_CODE (XEXP (disp, 0)) == PLUS
18462 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18464 offset = XEXP (XEXP (disp, 0), 1);
18465 disp = gen_rtx_CONST (VOIDmode,
18466 XEXP (XEXP (disp, 0), 0));
18469 if (flag_pic)
18470 output_pic_addr_const (file, disp, 0);
18471 else if (GET_CODE (disp) == LABEL_REF)
18472 output_asm_label (disp);
18473 else if (CONST_INT_P (disp))
18474 offset = disp;
18475 else
18476 output_addr_const (file, disp);
18479 putc ('[', file);
18480 if (base)
18482 print_reg (base, code, file);
18483 if (offset)
18485 if (INTVAL (offset) >= 0)
18486 putc ('+', file);
18487 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18490 else if (offset)
18491 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18492 else
18493 putc ('0', file);
18495 if (index)
18497 putc ('+', file);
18498 print_reg (index, vsib ? 0 : code, file);
18499 if (scale != 1 || vsib)
18500 fprintf (file, "*%d", scale);
18502 putc (']', file);
18507 static void
18508 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18510 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18513 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18515 static bool
18516 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18518 rtx op;
18520 if (GET_CODE (x) != UNSPEC)
18521 return false;
18523 op = XVECEXP (x, 0, 0);
18524 switch (XINT (x, 1))
18526 case UNSPEC_GOTTPOFF:
18527 output_addr_const (file, op);
18528 /* FIXME: This might be @TPOFF in Sun ld. */
18529 fputs ("@gottpoff", file);
18530 break;
18531 case UNSPEC_TPOFF:
18532 output_addr_const (file, op);
18533 fputs ("@tpoff", file);
18534 break;
18535 case UNSPEC_NTPOFF:
18536 output_addr_const (file, op);
18537 if (TARGET_64BIT)
18538 fputs ("@tpoff", file);
18539 else
18540 fputs ("@ntpoff", file);
18541 break;
18542 case UNSPEC_DTPOFF:
18543 output_addr_const (file, op);
18544 fputs ("@dtpoff", file);
18545 break;
18546 case UNSPEC_GOTNTPOFF:
18547 output_addr_const (file, op);
18548 if (TARGET_64BIT)
18549 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18550 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18551 else
18552 fputs ("@gotntpoff", file);
18553 break;
18554 case UNSPEC_INDNTPOFF:
18555 output_addr_const (file, op);
18556 fputs ("@indntpoff", file);
18557 break;
18558 #if TARGET_MACHO
18559 case UNSPEC_MACHOPIC_OFFSET:
18560 output_addr_const (file, op);
18561 putc ('-', file);
18562 machopic_output_function_base_name (file);
18563 break;
18564 #endif
18566 case UNSPEC_STACK_CHECK:
18568 int offset;
18570 gcc_assert (flag_split_stack);
18572 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18573 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18574 #else
18575 gcc_unreachable ();
18576 #endif
18578 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18580 break;
18582 default:
18583 return false;
18586 return true;
18589 /* Split one or more double-mode RTL references into pairs of half-mode
18590 references. The RTL can be REG, offsettable MEM, integer constant, or
18591 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18592 split and "num" is its length. lo_half and hi_half are output arrays
18593 that parallel "operands". */
18595 void
18596 split_double_mode (machine_mode mode, rtx operands[],
18597 int num, rtx lo_half[], rtx hi_half[])
18599 machine_mode half_mode;
18600 unsigned int byte;
18602 switch (mode)
18604 case TImode:
18605 half_mode = DImode;
18606 break;
18607 case DImode:
18608 half_mode = SImode;
18609 break;
18610 default:
18611 gcc_unreachable ();
18614 byte = GET_MODE_SIZE (half_mode);
18616 while (num--)
18618 rtx op = operands[num];
18620 /* simplify_subreg refuse to split volatile memory addresses,
18621 but we still have to handle it. */
18622 if (MEM_P (op))
18624 lo_half[num] = adjust_address (op, half_mode, 0);
18625 hi_half[num] = adjust_address (op, half_mode, byte);
18627 else
18629 lo_half[num] = simplify_gen_subreg (half_mode, op,
18630 GET_MODE (op) == VOIDmode
18631 ? mode : GET_MODE (op), 0);
18632 hi_half[num] = simplify_gen_subreg (half_mode, op,
18633 GET_MODE (op) == VOIDmode
18634 ? mode : GET_MODE (op), byte);
18639 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18640 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18641 is the expression of the binary operation. The output may either be
18642 emitted here, or returned to the caller, like all output_* functions.
18644 There is no guarantee that the operands are the same mode, as they
18645 might be within FLOAT or FLOAT_EXTEND expressions. */
18647 #ifndef SYSV386_COMPAT
18648 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18649 wants to fix the assemblers because that causes incompatibility
18650 with gcc. No-one wants to fix gcc because that causes
18651 incompatibility with assemblers... You can use the option of
18652 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18653 #define SYSV386_COMPAT 1
18654 #endif
18656 const char *
18657 output_387_binary_op (rtx insn, rtx *operands)
18659 static char buf[40];
18660 const char *p;
18661 const char *ssep;
18662 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18664 /* Even if we do not want to check the inputs, this documents input
18665 constraints. Which helps in understanding the following code. */
18666 if (flag_checking)
18668 if (STACK_REG_P (operands[0])
18669 && ((REG_P (operands[1])
18670 && REGNO (operands[0]) == REGNO (operands[1])
18671 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18672 || (REG_P (operands[2])
18673 && REGNO (operands[0]) == REGNO (operands[2])
18674 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18675 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18676 ; /* ok */
18677 else
18678 gcc_assert (is_sse);
18681 switch (GET_CODE (operands[3]))
18683 case PLUS:
18684 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18685 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18686 p = "fiadd";
18687 else
18688 p = "fadd";
18689 ssep = "vadd";
18690 break;
18692 case MINUS:
18693 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18694 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18695 p = "fisub";
18696 else
18697 p = "fsub";
18698 ssep = "vsub";
18699 break;
18701 case MULT:
18702 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18703 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18704 p = "fimul";
18705 else
18706 p = "fmul";
18707 ssep = "vmul";
18708 break;
18710 case DIV:
18711 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18712 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18713 p = "fidiv";
18714 else
18715 p = "fdiv";
18716 ssep = "vdiv";
18717 break;
18719 default:
18720 gcc_unreachable ();
18723 if (is_sse)
18725 if (TARGET_AVX)
18727 strcpy (buf, ssep);
18728 if (GET_MODE (operands[0]) == SFmode)
18729 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18730 else
18731 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18733 else
18735 strcpy (buf, ssep + 1);
18736 if (GET_MODE (operands[0]) == SFmode)
18737 strcat (buf, "ss\t{%2, %0|%0, %2}");
18738 else
18739 strcat (buf, "sd\t{%2, %0|%0, %2}");
18741 return buf;
18743 strcpy (buf, p);
18745 switch (GET_CODE (operands[3]))
18747 case MULT:
18748 case PLUS:
18749 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18750 std::swap (operands[1], operands[2]);
18752 /* know operands[0] == operands[1]. */
18754 if (MEM_P (operands[2]))
18756 p = "%Z2\t%2";
18757 break;
18760 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18762 if (STACK_TOP_P (operands[0]))
18763 /* How is it that we are storing to a dead operand[2]?
18764 Well, presumably operands[1] is dead too. We can't
18765 store the result to st(0) as st(0) gets popped on this
18766 instruction. Instead store to operands[2] (which I
18767 think has to be st(1)). st(1) will be popped later.
18768 gcc <= 2.8.1 didn't have this check and generated
18769 assembly code that the Unixware assembler rejected. */
18770 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18771 else
18772 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18773 break;
18776 if (STACK_TOP_P (operands[0]))
18777 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18778 else
18779 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18780 break;
18782 case MINUS:
18783 case DIV:
18784 if (MEM_P (operands[1]))
18786 p = "r%Z1\t%1";
18787 break;
18790 if (MEM_P (operands[2]))
18792 p = "%Z2\t%2";
18793 break;
18796 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18798 #if SYSV386_COMPAT
18799 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18800 derived assemblers, confusingly reverse the direction of
18801 the operation for fsub{r} and fdiv{r} when the
18802 destination register is not st(0). The Intel assembler
18803 doesn't have this brain damage. Read !SYSV386_COMPAT to
18804 figure out what the hardware really does. */
18805 if (STACK_TOP_P (operands[0]))
18806 p = "{p\t%0, %2|rp\t%2, %0}";
18807 else
18808 p = "{rp\t%2, %0|p\t%0, %2}";
18809 #else
18810 if (STACK_TOP_P (operands[0]))
18811 /* As above for fmul/fadd, we can't store to st(0). */
18812 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18813 else
18814 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18815 #endif
18816 break;
18819 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18821 #if SYSV386_COMPAT
18822 if (STACK_TOP_P (operands[0]))
18823 p = "{rp\t%0, %1|p\t%1, %0}";
18824 else
18825 p = "{p\t%1, %0|rp\t%0, %1}";
18826 #else
18827 if (STACK_TOP_P (operands[0]))
18828 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18829 else
18830 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18831 #endif
18832 break;
18835 if (STACK_TOP_P (operands[0]))
18837 if (STACK_TOP_P (operands[1]))
18838 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18839 else
18840 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18841 break;
18843 else if (STACK_TOP_P (operands[1]))
18845 #if SYSV386_COMPAT
18846 p = "{\t%1, %0|r\t%0, %1}";
18847 #else
18848 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18849 #endif
18851 else
18853 #if SYSV386_COMPAT
18854 p = "{r\t%2, %0|\t%0, %2}";
18855 #else
18856 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18857 #endif
18859 break;
18861 default:
18862 gcc_unreachable ();
18865 strcat (buf, p);
18866 return buf;
18869 /* Return needed mode for entity in optimize_mode_switching pass. */
18871 static int
18872 ix86_dirflag_mode_needed (rtx_insn *insn)
18874 if (CALL_P (insn))
18876 if (cfun->machine->func_type == TYPE_NORMAL)
18877 return X86_DIRFLAG_ANY;
18878 else
18879 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18880 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18883 if (recog_memoized (insn) < 0)
18884 return X86_DIRFLAG_ANY;
18886 if (get_attr_type (insn) == TYPE_STR)
18888 /* Emit cld instruction if stringops are used in the function. */
18889 if (cfun->machine->func_type == TYPE_NORMAL)
18890 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18891 else
18892 return X86_DIRFLAG_RESET;
18895 return X86_DIRFLAG_ANY;
18898 /* Check if a 256bit AVX register is referenced inside of EXP. */
18900 static bool
18901 ix86_check_avx256_register (const_rtx exp)
18903 if (SUBREG_P (exp))
18904 exp = SUBREG_REG (exp);
18906 return (REG_P (exp)
18907 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
18910 /* Return needed mode for entity in optimize_mode_switching pass. */
18912 static int
18913 ix86_avx_u128_mode_needed (rtx_insn *insn)
18915 if (CALL_P (insn))
18917 rtx link;
18919 /* Needed mode is set to AVX_U128_CLEAN if there are
18920 no 256bit modes used in function arguments. */
18921 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18922 link;
18923 link = XEXP (link, 1))
18925 if (GET_CODE (XEXP (link, 0)) == USE)
18927 rtx arg = XEXP (XEXP (link, 0), 0);
18929 if (ix86_check_avx256_register (arg))
18930 return AVX_U128_DIRTY;
18934 return AVX_U128_CLEAN;
18937 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
18938 changes state only when a 256bit register is written to, but we need
18939 to prevent the compiler from moving optimal insertion point above
18940 eventual read from 256bit register. */
18941 subrtx_iterator::array_type array;
18942 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18943 if (ix86_check_avx256_register (*iter))
18944 return AVX_U128_DIRTY;
18946 return AVX_U128_ANY;
18949 /* Return mode that i387 must be switched into
18950 prior to the execution of insn. */
18952 static int
18953 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18955 enum attr_i387_cw mode;
18957 /* The mode UNINITIALIZED is used to store control word after a
18958 function call or ASM pattern. The mode ANY specify that function
18959 has no requirements on the control word and make no changes in the
18960 bits we are interested in. */
18962 if (CALL_P (insn)
18963 || (NONJUMP_INSN_P (insn)
18964 && (asm_noperands (PATTERN (insn)) >= 0
18965 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18966 return I387_CW_UNINITIALIZED;
18968 if (recog_memoized (insn) < 0)
18969 return I387_CW_ANY;
18971 mode = get_attr_i387_cw (insn);
18973 switch (entity)
18975 case I387_TRUNC:
18976 if (mode == I387_CW_TRUNC)
18977 return mode;
18978 break;
18980 case I387_FLOOR:
18981 if (mode == I387_CW_FLOOR)
18982 return mode;
18983 break;
18985 case I387_CEIL:
18986 if (mode == I387_CW_CEIL)
18987 return mode;
18988 break;
18990 case I387_MASK_PM:
18991 if (mode == I387_CW_MASK_PM)
18992 return mode;
18993 break;
18995 default:
18996 gcc_unreachable ();
18999 return I387_CW_ANY;
19002 /* Return mode that entity must be switched into
19003 prior to the execution of insn. */
19005 static int
19006 ix86_mode_needed (int entity, rtx_insn *insn)
19008 switch (entity)
19010 case X86_DIRFLAG:
19011 return ix86_dirflag_mode_needed (insn);
19012 case AVX_U128:
19013 return ix86_avx_u128_mode_needed (insn);
19014 case I387_TRUNC:
19015 case I387_FLOOR:
19016 case I387_CEIL:
19017 case I387_MASK_PM:
19018 return ix86_i387_mode_needed (entity, insn);
19019 default:
19020 gcc_unreachable ();
19022 return 0;
19025 /* Check if a 256bit AVX register is referenced in stores. */
19027 static void
19028 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19030 if (ix86_check_avx256_register (dest))
19032 bool *used = (bool *) data;
19033 *used = true;
19037 /* Calculate mode of upper 128bit AVX registers after the insn. */
19039 static int
19040 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19042 rtx pat = PATTERN (insn);
19044 if (vzeroupper_operation (pat, VOIDmode)
19045 || vzeroall_operation (pat, VOIDmode))
19046 return AVX_U128_CLEAN;
19048 /* We know that state is clean after CALL insn if there are no
19049 256bit registers used in the function return register. */
19050 if (CALL_P (insn))
19052 bool avx_reg256_found = false;
19053 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19055 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19058 /* Otherwise, return current mode. Remember that if insn
19059 references AVX 256bit registers, the mode was already changed
19060 to DIRTY from MODE_NEEDED. */
19061 return mode;
19064 /* Return the mode that an insn results in. */
19066 static int
19067 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19069 switch (entity)
19071 case X86_DIRFLAG:
19072 return mode;
19073 case AVX_U128:
19074 return ix86_avx_u128_mode_after (mode, insn);
19075 case I387_TRUNC:
19076 case I387_FLOOR:
19077 case I387_CEIL:
19078 case I387_MASK_PM:
19079 return mode;
19080 default:
19081 gcc_unreachable ();
19085 static int
19086 ix86_dirflag_mode_entry (void)
19088 /* For TARGET_CLD or in the interrupt handler we can't assume
19089 direction flag state at function entry. */
19090 if (TARGET_CLD
19091 || cfun->machine->func_type != TYPE_NORMAL)
19092 return X86_DIRFLAG_ANY;
19094 return X86_DIRFLAG_RESET;
19097 static int
19098 ix86_avx_u128_mode_entry (void)
19100 tree arg;
19102 /* Entry mode is set to AVX_U128_DIRTY if there are
19103 256bit modes used in function arguments. */
19104 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19105 arg = TREE_CHAIN (arg))
19107 rtx incoming = DECL_INCOMING_RTL (arg);
19109 if (incoming && ix86_check_avx256_register (incoming))
19110 return AVX_U128_DIRTY;
19113 return AVX_U128_CLEAN;
19116 /* Return a mode that ENTITY is assumed to be
19117 switched to at function entry. */
19119 static int
19120 ix86_mode_entry (int entity)
19122 switch (entity)
19124 case X86_DIRFLAG:
19125 return ix86_dirflag_mode_entry ();
19126 case AVX_U128:
19127 return ix86_avx_u128_mode_entry ();
19128 case I387_TRUNC:
19129 case I387_FLOOR:
19130 case I387_CEIL:
19131 case I387_MASK_PM:
19132 return I387_CW_ANY;
19133 default:
19134 gcc_unreachable ();
19138 static int
19139 ix86_avx_u128_mode_exit (void)
19141 rtx reg = crtl->return_rtx;
19143 /* Exit mode is set to AVX_U128_DIRTY if there are
19144 256bit modes used in the function return register. */
19145 if (reg && ix86_check_avx256_register (reg))
19146 return AVX_U128_DIRTY;
19148 return AVX_U128_CLEAN;
19151 /* Return a mode that ENTITY is assumed to be
19152 switched to at function exit. */
19154 static int
19155 ix86_mode_exit (int entity)
19157 switch (entity)
19159 case X86_DIRFLAG:
19160 return X86_DIRFLAG_ANY;
19161 case AVX_U128:
19162 return ix86_avx_u128_mode_exit ();
19163 case I387_TRUNC:
19164 case I387_FLOOR:
19165 case I387_CEIL:
19166 case I387_MASK_PM:
19167 return I387_CW_ANY;
19168 default:
19169 gcc_unreachable ();
19173 static int
19174 ix86_mode_priority (int, int n)
19176 return n;
19179 /* Output code to initialize control word copies used by trunc?f?i and
19180 rounding patterns. CURRENT_MODE is set to current control word,
19181 while NEW_MODE is set to new control word. */
19183 static void
19184 emit_i387_cw_initialization (int mode)
19186 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19187 rtx new_mode;
19189 enum ix86_stack_slot slot;
19191 rtx reg = gen_reg_rtx (HImode);
19193 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19194 emit_move_insn (reg, copy_rtx (stored_mode));
19196 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19197 || optimize_insn_for_size_p ())
19199 switch (mode)
19201 case I387_CW_TRUNC:
19202 /* round toward zero (truncate) */
19203 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19204 slot = SLOT_CW_TRUNC;
19205 break;
19207 case I387_CW_FLOOR:
19208 /* round down toward -oo */
19209 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19210 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19211 slot = SLOT_CW_FLOOR;
19212 break;
19214 case I387_CW_CEIL:
19215 /* round up toward +oo */
19216 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19217 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19218 slot = SLOT_CW_CEIL;
19219 break;
19221 case I387_CW_MASK_PM:
19222 /* mask precision exception for nearbyint() */
19223 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19224 slot = SLOT_CW_MASK_PM;
19225 break;
19227 default:
19228 gcc_unreachable ();
19231 else
19233 switch (mode)
19235 case I387_CW_TRUNC:
19236 /* round toward zero (truncate) */
19237 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19238 slot = SLOT_CW_TRUNC;
19239 break;
19241 case I387_CW_FLOOR:
19242 /* round down toward -oo */
19243 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19244 slot = SLOT_CW_FLOOR;
19245 break;
19247 case I387_CW_CEIL:
19248 /* round up toward +oo */
19249 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19250 slot = SLOT_CW_CEIL;
19251 break;
19253 case I387_CW_MASK_PM:
19254 /* mask precision exception for nearbyint() */
19255 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19256 slot = SLOT_CW_MASK_PM;
19257 break;
19259 default:
19260 gcc_unreachable ();
19264 gcc_assert (slot < MAX_386_STACK_LOCALS);
19266 new_mode = assign_386_stack_local (HImode, slot);
19267 emit_move_insn (new_mode, reg);
19270 /* Emit vzeroupper. */
19272 void
19273 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19275 int i;
19277 /* Cancel automatic vzeroupper insertion if there are
19278 live call-saved SSE registers at the insertion point. */
19280 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19281 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19282 return;
19284 if (TARGET_64BIT)
19285 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19286 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19287 return;
19289 emit_insn (gen_avx_vzeroupper ());
19292 /* Generate one or more insns to set ENTITY to MODE. */
19294 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19295 is the set of hard registers live at the point where the insn(s)
19296 are to be inserted. */
19298 static void
19299 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19300 HARD_REG_SET regs_live)
19302 switch (entity)
19304 case X86_DIRFLAG:
19305 if (mode == X86_DIRFLAG_RESET)
19306 emit_insn (gen_cld ());
19307 break;
19308 case AVX_U128:
19309 if (mode == AVX_U128_CLEAN)
19310 ix86_avx_emit_vzeroupper (regs_live);
19311 break;
19312 case I387_TRUNC:
19313 case I387_FLOOR:
19314 case I387_CEIL:
19315 case I387_MASK_PM:
19316 if (mode != I387_CW_ANY
19317 && mode != I387_CW_UNINITIALIZED)
19318 emit_i387_cw_initialization (mode);
19319 break;
19320 default:
19321 gcc_unreachable ();
19325 /* Output code for INSN to convert a float to a signed int. OPERANDS
19326 are the insn operands. The output may be [HSD]Imode and the input
19327 operand may be [SDX]Fmode. */
19329 const char *
19330 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19332 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19333 int dimode_p = GET_MODE (operands[0]) == DImode;
19334 int round_mode = get_attr_i387_cw (insn);
19336 /* Jump through a hoop or two for DImode, since the hardware has no
19337 non-popping instruction. We used to do this a different way, but
19338 that was somewhat fragile and broke with post-reload splitters. */
19339 if ((dimode_p || fisttp) && !stack_top_dies)
19340 output_asm_insn ("fld\t%y1", operands);
19342 gcc_assert (STACK_TOP_P (operands[1]));
19343 gcc_assert (MEM_P (operands[0]));
19344 gcc_assert (GET_MODE (operands[1]) != TFmode);
19346 if (fisttp)
19347 output_asm_insn ("fisttp%Z0\t%0", operands);
19348 else
19350 if (round_mode != I387_CW_ANY)
19351 output_asm_insn ("fldcw\t%3", operands);
19352 if (stack_top_dies || dimode_p)
19353 output_asm_insn ("fistp%Z0\t%0", operands);
19354 else
19355 output_asm_insn ("fist%Z0\t%0", operands);
19356 if (round_mode != I387_CW_ANY)
19357 output_asm_insn ("fldcw\t%2", operands);
19360 return "";
19363 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19364 have the values zero or one, indicates the ffreep insn's operand
19365 from the OPERANDS array. */
19367 static const char *
19368 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19370 if (TARGET_USE_FFREEP)
19371 #ifdef HAVE_AS_IX86_FFREEP
19372 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19373 #else
19375 static char retval[32];
19376 int regno = REGNO (operands[opno]);
19378 gcc_assert (STACK_REGNO_P (regno));
19380 regno -= FIRST_STACK_REG;
19382 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19383 return retval;
19385 #endif
19387 return opno ? "fstp\t%y1" : "fstp\t%y0";
19391 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19392 should be used. UNORDERED_P is true when fucom should be used. */
19394 const char *
19395 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
19397 int stack_top_dies;
19398 rtx cmp_op0, cmp_op1;
19399 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19401 if (eflags_p)
19403 cmp_op0 = operands[0];
19404 cmp_op1 = operands[1];
19406 else
19408 cmp_op0 = operands[1];
19409 cmp_op1 = operands[2];
19412 if (is_sse)
19414 if (GET_MODE (operands[0]) == SFmode)
19415 if (unordered_p)
19416 return "%vucomiss\t{%1, %0|%0, %1}";
19417 else
19418 return "%vcomiss\t{%1, %0|%0, %1}";
19419 else
19420 if (unordered_p)
19421 return "%vucomisd\t{%1, %0|%0, %1}";
19422 else
19423 return "%vcomisd\t{%1, %0|%0, %1}";
19426 gcc_assert (STACK_TOP_P (cmp_op0));
19428 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19430 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19432 if (stack_top_dies)
19434 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19435 return output_387_ffreep (operands, 1);
19437 else
19438 return "ftst\n\tfnstsw\t%0";
19441 if (STACK_REG_P (cmp_op1)
19442 && stack_top_dies
19443 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19444 && REGNO (cmp_op1) != FIRST_STACK_REG)
19446 /* If both the top of the 387 stack dies, and the other operand
19447 is also a stack register that dies, then this must be a
19448 `fcompp' float compare */
19450 if (eflags_p)
19452 /* There is no double popping fcomi variant. Fortunately,
19453 eflags is immune from the fstp's cc clobbering. */
19454 if (unordered_p)
19455 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19456 else
19457 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19458 return output_387_ffreep (operands, 0);
19460 else
19462 if (unordered_p)
19463 return "fucompp\n\tfnstsw\t%0";
19464 else
19465 return "fcompp\n\tfnstsw\t%0";
19468 else
19470 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19472 static const char * const alt[16] =
19474 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19475 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19476 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19477 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19479 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19480 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19481 NULL,
19482 NULL,
19484 "fcomi\t{%y1, %0|%0, %y1}",
19485 "fcomip\t{%y1, %0|%0, %y1}",
19486 "fucomi\t{%y1, %0|%0, %y1}",
19487 "fucomip\t{%y1, %0|%0, %y1}",
19489 NULL,
19490 NULL,
19491 NULL,
19492 NULL
19495 int mask;
19496 const char *ret;
19498 mask = eflags_p << 3;
19499 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19500 mask |= unordered_p << 1;
19501 mask |= stack_top_dies;
19503 gcc_assert (mask < 16);
19504 ret = alt[mask];
19505 gcc_assert (ret);
19507 return ret;
19511 void
19512 ix86_output_addr_vec_elt (FILE *file, int value)
19514 const char *directive = ASM_LONG;
19516 #ifdef ASM_QUAD
19517 if (TARGET_LP64)
19518 directive = ASM_QUAD;
19519 #else
19520 gcc_assert (!TARGET_64BIT);
19521 #endif
19523 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19526 void
19527 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19529 const char *directive = ASM_LONG;
19531 #ifdef ASM_QUAD
19532 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19533 directive = ASM_QUAD;
19534 #else
19535 gcc_assert (!TARGET_64BIT);
19536 #endif
19537 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19538 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19539 fprintf (file, "%s%s%d-%s%d\n",
19540 directive, LPREFIX, value, LPREFIX, rel);
19541 else if (HAVE_AS_GOTOFF_IN_DATA)
19542 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19543 #if TARGET_MACHO
19544 else if (TARGET_MACHO)
19546 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19547 machopic_output_function_base_name (file);
19548 putc ('\n', file);
19550 #endif
19551 else
19552 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19553 GOT_SYMBOL_NAME, LPREFIX, value);
19556 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19557 for the target. */
19559 void
19560 ix86_expand_clear (rtx dest)
19562 rtx tmp;
19564 /* We play register width games, which are only valid after reload. */
19565 gcc_assert (reload_completed);
19567 /* Avoid HImode and its attendant prefix byte. */
19568 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19569 dest = gen_rtx_REG (SImode, REGNO (dest));
19570 tmp = gen_rtx_SET (dest, const0_rtx);
19572 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19574 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19575 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19578 emit_insn (tmp);
19581 /* X is an unchanging MEM. If it is a constant pool reference, return
19582 the constant pool rtx, else NULL. */
19585 maybe_get_pool_constant (rtx x)
19587 x = ix86_delegitimize_address (XEXP (x, 0));
19589 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19590 return get_pool_constant (x);
19592 return NULL_RTX;
19595 void
19596 ix86_expand_move (machine_mode mode, rtx operands[])
19598 rtx op0, op1;
19599 rtx tmp, addend = NULL_RTX;
19600 enum tls_model model;
19602 op0 = operands[0];
19603 op1 = operands[1];
19605 switch (GET_CODE (op1))
19607 case CONST:
19608 tmp = XEXP (op1, 0);
19610 if (GET_CODE (tmp) != PLUS
19611 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19612 break;
19614 op1 = XEXP (tmp, 0);
19615 addend = XEXP (tmp, 1);
19616 /* FALLTHRU */
19618 case SYMBOL_REF:
19619 model = SYMBOL_REF_TLS_MODEL (op1);
19621 if (model)
19622 op1 = legitimize_tls_address (op1, model, true);
19623 else if (ix86_force_load_from_GOT_p (op1))
19625 /* Load the external function address via GOT slot to avoid PLT. */
19626 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19627 (TARGET_64BIT
19628 ? UNSPEC_GOTPCREL
19629 : UNSPEC_GOT));
19630 op1 = gen_rtx_CONST (Pmode, op1);
19631 op1 = gen_const_mem (Pmode, op1);
19632 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19634 else
19636 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19637 if (tmp)
19639 op1 = tmp;
19640 if (!addend)
19641 break;
19643 else
19645 op1 = operands[1];
19646 break;
19650 if (addend)
19652 op1 = force_operand (op1, NULL_RTX);
19653 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19654 op0, 1, OPTAB_DIRECT);
19656 else
19657 op1 = force_operand (op1, op0);
19659 if (op1 == op0)
19660 return;
19662 op1 = convert_to_mode (mode, op1, 1);
19664 default:
19665 break;
19668 if ((flag_pic || MACHOPIC_INDIRECT)
19669 && symbolic_operand (op1, mode))
19671 if (TARGET_MACHO && !TARGET_64BIT)
19673 #if TARGET_MACHO
19674 /* dynamic-no-pic */
19675 if (MACHOPIC_INDIRECT)
19677 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19678 ? op0 : gen_reg_rtx (Pmode);
19679 op1 = machopic_indirect_data_reference (op1, temp);
19680 if (MACHOPIC_PURE)
19681 op1 = machopic_legitimize_pic_address (op1, mode,
19682 temp == op1 ? 0 : temp);
19684 if (op0 != op1 && GET_CODE (op0) != MEM)
19686 rtx insn = gen_rtx_SET (op0, op1);
19687 emit_insn (insn);
19688 return;
19690 if (GET_CODE (op0) == MEM)
19691 op1 = force_reg (Pmode, op1);
19692 else
19694 rtx temp = op0;
19695 if (GET_CODE (temp) != REG)
19696 temp = gen_reg_rtx (Pmode);
19697 temp = legitimize_pic_address (op1, temp);
19698 if (temp == op0)
19699 return;
19700 op1 = temp;
19702 /* dynamic-no-pic */
19703 #endif
19705 else
19707 if (MEM_P (op0))
19708 op1 = force_reg (mode, op1);
19709 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19711 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19712 op1 = legitimize_pic_address (op1, reg);
19713 if (op0 == op1)
19714 return;
19715 op1 = convert_to_mode (mode, op1, 1);
19719 else
19721 if (MEM_P (op0)
19722 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19723 || !push_operand (op0, mode))
19724 && MEM_P (op1))
19725 op1 = force_reg (mode, op1);
19727 if (push_operand (op0, mode)
19728 && ! general_no_elim_operand (op1, mode))
19729 op1 = copy_to_mode_reg (mode, op1);
19731 /* Force large constants in 64bit compilation into register
19732 to get them CSEed. */
19733 if (can_create_pseudo_p ()
19734 && (mode == DImode) && TARGET_64BIT
19735 && immediate_operand (op1, mode)
19736 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19737 && !register_operand (op0, mode)
19738 && optimize)
19739 op1 = copy_to_mode_reg (mode, op1);
19741 if (can_create_pseudo_p ()
19742 && CONST_DOUBLE_P (op1))
19744 /* If we are loading a floating point constant to a register,
19745 force the value to memory now, since we'll get better code
19746 out the back end. */
19748 op1 = validize_mem (force_const_mem (mode, op1));
19749 if (!register_operand (op0, mode))
19751 rtx temp = gen_reg_rtx (mode);
19752 emit_insn (gen_rtx_SET (temp, op1));
19753 emit_move_insn (op0, temp);
19754 return;
19759 emit_insn (gen_rtx_SET (op0, op1));
19762 void
19763 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19765 rtx op0 = operands[0], op1 = operands[1];
19766 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19767 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19768 unsigned int align = (TARGET_IAMCU
19769 ? GET_MODE_BITSIZE (mode)
19770 : GET_MODE_ALIGNMENT (mode));
19772 if (push_operand (op0, VOIDmode))
19773 op0 = emit_move_resolve_push (mode, op0);
19775 /* Force constants other than zero into memory. We do not know how
19776 the instructions used to build constants modify the upper 64 bits
19777 of the register, once we have that information we may be able
19778 to handle some of them more efficiently. */
19779 if (can_create_pseudo_p ()
19780 && (CONSTANT_P (op1)
19781 || (SUBREG_P (op1)
19782 && CONSTANT_P (SUBREG_REG (op1))))
19783 && ((register_operand (op0, mode)
19784 && !standard_sse_constant_p (op1, mode))
19785 /* ix86_expand_vector_move_misalign() does not like constants. */
19786 || (SSE_REG_MODE_P (mode)
19787 && MEM_P (op0)
19788 && MEM_ALIGN (op0) < align)))
19790 if (SUBREG_P (op1))
19792 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19793 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19794 if (r)
19795 r = validize_mem (r);
19796 else
19797 r = force_reg (imode, SUBREG_REG (op1));
19798 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19800 else
19801 op1 = validize_mem (force_const_mem (mode, op1));
19804 /* We need to check memory alignment for SSE mode since attribute
19805 can make operands unaligned. */
19806 if (can_create_pseudo_p ()
19807 && SSE_REG_MODE_P (mode)
19808 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19809 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19811 rtx tmp[2];
19813 /* ix86_expand_vector_move_misalign() does not like both
19814 arguments in memory. */
19815 if (!register_operand (op0, mode)
19816 && !register_operand (op1, mode))
19817 op1 = force_reg (mode, op1);
19819 tmp[0] = op0; tmp[1] = op1;
19820 ix86_expand_vector_move_misalign (mode, tmp);
19821 return;
19824 /* Make operand1 a register if it isn't already. */
19825 if (can_create_pseudo_p ()
19826 && !register_operand (op0, mode)
19827 && !register_operand (op1, mode))
19829 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19830 return;
19833 emit_insn (gen_rtx_SET (op0, op1));
19836 /* Split 32-byte AVX unaligned load and store if needed. */
19838 static void
19839 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19841 rtx m;
19842 rtx (*extract) (rtx, rtx, rtx);
19843 machine_mode mode;
19845 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19846 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19848 emit_insn (gen_rtx_SET (op0, op1));
19849 return;
19852 rtx orig_op0 = NULL_RTX;
19853 mode = GET_MODE (op0);
19854 switch (GET_MODE_CLASS (mode))
19856 case MODE_VECTOR_INT:
19857 case MODE_INT:
19858 if (mode != V32QImode)
19860 if (!MEM_P (op0))
19862 orig_op0 = op0;
19863 op0 = gen_reg_rtx (V32QImode);
19865 else
19866 op0 = gen_lowpart (V32QImode, op0);
19867 op1 = gen_lowpart (V32QImode, op1);
19868 mode = V32QImode;
19870 break;
19871 case MODE_VECTOR_FLOAT:
19872 break;
19873 default:
19874 gcc_unreachable ();
19877 switch (mode)
19879 default:
19880 gcc_unreachable ();
19881 case V32QImode:
19882 extract = gen_avx_vextractf128v32qi;
19883 mode = V16QImode;
19884 break;
19885 case V8SFmode:
19886 extract = gen_avx_vextractf128v8sf;
19887 mode = V4SFmode;
19888 break;
19889 case V4DFmode:
19890 extract = gen_avx_vextractf128v4df;
19891 mode = V2DFmode;
19892 break;
19895 if (MEM_P (op1))
19897 rtx r = gen_reg_rtx (mode);
19898 m = adjust_address (op1, mode, 0);
19899 emit_move_insn (r, m);
19900 m = adjust_address (op1, mode, 16);
19901 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19902 emit_move_insn (op0, r);
19904 else if (MEM_P (op0))
19906 m = adjust_address (op0, mode, 0);
19907 emit_insn (extract (m, op1, const0_rtx));
19908 m = adjust_address (op0, mode, 16);
19909 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19911 else
19912 gcc_unreachable ();
19914 if (orig_op0)
19915 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19918 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19919 straight to ix86_expand_vector_move. */
19920 /* Code generation for scalar reg-reg moves of single and double precision data:
19921 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19922 movaps reg, reg
19923 else
19924 movss reg, reg
19925 if (x86_sse_partial_reg_dependency == true)
19926 movapd reg, reg
19927 else
19928 movsd reg, reg
19930 Code generation for scalar loads of double precision data:
19931 if (x86_sse_split_regs == true)
19932 movlpd mem, reg (gas syntax)
19933 else
19934 movsd mem, reg
19936 Code generation for unaligned packed loads of single precision data
19937 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19938 if (x86_sse_unaligned_move_optimal)
19939 movups mem, reg
19941 if (x86_sse_partial_reg_dependency == true)
19943 xorps reg, reg
19944 movlps mem, reg
19945 movhps mem+8, reg
19947 else
19949 movlps mem, reg
19950 movhps mem+8, reg
19953 Code generation for unaligned packed loads of double precision data
19954 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19955 if (x86_sse_unaligned_move_optimal)
19956 movupd mem, reg
19958 if (x86_sse_split_regs == true)
19960 movlpd mem, reg
19961 movhpd mem+8, reg
19963 else
19965 movsd mem, reg
19966 movhpd mem+8, reg
19970 void
19971 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19973 rtx op0, op1, m;
19975 op0 = operands[0];
19976 op1 = operands[1];
19978 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19979 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19981 emit_insn (gen_rtx_SET (op0, op1));
19982 return;
19985 if (TARGET_AVX)
19987 if (GET_MODE_SIZE (mode) == 32)
19988 ix86_avx256_split_vector_move_misalign (op0, op1);
19989 else
19990 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19991 emit_insn (gen_rtx_SET (op0, op1));
19992 return;
19995 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19996 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19998 emit_insn (gen_rtx_SET (op0, op1));
19999 return;
20002 /* ??? If we have typed data, then it would appear that using
20003 movdqu is the only way to get unaligned data loaded with
20004 integer type. */
20005 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20007 emit_insn (gen_rtx_SET (op0, op1));
20008 return;
20011 if (MEM_P (op1))
20013 if (TARGET_SSE2 && mode == V2DFmode)
20015 rtx zero;
20017 /* When SSE registers are split into halves, we can avoid
20018 writing to the top half twice. */
20019 if (TARGET_SSE_SPLIT_REGS)
20021 emit_clobber (op0);
20022 zero = op0;
20024 else
20026 /* ??? Not sure about the best option for the Intel chips.
20027 The following would seem to satisfy; the register is
20028 entirely cleared, breaking the dependency chain. We
20029 then store to the upper half, with a dependency depth
20030 of one. A rumor has it that Intel recommends two movsd
20031 followed by an unpacklpd, but this is unconfirmed. And
20032 given that the dependency depth of the unpacklpd would
20033 still be one, I'm not sure why this would be better. */
20034 zero = CONST0_RTX (V2DFmode);
20037 m = adjust_address (op1, DFmode, 0);
20038 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20039 m = adjust_address (op1, DFmode, 8);
20040 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20042 else
20044 rtx t;
20046 if (mode != V4SFmode)
20047 t = gen_reg_rtx (V4SFmode);
20048 else
20049 t = op0;
20051 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20052 emit_move_insn (t, CONST0_RTX (V4SFmode));
20053 else
20054 emit_clobber (t);
20056 m = adjust_address (op1, V2SFmode, 0);
20057 emit_insn (gen_sse_loadlps (t, t, m));
20058 m = adjust_address (op1, V2SFmode, 8);
20059 emit_insn (gen_sse_loadhps (t, t, m));
20060 if (mode != V4SFmode)
20061 emit_move_insn (op0, gen_lowpart (mode, t));
20064 else if (MEM_P (op0))
20066 if (TARGET_SSE2 && mode == V2DFmode)
20068 m = adjust_address (op0, DFmode, 0);
20069 emit_insn (gen_sse2_storelpd (m, op1));
20070 m = adjust_address (op0, DFmode, 8);
20071 emit_insn (gen_sse2_storehpd (m, op1));
20073 else
20075 if (mode != V4SFmode)
20076 op1 = gen_lowpart (V4SFmode, op1);
20078 m = adjust_address (op0, V2SFmode, 0);
20079 emit_insn (gen_sse_storelps (m, op1));
20080 m = adjust_address (op0, V2SFmode, 8);
20081 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20084 else
20085 gcc_unreachable ();
20088 /* Helper function of ix86_fixup_binary_operands to canonicalize
20089 operand order. Returns true if the operands should be swapped. */
20091 static bool
20092 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20093 rtx operands[])
20095 rtx dst = operands[0];
20096 rtx src1 = operands[1];
20097 rtx src2 = operands[2];
20099 /* If the operation is not commutative, we can't do anything. */
20100 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20101 return false;
20103 /* Highest priority is that src1 should match dst. */
20104 if (rtx_equal_p (dst, src1))
20105 return false;
20106 if (rtx_equal_p (dst, src2))
20107 return true;
20109 /* Next highest priority is that immediate constants come second. */
20110 if (immediate_operand (src2, mode))
20111 return false;
20112 if (immediate_operand (src1, mode))
20113 return true;
20115 /* Lowest priority is that memory references should come second. */
20116 if (MEM_P (src2))
20117 return false;
20118 if (MEM_P (src1))
20119 return true;
20121 return false;
20125 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20126 destination to use for the operation. If different from the true
20127 destination in operands[0], a copy operation will be required. */
20130 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20131 rtx operands[])
20133 rtx dst = operands[0];
20134 rtx src1 = operands[1];
20135 rtx src2 = operands[2];
20137 /* Canonicalize operand order. */
20138 if (ix86_swap_binary_operands_p (code, mode, operands))
20140 /* It is invalid to swap operands of different modes. */
20141 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20143 std::swap (src1, src2);
20146 /* Both source operands cannot be in memory. */
20147 if (MEM_P (src1) && MEM_P (src2))
20149 /* Optimization: Only read from memory once. */
20150 if (rtx_equal_p (src1, src2))
20152 src2 = force_reg (mode, src2);
20153 src1 = src2;
20155 else if (rtx_equal_p (dst, src1))
20156 src2 = force_reg (mode, src2);
20157 else
20158 src1 = force_reg (mode, src1);
20161 /* If the destination is memory, and we do not have matching source
20162 operands, do things in registers. */
20163 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20164 dst = gen_reg_rtx (mode);
20166 /* Source 1 cannot be a constant. */
20167 if (CONSTANT_P (src1))
20168 src1 = force_reg (mode, src1);
20170 /* Source 1 cannot be a non-matching memory. */
20171 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20172 src1 = force_reg (mode, src1);
20174 /* Improve address combine. */
20175 if (code == PLUS
20176 && GET_MODE_CLASS (mode) == MODE_INT
20177 && MEM_P (src2))
20178 src2 = force_reg (mode, src2);
20180 operands[1] = src1;
20181 operands[2] = src2;
20182 return dst;
20185 /* Similarly, but assume that the destination has already been
20186 set up properly. */
20188 void
20189 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20190 machine_mode mode, rtx operands[])
20192 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20193 gcc_assert (dst == operands[0]);
20196 /* Attempt to expand a binary operator. Make the expansion closer to the
20197 actual machine, then just general_operand, which will allow 3 separate
20198 memory references (one output, two input) in a single insn. */
20200 void
20201 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20202 rtx operands[])
20204 rtx src1, src2, dst, op, clob;
20206 dst = ix86_fixup_binary_operands (code, mode, operands);
20207 src1 = operands[1];
20208 src2 = operands[2];
20210 /* Emit the instruction. */
20212 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20214 if (reload_completed
20215 && code == PLUS
20216 && !rtx_equal_p (dst, src1))
20218 /* This is going to be an LEA; avoid splitting it later. */
20219 emit_insn (op);
20221 else
20223 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20224 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20227 /* Fix up the destination if needed. */
20228 if (dst != operands[0])
20229 emit_move_insn (operands[0], dst);
20232 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20233 the given OPERANDS. */
20235 void
20236 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20237 rtx operands[])
20239 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20240 if (SUBREG_P (operands[1]))
20242 op1 = operands[1];
20243 op2 = operands[2];
20245 else if (SUBREG_P (operands[2]))
20247 op1 = operands[2];
20248 op2 = operands[1];
20250 /* Optimize (__m128i) d | (__m128i) e and similar code
20251 when d and e are float vectors into float vector logical
20252 insn. In C/C++ without using intrinsics there is no other way
20253 to express vector logical operation on float vectors than
20254 to cast them temporarily to integer vectors. */
20255 if (op1
20256 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20257 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20258 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20259 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20260 && SUBREG_BYTE (op1) == 0
20261 && (GET_CODE (op2) == CONST_VECTOR
20262 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20263 && SUBREG_BYTE (op2) == 0))
20264 && can_create_pseudo_p ())
20266 rtx dst;
20267 switch (GET_MODE (SUBREG_REG (op1)))
20269 case V4SFmode:
20270 case V8SFmode:
20271 case V16SFmode:
20272 case V2DFmode:
20273 case V4DFmode:
20274 case V8DFmode:
20275 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20276 if (GET_CODE (op2) == CONST_VECTOR)
20278 op2 = gen_lowpart (GET_MODE (dst), op2);
20279 op2 = force_reg (GET_MODE (dst), op2);
20281 else
20283 op1 = operands[1];
20284 op2 = SUBREG_REG (operands[2]);
20285 if (!vector_operand (op2, GET_MODE (dst)))
20286 op2 = force_reg (GET_MODE (dst), op2);
20288 op1 = SUBREG_REG (op1);
20289 if (!vector_operand (op1, GET_MODE (dst)))
20290 op1 = force_reg (GET_MODE (dst), op1);
20291 emit_insn (gen_rtx_SET (dst,
20292 gen_rtx_fmt_ee (code, GET_MODE (dst),
20293 op1, op2)));
20294 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20295 return;
20296 default:
20297 break;
20300 if (!vector_operand (operands[1], mode))
20301 operands[1] = force_reg (mode, operands[1]);
20302 if (!vector_operand (operands[2], mode))
20303 operands[2] = force_reg (mode, operands[2]);
20304 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20305 emit_insn (gen_rtx_SET (operands[0],
20306 gen_rtx_fmt_ee (code, mode, operands[1],
20307 operands[2])));
20310 /* Return TRUE or FALSE depending on whether the binary operator meets the
20311 appropriate constraints. */
20313 bool
20314 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20315 rtx operands[3])
20317 rtx dst = operands[0];
20318 rtx src1 = operands[1];
20319 rtx src2 = operands[2];
20321 /* Both source operands cannot be in memory. */
20322 if (MEM_P (src1) && MEM_P (src2))
20323 return false;
20325 /* Canonicalize operand order for commutative operators. */
20326 if (ix86_swap_binary_operands_p (code, mode, operands))
20327 std::swap (src1, src2);
20329 /* If the destination is memory, we must have a matching source operand. */
20330 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20331 return false;
20333 /* Source 1 cannot be a constant. */
20334 if (CONSTANT_P (src1))
20335 return false;
20337 /* Source 1 cannot be a non-matching memory. */
20338 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20339 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20340 return (code == AND
20341 && (mode == HImode
20342 || mode == SImode
20343 || (TARGET_64BIT && mode == DImode))
20344 && satisfies_constraint_L (src2));
20346 return true;
20349 /* Attempt to expand a unary operator. Make the expansion closer to the
20350 actual machine, then just general_operand, which will allow 2 separate
20351 memory references (one output, one input) in a single insn. */
20353 void
20354 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20355 rtx operands[])
20357 bool matching_memory = false;
20358 rtx src, dst, op, clob;
20360 dst = operands[0];
20361 src = operands[1];
20363 /* If the destination is memory, and we do not have matching source
20364 operands, do things in registers. */
20365 if (MEM_P (dst))
20367 if (rtx_equal_p (dst, src))
20368 matching_memory = true;
20369 else
20370 dst = gen_reg_rtx (mode);
20373 /* When source operand is memory, destination must match. */
20374 if (MEM_P (src) && !matching_memory)
20375 src = force_reg (mode, src);
20377 /* Emit the instruction. */
20379 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20381 if (code == NOT)
20382 emit_insn (op);
20383 else
20385 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20386 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20389 /* Fix up the destination if needed. */
20390 if (dst != operands[0])
20391 emit_move_insn (operands[0], dst);
20394 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20395 divisor are within the range [0-255]. */
20397 void
20398 ix86_split_idivmod (machine_mode mode, rtx operands[],
20399 bool signed_p)
20401 rtx_code_label *end_label, *qimode_label;
20402 rtx div, mod;
20403 rtx_insn *insn;
20404 rtx scratch, tmp0, tmp1, tmp2;
20405 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20406 rtx (*gen_zero_extend) (rtx, rtx);
20407 rtx (*gen_test_ccno_1) (rtx, rtx);
20409 switch (mode)
20411 case SImode:
20412 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20413 gen_test_ccno_1 = gen_testsi_ccno_1;
20414 gen_zero_extend = gen_zero_extendqisi2;
20415 break;
20416 case DImode:
20417 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20418 gen_test_ccno_1 = gen_testdi_ccno_1;
20419 gen_zero_extend = gen_zero_extendqidi2;
20420 break;
20421 default:
20422 gcc_unreachable ();
20425 end_label = gen_label_rtx ();
20426 qimode_label = gen_label_rtx ();
20428 scratch = gen_reg_rtx (mode);
20430 /* Use 8bit unsigned divimod if dividend and divisor are within
20431 the range [0-255]. */
20432 emit_move_insn (scratch, operands[2]);
20433 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20434 scratch, 1, OPTAB_DIRECT);
20435 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20436 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20437 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20438 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20439 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20440 pc_rtx);
20441 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20442 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20443 JUMP_LABEL (insn) = qimode_label;
20445 /* Generate original signed/unsigned divimod. */
20446 div = gen_divmod4_1 (operands[0], operands[1],
20447 operands[2], operands[3]);
20448 emit_insn (div);
20450 /* Branch to the end. */
20451 emit_jump_insn (gen_jump (end_label));
20452 emit_barrier ();
20454 /* Generate 8bit unsigned divide. */
20455 emit_label (qimode_label);
20456 /* Don't use operands[0] for result of 8bit divide since not all
20457 registers support QImode ZERO_EXTRACT. */
20458 tmp0 = lowpart_subreg (HImode, scratch, mode);
20459 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20460 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20461 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20463 if (signed_p)
20465 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20466 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20468 else
20470 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20471 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20474 /* Extract remainder from AH. */
20475 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20476 if (REG_P (operands[1]))
20477 insn = emit_move_insn (operands[1], tmp1);
20478 else
20480 /* Need a new scratch register since the old one has result
20481 of 8bit divide. */
20482 scratch = gen_reg_rtx (mode);
20483 emit_move_insn (scratch, tmp1);
20484 insn = emit_move_insn (operands[1], scratch);
20486 set_unique_reg_note (insn, REG_EQUAL, mod);
20488 /* Zero extend quotient from AL. */
20489 tmp1 = gen_lowpart (QImode, tmp0);
20490 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20491 set_unique_reg_note (insn, REG_EQUAL, div);
20493 emit_label (end_label);
20496 #define LEA_MAX_STALL (3)
20497 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20499 /* Increase given DISTANCE in half-cycles according to
20500 dependencies between PREV and NEXT instructions.
20501 Add 1 half-cycle if there is no dependency and
20502 go to next cycle if there is some dependecy. */
20504 static unsigned int
20505 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20507 df_ref def, use;
20509 if (!prev || !next)
20510 return distance + (distance & 1) + 2;
20512 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20513 return distance + 1;
20515 FOR_EACH_INSN_USE (use, next)
20516 FOR_EACH_INSN_DEF (def, prev)
20517 if (!DF_REF_IS_ARTIFICIAL (def)
20518 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20519 return distance + (distance & 1) + 2;
20521 return distance + 1;
20524 /* Function checks if instruction INSN defines register number
20525 REGNO1 or REGNO2. */
20527 static bool
20528 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20529 rtx_insn *insn)
20531 df_ref def;
20533 FOR_EACH_INSN_DEF (def, insn)
20534 if (DF_REF_REG_DEF_P (def)
20535 && !DF_REF_IS_ARTIFICIAL (def)
20536 && (regno1 == DF_REF_REGNO (def)
20537 || regno2 == DF_REF_REGNO (def)))
20538 return true;
20540 return false;
20543 /* Function checks if instruction INSN uses register number
20544 REGNO as a part of address expression. */
20546 static bool
20547 insn_uses_reg_mem (unsigned int regno, rtx insn)
20549 df_ref use;
20551 FOR_EACH_INSN_USE (use, insn)
20552 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20553 return true;
20555 return false;
20558 /* Search backward for non-agu definition of register number REGNO1
20559 or register number REGNO2 in basic block starting from instruction
20560 START up to head of basic block or instruction INSN.
20562 Function puts true value into *FOUND var if definition was found
20563 and false otherwise.
20565 Distance in half-cycles between START and found instruction or head
20566 of BB is added to DISTANCE and returned. */
20568 static int
20569 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20570 rtx_insn *insn, int distance,
20571 rtx_insn *start, bool *found)
20573 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20574 rtx_insn *prev = start;
20575 rtx_insn *next = NULL;
20577 *found = false;
20579 while (prev
20580 && prev != insn
20581 && distance < LEA_SEARCH_THRESHOLD)
20583 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20585 distance = increase_distance (prev, next, distance);
20586 if (insn_defines_reg (regno1, regno2, prev))
20588 if (recog_memoized (prev) < 0
20589 || get_attr_type (prev) != TYPE_LEA)
20591 *found = true;
20592 return distance;
20596 next = prev;
20598 if (prev == BB_HEAD (bb))
20599 break;
20601 prev = PREV_INSN (prev);
20604 return distance;
20607 /* Search backward for non-agu definition of register number REGNO1
20608 or register number REGNO2 in INSN's basic block until
20609 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20610 2. Reach neighbor BBs boundary, or
20611 3. Reach agu definition.
20612 Returns the distance between the non-agu definition point and INSN.
20613 If no definition point, returns -1. */
20615 static int
20616 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20617 rtx_insn *insn)
20619 basic_block bb = BLOCK_FOR_INSN (insn);
20620 int distance = 0;
20621 bool found = false;
20623 if (insn != BB_HEAD (bb))
20624 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20625 distance, PREV_INSN (insn),
20626 &found);
20628 if (!found && distance < LEA_SEARCH_THRESHOLD)
20630 edge e;
20631 edge_iterator ei;
20632 bool simple_loop = false;
20634 FOR_EACH_EDGE (e, ei, bb->preds)
20635 if (e->src == bb)
20637 simple_loop = true;
20638 break;
20641 if (simple_loop)
20642 distance = distance_non_agu_define_in_bb (regno1, regno2,
20643 insn, distance,
20644 BB_END (bb), &found);
20645 else
20647 int shortest_dist = -1;
20648 bool found_in_bb = false;
20650 FOR_EACH_EDGE (e, ei, bb->preds)
20652 int bb_dist
20653 = distance_non_agu_define_in_bb (regno1, regno2,
20654 insn, distance,
20655 BB_END (e->src),
20656 &found_in_bb);
20657 if (found_in_bb)
20659 if (shortest_dist < 0)
20660 shortest_dist = bb_dist;
20661 else if (bb_dist > 0)
20662 shortest_dist = MIN (bb_dist, shortest_dist);
20664 found = true;
20668 distance = shortest_dist;
20672 /* get_attr_type may modify recog data. We want to make sure
20673 that recog data is valid for instruction INSN, on which
20674 distance_non_agu_define is called. INSN is unchanged here. */
20675 extract_insn_cached (insn);
20677 if (!found)
20678 return -1;
20680 return distance >> 1;
20683 /* Return the distance in half-cycles between INSN and the next
20684 insn that uses register number REGNO in memory address added
20685 to DISTANCE. Return -1 if REGNO0 is set.
20687 Put true value into *FOUND if register usage was found and
20688 false otherwise.
20689 Put true value into *REDEFINED if register redefinition was
20690 found and false otherwise. */
20692 static int
20693 distance_agu_use_in_bb (unsigned int regno,
20694 rtx_insn *insn, int distance, rtx_insn *start,
20695 bool *found, bool *redefined)
20697 basic_block bb = NULL;
20698 rtx_insn *next = start;
20699 rtx_insn *prev = NULL;
20701 *found = false;
20702 *redefined = false;
20704 if (start != NULL_RTX)
20706 bb = BLOCK_FOR_INSN (start);
20707 if (start != BB_HEAD (bb))
20708 /* If insn and start belong to the same bb, set prev to insn,
20709 so the call to increase_distance will increase the distance
20710 between insns by 1. */
20711 prev = insn;
20714 while (next
20715 && next != insn
20716 && distance < LEA_SEARCH_THRESHOLD)
20718 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20720 distance = increase_distance(prev, next, distance);
20721 if (insn_uses_reg_mem (regno, next))
20723 /* Return DISTANCE if OP0 is used in memory
20724 address in NEXT. */
20725 *found = true;
20726 return distance;
20729 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20731 /* Return -1 if OP0 is set in NEXT. */
20732 *redefined = true;
20733 return -1;
20736 prev = next;
20739 if (next == BB_END (bb))
20740 break;
20742 next = NEXT_INSN (next);
20745 return distance;
20748 /* Return the distance between INSN and the next insn that uses
20749 register number REGNO0 in memory address. Return -1 if no such
20750 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20752 static int
20753 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20755 basic_block bb = BLOCK_FOR_INSN (insn);
20756 int distance = 0;
20757 bool found = false;
20758 bool redefined = false;
20760 if (insn != BB_END (bb))
20761 distance = distance_agu_use_in_bb (regno0, insn, distance,
20762 NEXT_INSN (insn),
20763 &found, &redefined);
20765 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20767 edge e;
20768 edge_iterator ei;
20769 bool simple_loop = false;
20771 FOR_EACH_EDGE (e, ei, bb->succs)
20772 if (e->dest == bb)
20774 simple_loop = true;
20775 break;
20778 if (simple_loop)
20779 distance = distance_agu_use_in_bb (regno0, insn,
20780 distance, BB_HEAD (bb),
20781 &found, &redefined);
20782 else
20784 int shortest_dist = -1;
20785 bool found_in_bb = false;
20786 bool redefined_in_bb = false;
20788 FOR_EACH_EDGE (e, ei, bb->succs)
20790 int bb_dist
20791 = distance_agu_use_in_bb (regno0, insn,
20792 distance, BB_HEAD (e->dest),
20793 &found_in_bb, &redefined_in_bb);
20794 if (found_in_bb)
20796 if (shortest_dist < 0)
20797 shortest_dist = bb_dist;
20798 else if (bb_dist > 0)
20799 shortest_dist = MIN (bb_dist, shortest_dist);
20801 found = true;
20805 distance = shortest_dist;
20809 if (!found || redefined)
20810 return -1;
20812 return distance >> 1;
20815 /* Define this macro to tune LEA priority vs ADD, it take effect when
20816 there is a dilemma of choicing LEA or ADD
20817 Negative value: ADD is more preferred than LEA
20818 Zero: Netrual
20819 Positive value: LEA is more preferred than ADD*/
20820 #define IX86_LEA_PRIORITY 0
20822 /* Return true if usage of lea INSN has performance advantage
20823 over a sequence of instructions. Instructions sequence has
20824 SPLIT_COST cycles higher latency than lea latency. */
20826 static bool
20827 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20828 unsigned int regno2, int split_cost, bool has_scale)
20830 int dist_define, dist_use;
20832 /* For Silvermont if using a 2-source or 3-source LEA for
20833 non-destructive destination purposes, or due to wanting
20834 ability to use SCALE, the use of LEA is justified. */
20835 if (TARGET_SILVERMONT || TARGET_INTEL)
20837 if (has_scale)
20838 return true;
20839 if (split_cost < 1)
20840 return false;
20841 if (regno0 == regno1 || regno0 == regno2)
20842 return false;
20843 return true;
20846 dist_define = distance_non_agu_define (regno1, regno2, insn);
20847 dist_use = distance_agu_use (regno0, insn);
20849 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20851 /* If there is no non AGU operand definition, no AGU
20852 operand usage and split cost is 0 then both lea
20853 and non lea variants have same priority. Currently
20854 we prefer lea for 64 bit code and non lea on 32 bit
20855 code. */
20856 if (dist_use < 0 && split_cost == 0)
20857 return TARGET_64BIT || IX86_LEA_PRIORITY;
20858 else
20859 return true;
20862 /* With longer definitions distance lea is more preferable.
20863 Here we change it to take into account splitting cost and
20864 lea priority. */
20865 dist_define += split_cost + IX86_LEA_PRIORITY;
20867 /* If there is no use in memory addess then we just check
20868 that split cost exceeds AGU stall. */
20869 if (dist_use < 0)
20870 return dist_define > LEA_MAX_STALL;
20872 /* If this insn has both backward non-agu dependence and forward
20873 agu dependence, the one with short distance takes effect. */
20874 return dist_define >= dist_use;
20877 /* Return true if it is legal to clobber flags by INSN and
20878 false otherwise. */
20880 static bool
20881 ix86_ok_to_clobber_flags (rtx_insn *insn)
20883 basic_block bb = BLOCK_FOR_INSN (insn);
20884 df_ref use;
20885 bitmap live;
20887 while (insn)
20889 if (NONDEBUG_INSN_P (insn))
20891 FOR_EACH_INSN_USE (use, insn)
20892 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20893 return false;
20895 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20896 return true;
20899 if (insn == BB_END (bb))
20900 break;
20902 insn = NEXT_INSN (insn);
20905 live = df_get_live_out(bb);
20906 return !REGNO_REG_SET_P (live, FLAGS_REG);
20909 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20910 move and add to avoid AGU stalls. */
20912 bool
20913 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20915 unsigned int regno0, regno1, regno2;
20917 /* Check if we need to optimize. */
20918 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20919 return false;
20921 /* Check it is correct to split here. */
20922 if (!ix86_ok_to_clobber_flags(insn))
20923 return false;
20925 regno0 = true_regnum (operands[0]);
20926 regno1 = true_regnum (operands[1]);
20927 regno2 = true_regnum (operands[2]);
20929 /* We need to split only adds with non destructive
20930 destination operand. */
20931 if (regno0 == regno1 || regno0 == regno2)
20932 return false;
20933 else
20934 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20937 /* Return true if we should emit lea instruction instead of mov
20938 instruction. */
20940 bool
20941 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20943 unsigned int regno0, regno1;
20945 /* Check if we need to optimize. */
20946 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20947 return false;
20949 /* Use lea for reg to reg moves only. */
20950 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20951 return false;
20953 regno0 = true_regnum (operands[0]);
20954 regno1 = true_regnum (operands[1]);
20956 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20959 /* Return true if we need to split lea into a sequence of
20960 instructions to avoid AGU stalls. */
20962 bool
20963 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20965 unsigned int regno0, regno1, regno2;
20966 int split_cost;
20967 struct ix86_address parts;
20968 int ok;
20970 /* Check we need to optimize. */
20971 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20972 return false;
20974 /* The "at least two components" test below might not catch simple
20975 move or zero extension insns if parts.base is non-NULL and parts.disp
20976 is const0_rtx as the only components in the address, e.g. if the
20977 register is %rbp or %r13. As this test is much cheaper and moves or
20978 zero extensions are the common case, do this check first. */
20979 if (REG_P (operands[1])
20980 || (SImode_address_operand (operands[1], VOIDmode)
20981 && REG_P (XEXP (operands[1], 0))))
20982 return false;
20984 /* Check if it is OK to split here. */
20985 if (!ix86_ok_to_clobber_flags (insn))
20986 return false;
20988 ok = ix86_decompose_address (operands[1], &parts);
20989 gcc_assert (ok);
20991 /* There should be at least two components in the address. */
20992 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20993 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20994 return false;
20996 /* We should not split into add if non legitimate pic
20997 operand is used as displacement. */
20998 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20999 return false;
21001 regno0 = true_regnum (operands[0]) ;
21002 regno1 = INVALID_REGNUM;
21003 regno2 = INVALID_REGNUM;
21005 if (parts.base)
21006 regno1 = true_regnum (parts.base);
21007 if (parts.index)
21008 regno2 = true_regnum (parts.index);
21010 split_cost = 0;
21012 /* Compute how many cycles we will add to execution time
21013 if split lea into a sequence of instructions. */
21014 if (parts.base || parts.index)
21016 /* Have to use mov instruction if non desctructive
21017 destination form is used. */
21018 if (regno1 != regno0 && regno2 != regno0)
21019 split_cost += 1;
21021 /* Have to add index to base if both exist. */
21022 if (parts.base && parts.index)
21023 split_cost += 1;
21025 /* Have to use shift and adds if scale is 2 or greater. */
21026 if (parts.scale > 1)
21028 if (regno0 != regno1)
21029 split_cost += 1;
21030 else if (regno2 == regno0)
21031 split_cost += 4;
21032 else
21033 split_cost += parts.scale;
21036 /* Have to use add instruction with immediate if
21037 disp is non zero. */
21038 if (parts.disp && parts.disp != const0_rtx)
21039 split_cost += 1;
21041 /* Subtract the price of lea. */
21042 split_cost -= 1;
21045 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21046 parts.scale > 1);
21049 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21050 matches destination. RTX includes clobber of FLAGS_REG. */
21052 static void
21053 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21054 rtx dst, rtx src)
21056 rtx op, clob;
21058 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21059 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21061 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21064 /* Return true if regno1 def is nearest to the insn. */
21066 static bool
21067 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21069 rtx_insn *prev = insn;
21070 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21072 if (insn == start)
21073 return false;
21074 while (prev && prev != start)
21076 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21078 prev = PREV_INSN (prev);
21079 continue;
21081 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21082 return true;
21083 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21084 return false;
21085 prev = PREV_INSN (prev);
21088 /* None of the regs is defined in the bb. */
21089 return false;
21092 /* Split lea instructions into a sequence of instructions
21093 which are executed on ALU to avoid AGU stalls.
21094 It is assumed that it is allowed to clobber flags register
21095 at lea position. */
21097 void
21098 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21100 unsigned int regno0, regno1, regno2;
21101 struct ix86_address parts;
21102 rtx target, tmp;
21103 int ok, adds;
21105 ok = ix86_decompose_address (operands[1], &parts);
21106 gcc_assert (ok);
21108 target = gen_lowpart (mode, operands[0]);
21110 regno0 = true_regnum (target);
21111 regno1 = INVALID_REGNUM;
21112 regno2 = INVALID_REGNUM;
21114 if (parts.base)
21116 parts.base = gen_lowpart (mode, parts.base);
21117 regno1 = true_regnum (parts.base);
21120 if (parts.index)
21122 parts.index = gen_lowpart (mode, parts.index);
21123 regno2 = true_regnum (parts.index);
21126 if (parts.disp)
21127 parts.disp = gen_lowpart (mode, parts.disp);
21129 if (parts.scale > 1)
21131 /* Case r1 = r1 + ... */
21132 if (regno1 == regno0)
21134 /* If we have a case r1 = r1 + C * r2 then we
21135 should use multiplication which is very
21136 expensive. Assume cost model is wrong if we
21137 have such case here. */
21138 gcc_assert (regno2 != regno0);
21140 for (adds = parts.scale; adds > 0; adds--)
21141 ix86_emit_binop (PLUS, mode, target, parts.index);
21143 else
21145 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21146 if (regno0 != regno2)
21147 emit_insn (gen_rtx_SET (target, parts.index));
21149 /* Use shift for scaling. */
21150 ix86_emit_binop (ASHIFT, mode, target,
21151 GEN_INT (exact_log2 (parts.scale)));
21153 if (parts.base)
21154 ix86_emit_binop (PLUS, mode, target, parts.base);
21156 if (parts.disp && parts.disp != const0_rtx)
21157 ix86_emit_binop (PLUS, mode, target, parts.disp);
21160 else if (!parts.base && !parts.index)
21162 gcc_assert(parts.disp);
21163 emit_insn (gen_rtx_SET (target, parts.disp));
21165 else
21167 if (!parts.base)
21169 if (regno0 != regno2)
21170 emit_insn (gen_rtx_SET (target, parts.index));
21172 else if (!parts.index)
21174 if (regno0 != regno1)
21175 emit_insn (gen_rtx_SET (target, parts.base));
21177 else
21179 if (regno0 == regno1)
21180 tmp = parts.index;
21181 else if (regno0 == regno2)
21182 tmp = parts.base;
21183 else
21185 rtx tmp1;
21187 /* Find better operand for SET instruction, depending
21188 on which definition is farther from the insn. */
21189 if (find_nearest_reg_def (insn, regno1, regno2))
21190 tmp = parts.index, tmp1 = parts.base;
21191 else
21192 tmp = parts.base, tmp1 = parts.index;
21194 emit_insn (gen_rtx_SET (target, tmp));
21196 if (parts.disp && parts.disp != const0_rtx)
21197 ix86_emit_binop (PLUS, mode, target, parts.disp);
21199 ix86_emit_binop (PLUS, mode, target, tmp1);
21200 return;
21203 ix86_emit_binop (PLUS, mode, target, tmp);
21206 if (parts.disp && parts.disp != const0_rtx)
21207 ix86_emit_binop (PLUS, mode, target, parts.disp);
21211 /* Return true if it is ok to optimize an ADD operation to LEA
21212 operation to avoid flag register consumation. For most processors,
21213 ADD is faster than LEA. For the processors like BONNELL, if the
21214 destination register of LEA holds an actual address which will be
21215 used soon, LEA is better and otherwise ADD is better. */
21217 bool
21218 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21220 unsigned int regno0 = true_regnum (operands[0]);
21221 unsigned int regno1 = true_regnum (operands[1]);
21222 unsigned int regno2 = true_regnum (operands[2]);
21224 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21225 if (regno0 != regno1 && regno0 != regno2)
21226 return true;
21228 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21229 return false;
21231 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21234 /* Return true if destination reg of SET_BODY is shift count of
21235 USE_BODY. */
21237 static bool
21238 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21240 rtx set_dest;
21241 rtx shift_rtx;
21242 int i;
21244 /* Retrieve destination of SET_BODY. */
21245 switch (GET_CODE (set_body))
21247 case SET:
21248 set_dest = SET_DEST (set_body);
21249 if (!set_dest || !REG_P (set_dest))
21250 return false;
21251 break;
21252 case PARALLEL:
21253 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21254 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21255 use_body))
21256 return true;
21257 /* FALLTHROUGH */
21258 default:
21259 return false;
21262 /* Retrieve shift count of USE_BODY. */
21263 switch (GET_CODE (use_body))
21265 case SET:
21266 shift_rtx = XEXP (use_body, 1);
21267 break;
21268 case PARALLEL:
21269 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21270 if (ix86_dep_by_shift_count_body (set_body,
21271 XVECEXP (use_body, 0, i)))
21272 return true;
21273 /* FALLTHROUGH */
21274 default:
21275 return false;
21278 if (shift_rtx
21279 && (GET_CODE (shift_rtx) == ASHIFT
21280 || GET_CODE (shift_rtx) == LSHIFTRT
21281 || GET_CODE (shift_rtx) == ASHIFTRT
21282 || GET_CODE (shift_rtx) == ROTATE
21283 || GET_CODE (shift_rtx) == ROTATERT))
21285 rtx shift_count = XEXP (shift_rtx, 1);
21287 /* Return true if shift count is dest of SET_BODY. */
21288 if (REG_P (shift_count))
21290 /* Add check since it can be invoked before register
21291 allocation in pre-reload schedule. */
21292 if (reload_completed
21293 && true_regnum (set_dest) == true_regnum (shift_count))
21294 return true;
21295 else if (REGNO(set_dest) == REGNO(shift_count))
21296 return true;
21300 return false;
21303 /* Return true if destination reg of SET_INSN is shift count of
21304 USE_INSN. */
21306 bool
21307 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21309 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21310 PATTERN (use_insn));
21313 /* Return TRUE or FALSE depending on whether the unary operator meets the
21314 appropriate constraints. */
21316 bool
21317 ix86_unary_operator_ok (enum rtx_code,
21318 machine_mode,
21319 rtx operands[2])
21321 /* If one of operands is memory, source and destination must match. */
21322 if ((MEM_P (operands[0])
21323 || MEM_P (operands[1]))
21324 && ! rtx_equal_p (operands[0], operands[1]))
21325 return false;
21326 return true;
21329 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21330 are ok, keeping in mind the possible movddup alternative. */
21332 bool
21333 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21335 if (MEM_P (operands[0]))
21336 return rtx_equal_p (operands[0], operands[1 + high]);
21337 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21338 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21339 return true;
21342 /* Post-reload splitter for converting an SF or DFmode value in an
21343 SSE register into an unsigned SImode. */
21345 void
21346 ix86_split_convert_uns_si_sse (rtx operands[])
21348 machine_mode vecmode;
21349 rtx value, large, zero_or_two31, input, two31, x;
21351 large = operands[1];
21352 zero_or_two31 = operands[2];
21353 input = operands[3];
21354 two31 = operands[4];
21355 vecmode = GET_MODE (large);
21356 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21358 /* Load up the value into the low element. We must ensure that the other
21359 elements are valid floats -- zero is the easiest such value. */
21360 if (MEM_P (input))
21362 if (vecmode == V4SFmode)
21363 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21364 else
21365 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21367 else
21369 input = gen_rtx_REG (vecmode, REGNO (input));
21370 emit_move_insn (value, CONST0_RTX (vecmode));
21371 if (vecmode == V4SFmode)
21372 emit_insn (gen_sse_movss (value, value, input));
21373 else
21374 emit_insn (gen_sse2_movsd (value, value, input));
21377 emit_move_insn (large, two31);
21378 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21380 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21381 emit_insn (gen_rtx_SET (large, x));
21383 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21384 emit_insn (gen_rtx_SET (zero_or_two31, x));
21386 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21387 emit_insn (gen_rtx_SET (value, x));
21389 large = gen_rtx_REG (V4SImode, REGNO (large));
21390 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21392 x = gen_rtx_REG (V4SImode, REGNO (value));
21393 if (vecmode == V4SFmode)
21394 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21395 else
21396 emit_insn (gen_sse2_cvttpd2dq (x, value));
21397 value = x;
21399 emit_insn (gen_xorv4si3 (value, value, large));
21402 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21403 Expects the 64-bit DImode to be supplied in a pair of integral
21404 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21405 -mfpmath=sse, !optimize_size only. */
21407 void
21408 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21410 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21411 rtx int_xmm, fp_xmm;
21412 rtx biases, exponents;
21413 rtx x;
21415 int_xmm = gen_reg_rtx (V4SImode);
21416 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21417 emit_insn (gen_movdi_to_sse (int_xmm, input));
21418 else if (TARGET_SSE_SPLIT_REGS)
21420 emit_clobber (int_xmm);
21421 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21423 else
21425 x = gen_reg_rtx (V2DImode);
21426 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21427 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21430 x = gen_rtx_CONST_VECTOR (V4SImode,
21431 gen_rtvec (4, GEN_INT (0x43300000UL),
21432 GEN_INT (0x45300000UL),
21433 const0_rtx, const0_rtx));
21434 exponents = validize_mem (force_const_mem (V4SImode, x));
21436 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21437 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21439 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21440 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21441 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21442 (0x1.0p84 + double(fp_value_hi_xmm)).
21443 Note these exponents differ by 32. */
21445 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21447 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21448 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21449 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21450 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21451 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21452 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21453 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21454 biases = validize_mem (force_const_mem (V2DFmode, biases));
21455 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21457 /* Add the upper and lower DFmode values together. */
21458 if (TARGET_SSE3)
21459 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21460 else
21462 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21463 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21464 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21467 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21470 /* Not used, but eases macroization of patterns. */
21471 void
21472 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21474 gcc_unreachable ();
21477 /* Convert an unsigned SImode value into a DFmode. Only currently used
21478 for SSE, but applicable anywhere. */
21480 void
21481 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21483 REAL_VALUE_TYPE TWO31r;
21484 rtx x, fp;
21486 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21487 NULL, 1, OPTAB_DIRECT);
21489 fp = gen_reg_rtx (DFmode);
21490 emit_insn (gen_floatsidf2 (fp, x));
21492 real_ldexp (&TWO31r, &dconst1, 31);
21493 x = const_double_from_real_value (TWO31r, DFmode);
21495 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21496 if (x != target)
21497 emit_move_insn (target, x);
21500 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21501 32-bit mode; otherwise we have a direct convert instruction. */
21503 void
21504 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21506 REAL_VALUE_TYPE TWO32r;
21507 rtx fp_lo, fp_hi, x;
21509 fp_lo = gen_reg_rtx (DFmode);
21510 fp_hi = gen_reg_rtx (DFmode);
21512 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21514 real_ldexp (&TWO32r, &dconst1, 32);
21515 x = const_double_from_real_value (TWO32r, DFmode);
21516 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21518 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21520 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21521 0, OPTAB_DIRECT);
21522 if (x != target)
21523 emit_move_insn (target, x);
21526 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21527 For x86_32, -mfpmath=sse, !optimize_size only. */
21528 void
21529 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21531 REAL_VALUE_TYPE ONE16r;
21532 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21534 real_ldexp (&ONE16r, &dconst1, 16);
21535 x = const_double_from_real_value (ONE16r, SFmode);
21536 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21537 NULL, 0, OPTAB_DIRECT);
21538 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21539 NULL, 0, OPTAB_DIRECT);
21540 fp_hi = gen_reg_rtx (SFmode);
21541 fp_lo = gen_reg_rtx (SFmode);
21542 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21543 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21544 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21545 0, OPTAB_DIRECT);
21546 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21547 0, OPTAB_DIRECT);
21548 if (!rtx_equal_p (target, fp_hi))
21549 emit_move_insn (target, fp_hi);
21552 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21553 a vector of unsigned ints VAL to vector of floats TARGET. */
21555 void
21556 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21558 rtx tmp[8];
21559 REAL_VALUE_TYPE TWO16r;
21560 machine_mode intmode = GET_MODE (val);
21561 machine_mode fltmode = GET_MODE (target);
21562 rtx (*cvt) (rtx, rtx);
21564 if (intmode == V4SImode)
21565 cvt = gen_floatv4siv4sf2;
21566 else
21567 cvt = gen_floatv8siv8sf2;
21568 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21569 tmp[0] = force_reg (intmode, tmp[0]);
21570 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21571 OPTAB_DIRECT);
21572 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21573 NULL_RTX, 1, OPTAB_DIRECT);
21574 tmp[3] = gen_reg_rtx (fltmode);
21575 emit_insn (cvt (tmp[3], tmp[1]));
21576 tmp[4] = gen_reg_rtx (fltmode);
21577 emit_insn (cvt (tmp[4], tmp[2]));
21578 real_ldexp (&TWO16r, &dconst1, 16);
21579 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21580 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21581 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21582 OPTAB_DIRECT);
21583 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21584 OPTAB_DIRECT);
21585 if (tmp[7] != target)
21586 emit_move_insn (target, tmp[7]);
21589 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21590 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21591 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21592 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21595 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21597 REAL_VALUE_TYPE TWO31r;
21598 rtx two31r, tmp[4];
21599 machine_mode mode = GET_MODE (val);
21600 machine_mode scalarmode = GET_MODE_INNER (mode);
21601 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21602 rtx (*cmp) (rtx, rtx, rtx, rtx);
21603 int i;
21605 for (i = 0; i < 3; i++)
21606 tmp[i] = gen_reg_rtx (mode);
21607 real_ldexp (&TWO31r, &dconst1, 31);
21608 two31r = const_double_from_real_value (TWO31r, scalarmode);
21609 two31r = ix86_build_const_vector (mode, 1, two31r);
21610 two31r = force_reg (mode, two31r);
21611 switch (mode)
21613 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21614 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21615 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21616 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21617 default: gcc_unreachable ();
21619 tmp[3] = gen_rtx_LE (mode, two31r, val);
21620 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21621 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21622 0, OPTAB_DIRECT);
21623 if (intmode == V4SImode || TARGET_AVX2)
21624 *xorp = expand_simple_binop (intmode, ASHIFT,
21625 gen_lowpart (intmode, tmp[0]),
21626 GEN_INT (31), NULL_RTX, 0,
21627 OPTAB_DIRECT);
21628 else
21630 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21631 two31 = ix86_build_const_vector (intmode, 1, two31);
21632 *xorp = expand_simple_binop (intmode, AND,
21633 gen_lowpart (intmode, tmp[0]),
21634 two31, NULL_RTX, 0,
21635 OPTAB_DIRECT);
21637 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21638 0, OPTAB_DIRECT);
21641 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21642 then replicate the value for all elements of the vector
21643 register. */
21646 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21648 int i, n_elt;
21649 rtvec v;
21650 machine_mode scalar_mode;
21652 switch (mode)
21654 case V64QImode:
21655 case V32QImode:
21656 case V16QImode:
21657 case V32HImode:
21658 case V16HImode:
21659 case V8HImode:
21660 case V16SImode:
21661 case V8SImode:
21662 case V4SImode:
21663 case V8DImode:
21664 case V4DImode:
21665 case V2DImode:
21666 gcc_assert (vect);
21667 /* FALLTHRU */
21668 case V16SFmode:
21669 case V8SFmode:
21670 case V4SFmode:
21671 case V8DFmode:
21672 case V4DFmode:
21673 case V2DFmode:
21674 n_elt = GET_MODE_NUNITS (mode);
21675 v = rtvec_alloc (n_elt);
21676 scalar_mode = GET_MODE_INNER (mode);
21678 RTVEC_ELT (v, 0) = value;
21680 for (i = 1; i < n_elt; ++i)
21681 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21683 return gen_rtx_CONST_VECTOR (mode, v);
21685 default:
21686 gcc_unreachable ();
21690 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21691 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21692 for an SSE register. If VECT is true, then replicate the mask for
21693 all elements of the vector register. If INVERT is true, then create
21694 a mask excluding the sign bit. */
21697 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21699 machine_mode vec_mode, imode;
21700 wide_int w;
21701 rtx mask, v;
21703 switch (mode)
21705 case V16SImode:
21706 case V16SFmode:
21707 case V8SImode:
21708 case V4SImode:
21709 case V8SFmode:
21710 case V4SFmode:
21711 vec_mode = mode;
21712 imode = SImode;
21713 break;
21715 case V8DImode:
21716 case V4DImode:
21717 case V2DImode:
21718 case V8DFmode:
21719 case V4DFmode:
21720 case V2DFmode:
21721 vec_mode = mode;
21722 imode = DImode;
21723 break;
21725 case TImode:
21726 case TFmode:
21727 vec_mode = VOIDmode;
21728 imode = TImode;
21729 break;
21731 default:
21732 gcc_unreachable ();
21735 machine_mode inner_mode = GET_MODE_INNER (mode);
21736 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21737 GET_MODE_BITSIZE (inner_mode));
21738 if (invert)
21739 w = wi::bit_not (w);
21741 /* Force this value into the low part of a fp vector constant. */
21742 mask = immed_wide_int_const (w, imode);
21743 mask = gen_lowpart (inner_mode, mask);
21745 if (vec_mode == VOIDmode)
21746 return force_reg (inner_mode, mask);
21748 v = ix86_build_const_vector (vec_mode, vect, mask);
21749 return force_reg (vec_mode, v);
21752 /* Generate code for floating point ABS or NEG. */
21754 void
21755 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21756 rtx operands[])
21758 rtx mask, set, dst, src;
21759 bool use_sse = false;
21760 bool vector_mode = VECTOR_MODE_P (mode);
21761 machine_mode vmode = mode;
21763 if (vector_mode)
21764 use_sse = true;
21765 else if (mode == TFmode)
21766 use_sse = true;
21767 else if (TARGET_SSE_MATH)
21769 use_sse = SSE_FLOAT_MODE_P (mode);
21770 if (mode == SFmode)
21771 vmode = V4SFmode;
21772 else if (mode == DFmode)
21773 vmode = V2DFmode;
21776 /* NEG and ABS performed with SSE use bitwise mask operations.
21777 Create the appropriate mask now. */
21778 if (use_sse)
21779 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21780 else
21781 mask = NULL_RTX;
21783 dst = operands[0];
21784 src = operands[1];
21786 set = gen_rtx_fmt_e (code, mode, src);
21787 set = gen_rtx_SET (dst, set);
21789 if (mask)
21791 rtx use, clob;
21792 rtvec par;
21794 use = gen_rtx_USE (VOIDmode, mask);
21795 if (vector_mode)
21796 par = gen_rtvec (2, set, use);
21797 else
21799 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21800 par = gen_rtvec (3, set, use, clob);
21802 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21804 else
21805 emit_insn (set);
21808 /* Expand a copysign operation. Special case operand 0 being a constant. */
21810 void
21811 ix86_expand_copysign (rtx operands[])
21813 machine_mode mode, vmode;
21814 rtx dest, op0, op1, mask, nmask;
21816 dest = operands[0];
21817 op0 = operands[1];
21818 op1 = operands[2];
21820 mode = GET_MODE (dest);
21822 if (mode == SFmode)
21823 vmode = V4SFmode;
21824 else if (mode == DFmode)
21825 vmode = V2DFmode;
21826 else
21827 vmode = mode;
21829 if (CONST_DOUBLE_P (op0))
21831 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21833 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21834 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21836 if (mode == SFmode || mode == DFmode)
21838 if (op0 == CONST0_RTX (mode))
21839 op0 = CONST0_RTX (vmode);
21840 else
21842 rtx v = ix86_build_const_vector (vmode, false, op0);
21844 op0 = force_reg (vmode, v);
21847 else if (op0 != CONST0_RTX (mode))
21848 op0 = force_reg (mode, op0);
21850 mask = ix86_build_signbit_mask (vmode, 0, 0);
21852 if (mode == SFmode)
21853 copysign_insn = gen_copysignsf3_const;
21854 else if (mode == DFmode)
21855 copysign_insn = gen_copysigndf3_const;
21856 else
21857 copysign_insn = gen_copysigntf3_const;
21859 emit_insn (copysign_insn (dest, op0, op1, mask));
21861 else
21863 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21865 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21866 mask = ix86_build_signbit_mask (vmode, 0, 0);
21868 if (mode == SFmode)
21869 copysign_insn = gen_copysignsf3_var;
21870 else if (mode == DFmode)
21871 copysign_insn = gen_copysigndf3_var;
21872 else
21873 copysign_insn = gen_copysigntf3_var;
21875 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21879 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21880 be a constant, and so has already been expanded into a vector constant. */
21882 void
21883 ix86_split_copysign_const (rtx operands[])
21885 machine_mode mode, vmode;
21886 rtx dest, op0, mask, x;
21888 dest = operands[0];
21889 op0 = operands[1];
21890 mask = operands[3];
21892 mode = GET_MODE (dest);
21893 vmode = GET_MODE (mask);
21895 dest = lowpart_subreg (vmode, dest, mode);
21896 x = gen_rtx_AND (vmode, dest, mask);
21897 emit_insn (gen_rtx_SET (dest, x));
21899 if (op0 != CONST0_RTX (vmode))
21901 x = gen_rtx_IOR (vmode, dest, op0);
21902 emit_insn (gen_rtx_SET (dest, x));
21906 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21907 so we have to do two masks. */
21909 void
21910 ix86_split_copysign_var (rtx operands[])
21912 machine_mode mode, vmode;
21913 rtx dest, scratch, op0, op1, mask, nmask, x;
21915 dest = operands[0];
21916 scratch = operands[1];
21917 op0 = operands[2];
21918 op1 = operands[3];
21919 nmask = operands[4];
21920 mask = operands[5];
21922 mode = GET_MODE (dest);
21923 vmode = GET_MODE (mask);
21925 if (rtx_equal_p (op0, op1))
21927 /* Shouldn't happen often (it's useless, obviously), but when it does
21928 we'd generate incorrect code if we continue below. */
21929 emit_move_insn (dest, op0);
21930 return;
21933 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21935 gcc_assert (REGNO (op1) == REGNO (scratch));
21937 x = gen_rtx_AND (vmode, scratch, mask);
21938 emit_insn (gen_rtx_SET (scratch, x));
21940 dest = mask;
21941 op0 = lowpart_subreg (vmode, op0, mode);
21942 x = gen_rtx_NOT (vmode, dest);
21943 x = gen_rtx_AND (vmode, x, op0);
21944 emit_insn (gen_rtx_SET (dest, x));
21946 else
21948 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21950 x = gen_rtx_AND (vmode, scratch, mask);
21952 else /* alternative 2,4 */
21954 gcc_assert (REGNO (mask) == REGNO (scratch));
21955 op1 = lowpart_subreg (vmode, op1, mode);
21956 x = gen_rtx_AND (vmode, scratch, op1);
21958 emit_insn (gen_rtx_SET (scratch, x));
21960 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21962 dest = lowpart_subreg (vmode, op0, mode);
21963 x = gen_rtx_AND (vmode, dest, nmask);
21965 else /* alternative 3,4 */
21967 gcc_assert (REGNO (nmask) == REGNO (dest));
21968 dest = nmask;
21969 op0 = lowpart_subreg (vmode, op0, mode);
21970 x = gen_rtx_AND (vmode, dest, op0);
21972 emit_insn (gen_rtx_SET (dest, x));
21975 x = gen_rtx_IOR (vmode, dest, scratch);
21976 emit_insn (gen_rtx_SET (dest, x));
21979 /* Return TRUE or FALSE depending on whether the first SET in INSN
21980 has source and destination with matching CC modes, and that the
21981 CC mode is at least as constrained as REQ_MODE. */
21983 bool
21984 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21986 rtx set;
21987 machine_mode set_mode;
21989 set = PATTERN (insn);
21990 if (GET_CODE (set) == PARALLEL)
21991 set = XVECEXP (set, 0, 0);
21992 gcc_assert (GET_CODE (set) == SET);
21993 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21995 set_mode = GET_MODE (SET_DEST (set));
21996 switch (set_mode)
21998 case CCNOmode:
21999 if (req_mode != CCNOmode
22000 && (req_mode != CCmode
22001 || XEXP (SET_SRC (set), 1) != const0_rtx))
22002 return false;
22003 break;
22004 case CCmode:
22005 if (req_mode == CCGCmode)
22006 return false;
22007 /* FALLTHRU */
22008 case CCGCmode:
22009 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22010 return false;
22011 /* FALLTHRU */
22012 case CCGOCmode:
22013 if (req_mode == CCZmode)
22014 return false;
22015 /* FALLTHRU */
22016 case CCZmode:
22017 break;
22019 case CCAmode:
22020 case CCCmode:
22021 case CCOmode:
22022 case CCPmode:
22023 case CCSmode:
22024 if (set_mode != req_mode)
22025 return false;
22026 break;
22028 default:
22029 gcc_unreachable ();
22032 return GET_MODE (SET_SRC (set)) == set_mode;
22035 /* Generate insn patterns to do an integer compare of OPERANDS. */
22037 static rtx
22038 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22040 machine_mode cmpmode;
22041 rtx tmp, flags;
22043 cmpmode = SELECT_CC_MODE (code, op0, op1);
22044 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22046 /* This is very simple, but making the interface the same as in the
22047 FP case makes the rest of the code easier. */
22048 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22049 emit_insn (gen_rtx_SET (flags, tmp));
22051 /* Return the test that should be put into the flags user, i.e.
22052 the bcc, scc, or cmov instruction. */
22053 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22056 /* Figure out whether to use ordered or unordered fp comparisons.
22057 Return the appropriate mode to use. */
22059 machine_mode
22060 ix86_fp_compare_mode (enum rtx_code)
22062 /* ??? In order to make all comparisons reversible, we do all comparisons
22063 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22064 all forms trapping and nontrapping comparisons, we can make inequality
22065 comparisons trapping again, since it results in better code when using
22066 FCOM based compares. */
22067 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22070 machine_mode
22071 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22073 machine_mode mode = GET_MODE (op0);
22075 if (SCALAR_FLOAT_MODE_P (mode))
22077 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22078 return ix86_fp_compare_mode (code);
22081 switch (code)
22083 /* Only zero flag is needed. */
22084 case EQ: /* ZF=0 */
22085 case NE: /* ZF!=0 */
22086 return CCZmode;
22087 /* Codes needing carry flag. */
22088 case GEU: /* CF=0 */
22089 case LTU: /* CF=1 */
22090 /* Detect overflow checks. They need just the carry flag. */
22091 if (GET_CODE (op0) == PLUS
22092 && (rtx_equal_p (op1, XEXP (op0, 0))
22093 || rtx_equal_p (op1, XEXP (op0, 1))))
22094 return CCCmode;
22095 else
22096 return CCmode;
22097 case GTU: /* CF=0 & ZF=0 */
22098 case LEU: /* CF=1 | ZF=1 */
22099 return CCmode;
22100 /* Codes possibly doable only with sign flag when
22101 comparing against zero. */
22102 case GE: /* SF=OF or SF=0 */
22103 case LT: /* SF<>OF or SF=1 */
22104 if (op1 == const0_rtx)
22105 return CCGOCmode;
22106 else
22107 /* For other cases Carry flag is not required. */
22108 return CCGCmode;
22109 /* Codes doable only with sign flag when comparing
22110 against zero, but we miss jump instruction for it
22111 so we need to use relational tests against overflow
22112 that thus needs to be zero. */
22113 case GT: /* ZF=0 & SF=OF */
22114 case LE: /* ZF=1 | SF<>OF */
22115 if (op1 == const0_rtx)
22116 return CCNOmode;
22117 else
22118 return CCGCmode;
22119 /* strcmp pattern do (use flags) and combine may ask us for proper
22120 mode. */
22121 case USE:
22122 return CCmode;
22123 default:
22124 gcc_unreachable ();
22128 /* Return the fixed registers used for condition codes. */
22130 static bool
22131 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22133 *p1 = FLAGS_REG;
22134 *p2 = FPSR_REG;
22135 return true;
22138 /* If two condition code modes are compatible, return a condition code
22139 mode which is compatible with both. Otherwise, return
22140 VOIDmode. */
22142 static machine_mode
22143 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22145 if (m1 == m2)
22146 return m1;
22148 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22149 return VOIDmode;
22151 if ((m1 == CCGCmode && m2 == CCGOCmode)
22152 || (m1 == CCGOCmode && m2 == CCGCmode))
22153 return CCGCmode;
22155 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22156 return m2;
22157 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22158 return m1;
22160 switch (m1)
22162 default:
22163 gcc_unreachable ();
22165 case CCmode:
22166 case CCGCmode:
22167 case CCGOCmode:
22168 case CCNOmode:
22169 case CCAmode:
22170 case CCCmode:
22171 case CCOmode:
22172 case CCPmode:
22173 case CCSmode:
22174 case CCZmode:
22175 switch (m2)
22177 default:
22178 return VOIDmode;
22180 case CCmode:
22181 case CCGCmode:
22182 case CCGOCmode:
22183 case CCNOmode:
22184 case CCAmode:
22185 case CCCmode:
22186 case CCOmode:
22187 case CCPmode:
22188 case CCSmode:
22189 case CCZmode:
22190 return CCmode;
22193 case CCFPmode:
22194 case CCFPUmode:
22195 /* These are only compatible with themselves, which we already
22196 checked above. */
22197 return VOIDmode;
22202 /* Return a comparison we can do and that it is equivalent to
22203 swap_condition (code) apart possibly from orderedness.
22204 But, never change orderedness if TARGET_IEEE_FP, returning
22205 UNKNOWN in that case if necessary. */
22207 static enum rtx_code
22208 ix86_fp_swap_condition (enum rtx_code code)
22210 switch (code)
22212 case GT: /* GTU - CF=0 & ZF=0 */
22213 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22214 case GE: /* GEU - CF=0 */
22215 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22216 case UNLT: /* LTU - CF=1 */
22217 return TARGET_IEEE_FP ? UNKNOWN : GT;
22218 case UNLE: /* LEU - CF=1 | ZF=1 */
22219 return TARGET_IEEE_FP ? UNKNOWN : GE;
22220 default:
22221 return swap_condition (code);
22225 /* Return cost of comparison CODE using the best strategy for performance.
22226 All following functions do use number of instructions as a cost metrics.
22227 In future this should be tweaked to compute bytes for optimize_size and
22228 take into account performance of various instructions on various CPUs. */
22230 static int
22231 ix86_fp_comparison_cost (enum rtx_code code)
22233 int arith_cost;
22235 /* The cost of code using bit-twiddling on %ah. */
22236 switch (code)
22238 case UNLE:
22239 case UNLT:
22240 case LTGT:
22241 case GT:
22242 case GE:
22243 case UNORDERED:
22244 case ORDERED:
22245 case UNEQ:
22246 arith_cost = 4;
22247 break;
22248 case LT:
22249 case NE:
22250 case EQ:
22251 case UNGE:
22252 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22253 break;
22254 case LE:
22255 case UNGT:
22256 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22257 break;
22258 default:
22259 gcc_unreachable ();
22262 switch (ix86_fp_comparison_strategy (code))
22264 case IX86_FPCMP_COMI:
22265 return arith_cost > 4 ? 3 : 2;
22266 case IX86_FPCMP_SAHF:
22267 return arith_cost > 4 ? 4 : 3;
22268 default:
22269 return arith_cost;
22273 /* Return strategy to use for floating-point. We assume that fcomi is always
22274 preferrable where available, since that is also true when looking at size
22275 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22277 enum ix86_fpcmp_strategy
22278 ix86_fp_comparison_strategy (enum rtx_code)
22280 /* Do fcomi/sahf based test when profitable. */
22282 if (TARGET_CMOVE)
22283 return IX86_FPCMP_COMI;
22285 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22286 return IX86_FPCMP_SAHF;
22288 return IX86_FPCMP_ARITH;
22291 /* Swap, force into registers, or otherwise massage the two operands
22292 to a fp comparison. The operands are updated in place; the new
22293 comparison code is returned. */
22295 static enum rtx_code
22296 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22298 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22299 rtx op0 = *pop0, op1 = *pop1;
22300 machine_mode op_mode = GET_MODE (op0);
22301 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22303 /* All of the unordered compare instructions only work on registers.
22304 The same is true of the fcomi compare instructions. The XFmode
22305 compare instructions require registers except when comparing
22306 against zero or when converting operand 1 from fixed point to
22307 floating point. */
22309 if (!is_sse
22310 && (fpcmp_mode == CCFPUmode
22311 || (op_mode == XFmode
22312 && ! (standard_80387_constant_p (op0) == 1
22313 || standard_80387_constant_p (op1) == 1)
22314 && GET_CODE (op1) != FLOAT)
22315 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22317 op0 = force_reg (op_mode, op0);
22318 op1 = force_reg (op_mode, op1);
22320 else
22322 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22323 things around if they appear profitable, otherwise force op0
22324 into a register. */
22326 if (standard_80387_constant_p (op0) == 0
22327 || (MEM_P (op0)
22328 && ! (standard_80387_constant_p (op1) == 0
22329 || MEM_P (op1))))
22331 enum rtx_code new_code = ix86_fp_swap_condition (code);
22332 if (new_code != UNKNOWN)
22334 std::swap (op0, op1);
22335 code = new_code;
22339 if (!REG_P (op0))
22340 op0 = force_reg (op_mode, op0);
22342 if (CONSTANT_P (op1))
22344 int tmp = standard_80387_constant_p (op1);
22345 if (tmp == 0)
22346 op1 = validize_mem (force_const_mem (op_mode, op1));
22347 else if (tmp == 1)
22349 if (TARGET_CMOVE)
22350 op1 = force_reg (op_mode, op1);
22352 else
22353 op1 = force_reg (op_mode, op1);
22357 /* Try to rearrange the comparison to make it cheaper. */
22358 if (ix86_fp_comparison_cost (code)
22359 > ix86_fp_comparison_cost (swap_condition (code))
22360 && (REG_P (op1) || can_create_pseudo_p ()))
22362 std::swap (op0, op1);
22363 code = swap_condition (code);
22364 if (!REG_P (op0))
22365 op0 = force_reg (op_mode, op0);
22368 *pop0 = op0;
22369 *pop1 = op1;
22370 return code;
22373 /* Convert comparison codes we use to represent FP comparison to integer
22374 code that will result in proper branch. Return UNKNOWN if no such code
22375 is available. */
22377 enum rtx_code
22378 ix86_fp_compare_code_to_integer (enum rtx_code code)
22380 switch (code)
22382 case GT:
22383 return GTU;
22384 case GE:
22385 return GEU;
22386 case ORDERED:
22387 case UNORDERED:
22388 return code;
22389 case UNEQ:
22390 return EQ;
22391 case UNLT:
22392 return LTU;
22393 case UNLE:
22394 return LEU;
22395 case LTGT:
22396 return NE;
22397 default:
22398 return UNKNOWN;
22402 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22404 static rtx
22405 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22407 machine_mode fpcmp_mode, intcmp_mode;
22408 rtx tmp, tmp2;
22410 fpcmp_mode = ix86_fp_compare_mode (code);
22411 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22413 /* Do fcomi/sahf based test when profitable. */
22414 switch (ix86_fp_comparison_strategy (code))
22416 case IX86_FPCMP_COMI:
22417 intcmp_mode = fpcmp_mode;
22418 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22419 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22420 emit_insn (tmp);
22421 break;
22423 case IX86_FPCMP_SAHF:
22424 intcmp_mode = fpcmp_mode;
22425 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22426 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22428 if (!scratch)
22429 scratch = gen_reg_rtx (HImode);
22430 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22431 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22432 break;
22434 case IX86_FPCMP_ARITH:
22435 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22436 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22437 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22438 if (!scratch)
22439 scratch = gen_reg_rtx (HImode);
22440 emit_insn (gen_rtx_SET (scratch, tmp2));
22442 /* In the unordered case, we have to check C2 for NaN's, which
22443 doesn't happen to work out to anything nice combination-wise.
22444 So do some bit twiddling on the value we've got in AH to come
22445 up with an appropriate set of condition codes. */
22447 intcmp_mode = CCNOmode;
22448 switch (code)
22450 case GT:
22451 case UNGT:
22452 if (code == GT || !TARGET_IEEE_FP)
22454 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22455 code = EQ;
22457 else
22459 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22460 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22461 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22462 intcmp_mode = CCmode;
22463 code = GEU;
22465 break;
22466 case LT:
22467 case UNLT:
22468 if (code == LT && TARGET_IEEE_FP)
22470 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22471 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22472 intcmp_mode = CCmode;
22473 code = EQ;
22475 else
22477 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
22478 code = NE;
22480 break;
22481 case GE:
22482 case UNGE:
22483 if (code == GE || !TARGET_IEEE_FP)
22485 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
22486 code = EQ;
22488 else
22490 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22491 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
22492 code = NE;
22494 break;
22495 case LE:
22496 case UNLE:
22497 if (code == LE && TARGET_IEEE_FP)
22499 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22500 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22501 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22502 intcmp_mode = CCmode;
22503 code = LTU;
22505 else
22507 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22508 code = NE;
22510 break;
22511 case EQ:
22512 case UNEQ:
22513 if (code == EQ && TARGET_IEEE_FP)
22515 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22516 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22517 intcmp_mode = CCmode;
22518 code = EQ;
22520 else
22522 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22523 code = NE;
22525 break;
22526 case NE:
22527 case LTGT:
22528 if (code == NE && TARGET_IEEE_FP)
22530 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22531 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
22532 GEN_INT (0x40)));
22533 code = NE;
22535 else
22537 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22538 code = EQ;
22540 break;
22542 case UNORDERED:
22543 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22544 code = NE;
22545 break;
22546 case ORDERED:
22547 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22548 code = EQ;
22549 break;
22551 default:
22552 gcc_unreachable ();
22554 break;
22556 default:
22557 gcc_unreachable();
22560 /* Return the test that should be put into the flags user, i.e.
22561 the bcc, scc, or cmov instruction. */
22562 return gen_rtx_fmt_ee (code, VOIDmode,
22563 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22564 const0_rtx);
22567 static rtx
22568 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22570 rtx ret;
22572 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22573 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22575 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22577 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22578 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22580 else
22581 ret = ix86_expand_int_compare (code, op0, op1);
22583 return ret;
22586 void
22587 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22589 machine_mode mode = GET_MODE (op0);
22590 rtx tmp;
22592 /* Handle special case - vector comparsion with boolean result, transform
22593 it using ptest instruction. */
22594 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22596 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22597 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22599 gcc_assert (code == EQ || code == NE);
22600 /* Generate XOR since we can't check that one operand is zero vector. */
22601 tmp = gen_reg_rtx (mode);
22602 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22603 tmp = gen_lowpart (p_mode, tmp);
22604 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22605 gen_rtx_UNSPEC (CCmode,
22606 gen_rtvec (2, tmp, tmp),
22607 UNSPEC_PTEST)));
22608 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22609 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22610 gen_rtx_LABEL_REF (VOIDmode, label),
22611 pc_rtx);
22612 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22613 return;
22616 switch (mode)
22618 case SFmode:
22619 case DFmode:
22620 case XFmode:
22621 case QImode:
22622 case HImode:
22623 case SImode:
22624 simple:
22625 tmp = ix86_expand_compare (code, op0, op1);
22626 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22627 gen_rtx_LABEL_REF (VOIDmode, label),
22628 pc_rtx);
22629 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22630 return;
22632 case DImode:
22633 if (TARGET_64BIT)
22634 goto simple;
22635 /* For 32-bit target DI comparison may be performed on
22636 SSE registers. To allow this we should avoid split
22637 to SI mode which is achieved by doing xor in DI mode
22638 and then comparing with zero (which is recognized by
22639 STV pass). We don't compare using xor when optimizing
22640 for size. */
22641 if (!optimize_insn_for_size_p ()
22642 && TARGET_STV
22643 && (code == EQ || code == NE))
22645 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22646 op1 = const0_rtx;
22648 /* FALLTHRU */
22649 case TImode:
22650 /* Expand DImode branch into multiple compare+branch. */
22652 rtx lo[2], hi[2];
22653 rtx_code_label *label2;
22654 enum rtx_code code1, code2, code3;
22655 machine_mode submode;
22657 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22659 std::swap (op0, op1);
22660 code = swap_condition (code);
22663 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22664 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22666 submode = mode == DImode ? SImode : DImode;
22668 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22669 avoid two branches. This costs one extra insn, so disable when
22670 optimizing for size. */
22672 if ((code == EQ || code == NE)
22673 && (!optimize_insn_for_size_p ()
22674 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22676 rtx xor0, xor1;
22678 xor1 = hi[0];
22679 if (hi[1] != const0_rtx)
22680 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22681 NULL_RTX, 0, OPTAB_WIDEN);
22683 xor0 = lo[0];
22684 if (lo[1] != const0_rtx)
22685 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22686 NULL_RTX, 0, OPTAB_WIDEN);
22688 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22689 NULL_RTX, 0, OPTAB_WIDEN);
22691 ix86_expand_branch (code, tmp, const0_rtx, label);
22692 return;
22695 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22696 op1 is a constant and the low word is zero, then we can just
22697 examine the high word. Similarly for low word -1 and
22698 less-or-equal-than or greater-than. */
22700 if (CONST_INT_P (hi[1]))
22701 switch (code)
22703 case LT: case LTU: case GE: case GEU:
22704 if (lo[1] == const0_rtx)
22706 ix86_expand_branch (code, hi[0], hi[1], label);
22707 return;
22709 break;
22710 case LE: case LEU: case GT: case GTU:
22711 if (lo[1] == constm1_rtx)
22713 ix86_expand_branch (code, hi[0], hi[1], label);
22714 return;
22716 break;
22717 default:
22718 break;
22721 /* Otherwise, we need two or three jumps. */
22723 label2 = gen_label_rtx ();
22725 code1 = code;
22726 code2 = swap_condition (code);
22727 code3 = unsigned_condition (code);
22729 switch (code)
22731 case LT: case GT: case LTU: case GTU:
22732 break;
22734 case LE: code1 = LT; code2 = GT; break;
22735 case GE: code1 = GT; code2 = LT; break;
22736 case LEU: code1 = LTU; code2 = GTU; break;
22737 case GEU: code1 = GTU; code2 = LTU; break;
22739 case EQ: code1 = UNKNOWN; code2 = NE; break;
22740 case NE: code2 = UNKNOWN; break;
22742 default:
22743 gcc_unreachable ();
22747 * a < b =>
22748 * if (hi(a) < hi(b)) goto true;
22749 * if (hi(a) > hi(b)) goto false;
22750 * if (lo(a) < lo(b)) goto true;
22751 * false:
22754 if (code1 != UNKNOWN)
22755 ix86_expand_branch (code1, hi[0], hi[1], label);
22756 if (code2 != UNKNOWN)
22757 ix86_expand_branch (code2, hi[0], hi[1], label2);
22759 ix86_expand_branch (code3, lo[0], lo[1], label);
22761 if (code2 != UNKNOWN)
22762 emit_label (label2);
22763 return;
22766 default:
22767 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22768 goto simple;
22772 /* Split branch based on floating point condition. */
22773 void
22774 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22775 rtx target1, rtx target2, rtx tmp)
22777 rtx condition;
22778 rtx i;
22780 if (target2 != pc_rtx)
22782 std::swap (target1, target2);
22783 code = reverse_condition_maybe_unordered (code);
22786 condition = ix86_expand_fp_compare (code, op1, op2,
22787 tmp);
22789 i = emit_jump_insn (gen_rtx_SET
22790 (pc_rtx,
22791 gen_rtx_IF_THEN_ELSE (VOIDmode,
22792 condition, target1, target2)));
22793 if (split_branch_probability >= 0)
22794 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
22797 void
22798 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22800 rtx ret;
22802 gcc_assert (GET_MODE (dest) == QImode);
22804 ret = ix86_expand_compare (code, op0, op1);
22805 PUT_MODE (ret, QImode);
22806 emit_insn (gen_rtx_SET (dest, ret));
22809 /* Expand comparison setting or clearing carry flag. Return true when
22810 successful and set pop for the operation. */
22811 static bool
22812 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22814 machine_mode mode =
22815 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22817 /* Do not handle double-mode compares that go through special path. */
22818 if (mode == (TARGET_64BIT ? TImode : DImode))
22819 return false;
22821 if (SCALAR_FLOAT_MODE_P (mode))
22823 rtx compare_op;
22824 rtx_insn *compare_seq;
22826 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22828 /* Shortcut: following common codes never translate
22829 into carry flag compares. */
22830 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22831 || code == ORDERED || code == UNORDERED)
22832 return false;
22834 /* These comparisons require zero flag; swap operands so they won't. */
22835 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22836 && !TARGET_IEEE_FP)
22838 std::swap (op0, op1);
22839 code = swap_condition (code);
22842 /* Try to expand the comparison and verify that we end up with
22843 carry flag based comparison. This fails to be true only when
22844 we decide to expand comparison using arithmetic that is not
22845 too common scenario. */
22846 start_sequence ();
22847 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22848 compare_seq = get_insns ();
22849 end_sequence ();
22851 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22852 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22853 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22854 else
22855 code = GET_CODE (compare_op);
22857 if (code != LTU && code != GEU)
22858 return false;
22860 emit_insn (compare_seq);
22861 *pop = compare_op;
22862 return true;
22865 if (!INTEGRAL_MODE_P (mode))
22866 return false;
22868 switch (code)
22870 case LTU:
22871 case GEU:
22872 break;
22874 /* Convert a==0 into (unsigned)a<1. */
22875 case EQ:
22876 case NE:
22877 if (op1 != const0_rtx)
22878 return false;
22879 op1 = const1_rtx;
22880 code = (code == EQ ? LTU : GEU);
22881 break;
22883 /* Convert a>b into b<a or a>=b-1. */
22884 case GTU:
22885 case LEU:
22886 if (CONST_INT_P (op1))
22888 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22889 /* Bail out on overflow. We still can swap operands but that
22890 would force loading of the constant into register. */
22891 if (op1 == const0_rtx
22892 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22893 return false;
22894 code = (code == GTU ? GEU : LTU);
22896 else
22898 std::swap (op0, op1);
22899 code = (code == GTU ? LTU : GEU);
22901 break;
22903 /* Convert a>=0 into (unsigned)a<0x80000000. */
22904 case LT:
22905 case GE:
22906 if (mode == DImode || op1 != const0_rtx)
22907 return false;
22908 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22909 code = (code == LT ? GEU : LTU);
22910 break;
22911 case LE:
22912 case GT:
22913 if (mode == DImode || op1 != constm1_rtx)
22914 return false;
22915 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22916 code = (code == LE ? GEU : LTU);
22917 break;
22919 default:
22920 return false;
22922 /* Swapping operands may cause constant to appear as first operand. */
22923 if (!nonimmediate_operand (op0, VOIDmode))
22925 if (!can_create_pseudo_p ())
22926 return false;
22927 op0 = force_reg (mode, op0);
22929 *pop = ix86_expand_compare (code, op0, op1);
22930 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22931 return true;
22934 bool
22935 ix86_expand_int_movcc (rtx operands[])
22937 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22938 rtx_insn *compare_seq;
22939 rtx compare_op;
22940 machine_mode mode = GET_MODE (operands[0]);
22941 bool sign_bit_compare_p = false;
22942 rtx op0 = XEXP (operands[1], 0);
22943 rtx op1 = XEXP (operands[1], 1);
22945 if (GET_MODE (op0) == TImode
22946 || (GET_MODE (op0) == DImode
22947 && !TARGET_64BIT))
22948 return false;
22950 start_sequence ();
22951 compare_op = ix86_expand_compare (code, op0, op1);
22952 compare_seq = get_insns ();
22953 end_sequence ();
22955 compare_code = GET_CODE (compare_op);
22957 if ((op1 == const0_rtx && (code == GE || code == LT))
22958 || (op1 == constm1_rtx && (code == GT || code == LE)))
22959 sign_bit_compare_p = true;
22961 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22962 HImode insns, we'd be swallowed in word prefix ops. */
22964 if ((mode != HImode || TARGET_FAST_PREFIX)
22965 && (mode != (TARGET_64BIT ? TImode : DImode))
22966 && CONST_INT_P (operands[2])
22967 && CONST_INT_P (operands[3]))
22969 rtx out = operands[0];
22970 HOST_WIDE_INT ct = INTVAL (operands[2]);
22971 HOST_WIDE_INT cf = INTVAL (operands[3]);
22972 HOST_WIDE_INT diff;
22974 diff = ct - cf;
22975 /* Sign bit compares are better done using shifts than we do by using
22976 sbb. */
22977 if (sign_bit_compare_p
22978 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22980 /* Detect overlap between destination and compare sources. */
22981 rtx tmp = out;
22983 if (!sign_bit_compare_p)
22985 rtx flags;
22986 bool fpcmp = false;
22988 compare_code = GET_CODE (compare_op);
22990 flags = XEXP (compare_op, 0);
22992 if (GET_MODE (flags) == CCFPmode
22993 || GET_MODE (flags) == CCFPUmode)
22995 fpcmp = true;
22996 compare_code
22997 = ix86_fp_compare_code_to_integer (compare_code);
23000 /* To simplify rest of code, restrict to the GEU case. */
23001 if (compare_code == LTU)
23003 std::swap (ct, cf);
23004 compare_code = reverse_condition (compare_code);
23005 code = reverse_condition (code);
23007 else
23009 if (fpcmp)
23010 PUT_CODE (compare_op,
23011 reverse_condition_maybe_unordered
23012 (GET_CODE (compare_op)));
23013 else
23014 PUT_CODE (compare_op,
23015 reverse_condition (GET_CODE (compare_op)));
23017 diff = ct - cf;
23019 if (reg_overlap_mentioned_p (out, op0)
23020 || reg_overlap_mentioned_p (out, op1))
23021 tmp = gen_reg_rtx (mode);
23023 if (mode == DImode)
23024 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23025 else
23026 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23027 flags, compare_op));
23029 else
23031 if (code == GT || code == GE)
23032 code = reverse_condition (code);
23033 else
23035 std::swap (ct, cf);
23036 diff = ct - cf;
23038 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23041 if (diff == 1)
23044 * cmpl op0,op1
23045 * sbbl dest,dest
23046 * [addl dest, ct]
23048 * Size 5 - 8.
23050 if (ct)
23051 tmp = expand_simple_binop (mode, PLUS,
23052 tmp, GEN_INT (ct),
23053 copy_rtx (tmp), 1, OPTAB_DIRECT);
23055 else if (cf == -1)
23058 * cmpl op0,op1
23059 * sbbl dest,dest
23060 * orl $ct, dest
23062 * Size 8.
23064 tmp = expand_simple_binop (mode, IOR,
23065 tmp, GEN_INT (ct),
23066 copy_rtx (tmp), 1, OPTAB_DIRECT);
23068 else if (diff == -1 && ct)
23071 * cmpl op0,op1
23072 * sbbl dest,dest
23073 * notl dest
23074 * [addl dest, cf]
23076 * Size 8 - 11.
23078 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23079 if (cf)
23080 tmp = expand_simple_binop (mode, PLUS,
23081 copy_rtx (tmp), GEN_INT (cf),
23082 copy_rtx (tmp), 1, OPTAB_DIRECT);
23084 else
23087 * cmpl op0,op1
23088 * sbbl dest,dest
23089 * [notl dest]
23090 * andl cf - ct, dest
23091 * [addl dest, ct]
23093 * Size 8 - 11.
23096 if (cf == 0)
23098 cf = ct;
23099 ct = 0;
23100 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23103 tmp = expand_simple_binop (mode, AND,
23104 copy_rtx (tmp),
23105 gen_int_mode (cf - ct, mode),
23106 copy_rtx (tmp), 1, OPTAB_DIRECT);
23107 if (ct)
23108 tmp = expand_simple_binop (mode, PLUS,
23109 copy_rtx (tmp), GEN_INT (ct),
23110 copy_rtx (tmp), 1, OPTAB_DIRECT);
23113 if (!rtx_equal_p (tmp, out))
23114 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23116 return true;
23119 if (diff < 0)
23121 machine_mode cmp_mode = GET_MODE (op0);
23122 enum rtx_code new_code;
23124 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23126 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23128 /* We may be reversing unordered compare to normal compare, that
23129 is not valid in general (we may convert non-trapping condition
23130 to trapping one), however on i386 we currently emit all
23131 comparisons unordered. */
23132 new_code = reverse_condition_maybe_unordered (code);
23134 else
23135 new_code = ix86_reverse_condition (code, cmp_mode);
23136 if (new_code != UNKNOWN)
23138 std::swap (ct, cf);
23139 diff = -diff;
23140 code = new_code;
23144 compare_code = UNKNOWN;
23145 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23146 && CONST_INT_P (op1))
23148 if (op1 == const0_rtx
23149 && (code == LT || code == GE))
23150 compare_code = code;
23151 else if (op1 == constm1_rtx)
23153 if (code == LE)
23154 compare_code = LT;
23155 else if (code == GT)
23156 compare_code = GE;
23160 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23161 if (compare_code != UNKNOWN
23162 && GET_MODE (op0) == GET_MODE (out)
23163 && (cf == -1 || ct == -1))
23165 /* If lea code below could be used, only optimize
23166 if it results in a 2 insn sequence. */
23168 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23169 || diff == 3 || diff == 5 || diff == 9)
23170 || (compare_code == LT && ct == -1)
23171 || (compare_code == GE && cf == -1))
23174 * notl op1 (if necessary)
23175 * sarl $31, op1
23176 * orl cf, op1
23178 if (ct != -1)
23180 cf = ct;
23181 ct = -1;
23182 code = reverse_condition (code);
23185 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23187 out = expand_simple_binop (mode, IOR,
23188 out, GEN_INT (cf),
23189 out, 1, OPTAB_DIRECT);
23190 if (out != operands[0])
23191 emit_move_insn (operands[0], out);
23193 return true;
23198 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23199 || diff == 3 || diff == 5 || diff == 9)
23200 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23201 && (mode != DImode
23202 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23205 * xorl dest,dest
23206 * cmpl op1,op2
23207 * setcc dest
23208 * lea cf(dest*(ct-cf)),dest
23210 * Size 14.
23212 * This also catches the degenerate setcc-only case.
23215 rtx tmp;
23216 int nops;
23218 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23220 nops = 0;
23221 /* On x86_64 the lea instruction operates on Pmode, so we need
23222 to get arithmetics done in proper mode to match. */
23223 if (diff == 1)
23224 tmp = copy_rtx (out);
23225 else
23227 rtx out1;
23228 out1 = copy_rtx (out);
23229 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23230 nops++;
23231 if (diff & 1)
23233 tmp = gen_rtx_PLUS (mode, tmp, out1);
23234 nops++;
23237 if (cf != 0)
23239 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23240 nops++;
23242 if (!rtx_equal_p (tmp, out))
23244 if (nops == 1)
23245 out = force_operand (tmp, copy_rtx (out));
23246 else
23247 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23249 if (!rtx_equal_p (out, operands[0]))
23250 emit_move_insn (operands[0], copy_rtx (out));
23252 return true;
23256 * General case: Jumpful:
23257 * xorl dest,dest cmpl op1, op2
23258 * cmpl op1, op2 movl ct, dest
23259 * setcc dest jcc 1f
23260 * decl dest movl cf, dest
23261 * andl (cf-ct),dest 1:
23262 * addl ct,dest
23264 * Size 20. Size 14.
23266 * This is reasonably steep, but branch mispredict costs are
23267 * high on modern cpus, so consider failing only if optimizing
23268 * for space.
23271 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23272 && BRANCH_COST (optimize_insn_for_speed_p (),
23273 false) >= 2)
23275 if (cf == 0)
23277 machine_mode cmp_mode = GET_MODE (op0);
23278 enum rtx_code new_code;
23280 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23282 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23284 /* We may be reversing unordered compare to normal compare,
23285 that is not valid in general (we may convert non-trapping
23286 condition to trapping one), however on i386 we currently
23287 emit all comparisons unordered. */
23288 new_code = reverse_condition_maybe_unordered (code);
23290 else
23292 new_code = ix86_reverse_condition (code, cmp_mode);
23293 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23294 compare_code = reverse_condition (compare_code);
23297 if (new_code != UNKNOWN)
23299 cf = ct;
23300 ct = 0;
23301 code = new_code;
23305 if (compare_code != UNKNOWN)
23307 /* notl op1 (if needed)
23308 sarl $31, op1
23309 andl (cf-ct), op1
23310 addl ct, op1
23312 For x < 0 (resp. x <= -1) there will be no notl,
23313 so if possible swap the constants to get rid of the
23314 complement.
23315 True/false will be -1/0 while code below (store flag
23316 followed by decrement) is 0/-1, so the constants need
23317 to be exchanged once more. */
23319 if (compare_code == GE || !cf)
23321 code = reverse_condition (code);
23322 compare_code = LT;
23324 else
23325 std::swap (ct, cf);
23327 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23329 else
23331 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23333 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23334 constm1_rtx,
23335 copy_rtx (out), 1, OPTAB_DIRECT);
23338 out = expand_simple_binop (mode, AND, copy_rtx (out),
23339 gen_int_mode (cf - ct, mode),
23340 copy_rtx (out), 1, OPTAB_DIRECT);
23341 if (ct)
23342 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23343 copy_rtx (out), 1, OPTAB_DIRECT);
23344 if (!rtx_equal_p (out, operands[0]))
23345 emit_move_insn (operands[0], copy_rtx (out));
23347 return true;
23351 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23353 /* Try a few things more with specific constants and a variable. */
23355 optab op;
23356 rtx var, orig_out, out, tmp;
23358 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23359 return false;
23361 /* If one of the two operands is an interesting constant, load a
23362 constant with the above and mask it in with a logical operation. */
23364 if (CONST_INT_P (operands[2]))
23366 var = operands[3];
23367 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23368 operands[3] = constm1_rtx, op = and_optab;
23369 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23370 operands[3] = const0_rtx, op = ior_optab;
23371 else
23372 return false;
23374 else if (CONST_INT_P (operands[3]))
23376 var = operands[2];
23377 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23378 operands[2] = constm1_rtx, op = and_optab;
23379 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23380 operands[2] = const0_rtx, op = ior_optab;
23381 else
23382 return false;
23384 else
23385 return false;
23387 orig_out = operands[0];
23388 tmp = gen_reg_rtx (mode);
23389 operands[0] = tmp;
23391 /* Recurse to get the constant loaded. */
23392 if (!ix86_expand_int_movcc (operands))
23393 return false;
23395 /* Mask in the interesting variable. */
23396 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23397 OPTAB_WIDEN);
23398 if (!rtx_equal_p (out, orig_out))
23399 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23401 return true;
23405 * For comparison with above,
23407 * movl cf,dest
23408 * movl ct,tmp
23409 * cmpl op1,op2
23410 * cmovcc tmp,dest
23412 * Size 15.
23415 if (! nonimmediate_operand (operands[2], mode))
23416 operands[2] = force_reg (mode, operands[2]);
23417 if (! nonimmediate_operand (operands[3], mode))
23418 operands[3] = force_reg (mode, operands[3]);
23420 if (! register_operand (operands[2], VOIDmode)
23421 && (mode == QImode
23422 || ! register_operand (operands[3], VOIDmode)))
23423 operands[2] = force_reg (mode, operands[2]);
23425 if (mode == QImode
23426 && ! register_operand (operands[3], VOIDmode))
23427 operands[3] = force_reg (mode, operands[3]);
23429 emit_insn (compare_seq);
23430 emit_insn (gen_rtx_SET (operands[0],
23431 gen_rtx_IF_THEN_ELSE (mode,
23432 compare_op, operands[2],
23433 operands[3])));
23434 return true;
23437 /* Swap, force into registers, or otherwise massage the two operands
23438 to an sse comparison with a mask result. Thus we differ a bit from
23439 ix86_prepare_fp_compare_args which expects to produce a flags result.
23441 The DEST operand exists to help determine whether to commute commutative
23442 operators. The POP0/POP1 operands are updated in place. The new
23443 comparison code is returned, or UNKNOWN if not implementable. */
23445 static enum rtx_code
23446 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23447 rtx *pop0, rtx *pop1)
23449 switch (code)
23451 case LTGT:
23452 case UNEQ:
23453 /* AVX supports all the needed comparisons. */
23454 if (TARGET_AVX)
23455 break;
23456 /* We have no LTGT as an operator. We could implement it with
23457 NE & ORDERED, but this requires an extra temporary. It's
23458 not clear that it's worth it. */
23459 return UNKNOWN;
23461 case LT:
23462 case LE:
23463 case UNGT:
23464 case UNGE:
23465 /* These are supported directly. */
23466 break;
23468 case EQ:
23469 case NE:
23470 case UNORDERED:
23471 case ORDERED:
23472 /* AVX has 3 operand comparisons, no need to swap anything. */
23473 if (TARGET_AVX)
23474 break;
23475 /* For commutative operators, try to canonicalize the destination
23476 operand to be first in the comparison - this helps reload to
23477 avoid extra moves. */
23478 if (!dest || !rtx_equal_p (dest, *pop1))
23479 break;
23480 /* FALLTHRU */
23482 case GE:
23483 case GT:
23484 case UNLE:
23485 case UNLT:
23486 /* These are not supported directly before AVX, and furthermore
23487 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23488 comparison operands to transform into something that is
23489 supported. */
23490 std::swap (*pop0, *pop1);
23491 code = swap_condition (code);
23492 break;
23494 default:
23495 gcc_unreachable ();
23498 return code;
23501 /* Detect conditional moves that exactly match min/max operational
23502 semantics. Note that this is IEEE safe, as long as we don't
23503 interchange the operands.
23505 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23506 and TRUE if the operation is successful and instructions are emitted. */
23508 static bool
23509 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23510 rtx cmp_op1, rtx if_true, rtx if_false)
23512 machine_mode mode;
23513 bool is_min;
23514 rtx tmp;
23516 if (code == LT)
23518 else if (code == UNGE)
23519 std::swap (if_true, if_false);
23520 else
23521 return false;
23523 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23524 is_min = true;
23525 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23526 is_min = false;
23527 else
23528 return false;
23530 mode = GET_MODE (dest);
23532 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23533 but MODE may be a vector mode and thus not appropriate. */
23534 if (!flag_finite_math_only || flag_signed_zeros)
23536 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23537 rtvec v;
23539 if_true = force_reg (mode, if_true);
23540 v = gen_rtvec (2, if_true, if_false);
23541 tmp = gen_rtx_UNSPEC (mode, v, u);
23543 else
23545 code = is_min ? SMIN : SMAX;
23546 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23549 emit_insn (gen_rtx_SET (dest, tmp));
23550 return true;
23553 /* Expand an sse vector comparison. Return the register with the result. */
23555 static rtx
23556 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23557 rtx op_true, rtx op_false)
23559 machine_mode mode = GET_MODE (dest);
23560 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23562 /* In general case result of comparison can differ from operands' type. */
23563 machine_mode cmp_mode;
23565 /* In AVX512F the result of comparison is an integer mask. */
23566 bool maskcmp = false;
23567 rtx x;
23569 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23571 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23572 gcc_assert (cmp_mode != BLKmode);
23574 maskcmp = true;
23576 else
23577 cmp_mode = cmp_ops_mode;
23580 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23581 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23582 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23584 if (optimize
23585 || (maskcmp && cmp_mode != mode)
23586 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23587 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23588 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23590 /* Compare patterns for int modes are unspec in AVX512F only. */
23591 if (maskcmp && (code == GT || code == EQ))
23593 rtx (*gen)(rtx, rtx, rtx);
23595 switch (cmp_ops_mode)
23597 case V64QImode:
23598 gcc_assert (TARGET_AVX512BW);
23599 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23600 break;
23601 case V32HImode:
23602 gcc_assert (TARGET_AVX512BW);
23603 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23604 break;
23605 case V16SImode:
23606 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23607 break;
23608 case V8DImode:
23609 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23610 break;
23611 default:
23612 gen = NULL;
23615 if (gen)
23617 emit_insn (gen (dest, cmp_op0, cmp_op1));
23618 return dest;
23621 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23623 if (cmp_mode != mode && !maskcmp)
23625 x = force_reg (cmp_ops_mode, x);
23626 convert_move (dest, x, false);
23628 else
23629 emit_insn (gen_rtx_SET (dest, x));
23631 return dest;
23634 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23635 operations. This is used for both scalar and vector conditional moves. */
23637 void
23638 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23640 machine_mode mode = GET_MODE (dest);
23641 machine_mode cmpmode = GET_MODE (cmp);
23643 /* In AVX512F the result of comparison is an integer mask. */
23644 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23646 rtx t2, t3, x;
23648 /* If we have an integer mask and FP value then we need
23649 to cast mask to FP mode. */
23650 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23652 cmp = force_reg (cmpmode, cmp);
23653 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23656 if (vector_all_ones_operand (op_true, mode)
23657 && rtx_equal_p (op_false, CONST0_RTX (mode))
23658 && !maskcmp)
23660 emit_insn (gen_rtx_SET (dest, cmp));
23662 else if (op_false == CONST0_RTX (mode)
23663 && !maskcmp)
23665 op_true = force_reg (mode, op_true);
23666 x = gen_rtx_AND (mode, cmp, op_true);
23667 emit_insn (gen_rtx_SET (dest, x));
23669 else if (op_true == CONST0_RTX (mode)
23670 && !maskcmp)
23672 op_false = force_reg (mode, op_false);
23673 x = gen_rtx_NOT (mode, cmp);
23674 x = gen_rtx_AND (mode, x, op_false);
23675 emit_insn (gen_rtx_SET (dest, x));
23677 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23678 && !maskcmp)
23680 op_false = force_reg (mode, op_false);
23681 x = gen_rtx_IOR (mode, cmp, op_false);
23682 emit_insn (gen_rtx_SET (dest, x));
23684 else if (TARGET_XOP
23685 && !maskcmp)
23687 op_true = force_reg (mode, op_true);
23689 if (!nonimmediate_operand (op_false, mode))
23690 op_false = force_reg (mode, op_false);
23692 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23693 op_true,
23694 op_false)));
23696 else
23698 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23699 rtx d = dest;
23701 if (!nonimmediate_operand (op_true, mode))
23702 op_true = force_reg (mode, op_true);
23704 op_false = force_reg (mode, op_false);
23706 switch (mode)
23708 case V4SFmode:
23709 if (TARGET_SSE4_1)
23710 gen = gen_sse4_1_blendvps;
23711 break;
23712 case V2DFmode:
23713 if (TARGET_SSE4_1)
23714 gen = gen_sse4_1_blendvpd;
23715 break;
23716 case V16QImode:
23717 case V8HImode:
23718 case V4SImode:
23719 case V2DImode:
23720 if (TARGET_SSE4_1)
23722 gen = gen_sse4_1_pblendvb;
23723 if (mode != V16QImode)
23724 d = gen_reg_rtx (V16QImode);
23725 op_false = gen_lowpart (V16QImode, op_false);
23726 op_true = gen_lowpart (V16QImode, op_true);
23727 cmp = gen_lowpart (V16QImode, cmp);
23729 break;
23730 case V8SFmode:
23731 if (TARGET_AVX)
23732 gen = gen_avx_blendvps256;
23733 break;
23734 case V4DFmode:
23735 if (TARGET_AVX)
23736 gen = gen_avx_blendvpd256;
23737 break;
23738 case V32QImode:
23739 case V16HImode:
23740 case V8SImode:
23741 case V4DImode:
23742 if (TARGET_AVX2)
23744 gen = gen_avx2_pblendvb;
23745 if (mode != V32QImode)
23746 d = gen_reg_rtx (V32QImode);
23747 op_false = gen_lowpart (V32QImode, op_false);
23748 op_true = gen_lowpart (V32QImode, op_true);
23749 cmp = gen_lowpart (V32QImode, cmp);
23751 break;
23753 case V64QImode:
23754 gen = gen_avx512bw_blendmv64qi;
23755 break;
23756 case V32HImode:
23757 gen = gen_avx512bw_blendmv32hi;
23758 break;
23759 case V16SImode:
23760 gen = gen_avx512f_blendmv16si;
23761 break;
23762 case V8DImode:
23763 gen = gen_avx512f_blendmv8di;
23764 break;
23765 case V8DFmode:
23766 gen = gen_avx512f_blendmv8df;
23767 break;
23768 case V16SFmode:
23769 gen = gen_avx512f_blendmv16sf;
23770 break;
23772 default:
23773 break;
23776 if (gen != NULL)
23778 emit_insn (gen (d, op_false, op_true, cmp));
23779 if (d != dest)
23780 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23782 else
23784 op_true = force_reg (mode, op_true);
23786 t2 = gen_reg_rtx (mode);
23787 if (optimize)
23788 t3 = gen_reg_rtx (mode);
23789 else
23790 t3 = dest;
23792 x = gen_rtx_AND (mode, op_true, cmp);
23793 emit_insn (gen_rtx_SET (t2, x));
23795 x = gen_rtx_NOT (mode, cmp);
23796 x = gen_rtx_AND (mode, x, op_false);
23797 emit_insn (gen_rtx_SET (t3, x));
23799 x = gen_rtx_IOR (mode, t3, t2);
23800 emit_insn (gen_rtx_SET (dest, x));
23805 /* Expand a floating-point conditional move. Return true if successful. */
23807 bool
23808 ix86_expand_fp_movcc (rtx operands[])
23810 machine_mode mode = GET_MODE (operands[0]);
23811 enum rtx_code code = GET_CODE (operands[1]);
23812 rtx tmp, compare_op;
23813 rtx op0 = XEXP (operands[1], 0);
23814 rtx op1 = XEXP (operands[1], 1);
23816 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23818 machine_mode cmode;
23820 /* Since we've no cmove for sse registers, don't force bad register
23821 allocation just to gain access to it. Deny movcc when the
23822 comparison mode doesn't match the move mode. */
23823 cmode = GET_MODE (op0);
23824 if (cmode == VOIDmode)
23825 cmode = GET_MODE (op1);
23826 if (cmode != mode)
23827 return false;
23829 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23830 if (code == UNKNOWN)
23831 return false;
23833 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23834 operands[2], operands[3]))
23835 return true;
23837 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23838 operands[2], operands[3]);
23839 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23840 return true;
23843 if (GET_MODE (op0) == TImode
23844 || (GET_MODE (op0) == DImode
23845 && !TARGET_64BIT))
23846 return false;
23848 /* The floating point conditional move instructions don't directly
23849 support conditions resulting from a signed integer comparison. */
23851 compare_op = ix86_expand_compare (code, op0, op1);
23852 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23854 tmp = gen_reg_rtx (QImode);
23855 ix86_expand_setcc (tmp, code, op0, op1);
23857 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23860 emit_insn (gen_rtx_SET (operands[0],
23861 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23862 operands[2], operands[3])));
23864 return true;
23867 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23869 static int
23870 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23872 switch (code)
23874 case EQ:
23875 return 0;
23876 case LT:
23877 case LTU:
23878 return 1;
23879 case LE:
23880 case LEU:
23881 return 2;
23882 case NE:
23883 return 4;
23884 case GE:
23885 case GEU:
23886 return 5;
23887 case GT:
23888 case GTU:
23889 return 6;
23890 default:
23891 gcc_unreachable ();
23895 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23897 static int
23898 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23900 switch (code)
23902 case EQ:
23903 return 0x00;
23904 case NE:
23905 return 0x04;
23906 case GT:
23907 return 0x0e;
23908 case LE:
23909 return 0x02;
23910 case GE:
23911 return 0x0d;
23912 case LT:
23913 return 0x01;
23914 case UNLE:
23915 return 0x0a;
23916 case UNLT:
23917 return 0x09;
23918 case UNGE:
23919 return 0x05;
23920 case UNGT:
23921 return 0x06;
23922 case UNEQ:
23923 return 0x18;
23924 case LTGT:
23925 return 0x0c;
23926 case ORDERED:
23927 return 0x07;
23928 case UNORDERED:
23929 return 0x03;
23930 default:
23931 gcc_unreachable ();
23935 /* Return immediate value to be used in UNSPEC_PCMP
23936 for comparison CODE in MODE. */
23938 static int
23939 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23941 if (FLOAT_MODE_P (mode))
23942 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23943 return ix86_int_cmp_code_to_pcmp_immediate (code);
23946 /* Expand AVX-512 vector comparison. */
23948 bool
23949 ix86_expand_mask_vec_cmp (rtx operands[])
23951 machine_mode mask_mode = GET_MODE (operands[0]);
23952 machine_mode cmp_mode = GET_MODE (operands[2]);
23953 enum rtx_code code = GET_CODE (operands[1]);
23954 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23955 int unspec_code;
23956 rtx unspec;
23958 switch (code)
23960 case LEU:
23961 case GTU:
23962 case GEU:
23963 case LTU:
23964 unspec_code = UNSPEC_UNSIGNED_PCMP;
23965 break;
23967 default:
23968 unspec_code = UNSPEC_PCMP;
23971 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23972 operands[3], imm),
23973 unspec_code);
23974 emit_insn (gen_rtx_SET (operands[0], unspec));
23976 return true;
23979 /* Expand fp vector comparison. */
23981 bool
23982 ix86_expand_fp_vec_cmp (rtx operands[])
23984 enum rtx_code code = GET_CODE (operands[1]);
23985 rtx cmp;
23987 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23988 &operands[2], &operands[3]);
23989 if (code == UNKNOWN)
23991 rtx temp;
23992 switch (GET_CODE (operands[1]))
23994 case LTGT:
23995 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23996 operands[3], NULL, NULL);
23997 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23998 operands[3], NULL, NULL);
23999 code = AND;
24000 break;
24001 case UNEQ:
24002 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24003 operands[3], NULL, NULL);
24004 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24005 operands[3], NULL, NULL);
24006 code = IOR;
24007 break;
24008 default:
24009 gcc_unreachable ();
24011 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24012 OPTAB_DIRECT);
24014 else
24015 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24016 operands[1], operands[2]);
24018 if (operands[0] != cmp)
24019 emit_move_insn (operands[0], cmp);
24021 return true;
24024 static rtx
24025 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24026 rtx op_true, rtx op_false, bool *negate)
24028 machine_mode data_mode = GET_MODE (dest);
24029 machine_mode mode = GET_MODE (cop0);
24030 rtx x;
24032 *negate = false;
24034 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24035 if (TARGET_XOP
24036 && (mode == V16QImode || mode == V8HImode
24037 || mode == V4SImode || mode == V2DImode))
24039 else
24041 /* Canonicalize the comparison to EQ, GT, GTU. */
24042 switch (code)
24044 case EQ:
24045 case GT:
24046 case GTU:
24047 break;
24049 case NE:
24050 case LE:
24051 case LEU:
24052 code = reverse_condition (code);
24053 *negate = true;
24054 break;
24056 case GE:
24057 case GEU:
24058 code = reverse_condition (code);
24059 *negate = true;
24060 /* FALLTHRU */
24062 case LT:
24063 case LTU:
24064 std::swap (cop0, cop1);
24065 code = swap_condition (code);
24066 break;
24068 default:
24069 gcc_unreachable ();
24072 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24073 if (mode == V2DImode)
24075 switch (code)
24077 case EQ:
24078 /* SSE4.1 supports EQ. */
24079 if (!TARGET_SSE4_1)
24080 return NULL;
24081 break;
24083 case GT:
24084 case GTU:
24085 /* SSE4.2 supports GT/GTU. */
24086 if (!TARGET_SSE4_2)
24087 return NULL;
24088 break;
24090 default:
24091 gcc_unreachable ();
24095 /* Unsigned parallel compare is not supported by the hardware.
24096 Play some tricks to turn this into a signed comparison
24097 against 0. */
24098 if (code == GTU)
24100 cop0 = force_reg (mode, cop0);
24102 switch (mode)
24104 case V16SImode:
24105 case V8DImode:
24106 case V8SImode:
24107 case V4DImode:
24108 case V4SImode:
24109 case V2DImode:
24111 rtx t1, t2, mask;
24112 rtx (*gen_sub3) (rtx, rtx, rtx);
24114 switch (mode)
24116 case V16SImode: gen_sub3 = gen_subv16si3; break;
24117 case V8DImode: gen_sub3 = gen_subv8di3; break;
24118 case V8SImode: gen_sub3 = gen_subv8si3; break;
24119 case V4DImode: gen_sub3 = gen_subv4di3; break;
24120 case V4SImode: gen_sub3 = gen_subv4si3; break;
24121 case V2DImode: gen_sub3 = gen_subv2di3; break;
24122 default:
24123 gcc_unreachable ();
24125 /* Subtract (-(INT MAX) - 1) from both operands to make
24126 them signed. */
24127 mask = ix86_build_signbit_mask (mode, true, false);
24128 t1 = gen_reg_rtx (mode);
24129 emit_insn (gen_sub3 (t1, cop0, mask));
24131 t2 = gen_reg_rtx (mode);
24132 emit_insn (gen_sub3 (t2, cop1, mask));
24134 cop0 = t1;
24135 cop1 = t2;
24136 code = GT;
24138 break;
24140 case V64QImode:
24141 case V32HImode:
24142 case V32QImode:
24143 case V16HImode:
24144 case V16QImode:
24145 case V8HImode:
24146 /* Perform a parallel unsigned saturating subtraction. */
24147 x = gen_reg_rtx (mode);
24148 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24149 cop1)));
24151 cop0 = x;
24152 cop1 = CONST0_RTX (mode);
24153 code = EQ;
24154 *negate = !*negate;
24155 break;
24157 default:
24158 gcc_unreachable ();
24163 if (*negate)
24164 std::swap (op_true, op_false);
24166 /* Allow the comparison to be done in one mode, but the movcc to
24167 happen in another mode. */
24168 if (data_mode == mode)
24170 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24171 op_true, op_false);
24173 else
24175 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24176 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24177 op_true, op_false);
24178 if (GET_MODE (x) == mode)
24179 x = gen_lowpart (data_mode, x);
24182 return x;
24185 /* Expand integer vector comparison. */
24187 bool
24188 ix86_expand_int_vec_cmp (rtx operands[])
24190 rtx_code code = GET_CODE (operands[1]);
24191 bool negate = false;
24192 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24193 operands[3], NULL, NULL, &negate);
24195 if (!cmp)
24196 return false;
24198 if (negate)
24199 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24200 CONST0_RTX (GET_MODE (cmp)),
24201 NULL, NULL, &negate);
24203 gcc_assert (!negate);
24205 if (operands[0] != cmp)
24206 emit_move_insn (operands[0], cmp);
24208 return true;
24211 /* Expand a floating-point vector conditional move; a vcond operation
24212 rather than a movcc operation. */
24214 bool
24215 ix86_expand_fp_vcond (rtx operands[])
24217 enum rtx_code code = GET_CODE (operands[3]);
24218 rtx cmp;
24220 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24221 &operands[4], &operands[5]);
24222 if (code == UNKNOWN)
24224 rtx temp;
24225 switch (GET_CODE (operands[3]))
24227 case LTGT:
24228 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24229 operands[5], operands[0], operands[0]);
24230 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24231 operands[5], operands[1], operands[2]);
24232 code = AND;
24233 break;
24234 case UNEQ:
24235 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24236 operands[5], operands[0], operands[0]);
24237 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24238 operands[5], operands[1], operands[2]);
24239 code = IOR;
24240 break;
24241 default:
24242 gcc_unreachable ();
24244 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24245 OPTAB_DIRECT);
24246 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24247 return true;
24250 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24251 operands[5], operands[1], operands[2]))
24252 return true;
24254 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24255 operands[1], operands[2]);
24256 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24257 return true;
24260 /* Expand a signed/unsigned integral vector conditional move. */
24262 bool
24263 ix86_expand_int_vcond (rtx operands[])
24265 machine_mode data_mode = GET_MODE (operands[0]);
24266 machine_mode mode = GET_MODE (operands[4]);
24267 enum rtx_code code = GET_CODE (operands[3]);
24268 bool negate = false;
24269 rtx x, cop0, cop1;
24271 cop0 = operands[4];
24272 cop1 = operands[5];
24274 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24275 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24276 if ((code == LT || code == GE)
24277 && data_mode == mode
24278 && cop1 == CONST0_RTX (mode)
24279 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24280 && GET_MODE_UNIT_SIZE (data_mode) > 1
24281 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24282 && (GET_MODE_SIZE (data_mode) == 16
24283 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24285 rtx negop = operands[2 - (code == LT)];
24286 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24287 if (negop == CONST1_RTX (data_mode))
24289 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24290 operands[0], 1, OPTAB_DIRECT);
24291 if (res != operands[0])
24292 emit_move_insn (operands[0], res);
24293 return true;
24295 else if (GET_MODE_INNER (data_mode) != DImode
24296 && vector_all_ones_operand (negop, data_mode))
24298 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24299 operands[0], 0, OPTAB_DIRECT);
24300 if (res != operands[0])
24301 emit_move_insn (operands[0], res);
24302 return true;
24306 if (!nonimmediate_operand (cop1, mode))
24307 cop1 = force_reg (mode, cop1);
24308 if (!general_operand (operands[1], data_mode))
24309 operands[1] = force_reg (data_mode, operands[1]);
24310 if (!general_operand (operands[2], data_mode))
24311 operands[2] = force_reg (data_mode, operands[2]);
24313 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24314 operands[1], operands[2], &negate);
24316 if (!x)
24317 return false;
24319 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24320 operands[2-negate]);
24321 return true;
24324 /* AVX512F does support 64-byte integer vector operations,
24325 thus the longest vector we are faced with is V64QImode. */
24326 #define MAX_VECT_LEN 64
24328 struct expand_vec_perm_d
24330 rtx target, op0, op1;
24331 unsigned char perm[MAX_VECT_LEN];
24332 machine_mode vmode;
24333 unsigned char nelt;
24334 bool one_operand_p;
24335 bool testing_p;
24338 static bool
24339 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24340 struct expand_vec_perm_d *d)
24342 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24343 expander, so args are either in d, or in op0, op1 etc. */
24344 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24345 machine_mode maskmode = mode;
24346 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24348 switch (mode)
24350 case V8HImode:
24351 if (TARGET_AVX512VL && TARGET_AVX512BW)
24352 gen = gen_avx512vl_vpermi2varv8hi3;
24353 break;
24354 case V16HImode:
24355 if (TARGET_AVX512VL && TARGET_AVX512BW)
24356 gen = gen_avx512vl_vpermi2varv16hi3;
24357 break;
24358 case V64QImode:
24359 if (TARGET_AVX512VBMI)
24360 gen = gen_avx512bw_vpermi2varv64qi3;
24361 break;
24362 case V32HImode:
24363 if (TARGET_AVX512BW)
24364 gen = gen_avx512bw_vpermi2varv32hi3;
24365 break;
24366 case V4SImode:
24367 if (TARGET_AVX512VL)
24368 gen = gen_avx512vl_vpermi2varv4si3;
24369 break;
24370 case V8SImode:
24371 if (TARGET_AVX512VL)
24372 gen = gen_avx512vl_vpermi2varv8si3;
24373 break;
24374 case V16SImode:
24375 if (TARGET_AVX512F)
24376 gen = gen_avx512f_vpermi2varv16si3;
24377 break;
24378 case V4SFmode:
24379 if (TARGET_AVX512VL)
24381 gen = gen_avx512vl_vpermi2varv4sf3;
24382 maskmode = V4SImode;
24384 break;
24385 case V8SFmode:
24386 if (TARGET_AVX512VL)
24388 gen = gen_avx512vl_vpermi2varv8sf3;
24389 maskmode = V8SImode;
24391 break;
24392 case V16SFmode:
24393 if (TARGET_AVX512F)
24395 gen = gen_avx512f_vpermi2varv16sf3;
24396 maskmode = V16SImode;
24398 break;
24399 case V2DImode:
24400 if (TARGET_AVX512VL)
24401 gen = gen_avx512vl_vpermi2varv2di3;
24402 break;
24403 case V4DImode:
24404 if (TARGET_AVX512VL)
24405 gen = gen_avx512vl_vpermi2varv4di3;
24406 break;
24407 case V8DImode:
24408 if (TARGET_AVX512F)
24409 gen = gen_avx512f_vpermi2varv8di3;
24410 break;
24411 case V2DFmode:
24412 if (TARGET_AVX512VL)
24414 gen = gen_avx512vl_vpermi2varv2df3;
24415 maskmode = V2DImode;
24417 break;
24418 case V4DFmode:
24419 if (TARGET_AVX512VL)
24421 gen = gen_avx512vl_vpermi2varv4df3;
24422 maskmode = V4DImode;
24424 break;
24425 case V8DFmode:
24426 if (TARGET_AVX512F)
24428 gen = gen_avx512f_vpermi2varv8df3;
24429 maskmode = V8DImode;
24431 break;
24432 default:
24433 break;
24436 if (gen == NULL)
24437 return false;
24439 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24440 expander, so args are either in d, or in op0, op1 etc. */
24441 if (d)
24443 rtx vec[64];
24444 target = d->target;
24445 op0 = d->op0;
24446 op1 = d->op1;
24447 for (int i = 0; i < d->nelt; ++i)
24448 vec[i] = GEN_INT (d->perm[i]);
24449 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24452 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24453 return true;
24456 /* Expand a variable vector permutation. */
24458 void
24459 ix86_expand_vec_perm (rtx operands[])
24461 rtx target = operands[0];
24462 rtx op0 = operands[1];
24463 rtx op1 = operands[2];
24464 rtx mask = operands[3];
24465 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24466 machine_mode mode = GET_MODE (op0);
24467 machine_mode maskmode = GET_MODE (mask);
24468 int w, e, i;
24469 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24471 /* Number of elements in the vector. */
24472 w = GET_MODE_NUNITS (mode);
24473 e = GET_MODE_UNIT_SIZE (mode);
24474 gcc_assert (w <= 64);
24476 if (TARGET_AVX512F && one_operand_shuffle)
24478 rtx (*gen) (rtx, rtx, rtx) = NULL;
24479 switch (mode)
24481 case V16SImode:
24482 gen =gen_avx512f_permvarv16si;
24483 break;
24484 case V16SFmode:
24485 gen = gen_avx512f_permvarv16sf;
24486 break;
24487 case V8DImode:
24488 gen = gen_avx512f_permvarv8di;
24489 break;
24490 case V8DFmode:
24491 gen = gen_avx512f_permvarv8df;
24492 break;
24493 default:
24494 break;
24496 if (gen != NULL)
24498 emit_insn (gen (target, op0, mask));
24499 return;
24503 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24504 return;
24506 if (TARGET_AVX2)
24508 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24510 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24511 an constant shuffle operand. With a tiny bit of effort we can
24512 use VPERMD instead. A re-interpretation stall for V4DFmode is
24513 unfortunate but there's no avoiding it.
24514 Similarly for V16HImode we don't have instructions for variable
24515 shuffling, while for V32QImode we can use after preparing suitable
24516 masks vpshufb; vpshufb; vpermq; vpor. */
24518 if (mode == V16HImode)
24520 maskmode = mode = V32QImode;
24521 w = 32;
24522 e = 1;
24524 else
24526 maskmode = mode = V8SImode;
24527 w = 8;
24528 e = 4;
24530 t1 = gen_reg_rtx (maskmode);
24532 /* Replicate the low bits of the V4DImode mask into V8SImode:
24533 mask = { A B C D }
24534 t1 = { A A B B C C D D }. */
24535 for (i = 0; i < w / 2; ++i)
24536 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24537 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24538 vt = force_reg (maskmode, vt);
24539 mask = gen_lowpart (maskmode, mask);
24540 if (maskmode == V8SImode)
24541 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24542 else
24543 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24545 /* Multiply the shuffle indicies by two. */
24546 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24547 OPTAB_DIRECT);
24549 /* Add one to the odd shuffle indicies:
24550 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24551 for (i = 0; i < w / 2; ++i)
24553 vec[i * 2] = const0_rtx;
24554 vec[i * 2 + 1] = const1_rtx;
24556 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24557 vt = validize_mem (force_const_mem (maskmode, vt));
24558 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24559 OPTAB_DIRECT);
24561 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24562 operands[3] = mask = t1;
24563 target = gen_reg_rtx (mode);
24564 op0 = gen_lowpart (mode, op0);
24565 op1 = gen_lowpart (mode, op1);
24568 switch (mode)
24570 case V8SImode:
24571 /* The VPERMD and VPERMPS instructions already properly ignore
24572 the high bits of the shuffle elements. No need for us to
24573 perform an AND ourselves. */
24574 if (one_operand_shuffle)
24576 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24577 if (target != operands[0])
24578 emit_move_insn (operands[0],
24579 gen_lowpart (GET_MODE (operands[0]), target));
24581 else
24583 t1 = gen_reg_rtx (V8SImode);
24584 t2 = gen_reg_rtx (V8SImode);
24585 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24586 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24587 goto merge_two;
24589 return;
24591 case V8SFmode:
24592 mask = gen_lowpart (V8SImode, mask);
24593 if (one_operand_shuffle)
24594 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24595 else
24597 t1 = gen_reg_rtx (V8SFmode);
24598 t2 = gen_reg_rtx (V8SFmode);
24599 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24600 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24601 goto merge_two;
24603 return;
24605 case V4SImode:
24606 /* By combining the two 128-bit input vectors into one 256-bit
24607 input vector, we can use VPERMD and VPERMPS for the full
24608 two-operand shuffle. */
24609 t1 = gen_reg_rtx (V8SImode);
24610 t2 = gen_reg_rtx (V8SImode);
24611 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24612 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24613 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24614 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24615 return;
24617 case V4SFmode:
24618 t1 = gen_reg_rtx (V8SFmode);
24619 t2 = gen_reg_rtx (V8SImode);
24620 mask = gen_lowpart (V4SImode, mask);
24621 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24622 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24623 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24624 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24625 return;
24627 case V32QImode:
24628 t1 = gen_reg_rtx (V32QImode);
24629 t2 = gen_reg_rtx (V32QImode);
24630 t3 = gen_reg_rtx (V32QImode);
24631 vt2 = GEN_INT (-128);
24632 for (i = 0; i < 32; i++)
24633 vec[i] = vt2;
24634 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24635 vt = force_reg (V32QImode, vt);
24636 for (i = 0; i < 32; i++)
24637 vec[i] = i < 16 ? vt2 : const0_rtx;
24638 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24639 vt2 = force_reg (V32QImode, vt2);
24640 /* From mask create two adjusted masks, which contain the same
24641 bits as mask in the low 7 bits of each vector element.
24642 The first mask will have the most significant bit clear
24643 if it requests element from the same 128-bit lane
24644 and MSB set if it requests element from the other 128-bit lane.
24645 The second mask will have the opposite values of the MSB,
24646 and additionally will have its 128-bit lanes swapped.
24647 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24648 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24649 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24650 stands for other 12 bytes. */
24651 /* The bit whether element is from the same lane or the other
24652 lane is bit 4, so shift it up by 3 to the MSB position. */
24653 t5 = gen_reg_rtx (V4DImode);
24654 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24655 GEN_INT (3)));
24656 /* Clear MSB bits from the mask just in case it had them set. */
24657 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24658 /* After this t1 will have MSB set for elements from other lane. */
24659 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24660 /* Clear bits other than MSB. */
24661 emit_insn (gen_andv32qi3 (t1, t1, vt));
24662 /* Or in the lower bits from mask into t3. */
24663 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24664 /* And invert MSB bits in t1, so MSB is set for elements from the same
24665 lane. */
24666 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24667 /* Swap 128-bit lanes in t3. */
24668 t6 = gen_reg_rtx (V4DImode);
24669 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24670 const2_rtx, GEN_INT (3),
24671 const0_rtx, const1_rtx));
24672 /* And or in the lower bits from mask into t1. */
24673 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24674 if (one_operand_shuffle)
24676 /* Each of these shuffles will put 0s in places where
24677 element from the other 128-bit lane is needed, otherwise
24678 will shuffle in the requested value. */
24679 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24680 gen_lowpart (V32QImode, t6)));
24681 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24682 /* For t3 the 128-bit lanes are swapped again. */
24683 t7 = gen_reg_rtx (V4DImode);
24684 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24685 const2_rtx, GEN_INT (3),
24686 const0_rtx, const1_rtx));
24687 /* And oring both together leads to the result. */
24688 emit_insn (gen_iorv32qi3 (target, t1,
24689 gen_lowpart (V32QImode, t7)));
24690 if (target != operands[0])
24691 emit_move_insn (operands[0],
24692 gen_lowpart (GET_MODE (operands[0]), target));
24693 return;
24696 t4 = gen_reg_rtx (V32QImode);
24697 /* Similarly to the above one_operand_shuffle code,
24698 just for repeated twice for each operand. merge_two:
24699 code will merge the two results together. */
24700 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24701 gen_lowpart (V32QImode, t6)));
24702 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24703 gen_lowpart (V32QImode, t6)));
24704 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24705 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24706 t7 = gen_reg_rtx (V4DImode);
24707 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24708 const2_rtx, GEN_INT (3),
24709 const0_rtx, const1_rtx));
24710 t8 = gen_reg_rtx (V4DImode);
24711 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24712 const2_rtx, GEN_INT (3),
24713 const0_rtx, const1_rtx));
24714 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24715 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24716 t1 = t4;
24717 t2 = t3;
24718 goto merge_two;
24720 default:
24721 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24722 break;
24726 if (TARGET_XOP)
24728 /* The XOP VPPERM insn supports three inputs. By ignoring the
24729 one_operand_shuffle special case, we avoid creating another
24730 set of constant vectors in memory. */
24731 one_operand_shuffle = false;
24733 /* mask = mask & {2*w-1, ...} */
24734 vt = GEN_INT (2*w - 1);
24736 else
24738 /* mask = mask & {w-1, ...} */
24739 vt = GEN_INT (w - 1);
24742 for (i = 0; i < w; i++)
24743 vec[i] = vt;
24744 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24745 mask = expand_simple_binop (maskmode, AND, mask, vt,
24746 NULL_RTX, 0, OPTAB_DIRECT);
24748 /* For non-QImode operations, convert the word permutation control
24749 into a byte permutation control. */
24750 if (mode != V16QImode)
24752 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24753 GEN_INT (exact_log2 (e)),
24754 NULL_RTX, 0, OPTAB_DIRECT);
24756 /* Convert mask to vector of chars. */
24757 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24759 /* Replicate each of the input bytes into byte positions:
24760 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24761 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24762 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24763 for (i = 0; i < 16; ++i)
24764 vec[i] = GEN_INT (i/e * e);
24765 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24766 vt = validize_mem (force_const_mem (V16QImode, vt));
24767 if (TARGET_XOP)
24768 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24769 else
24770 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24772 /* Convert it into the byte positions by doing
24773 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24774 for (i = 0; i < 16; ++i)
24775 vec[i] = GEN_INT (i % e);
24776 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24777 vt = validize_mem (force_const_mem (V16QImode, vt));
24778 emit_insn (gen_addv16qi3 (mask, mask, vt));
24781 /* The actual shuffle operations all operate on V16QImode. */
24782 op0 = gen_lowpart (V16QImode, op0);
24783 op1 = gen_lowpart (V16QImode, op1);
24785 if (TARGET_XOP)
24787 if (GET_MODE (target) != V16QImode)
24788 target = gen_reg_rtx (V16QImode);
24789 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24790 if (target != operands[0])
24791 emit_move_insn (operands[0],
24792 gen_lowpart (GET_MODE (operands[0]), target));
24794 else if (one_operand_shuffle)
24796 if (GET_MODE (target) != V16QImode)
24797 target = gen_reg_rtx (V16QImode);
24798 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24799 if (target != operands[0])
24800 emit_move_insn (operands[0],
24801 gen_lowpart (GET_MODE (operands[0]), target));
24803 else
24805 rtx xops[6];
24806 bool ok;
24808 /* Shuffle the two input vectors independently. */
24809 t1 = gen_reg_rtx (V16QImode);
24810 t2 = gen_reg_rtx (V16QImode);
24811 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24812 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24814 merge_two:
24815 /* Then merge them together. The key is whether any given control
24816 element contained a bit set that indicates the second word. */
24817 mask = operands[3];
24818 vt = GEN_INT (w);
24819 if (maskmode == V2DImode && !TARGET_SSE4_1)
24821 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24822 more shuffle to convert the V2DI input mask into a V4SI
24823 input mask. At which point the masking that expand_int_vcond
24824 will work as desired. */
24825 rtx t3 = gen_reg_rtx (V4SImode);
24826 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24827 const0_rtx, const0_rtx,
24828 const2_rtx, const2_rtx));
24829 mask = t3;
24830 maskmode = V4SImode;
24831 e = w = 4;
24834 for (i = 0; i < w; i++)
24835 vec[i] = vt;
24836 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24837 vt = force_reg (maskmode, vt);
24838 mask = expand_simple_binop (maskmode, AND, mask, vt,
24839 NULL_RTX, 0, OPTAB_DIRECT);
24841 if (GET_MODE (target) != mode)
24842 target = gen_reg_rtx (mode);
24843 xops[0] = target;
24844 xops[1] = gen_lowpart (mode, t2);
24845 xops[2] = gen_lowpart (mode, t1);
24846 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24847 xops[4] = mask;
24848 xops[5] = vt;
24849 ok = ix86_expand_int_vcond (xops);
24850 gcc_assert (ok);
24851 if (target != operands[0])
24852 emit_move_insn (operands[0],
24853 gen_lowpart (GET_MODE (operands[0]), target));
24857 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24858 true if we should do zero extension, else sign extension. HIGH_P is
24859 true if we want the N/2 high elements, else the low elements. */
24861 void
24862 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24864 machine_mode imode = GET_MODE (src);
24865 rtx tmp;
24867 if (TARGET_SSE4_1)
24869 rtx (*unpack)(rtx, rtx);
24870 rtx (*extract)(rtx, rtx) = NULL;
24871 machine_mode halfmode = BLKmode;
24873 switch (imode)
24875 case V64QImode:
24876 if (unsigned_p)
24877 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24878 else
24879 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24880 halfmode = V32QImode;
24881 extract
24882 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24883 break;
24884 case V32QImode:
24885 if (unsigned_p)
24886 unpack = gen_avx2_zero_extendv16qiv16hi2;
24887 else
24888 unpack = gen_avx2_sign_extendv16qiv16hi2;
24889 halfmode = V16QImode;
24890 extract
24891 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24892 break;
24893 case V32HImode:
24894 if (unsigned_p)
24895 unpack = gen_avx512f_zero_extendv16hiv16si2;
24896 else
24897 unpack = gen_avx512f_sign_extendv16hiv16si2;
24898 halfmode = V16HImode;
24899 extract
24900 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24901 break;
24902 case V16HImode:
24903 if (unsigned_p)
24904 unpack = gen_avx2_zero_extendv8hiv8si2;
24905 else
24906 unpack = gen_avx2_sign_extendv8hiv8si2;
24907 halfmode = V8HImode;
24908 extract
24909 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24910 break;
24911 case V16SImode:
24912 if (unsigned_p)
24913 unpack = gen_avx512f_zero_extendv8siv8di2;
24914 else
24915 unpack = gen_avx512f_sign_extendv8siv8di2;
24916 halfmode = V8SImode;
24917 extract
24918 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24919 break;
24920 case V8SImode:
24921 if (unsigned_p)
24922 unpack = gen_avx2_zero_extendv4siv4di2;
24923 else
24924 unpack = gen_avx2_sign_extendv4siv4di2;
24925 halfmode = V4SImode;
24926 extract
24927 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24928 break;
24929 case V16QImode:
24930 if (unsigned_p)
24931 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24932 else
24933 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24934 break;
24935 case V8HImode:
24936 if (unsigned_p)
24937 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24938 else
24939 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24940 break;
24941 case V4SImode:
24942 if (unsigned_p)
24943 unpack = gen_sse4_1_zero_extendv2siv2di2;
24944 else
24945 unpack = gen_sse4_1_sign_extendv2siv2di2;
24946 break;
24947 default:
24948 gcc_unreachable ();
24951 if (GET_MODE_SIZE (imode) >= 32)
24953 tmp = gen_reg_rtx (halfmode);
24954 emit_insn (extract (tmp, src));
24956 else if (high_p)
24958 /* Shift higher 8 bytes to lower 8 bytes. */
24959 tmp = gen_reg_rtx (V1TImode);
24960 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24961 GEN_INT (64)));
24962 tmp = gen_lowpart (imode, tmp);
24964 else
24965 tmp = src;
24967 emit_insn (unpack (dest, tmp));
24969 else
24971 rtx (*unpack)(rtx, rtx, rtx);
24973 switch (imode)
24975 case V16QImode:
24976 if (high_p)
24977 unpack = gen_vec_interleave_highv16qi;
24978 else
24979 unpack = gen_vec_interleave_lowv16qi;
24980 break;
24981 case V8HImode:
24982 if (high_p)
24983 unpack = gen_vec_interleave_highv8hi;
24984 else
24985 unpack = gen_vec_interleave_lowv8hi;
24986 break;
24987 case V4SImode:
24988 if (high_p)
24989 unpack = gen_vec_interleave_highv4si;
24990 else
24991 unpack = gen_vec_interleave_lowv4si;
24992 break;
24993 default:
24994 gcc_unreachable ();
24997 if (unsigned_p)
24998 tmp = force_reg (imode, CONST0_RTX (imode));
24999 else
25000 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25001 src, pc_rtx, pc_rtx);
25003 rtx tmp2 = gen_reg_rtx (imode);
25004 emit_insn (unpack (tmp2, src, tmp));
25005 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25009 /* Expand conditional increment or decrement using adb/sbb instructions.
25010 The default case using setcc followed by the conditional move can be
25011 done by generic code. */
25012 bool
25013 ix86_expand_int_addcc (rtx operands[])
25015 enum rtx_code code = GET_CODE (operands[1]);
25016 rtx flags;
25017 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25018 rtx compare_op;
25019 rtx val = const0_rtx;
25020 bool fpcmp = false;
25021 machine_mode mode;
25022 rtx op0 = XEXP (operands[1], 0);
25023 rtx op1 = XEXP (operands[1], 1);
25025 if (operands[3] != const1_rtx
25026 && operands[3] != constm1_rtx)
25027 return false;
25028 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25029 return false;
25030 code = GET_CODE (compare_op);
25032 flags = XEXP (compare_op, 0);
25034 if (GET_MODE (flags) == CCFPmode
25035 || GET_MODE (flags) == CCFPUmode)
25037 fpcmp = true;
25038 code = ix86_fp_compare_code_to_integer (code);
25041 if (code != LTU)
25043 val = constm1_rtx;
25044 if (fpcmp)
25045 PUT_CODE (compare_op,
25046 reverse_condition_maybe_unordered
25047 (GET_CODE (compare_op)));
25048 else
25049 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25052 mode = GET_MODE (operands[0]);
25054 /* Construct either adc or sbb insn. */
25055 if ((code == LTU) == (operands[3] == constm1_rtx))
25057 switch (mode)
25059 case QImode:
25060 insn = gen_subqi3_carry;
25061 break;
25062 case HImode:
25063 insn = gen_subhi3_carry;
25064 break;
25065 case SImode:
25066 insn = gen_subsi3_carry;
25067 break;
25068 case DImode:
25069 insn = gen_subdi3_carry;
25070 break;
25071 default:
25072 gcc_unreachable ();
25075 else
25077 switch (mode)
25079 case QImode:
25080 insn = gen_addqi3_carry;
25081 break;
25082 case HImode:
25083 insn = gen_addhi3_carry;
25084 break;
25085 case SImode:
25086 insn = gen_addsi3_carry;
25087 break;
25088 case DImode:
25089 insn = gen_adddi3_carry;
25090 break;
25091 default:
25092 gcc_unreachable ();
25095 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25097 return true;
25101 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25102 but works for floating pointer parameters and nonoffsetable memories.
25103 For pushes, it returns just stack offsets; the values will be saved
25104 in the right order. Maximally three parts are generated. */
25106 static int
25107 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25109 int size;
25111 if (!TARGET_64BIT)
25112 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25113 else
25114 size = (GET_MODE_SIZE (mode) + 4) / 8;
25116 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25117 gcc_assert (size >= 2 && size <= 4);
25119 /* Optimize constant pool reference to immediates. This is used by fp
25120 moves, that force all constants to memory to allow combining. */
25121 if (MEM_P (operand) && MEM_READONLY_P (operand))
25123 rtx tmp = maybe_get_pool_constant (operand);
25124 if (tmp)
25125 operand = tmp;
25128 if (MEM_P (operand) && !offsettable_memref_p (operand))
25130 /* The only non-offsetable memories we handle are pushes. */
25131 int ok = push_operand (operand, VOIDmode);
25133 gcc_assert (ok);
25135 operand = copy_rtx (operand);
25136 PUT_MODE (operand, word_mode);
25137 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25138 return size;
25141 if (GET_CODE (operand) == CONST_VECTOR)
25143 machine_mode imode = int_mode_for_mode (mode);
25144 /* Caution: if we looked through a constant pool memory above,
25145 the operand may actually have a different mode now. That's
25146 ok, since we want to pun this all the way back to an integer. */
25147 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25148 gcc_assert (operand != NULL);
25149 mode = imode;
25152 if (!TARGET_64BIT)
25154 if (mode == DImode)
25155 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25156 else
25158 int i;
25160 if (REG_P (operand))
25162 gcc_assert (reload_completed);
25163 for (i = 0; i < size; i++)
25164 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25166 else if (offsettable_memref_p (operand))
25168 operand = adjust_address (operand, SImode, 0);
25169 parts[0] = operand;
25170 for (i = 1; i < size; i++)
25171 parts[i] = adjust_address (operand, SImode, 4 * i);
25173 else if (CONST_DOUBLE_P (operand))
25175 const REAL_VALUE_TYPE *r;
25176 long l[4];
25178 r = CONST_DOUBLE_REAL_VALUE (operand);
25179 switch (mode)
25181 case TFmode:
25182 real_to_target (l, r, mode);
25183 parts[3] = gen_int_mode (l[3], SImode);
25184 parts[2] = gen_int_mode (l[2], SImode);
25185 break;
25186 case XFmode:
25187 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25188 long double may not be 80-bit. */
25189 real_to_target (l, r, mode);
25190 parts[2] = gen_int_mode (l[2], SImode);
25191 break;
25192 case DFmode:
25193 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25194 break;
25195 default:
25196 gcc_unreachable ();
25198 parts[1] = gen_int_mode (l[1], SImode);
25199 parts[0] = gen_int_mode (l[0], SImode);
25201 else
25202 gcc_unreachable ();
25205 else
25207 if (mode == TImode)
25208 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25209 if (mode == XFmode || mode == TFmode)
25211 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25212 if (REG_P (operand))
25214 gcc_assert (reload_completed);
25215 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25216 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25218 else if (offsettable_memref_p (operand))
25220 operand = adjust_address (operand, DImode, 0);
25221 parts[0] = operand;
25222 parts[1] = adjust_address (operand, upper_mode, 8);
25224 else if (CONST_DOUBLE_P (operand))
25226 long l[4];
25228 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25230 /* real_to_target puts 32-bit pieces in each long. */
25231 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25232 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25233 << 32), DImode);
25235 if (upper_mode == SImode)
25236 parts[1] = gen_int_mode (l[2], SImode);
25237 else
25238 parts[1]
25239 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25240 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25241 << 32), DImode);
25243 else
25244 gcc_unreachable ();
25248 return size;
25251 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25252 Return false when normal moves are needed; true when all required
25253 insns have been emitted. Operands 2-4 contain the input values
25254 int the correct order; operands 5-7 contain the output values. */
25256 void
25257 ix86_split_long_move (rtx operands[])
25259 rtx part[2][4];
25260 int nparts, i, j;
25261 int push = 0;
25262 int collisions = 0;
25263 machine_mode mode = GET_MODE (operands[0]);
25264 bool collisionparts[4];
25266 /* The DFmode expanders may ask us to move double.
25267 For 64bit target this is single move. By hiding the fact
25268 here we simplify i386.md splitters. */
25269 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25271 /* Optimize constant pool reference to immediates. This is used by
25272 fp moves, that force all constants to memory to allow combining. */
25274 if (MEM_P (operands[1])
25275 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25276 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25277 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25278 if (push_operand (operands[0], VOIDmode))
25280 operands[0] = copy_rtx (operands[0]);
25281 PUT_MODE (operands[0], word_mode);
25283 else
25284 operands[0] = gen_lowpart (DImode, operands[0]);
25285 operands[1] = gen_lowpart (DImode, operands[1]);
25286 emit_move_insn (operands[0], operands[1]);
25287 return;
25290 /* The only non-offsettable memory we handle is push. */
25291 if (push_operand (operands[0], VOIDmode))
25292 push = 1;
25293 else
25294 gcc_assert (!MEM_P (operands[0])
25295 || offsettable_memref_p (operands[0]));
25297 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25298 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25300 /* When emitting push, take care for source operands on the stack. */
25301 if (push && MEM_P (operands[1])
25302 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25304 rtx src_base = XEXP (part[1][nparts - 1], 0);
25306 /* Compensate for the stack decrement by 4. */
25307 if (!TARGET_64BIT && nparts == 3
25308 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25309 src_base = plus_constant (Pmode, src_base, 4);
25311 /* src_base refers to the stack pointer and is
25312 automatically decreased by emitted push. */
25313 for (i = 0; i < nparts; i++)
25314 part[1][i] = change_address (part[1][i],
25315 GET_MODE (part[1][i]), src_base);
25318 /* We need to do copy in the right order in case an address register
25319 of the source overlaps the destination. */
25320 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25322 rtx tmp;
25324 for (i = 0; i < nparts; i++)
25326 collisionparts[i]
25327 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25328 if (collisionparts[i])
25329 collisions++;
25332 /* Collision in the middle part can be handled by reordering. */
25333 if (collisions == 1 && nparts == 3 && collisionparts [1])
25335 std::swap (part[0][1], part[0][2]);
25336 std::swap (part[1][1], part[1][2]);
25338 else if (collisions == 1
25339 && nparts == 4
25340 && (collisionparts [1] || collisionparts [2]))
25342 if (collisionparts [1])
25344 std::swap (part[0][1], part[0][2]);
25345 std::swap (part[1][1], part[1][2]);
25347 else
25349 std::swap (part[0][2], part[0][3]);
25350 std::swap (part[1][2], part[1][3]);
25354 /* If there are more collisions, we can't handle it by reordering.
25355 Do an lea to the last part and use only one colliding move. */
25356 else if (collisions > 1)
25358 rtx base, addr, tls_base = NULL_RTX;
25360 collisions = 1;
25362 base = part[0][nparts - 1];
25364 /* Handle the case when the last part isn't valid for lea.
25365 Happens in 64-bit mode storing the 12-byte XFmode. */
25366 if (GET_MODE (base) != Pmode)
25367 base = gen_rtx_REG (Pmode, REGNO (base));
25369 addr = XEXP (part[1][0], 0);
25370 if (TARGET_TLS_DIRECT_SEG_REFS)
25372 struct ix86_address parts;
25373 int ok = ix86_decompose_address (addr, &parts);
25374 gcc_assert (ok);
25375 if (parts.seg == DEFAULT_TLS_SEG_REG)
25377 /* It is not valid to use %gs: or %fs: in
25378 lea though, so we need to remove it from the
25379 address used for lea and add it to each individual
25380 memory loads instead. */
25381 addr = copy_rtx (addr);
25382 rtx *x = &addr;
25383 while (GET_CODE (*x) == PLUS)
25385 for (i = 0; i < 2; i++)
25387 rtx u = XEXP (*x, i);
25388 if (GET_CODE (u) == ZERO_EXTEND)
25389 u = XEXP (u, 0);
25390 if (GET_CODE (u) == UNSPEC
25391 && XINT (u, 1) == UNSPEC_TP)
25393 tls_base = XEXP (*x, i);
25394 *x = XEXP (*x, 1 - i);
25395 break;
25398 if (tls_base)
25399 break;
25400 x = &XEXP (*x, 0);
25402 gcc_assert (tls_base);
25405 emit_insn (gen_rtx_SET (base, addr));
25406 if (tls_base)
25407 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25408 part[1][0] = replace_equiv_address (part[1][0], base);
25409 for (i = 1; i < nparts; i++)
25411 if (tls_base)
25412 base = copy_rtx (base);
25413 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25414 part[1][i] = replace_equiv_address (part[1][i], tmp);
25419 if (push)
25421 if (!TARGET_64BIT)
25423 if (nparts == 3)
25425 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25426 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25427 stack_pointer_rtx, GEN_INT (-4)));
25428 emit_move_insn (part[0][2], part[1][2]);
25430 else if (nparts == 4)
25432 emit_move_insn (part[0][3], part[1][3]);
25433 emit_move_insn (part[0][2], part[1][2]);
25436 else
25438 /* In 64bit mode we don't have 32bit push available. In case this is
25439 register, it is OK - we will just use larger counterpart. We also
25440 retype memory - these comes from attempt to avoid REX prefix on
25441 moving of second half of TFmode value. */
25442 if (GET_MODE (part[1][1]) == SImode)
25444 switch (GET_CODE (part[1][1]))
25446 case MEM:
25447 part[1][1] = adjust_address (part[1][1], DImode, 0);
25448 break;
25450 case REG:
25451 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25452 break;
25454 default:
25455 gcc_unreachable ();
25458 if (GET_MODE (part[1][0]) == SImode)
25459 part[1][0] = part[1][1];
25462 emit_move_insn (part[0][1], part[1][1]);
25463 emit_move_insn (part[0][0], part[1][0]);
25464 return;
25467 /* Choose correct order to not overwrite the source before it is copied. */
25468 if ((REG_P (part[0][0])
25469 && REG_P (part[1][1])
25470 && (REGNO (part[0][0]) == REGNO (part[1][1])
25471 || (nparts == 3
25472 && REGNO (part[0][0]) == REGNO (part[1][2]))
25473 || (nparts == 4
25474 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25475 || (collisions > 0
25476 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25478 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25480 operands[2 + i] = part[0][j];
25481 operands[6 + i] = part[1][j];
25484 else
25486 for (i = 0; i < nparts; i++)
25488 operands[2 + i] = part[0][i];
25489 operands[6 + i] = part[1][i];
25493 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25494 if (optimize_insn_for_size_p ())
25496 for (j = 0; j < nparts - 1; j++)
25497 if (CONST_INT_P (operands[6 + j])
25498 && operands[6 + j] != const0_rtx
25499 && REG_P (operands[2 + j]))
25500 for (i = j; i < nparts - 1; i++)
25501 if (CONST_INT_P (operands[7 + i])
25502 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25503 operands[7 + i] = operands[2 + j];
25506 for (i = 0; i < nparts; i++)
25507 emit_move_insn (operands[2 + i], operands[6 + i]);
25509 return;
25512 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25513 left shift by a constant, either using a single shift or
25514 a sequence of add instructions. */
25516 static void
25517 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25519 rtx (*insn)(rtx, rtx, rtx);
25521 if (count == 1
25522 || (count * ix86_cost->add <= ix86_cost->shift_const
25523 && !optimize_insn_for_size_p ()))
25525 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25526 while (count-- > 0)
25527 emit_insn (insn (operand, operand, operand));
25529 else
25531 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25532 emit_insn (insn (operand, operand, GEN_INT (count)));
25536 void
25537 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25539 rtx (*gen_ashl3)(rtx, rtx, rtx);
25540 rtx (*gen_shld)(rtx, rtx, rtx);
25541 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25543 rtx low[2], high[2];
25544 int count;
25546 if (CONST_INT_P (operands[2]))
25548 split_double_mode (mode, operands, 2, low, high);
25549 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25551 if (count >= half_width)
25553 emit_move_insn (high[0], low[1]);
25554 emit_move_insn (low[0], const0_rtx);
25556 if (count > half_width)
25557 ix86_expand_ashl_const (high[0], count - half_width, mode);
25559 else
25561 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25563 if (!rtx_equal_p (operands[0], operands[1]))
25564 emit_move_insn (operands[0], operands[1]);
25566 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25567 ix86_expand_ashl_const (low[0], count, mode);
25569 return;
25572 split_double_mode (mode, operands, 1, low, high);
25574 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25576 if (operands[1] == const1_rtx)
25578 /* Assuming we've chosen a QImode capable registers, then 1 << N
25579 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25580 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25582 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25584 ix86_expand_clear (low[0]);
25585 ix86_expand_clear (high[0]);
25586 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25588 d = gen_lowpart (QImode, low[0]);
25589 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25590 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25591 emit_insn (gen_rtx_SET (d, s));
25593 d = gen_lowpart (QImode, high[0]);
25594 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25595 s = gen_rtx_NE (QImode, flags, const0_rtx);
25596 emit_insn (gen_rtx_SET (d, s));
25599 /* Otherwise, we can get the same results by manually performing
25600 a bit extract operation on bit 5/6, and then performing the two
25601 shifts. The two methods of getting 0/1 into low/high are exactly
25602 the same size. Avoiding the shift in the bit extract case helps
25603 pentium4 a bit; no one else seems to care much either way. */
25604 else
25606 machine_mode half_mode;
25607 rtx (*gen_lshr3)(rtx, rtx, rtx);
25608 rtx (*gen_and3)(rtx, rtx, rtx);
25609 rtx (*gen_xor3)(rtx, rtx, rtx);
25610 HOST_WIDE_INT bits;
25611 rtx x;
25613 if (mode == DImode)
25615 half_mode = SImode;
25616 gen_lshr3 = gen_lshrsi3;
25617 gen_and3 = gen_andsi3;
25618 gen_xor3 = gen_xorsi3;
25619 bits = 5;
25621 else
25623 half_mode = DImode;
25624 gen_lshr3 = gen_lshrdi3;
25625 gen_and3 = gen_anddi3;
25626 gen_xor3 = gen_xordi3;
25627 bits = 6;
25630 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25631 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25632 else
25633 x = gen_lowpart (half_mode, operands[2]);
25634 emit_insn (gen_rtx_SET (high[0], x));
25636 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25637 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25638 emit_move_insn (low[0], high[0]);
25639 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25642 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25643 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25644 return;
25647 if (operands[1] == constm1_rtx)
25649 /* For -1 << N, we can avoid the shld instruction, because we
25650 know that we're shifting 0...31/63 ones into a -1. */
25651 emit_move_insn (low[0], constm1_rtx);
25652 if (optimize_insn_for_size_p ())
25653 emit_move_insn (high[0], low[0]);
25654 else
25655 emit_move_insn (high[0], constm1_rtx);
25657 else
25659 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25661 if (!rtx_equal_p (operands[0], operands[1]))
25662 emit_move_insn (operands[0], operands[1]);
25664 split_double_mode (mode, operands, 1, low, high);
25665 emit_insn (gen_shld (high[0], low[0], operands[2]));
25668 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25670 if (TARGET_CMOVE && scratch)
25672 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25673 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25675 ix86_expand_clear (scratch);
25676 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25678 else
25680 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25681 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25683 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25687 void
25688 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25690 rtx (*gen_ashr3)(rtx, rtx, rtx)
25691 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25692 rtx (*gen_shrd)(rtx, rtx, rtx);
25693 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25695 rtx low[2], high[2];
25696 int count;
25698 if (CONST_INT_P (operands[2]))
25700 split_double_mode (mode, operands, 2, low, high);
25701 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25703 if (count == GET_MODE_BITSIZE (mode) - 1)
25705 emit_move_insn (high[0], high[1]);
25706 emit_insn (gen_ashr3 (high[0], high[0],
25707 GEN_INT (half_width - 1)));
25708 emit_move_insn (low[0], high[0]);
25711 else if (count >= half_width)
25713 emit_move_insn (low[0], high[1]);
25714 emit_move_insn (high[0], low[0]);
25715 emit_insn (gen_ashr3 (high[0], high[0],
25716 GEN_INT (half_width - 1)));
25718 if (count > half_width)
25719 emit_insn (gen_ashr3 (low[0], low[0],
25720 GEN_INT (count - half_width)));
25722 else
25724 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25726 if (!rtx_equal_p (operands[0], operands[1]))
25727 emit_move_insn (operands[0], operands[1]);
25729 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25730 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25733 else
25735 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25737 if (!rtx_equal_p (operands[0], operands[1]))
25738 emit_move_insn (operands[0], operands[1]);
25740 split_double_mode (mode, operands, 1, low, high);
25742 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25743 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25745 if (TARGET_CMOVE && scratch)
25747 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25748 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25750 emit_move_insn (scratch, high[0]);
25751 emit_insn (gen_ashr3 (scratch, scratch,
25752 GEN_INT (half_width - 1)));
25753 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25754 scratch));
25756 else
25758 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25759 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25761 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25766 void
25767 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25769 rtx (*gen_lshr3)(rtx, rtx, rtx)
25770 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25771 rtx (*gen_shrd)(rtx, rtx, rtx);
25772 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25774 rtx low[2], high[2];
25775 int count;
25777 if (CONST_INT_P (operands[2]))
25779 split_double_mode (mode, operands, 2, low, high);
25780 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25782 if (count >= half_width)
25784 emit_move_insn (low[0], high[1]);
25785 ix86_expand_clear (high[0]);
25787 if (count > half_width)
25788 emit_insn (gen_lshr3 (low[0], low[0],
25789 GEN_INT (count - half_width)));
25791 else
25793 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25795 if (!rtx_equal_p (operands[0], operands[1]))
25796 emit_move_insn (operands[0], operands[1]);
25798 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25799 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25802 else
25804 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25806 if (!rtx_equal_p (operands[0], operands[1]))
25807 emit_move_insn (operands[0], operands[1]);
25809 split_double_mode (mode, operands, 1, low, high);
25811 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25812 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25814 if (TARGET_CMOVE && scratch)
25816 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25817 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25819 ix86_expand_clear (scratch);
25820 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25821 scratch));
25823 else
25825 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25826 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25828 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25833 /* Predict just emitted jump instruction to be taken with probability PROB. */
25834 static void
25835 predict_jump (int prob)
25837 rtx insn = get_last_insn ();
25838 gcc_assert (JUMP_P (insn));
25839 add_int_reg_note (insn, REG_BR_PROB, prob);
25842 /* Helper function for the string operations below. Dest VARIABLE whether
25843 it is aligned to VALUE bytes. If true, jump to the label. */
25844 static rtx_code_label *
25845 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25847 rtx_code_label *label = gen_label_rtx ();
25848 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25849 if (GET_MODE (variable) == DImode)
25850 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25851 else
25852 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25853 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25854 1, label);
25855 if (epilogue)
25856 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25857 else
25858 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25859 return label;
25862 /* Adjust COUNTER by the VALUE. */
25863 static void
25864 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25866 rtx (*gen_add)(rtx, rtx, rtx)
25867 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25869 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25872 /* Zero extend possibly SImode EXP to Pmode register. */
25874 ix86_zero_extend_to_Pmode (rtx exp)
25876 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25879 /* Divide COUNTREG by SCALE. */
25880 static rtx
25881 scale_counter (rtx countreg, int scale)
25883 rtx sc;
25885 if (scale == 1)
25886 return countreg;
25887 if (CONST_INT_P (countreg))
25888 return GEN_INT (INTVAL (countreg) / scale);
25889 gcc_assert (REG_P (countreg));
25891 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25892 GEN_INT (exact_log2 (scale)),
25893 NULL, 1, OPTAB_DIRECT);
25894 return sc;
25897 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25898 DImode for constant loop counts. */
25900 static machine_mode
25901 counter_mode (rtx count_exp)
25903 if (GET_MODE (count_exp) != VOIDmode)
25904 return GET_MODE (count_exp);
25905 if (!CONST_INT_P (count_exp))
25906 return Pmode;
25907 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25908 return DImode;
25909 return SImode;
25912 /* Copy the address to a Pmode register. This is used for x32 to
25913 truncate DImode TLS address to a SImode register. */
25915 static rtx
25916 ix86_copy_addr_to_reg (rtx addr)
25918 rtx reg;
25919 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25921 reg = copy_addr_to_reg (addr);
25922 REG_POINTER (reg) = 1;
25923 return reg;
25925 else
25927 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25928 reg = copy_to_mode_reg (DImode, addr);
25929 REG_POINTER (reg) = 1;
25930 return gen_rtx_SUBREG (SImode, reg, 0);
25934 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25935 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25936 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25937 memory by VALUE (supposed to be in MODE).
25939 The size is rounded down to whole number of chunk size moved at once.
25940 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25943 static void
25944 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25945 rtx destptr, rtx srcptr, rtx value,
25946 rtx count, machine_mode mode, int unroll,
25947 int expected_size, bool issetmem)
25949 rtx_code_label *out_label, *top_label;
25950 rtx iter, tmp;
25951 machine_mode iter_mode = counter_mode (count);
25952 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25953 rtx piece_size = GEN_INT (piece_size_n);
25954 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25955 rtx size;
25956 int i;
25958 top_label = gen_label_rtx ();
25959 out_label = gen_label_rtx ();
25960 iter = gen_reg_rtx (iter_mode);
25962 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25963 NULL, 1, OPTAB_DIRECT);
25964 /* Those two should combine. */
25965 if (piece_size == const1_rtx)
25967 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25968 true, out_label);
25969 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25971 emit_move_insn (iter, const0_rtx);
25973 emit_label (top_label);
25975 tmp = convert_modes (Pmode, iter_mode, iter, true);
25977 /* This assert could be relaxed - in this case we'll need to compute
25978 smallest power of two, containing in PIECE_SIZE_N and pass it to
25979 offset_address. */
25980 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25981 destmem = offset_address (destmem, tmp, piece_size_n);
25982 destmem = adjust_address (destmem, mode, 0);
25984 if (!issetmem)
25986 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25987 srcmem = adjust_address (srcmem, mode, 0);
25989 /* When unrolling for chips that reorder memory reads and writes,
25990 we can save registers by using single temporary.
25991 Also using 4 temporaries is overkill in 32bit mode. */
25992 if (!TARGET_64BIT && 0)
25994 for (i = 0; i < unroll; i++)
25996 if (i)
25998 destmem =
25999 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26000 srcmem =
26001 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26003 emit_move_insn (destmem, srcmem);
26006 else
26008 rtx tmpreg[4];
26009 gcc_assert (unroll <= 4);
26010 for (i = 0; i < unroll; i++)
26012 tmpreg[i] = gen_reg_rtx (mode);
26013 if (i)
26015 srcmem =
26016 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26018 emit_move_insn (tmpreg[i], srcmem);
26020 for (i = 0; i < unroll; i++)
26022 if (i)
26024 destmem =
26025 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26027 emit_move_insn (destmem, tmpreg[i]);
26031 else
26032 for (i = 0; i < unroll; i++)
26034 if (i)
26035 destmem =
26036 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26037 emit_move_insn (destmem, value);
26040 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26041 true, OPTAB_LIB_WIDEN);
26042 if (tmp != iter)
26043 emit_move_insn (iter, tmp);
26045 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26046 true, top_label);
26047 if (expected_size != -1)
26049 expected_size /= GET_MODE_SIZE (mode) * unroll;
26050 if (expected_size == 0)
26051 predict_jump (0);
26052 else if (expected_size > REG_BR_PROB_BASE)
26053 predict_jump (REG_BR_PROB_BASE - 1);
26054 else
26055 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26057 else
26058 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26059 iter = ix86_zero_extend_to_Pmode (iter);
26060 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26061 true, OPTAB_LIB_WIDEN);
26062 if (tmp != destptr)
26063 emit_move_insn (destptr, tmp);
26064 if (!issetmem)
26066 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26067 true, OPTAB_LIB_WIDEN);
26068 if (tmp != srcptr)
26069 emit_move_insn (srcptr, tmp);
26071 emit_label (out_label);
26074 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26075 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26076 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26077 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26078 ORIG_VALUE is the original value passed to memset to fill the memory with.
26079 Other arguments have same meaning as for previous function. */
26081 static void
26082 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26083 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26084 rtx count,
26085 machine_mode mode, bool issetmem)
26087 rtx destexp;
26088 rtx srcexp;
26089 rtx countreg;
26090 HOST_WIDE_INT rounded_count;
26092 /* If possible, it is shorter to use rep movs.
26093 TODO: Maybe it is better to move this logic to decide_alg. */
26094 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26095 && (!issetmem || orig_value == const0_rtx))
26096 mode = SImode;
26098 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26099 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26101 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26102 GET_MODE_SIZE (mode)));
26103 if (mode != QImode)
26105 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26106 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26107 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26109 else
26110 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26111 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26113 rounded_count
26114 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26115 destmem = shallow_copy_rtx (destmem);
26116 set_mem_size (destmem, rounded_count);
26118 else if (MEM_SIZE_KNOWN_P (destmem))
26119 clear_mem_size (destmem);
26121 if (issetmem)
26123 value = force_reg (mode, gen_lowpart (mode, value));
26124 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26126 else
26128 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26129 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26130 if (mode != QImode)
26132 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26133 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26134 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26136 else
26137 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26138 if (CONST_INT_P (count))
26140 rounded_count
26141 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26142 srcmem = shallow_copy_rtx (srcmem);
26143 set_mem_size (srcmem, rounded_count);
26145 else
26147 if (MEM_SIZE_KNOWN_P (srcmem))
26148 clear_mem_size (srcmem);
26150 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26151 destexp, srcexp));
26155 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26156 DESTMEM.
26157 SRC is passed by pointer to be updated on return.
26158 Return value is updated DST. */
26159 static rtx
26160 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26161 HOST_WIDE_INT size_to_move)
26163 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26164 enum insn_code code;
26165 machine_mode move_mode;
26166 int piece_size, i;
26168 /* Find the widest mode in which we could perform moves.
26169 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26170 it until move of such size is supported. */
26171 piece_size = 1 << floor_log2 (size_to_move);
26172 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26173 code = optab_handler (mov_optab, move_mode);
26174 while (code == CODE_FOR_nothing && piece_size > 1)
26176 piece_size >>= 1;
26177 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26178 code = optab_handler (mov_optab, move_mode);
26181 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26182 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26183 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26185 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26186 move_mode = mode_for_vector (word_mode, nunits);
26187 code = optab_handler (mov_optab, move_mode);
26188 if (code == CODE_FOR_nothing)
26190 move_mode = word_mode;
26191 piece_size = GET_MODE_SIZE (move_mode);
26192 code = optab_handler (mov_optab, move_mode);
26195 gcc_assert (code != CODE_FOR_nothing);
26197 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26198 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26200 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26201 gcc_assert (size_to_move % piece_size == 0);
26202 adjust = GEN_INT (piece_size);
26203 for (i = 0; i < size_to_move; i += piece_size)
26205 /* We move from memory to memory, so we'll need to do it via
26206 a temporary register. */
26207 tempreg = gen_reg_rtx (move_mode);
26208 emit_insn (GEN_FCN (code) (tempreg, src));
26209 emit_insn (GEN_FCN (code) (dst, tempreg));
26211 emit_move_insn (destptr,
26212 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26213 emit_move_insn (srcptr,
26214 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26216 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26217 piece_size);
26218 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26219 piece_size);
26222 /* Update DST and SRC rtx. */
26223 *srcmem = src;
26224 return dst;
26227 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26228 static void
26229 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26230 rtx destptr, rtx srcptr, rtx count, int max_size)
26232 rtx src, dest;
26233 if (CONST_INT_P (count))
26235 HOST_WIDE_INT countval = INTVAL (count);
26236 HOST_WIDE_INT epilogue_size = countval % max_size;
26237 int i;
26239 /* For now MAX_SIZE should be a power of 2. This assert could be
26240 relaxed, but it'll require a bit more complicated epilogue
26241 expanding. */
26242 gcc_assert ((max_size & (max_size - 1)) == 0);
26243 for (i = max_size; i >= 1; i >>= 1)
26245 if (epilogue_size & i)
26246 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26248 return;
26250 if (max_size > 8)
26252 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26253 count, 1, OPTAB_DIRECT);
26254 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26255 count, QImode, 1, 4, false);
26256 return;
26259 /* When there are stringops, we can cheaply increase dest and src pointers.
26260 Otherwise we save code size by maintaining offset (zero is readily
26261 available from preceding rep operation) and using x86 addressing modes.
26263 if (TARGET_SINGLE_STRINGOP)
26265 if (max_size > 4)
26267 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26268 src = change_address (srcmem, SImode, srcptr);
26269 dest = change_address (destmem, SImode, destptr);
26270 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26271 emit_label (label);
26272 LABEL_NUSES (label) = 1;
26274 if (max_size > 2)
26276 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26277 src = change_address (srcmem, HImode, srcptr);
26278 dest = change_address (destmem, HImode, destptr);
26279 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26280 emit_label (label);
26281 LABEL_NUSES (label) = 1;
26283 if (max_size > 1)
26285 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26286 src = change_address (srcmem, QImode, srcptr);
26287 dest = change_address (destmem, QImode, destptr);
26288 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26289 emit_label (label);
26290 LABEL_NUSES (label) = 1;
26293 else
26295 rtx offset = force_reg (Pmode, const0_rtx);
26296 rtx tmp;
26298 if (max_size > 4)
26300 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26301 src = change_address (srcmem, SImode, srcptr);
26302 dest = change_address (destmem, SImode, destptr);
26303 emit_move_insn (dest, src);
26304 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26305 true, OPTAB_LIB_WIDEN);
26306 if (tmp != offset)
26307 emit_move_insn (offset, tmp);
26308 emit_label (label);
26309 LABEL_NUSES (label) = 1;
26311 if (max_size > 2)
26313 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26314 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26315 src = change_address (srcmem, HImode, tmp);
26316 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26317 dest = change_address (destmem, HImode, tmp);
26318 emit_move_insn (dest, src);
26319 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26320 true, OPTAB_LIB_WIDEN);
26321 if (tmp != offset)
26322 emit_move_insn (offset, tmp);
26323 emit_label (label);
26324 LABEL_NUSES (label) = 1;
26326 if (max_size > 1)
26328 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26329 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26330 src = change_address (srcmem, QImode, tmp);
26331 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26332 dest = change_address (destmem, QImode, tmp);
26333 emit_move_insn (dest, src);
26334 emit_label (label);
26335 LABEL_NUSES (label) = 1;
26340 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26341 with value PROMOTED_VAL.
26342 SRC is passed by pointer to be updated on return.
26343 Return value is updated DST. */
26344 static rtx
26345 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26346 HOST_WIDE_INT size_to_move)
26348 rtx dst = destmem, adjust;
26349 enum insn_code code;
26350 machine_mode move_mode;
26351 int piece_size, i;
26353 /* Find the widest mode in which we could perform moves.
26354 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26355 it until move of such size is supported. */
26356 move_mode = GET_MODE (promoted_val);
26357 if (move_mode == VOIDmode)
26358 move_mode = QImode;
26359 if (size_to_move < GET_MODE_SIZE (move_mode))
26361 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26362 promoted_val = gen_lowpart (move_mode, promoted_val);
26364 piece_size = GET_MODE_SIZE (move_mode);
26365 code = optab_handler (mov_optab, move_mode);
26366 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26368 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26370 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26371 gcc_assert (size_to_move % piece_size == 0);
26372 adjust = GEN_INT (piece_size);
26373 for (i = 0; i < size_to_move; i += piece_size)
26375 if (piece_size <= GET_MODE_SIZE (word_mode))
26377 emit_insn (gen_strset (destptr, dst, promoted_val));
26378 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26379 piece_size);
26380 continue;
26383 emit_insn (GEN_FCN (code) (dst, promoted_val));
26385 emit_move_insn (destptr,
26386 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26388 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26389 piece_size);
26392 /* Update DST rtx. */
26393 return dst;
26395 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26396 static void
26397 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26398 rtx count, int max_size)
26400 count =
26401 expand_simple_binop (counter_mode (count), AND, count,
26402 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26403 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26404 gen_lowpart (QImode, value), count, QImode,
26405 1, max_size / 2, true);
26408 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26409 static void
26410 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26411 rtx count, int max_size)
26413 rtx dest;
26415 if (CONST_INT_P (count))
26417 HOST_WIDE_INT countval = INTVAL (count);
26418 HOST_WIDE_INT epilogue_size = countval % max_size;
26419 int i;
26421 /* For now MAX_SIZE should be a power of 2. This assert could be
26422 relaxed, but it'll require a bit more complicated epilogue
26423 expanding. */
26424 gcc_assert ((max_size & (max_size - 1)) == 0);
26425 for (i = max_size; i >= 1; i >>= 1)
26427 if (epilogue_size & i)
26429 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26430 destmem = emit_memset (destmem, destptr, vec_value, i);
26431 else
26432 destmem = emit_memset (destmem, destptr, value, i);
26435 return;
26437 if (max_size > 32)
26439 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26440 return;
26442 if (max_size > 16)
26444 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26445 if (TARGET_64BIT)
26447 dest = change_address (destmem, DImode, destptr);
26448 emit_insn (gen_strset (destptr, dest, value));
26449 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26450 emit_insn (gen_strset (destptr, dest, value));
26452 else
26454 dest = change_address (destmem, SImode, destptr);
26455 emit_insn (gen_strset (destptr, dest, value));
26456 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26457 emit_insn (gen_strset (destptr, dest, value));
26458 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26459 emit_insn (gen_strset (destptr, dest, value));
26460 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26461 emit_insn (gen_strset (destptr, dest, value));
26463 emit_label (label);
26464 LABEL_NUSES (label) = 1;
26466 if (max_size > 8)
26468 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26469 if (TARGET_64BIT)
26471 dest = change_address (destmem, DImode, destptr);
26472 emit_insn (gen_strset (destptr, dest, value));
26474 else
26476 dest = change_address (destmem, SImode, destptr);
26477 emit_insn (gen_strset (destptr, dest, value));
26478 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26479 emit_insn (gen_strset (destptr, dest, value));
26481 emit_label (label);
26482 LABEL_NUSES (label) = 1;
26484 if (max_size > 4)
26486 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26487 dest = change_address (destmem, SImode, destptr);
26488 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26489 emit_label (label);
26490 LABEL_NUSES (label) = 1;
26492 if (max_size > 2)
26494 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26495 dest = change_address (destmem, HImode, destptr);
26496 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26497 emit_label (label);
26498 LABEL_NUSES (label) = 1;
26500 if (max_size > 1)
26502 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26503 dest = change_address (destmem, QImode, destptr);
26504 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26505 emit_label (label);
26506 LABEL_NUSES (label) = 1;
26510 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26511 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26512 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26513 ignored.
26514 Return value is updated DESTMEM. */
26515 static rtx
26516 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26517 rtx destptr, rtx srcptr, rtx value,
26518 rtx vec_value, rtx count, int align,
26519 int desired_alignment, bool issetmem)
26521 int i;
26522 for (i = 1; i < desired_alignment; i <<= 1)
26524 if (align <= i)
26526 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26527 if (issetmem)
26529 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26530 destmem = emit_memset (destmem, destptr, vec_value, i);
26531 else
26532 destmem = emit_memset (destmem, destptr, value, i);
26534 else
26535 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26536 ix86_adjust_counter (count, i);
26537 emit_label (label);
26538 LABEL_NUSES (label) = 1;
26539 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26542 return destmem;
26545 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26546 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26547 and jump to DONE_LABEL. */
26548 static void
26549 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26550 rtx destptr, rtx srcptr,
26551 rtx value, rtx vec_value,
26552 rtx count, int size,
26553 rtx done_label, bool issetmem)
26555 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26556 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26557 rtx modesize;
26558 int n;
26560 /* If we do not have vector value to copy, we must reduce size. */
26561 if (issetmem)
26563 if (!vec_value)
26565 if (GET_MODE (value) == VOIDmode && size > 8)
26566 mode = Pmode;
26567 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26568 mode = GET_MODE (value);
26570 else
26571 mode = GET_MODE (vec_value), value = vec_value;
26573 else
26575 /* Choose appropriate vector mode. */
26576 if (size >= 32)
26577 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26578 else if (size >= 16)
26579 mode = TARGET_SSE ? V16QImode : DImode;
26580 srcmem = change_address (srcmem, mode, srcptr);
26582 destmem = change_address (destmem, mode, destptr);
26583 modesize = GEN_INT (GET_MODE_SIZE (mode));
26584 gcc_assert (GET_MODE_SIZE (mode) <= size);
26585 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26587 if (issetmem)
26588 emit_move_insn (destmem, gen_lowpart (mode, value));
26589 else
26591 emit_move_insn (destmem, srcmem);
26592 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26594 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26597 destmem = offset_address (destmem, count, 1);
26598 destmem = offset_address (destmem, GEN_INT (-2 * size),
26599 GET_MODE_SIZE (mode));
26600 if (!issetmem)
26602 srcmem = offset_address (srcmem, count, 1);
26603 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26604 GET_MODE_SIZE (mode));
26606 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26608 if (issetmem)
26609 emit_move_insn (destmem, gen_lowpart (mode, value));
26610 else
26612 emit_move_insn (destmem, srcmem);
26613 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26615 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26617 emit_jump_insn (gen_jump (done_label));
26618 emit_barrier ();
26620 emit_label (label);
26621 LABEL_NUSES (label) = 1;
26624 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26625 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26626 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26627 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26628 DONE_LABEL is a label after the whole copying sequence. The label is created
26629 on demand if *DONE_LABEL is NULL.
26630 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26631 bounds after the initial copies.
26633 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26634 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26635 we will dispatch to a library call for large blocks.
26637 In pseudocode we do:
26639 if (COUNT < SIZE)
26641 Assume that SIZE is 4. Bigger sizes are handled analogously
26642 if (COUNT & 4)
26644 copy 4 bytes from SRCPTR to DESTPTR
26645 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26646 goto done_label
26648 if (!COUNT)
26649 goto done_label;
26650 copy 1 byte from SRCPTR to DESTPTR
26651 if (COUNT & 2)
26653 copy 2 bytes from SRCPTR to DESTPTR
26654 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26657 else
26659 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26660 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26662 OLD_DESPTR = DESTPTR;
26663 Align DESTPTR up to DESIRED_ALIGN
26664 SRCPTR += DESTPTR - OLD_DESTPTR
26665 COUNT -= DEST_PTR - OLD_DESTPTR
26666 if (DYNAMIC_CHECK)
26667 Round COUNT down to multiple of SIZE
26668 << optional caller supplied zero size guard is here >>
26669 << optional caller supplied dynamic check is here >>
26670 << caller supplied main copy loop is here >>
26672 done_label:
26674 static void
26675 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26676 rtx *destptr, rtx *srcptr,
26677 machine_mode mode,
26678 rtx value, rtx vec_value,
26679 rtx *count,
26680 rtx_code_label **done_label,
26681 int size,
26682 int desired_align,
26683 int align,
26684 unsigned HOST_WIDE_INT *min_size,
26685 bool dynamic_check,
26686 bool issetmem)
26688 rtx_code_label *loop_label = NULL, *label;
26689 int n;
26690 rtx modesize;
26691 int prolog_size = 0;
26692 rtx mode_value;
26694 /* Chose proper value to copy. */
26695 if (issetmem && VECTOR_MODE_P (mode))
26696 mode_value = vec_value;
26697 else
26698 mode_value = value;
26699 gcc_assert (GET_MODE_SIZE (mode) <= size);
26701 /* See if block is big or small, handle small blocks. */
26702 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26704 int size2 = size;
26705 loop_label = gen_label_rtx ();
26707 if (!*done_label)
26708 *done_label = gen_label_rtx ();
26710 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26711 1, loop_label);
26712 size2 >>= 1;
26714 /* Handle sizes > 3. */
26715 for (;size2 > 2; size2 >>= 1)
26716 expand_small_movmem_or_setmem (destmem, srcmem,
26717 *destptr, *srcptr,
26718 value, vec_value,
26719 *count,
26720 size2, *done_label, issetmem);
26721 /* Nothing to copy? Jump to DONE_LABEL if so */
26722 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26723 1, *done_label);
26725 /* Do a byte copy. */
26726 destmem = change_address (destmem, QImode, *destptr);
26727 if (issetmem)
26728 emit_move_insn (destmem, gen_lowpart (QImode, value));
26729 else
26731 srcmem = change_address (srcmem, QImode, *srcptr);
26732 emit_move_insn (destmem, srcmem);
26735 /* Handle sizes 2 and 3. */
26736 label = ix86_expand_aligntest (*count, 2, false);
26737 destmem = change_address (destmem, HImode, *destptr);
26738 destmem = offset_address (destmem, *count, 1);
26739 destmem = offset_address (destmem, GEN_INT (-2), 2);
26740 if (issetmem)
26741 emit_move_insn (destmem, gen_lowpart (HImode, value));
26742 else
26744 srcmem = change_address (srcmem, HImode, *srcptr);
26745 srcmem = offset_address (srcmem, *count, 1);
26746 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26747 emit_move_insn (destmem, srcmem);
26750 emit_label (label);
26751 LABEL_NUSES (label) = 1;
26752 emit_jump_insn (gen_jump (*done_label));
26753 emit_barrier ();
26755 else
26756 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26757 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26759 /* Start memcpy for COUNT >= SIZE. */
26760 if (loop_label)
26762 emit_label (loop_label);
26763 LABEL_NUSES (loop_label) = 1;
26766 /* Copy first desired_align bytes. */
26767 if (!issetmem)
26768 srcmem = change_address (srcmem, mode, *srcptr);
26769 destmem = change_address (destmem, mode, *destptr);
26770 modesize = GEN_INT (GET_MODE_SIZE (mode));
26771 for (n = 0; prolog_size < desired_align - align; n++)
26773 if (issetmem)
26774 emit_move_insn (destmem, mode_value);
26775 else
26777 emit_move_insn (destmem, srcmem);
26778 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26780 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26781 prolog_size += GET_MODE_SIZE (mode);
26785 /* Copy last SIZE bytes. */
26786 destmem = offset_address (destmem, *count, 1);
26787 destmem = offset_address (destmem,
26788 GEN_INT (-size - prolog_size),
26790 if (issetmem)
26791 emit_move_insn (destmem, mode_value);
26792 else
26794 srcmem = offset_address (srcmem, *count, 1);
26795 srcmem = offset_address (srcmem,
26796 GEN_INT (-size - prolog_size),
26798 emit_move_insn (destmem, srcmem);
26800 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26802 destmem = offset_address (destmem, modesize, 1);
26803 if (issetmem)
26804 emit_move_insn (destmem, mode_value);
26805 else
26807 srcmem = offset_address (srcmem, modesize, 1);
26808 emit_move_insn (destmem, srcmem);
26812 /* Align destination. */
26813 if (desired_align > 1 && desired_align > align)
26815 rtx saveddest = *destptr;
26817 gcc_assert (desired_align <= size);
26818 /* Align destptr up, place it to new register. */
26819 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26820 GEN_INT (prolog_size),
26821 NULL_RTX, 1, OPTAB_DIRECT);
26822 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26823 REG_POINTER (*destptr) = 1;
26824 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26825 GEN_INT (-desired_align),
26826 *destptr, 1, OPTAB_DIRECT);
26827 /* See how many bytes we skipped. */
26828 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26829 *destptr,
26830 saveddest, 1, OPTAB_DIRECT);
26831 /* Adjust srcptr and count. */
26832 if (!issetmem)
26833 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26834 saveddest, *srcptr, 1, OPTAB_DIRECT);
26835 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26836 saveddest, *count, 1, OPTAB_DIRECT);
26837 /* We copied at most size + prolog_size. */
26838 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26839 *min_size
26840 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26841 else
26842 *min_size = 0;
26844 /* Our loops always round down the block size, but for dispatch to
26845 library we need precise value. */
26846 if (dynamic_check)
26847 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26848 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26850 else
26852 gcc_assert (prolog_size == 0);
26853 /* Decrease count, so we won't end up copying last word twice. */
26854 if (!CONST_INT_P (*count))
26855 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26856 constm1_rtx, *count, 1, OPTAB_DIRECT);
26857 else
26858 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26859 (unsigned HOST_WIDE_INT)size));
26860 if (*min_size)
26861 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26866 /* This function is like the previous one, except here we know how many bytes
26867 need to be copied. That allows us to update alignment not only of DST, which
26868 is returned, but also of SRC, which is passed as a pointer for that
26869 reason. */
26870 static rtx
26871 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26872 rtx srcreg, rtx value, rtx vec_value,
26873 int desired_align, int align_bytes,
26874 bool issetmem)
26876 rtx src = NULL;
26877 rtx orig_dst = dst;
26878 rtx orig_src = NULL;
26879 int piece_size = 1;
26880 int copied_bytes = 0;
26882 if (!issetmem)
26884 gcc_assert (srcp != NULL);
26885 src = *srcp;
26886 orig_src = src;
26889 for (piece_size = 1;
26890 piece_size <= desired_align && copied_bytes < align_bytes;
26891 piece_size <<= 1)
26893 if (align_bytes & piece_size)
26895 if (issetmem)
26897 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26898 dst = emit_memset (dst, destreg, vec_value, piece_size);
26899 else
26900 dst = emit_memset (dst, destreg, value, piece_size);
26902 else
26903 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26904 copied_bytes += piece_size;
26907 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26908 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26909 if (MEM_SIZE_KNOWN_P (orig_dst))
26910 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26912 if (!issetmem)
26914 int src_align_bytes = get_mem_align_offset (src, desired_align
26915 * BITS_PER_UNIT);
26916 if (src_align_bytes >= 0)
26917 src_align_bytes = desired_align - src_align_bytes;
26918 if (src_align_bytes >= 0)
26920 unsigned int src_align;
26921 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26923 if ((src_align_bytes & (src_align - 1))
26924 == (align_bytes & (src_align - 1)))
26925 break;
26927 if (src_align > (unsigned int) desired_align)
26928 src_align = desired_align;
26929 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26930 set_mem_align (src, src_align * BITS_PER_UNIT);
26932 if (MEM_SIZE_KNOWN_P (orig_src))
26933 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26934 *srcp = src;
26937 return dst;
26940 /* Return true if ALG can be used in current context.
26941 Assume we expand memset if MEMSET is true. */
26942 static bool
26943 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26945 if (alg == no_stringop)
26946 return false;
26947 if (alg == vector_loop)
26948 return TARGET_SSE || TARGET_AVX;
26949 /* Algorithms using the rep prefix want at least edi and ecx;
26950 additionally, memset wants eax and memcpy wants esi. Don't
26951 consider such algorithms if the user has appropriated those
26952 registers for their own purposes, or if we have a non-default
26953 address space, since some string insns cannot override the segment. */
26954 if (alg == rep_prefix_1_byte
26955 || alg == rep_prefix_4_byte
26956 || alg == rep_prefix_8_byte)
26958 if (have_as)
26959 return false;
26960 if (fixed_regs[CX_REG]
26961 || fixed_regs[DI_REG]
26962 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26963 return false;
26965 return true;
26968 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26969 static enum stringop_alg
26970 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26971 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26972 bool memset, bool zero_memset, bool have_as,
26973 int *dynamic_check, bool *noalign, bool recur)
26975 const struct stringop_algs *algs;
26976 bool optimize_for_speed;
26977 int max = 0;
26978 const struct processor_costs *cost;
26979 int i;
26980 bool any_alg_usable_p = false;
26982 *noalign = false;
26983 *dynamic_check = -1;
26985 /* Even if the string operation call is cold, we still might spend a lot
26986 of time processing large blocks. */
26987 if (optimize_function_for_size_p (cfun)
26988 || (optimize_insn_for_size_p ()
26989 && (max_size < 256
26990 || (expected_size != -1 && expected_size < 256))))
26991 optimize_for_speed = false;
26992 else
26993 optimize_for_speed = true;
26995 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26996 if (memset)
26997 algs = &cost->memset[TARGET_64BIT != 0];
26998 else
26999 algs = &cost->memcpy[TARGET_64BIT != 0];
27001 /* See maximal size for user defined algorithm. */
27002 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27004 enum stringop_alg candidate = algs->size[i].alg;
27005 bool usable = alg_usable_p (candidate, memset, have_as);
27006 any_alg_usable_p |= usable;
27008 if (candidate != libcall && candidate && usable)
27009 max = algs->size[i].max;
27012 /* If expected size is not known but max size is small enough
27013 so inline version is a win, set expected size into
27014 the range. */
27015 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27016 && expected_size == -1)
27017 expected_size = min_size / 2 + max_size / 2;
27019 /* If user specified the algorithm, honor it if possible. */
27020 if (ix86_stringop_alg != no_stringop
27021 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27022 return ix86_stringop_alg;
27023 /* rep; movq or rep; movl is the smallest variant. */
27024 else if (!optimize_for_speed)
27026 *noalign = true;
27027 if (!count || (count & 3) || (memset && !zero_memset))
27028 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27029 ? rep_prefix_1_byte : loop_1_byte;
27030 else
27031 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27032 ? rep_prefix_4_byte : loop;
27034 /* Very tiny blocks are best handled via the loop, REP is expensive to
27035 setup. */
27036 else if (expected_size != -1 && expected_size < 4)
27037 return loop_1_byte;
27038 else if (expected_size != -1)
27040 enum stringop_alg alg = libcall;
27041 bool alg_noalign = false;
27042 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27044 /* We get here if the algorithms that were not libcall-based
27045 were rep-prefix based and we are unable to use rep prefixes
27046 based on global register usage. Break out of the loop and
27047 use the heuristic below. */
27048 if (algs->size[i].max == 0)
27049 break;
27050 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27052 enum stringop_alg candidate = algs->size[i].alg;
27054 if (candidate != libcall
27055 && alg_usable_p (candidate, memset, have_as))
27057 alg = candidate;
27058 alg_noalign = algs->size[i].noalign;
27060 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27061 last non-libcall inline algorithm. */
27062 if (TARGET_INLINE_ALL_STRINGOPS)
27064 /* When the current size is best to be copied by a libcall,
27065 but we are still forced to inline, run the heuristic below
27066 that will pick code for medium sized blocks. */
27067 if (alg != libcall)
27069 *noalign = alg_noalign;
27070 return alg;
27072 else if (!any_alg_usable_p)
27073 break;
27075 else if (alg_usable_p (candidate, memset, have_as))
27077 *noalign = algs->size[i].noalign;
27078 return candidate;
27083 /* When asked to inline the call anyway, try to pick meaningful choice.
27084 We look for maximal size of block that is faster to copy by hand and
27085 take blocks of at most of that size guessing that average size will
27086 be roughly half of the block.
27088 If this turns out to be bad, we might simply specify the preferred
27089 choice in ix86_costs. */
27090 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27091 && (algs->unknown_size == libcall
27092 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27094 enum stringop_alg alg;
27095 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27097 /* If there aren't any usable algorithms or if recursing already,
27098 then recursing on smaller sizes or same size isn't going to
27099 find anything. Just return the simple byte-at-a-time copy loop. */
27100 if (!any_alg_usable_p || recur)
27102 /* Pick something reasonable. */
27103 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27104 *dynamic_check = 128;
27105 return loop_1_byte;
27107 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27108 zero_memset, have_as, dynamic_check, noalign, true);
27109 gcc_assert (*dynamic_check == -1);
27110 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27111 *dynamic_check = max;
27112 else
27113 gcc_assert (alg != libcall);
27114 return alg;
27116 return (alg_usable_p (algs->unknown_size, memset, have_as)
27117 ? algs->unknown_size : libcall);
27120 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27121 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27122 static int
27123 decide_alignment (int align,
27124 enum stringop_alg alg,
27125 int expected_size,
27126 machine_mode move_mode)
27128 int desired_align = 0;
27130 gcc_assert (alg != no_stringop);
27132 if (alg == libcall)
27133 return 0;
27134 if (move_mode == VOIDmode)
27135 return 0;
27137 desired_align = GET_MODE_SIZE (move_mode);
27138 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27139 copying whole cacheline at once. */
27140 if (TARGET_PENTIUMPRO
27141 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27142 desired_align = 8;
27144 if (optimize_size)
27145 desired_align = 1;
27146 if (desired_align < align)
27147 desired_align = align;
27148 if (expected_size != -1 && expected_size < 4)
27149 desired_align = align;
27151 return desired_align;
27155 /* Helper function for memcpy. For QImode value 0xXY produce
27156 0xXYXYXYXY of wide specified by MODE. This is essentially
27157 a * 0x10101010, but we can do slightly better than
27158 synth_mult by unwinding the sequence by hand on CPUs with
27159 slow multiply. */
27160 static rtx
27161 promote_duplicated_reg (machine_mode mode, rtx val)
27163 machine_mode valmode = GET_MODE (val);
27164 rtx tmp;
27165 int nops = mode == DImode ? 3 : 2;
27167 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27168 if (val == const0_rtx)
27169 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27170 if (CONST_INT_P (val))
27172 HOST_WIDE_INT v = INTVAL (val) & 255;
27174 v |= v << 8;
27175 v |= v << 16;
27176 if (mode == DImode)
27177 v |= (v << 16) << 16;
27178 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27181 if (valmode == VOIDmode)
27182 valmode = QImode;
27183 if (valmode != QImode)
27184 val = gen_lowpart (QImode, val);
27185 if (mode == QImode)
27186 return val;
27187 if (!TARGET_PARTIAL_REG_STALL)
27188 nops--;
27189 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27190 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27191 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27192 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27194 rtx reg = convert_modes (mode, QImode, val, true);
27195 tmp = promote_duplicated_reg (mode, const1_rtx);
27196 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27197 OPTAB_DIRECT);
27199 else
27201 rtx reg = convert_modes (mode, QImode, val, true);
27203 if (!TARGET_PARTIAL_REG_STALL)
27204 if (mode == SImode)
27205 emit_insn (gen_insvsi_1 (reg, reg));
27206 else
27207 emit_insn (gen_insvdi_1 (reg, reg));
27208 else
27210 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27211 NULL, 1, OPTAB_DIRECT);
27212 reg =
27213 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27215 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27216 NULL, 1, OPTAB_DIRECT);
27217 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27218 if (mode == SImode)
27219 return reg;
27220 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27221 NULL, 1, OPTAB_DIRECT);
27222 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27223 return reg;
27227 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27228 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27229 alignment from ALIGN to DESIRED_ALIGN. */
27230 static rtx
27231 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27232 int align)
27234 rtx promoted_val;
27236 if (TARGET_64BIT
27237 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27238 promoted_val = promote_duplicated_reg (DImode, val);
27239 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27240 promoted_val = promote_duplicated_reg (SImode, val);
27241 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27242 promoted_val = promote_duplicated_reg (HImode, val);
27243 else
27244 promoted_val = val;
27246 return promoted_val;
27249 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27250 operations when profitable. The code depends upon architecture, block size
27251 and alignment, but always has one of the following overall structures:
27253 Aligned move sequence:
27255 1) Prologue guard: Conditional that jumps up to epilogues for small
27256 blocks that can be handled by epilogue alone. This is faster
27257 but also needed for correctness, since prologue assume the block
27258 is larger than the desired alignment.
27260 Optional dynamic check for size and libcall for large
27261 blocks is emitted here too, with -minline-stringops-dynamically.
27263 2) Prologue: copy first few bytes in order to get destination
27264 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27265 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27266 copied. We emit either a jump tree on power of two sized
27267 blocks, or a byte loop.
27269 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27270 with specified algorithm.
27272 4) Epilogue: code copying tail of the block that is too small to be
27273 handled by main body (or up to size guarded by prologue guard).
27275 Misaligned move sequence
27277 1) missaligned move prologue/epilogue containing:
27278 a) Prologue handling small memory blocks and jumping to done_label
27279 (skipped if blocks are known to be large enough)
27280 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27281 needed by single possibly misaligned move
27282 (skipped if alignment is not needed)
27283 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27285 2) Zero size guard dispatching to done_label, if needed
27287 3) dispatch to library call, if needed,
27289 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27290 with specified algorithm. */
27291 bool
27292 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27293 rtx align_exp, rtx expected_align_exp,
27294 rtx expected_size_exp, rtx min_size_exp,
27295 rtx max_size_exp, rtx probable_max_size_exp,
27296 bool issetmem)
27298 rtx destreg;
27299 rtx srcreg = NULL;
27300 rtx_code_label *label = NULL;
27301 rtx tmp;
27302 rtx_code_label *jump_around_label = NULL;
27303 HOST_WIDE_INT align = 1;
27304 unsigned HOST_WIDE_INT count = 0;
27305 HOST_WIDE_INT expected_size = -1;
27306 int size_needed = 0, epilogue_size_needed;
27307 int desired_align = 0, align_bytes = 0;
27308 enum stringop_alg alg;
27309 rtx promoted_val = NULL;
27310 rtx vec_promoted_val = NULL;
27311 bool force_loopy_epilogue = false;
27312 int dynamic_check;
27313 bool need_zero_guard = false;
27314 bool noalign;
27315 machine_mode move_mode = VOIDmode;
27316 int unroll_factor = 1;
27317 /* TODO: Once value ranges are available, fill in proper data. */
27318 unsigned HOST_WIDE_INT min_size = 0;
27319 unsigned HOST_WIDE_INT max_size = -1;
27320 unsigned HOST_WIDE_INT probable_max_size = -1;
27321 bool misaligned_prologue_used = false;
27322 bool have_as;
27324 if (CONST_INT_P (align_exp))
27325 align = INTVAL (align_exp);
27326 /* i386 can do misaligned access on reasonably increased cost. */
27327 if (CONST_INT_P (expected_align_exp)
27328 && INTVAL (expected_align_exp) > align)
27329 align = INTVAL (expected_align_exp);
27330 /* ALIGN is the minimum of destination and source alignment, but we care here
27331 just about destination alignment. */
27332 else if (!issetmem
27333 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27334 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27336 if (CONST_INT_P (count_exp))
27338 min_size = max_size = probable_max_size = count = expected_size
27339 = INTVAL (count_exp);
27340 /* When COUNT is 0, there is nothing to do. */
27341 if (!count)
27342 return true;
27344 else
27346 if (min_size_exp)
27347 min_size = INTVAL (min_size_exp);
27348 if (max_size_exp)
27349 max_size = INTVAL (max_size_exp);
27350 if (probable_max_size_exp)
27351 probable_max_size = INTVAL (probable_max_size_exp);
27352 if (CONST_INT_P (expected_size_exp))
27353 expected_size = INTVAL (expected_size_exp);
27356 /* Make sure we don't need to care about overflow later on. */
27357 if (count > (HOST_WIDE_INT_1U << 30))
27358 return false;
27360 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27361 if (!issetmem)
27362 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27364 /* Step 0: Decide on preferred algorithm, desired alignment and
27365 size of chunks to be copied by main loop. */
27366 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27367 issetmem,
27368 issetmem && val_exp == const0_rtx, have_as,
27369 &dynamic_check, &noalign, false);
27370 if (alg == libcall)
27371 return false;
27372 gcc_assert (alg != no_stringop);
27374 /* For now vector-version of memset is generated only for memory zeroing, as
27375 creating of promoted vector value is very cheap in this case. */
27376 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27377 alg = unrolled_loop;
27379 if (!count)
27380 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27381 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27382 if (!issetmem)
27383 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27385 unroll_factor = 1;
27386 move_mode = word_mode;
27387 switch (alg)
27389 case libcall:
27390 case no_stringop:
27391 case last_alg:
27392 gcc_unreachable ();
27393 case loop_1_byte:
27394 need_zero_guard = true;
27395 move_mode = QImode;
27396 break;
27397 case loop:
27398 need_zero_guard = true;
27399 break;
27400 case unrolled_loop:
27401 need_zero_guard = true;
27402 unroll_factor = (TARGET_64BIT ? 4 : 2);
27403 break;
27404 case vector_loop:
27405 need_zero_guard = true;
27406 unroll_factor = 4;
27407 /* Find the widest supported mode. */
27408 move_mode = word_mode;
27409 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27410 != CODE_FOR_nothing)
27411 move_mode = GET_MODE_WIDER_MODE (move_mode);
27413 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27414 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27415 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27417 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27418 move_mode = mode_for_vector (word_mode, nunits);
27419 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27420 move_mode = word_mode;
27422 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27423 break;
27424 case rep_prefix_8_byte:
27425 move_mode = DImode;
27426 break;
27427 case rep_prefix_4_byte:
27428 move_mode = SImode;
27429 break;
27430 case rep_prefix_1_byte:
27431 move_mode = QImode;
27432 break;
27434 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27435 epilogue_size_needed = size_needed;
27437 /* If we are going to call any library calls conditionally, make sure any
27438 pending stack adjustment happen before the first conditional branch,
27439 otherwise they will be emitted before the library call only and won't
27440 happen from the other branches. */
27441 if (dynamic_check != -1)
27442 do_pending_stack_adjust ();
27444 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27445 if (!TARGET_ALIGN_STRINGOPS || noalign)
27446 align = desired_align;
27448 /* Step 1: Prologue guard. */
27450 /* Alignment code needs count to be in register. */
27451 if (CONST_INT_P (count_exp) && desired_align > align)
27453 if (INTVAL (count_exp) > desired_align
27454 && INTVAL (count_exp) > size_needed)
27456 align_bytes
27457 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27458 if (align_bytes <= 0)
27459 align_bytes = 0;
27460 else
27461 align_bytes = desired_align - align_bytes;
27463 if (align_bytes == 0)
27464 count_exp = force_reg (counter_mode (count_exp), count_exp);
27466 gcc_assert (desired_align >= 1 && align >= 1);
27468 /* Misaligned move sequences handle both prologue and epilogue at once.
27469 Default code generation results in a smaller code for large alignments
27470 and also avoids redundant job when sizes are known precisely. */
27471 misaligned_prologue_used
27472 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27473 && MAX (desired_align, epilogue_size_needed) <= 32
27474 && desired_align <= epilogue_size_needed
27475 && ((desired_align > align && !align_bytes)
27476 || (!count && epilogue_size_needed > 1)));
27478 /* Do the cheap promotion to allow better CSE across the
27479 main loop and epilogue (ie one load of the big constant in the
27480 front of all code.
27481 For now the misaligned move sequences do not have fast path
27482 without broadcasting. */
27483 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27485 if (alg == vector_loop)
27487 gcc_assert (val_exp == const0_rtx);
27488 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27489 promoted_val = promote_duplicated_reg_to_size (val_exp,
27490 GET_MODE_SIZE (word_mode),
27491 desired_align, align);
27493 else
27495 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27496 desired_align, align);
27499 /* Misaligned move sequences handles both prologues and epilogues at once.
27500 Default code generation results in smaller code for large alignments and
27501 also avoids redundant job when sizes are known precisely. */
27502 if (misaligned_prologue_used)
27504 /* Misaligned move prologue handled small blocks by itself. */
27505 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27506 (dst, src, &destreg, &srcreg,
27507 move_mode, promoted_val, vec_promoted_val,
27508 &count_exp,
27509 &jump_around_label,
27510 desired_align < align
27511 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27512 desired_align, align, &min_size, dynamic_check, issetmem);
27513 if (!issetmem)
27514 src = change_address (src, BLKmode, srcreg);
27515 dst = change_address (dst, BLKmode, destreg);
27516 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27517 epilogue_size_needed = 0;
27518 if (need_zero_guard
27519 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27521 /* It is possible that we copied enough so the main loop will not
27522 execute. */
27523 gcc_assert (size_needed > 1);
27524 if (jump_around_label == NULL_RTX)
27525 jump_around_label = gen_label_rtx ();
27526 emit_cmp_and_jump_insns (count_exp,
27527 GEN_INT (size_needed),
27528 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27529 if (expected_size == -1
27530 || expected_size < (desired_align - align) / 2 + size_needed)
27531 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27532 else
27533 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27536 /* Ensure that alignment prologue won't copy past end of block. */
27537 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27539 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27540 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27541 Make sure it is power of 2. */
27542 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27544 /* To improve performance of small blocks, we jump around the VAL
27545 promoting mode. This mean that if the promoted VAL is not constant,
27546 we might not use it in the epilogue and have to use byte
27547 loop variant. */
27548 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27549 force_loopy_epilogue = true;
27550 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27551 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27553 /* If main algorithm works on QImode, no epilogue is needed.
27554 For small sizes just don't align anything. */
27555 if (size_needed == 1)
27556 desired_align = align;
27557 else
27558 goto epilogue;
27560 else if (!count
27561 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27563 label = gen_label_rtx ();
27564 emit_cmp_and_jump_insns (count_exp,
27565 GEN_INT (epilogue_size_needed),
27566 LTU, 0, counter_mode (count_exp), 1, label);
27567 if (expected_size == -1 || expected_size < epilogue_size_needed)
27568 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27569 else
27570 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27574 /* Emit code to decide on runtime whether library call or inline should be
27575 used. */
27576 if (dynamic_check != -1)
27578 if (!issetmem && CONST_INT_P (count_exp))
27580 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27582 emit_block_copy_via_libcall (dst, src, count_exp);
27583 count_exp = const0_rtx;
27584 goto epilogue;
27587 else
27589 rtx_code_label *hot_label = gen_label_rtx ();
27590 if (jump_around_label == NULL_RTX)
27591 jump_around_label = gen_label_rtx ();
27592 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27593 LEU, 0, counter_mode (count_exp),
27594 1, hot_label);
27595 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27596 if (issetmem)
27597 set_storage_via_libcall (dst, count_exp, val_exp);
27598 else
27599 emit_block_copy_via_libcall (dst, src, count_exp);
27600 emit_jump (jump_around_label);
27601 emit_label (hot_label);
27605 /* Step 2: Alignment prologue. */
27606 /* Do the expensive promotion once we branched off the small blocks. */
27607 if (issetmem && !promoted_val)
27608 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27609 desired_align, align);
27611 if (desired_align > align && !misaligned_prologue_used)
27613 if (align_bytes == 0)
27615 /* Except for the first move in prologue, we no longer know
27616 constant offset in aliasing info. It don't seems to worth
27617 the pain to maintain it for the first move, so throw away
27618 the info early. */
27619 dst = change_address (dst, BLKmode, destreg);
27620 if (!issetmem)
27621 src = change_address (src, BLKmode, srcreg);
27622 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27623 promoted_val, vec_promoted_val,
27624 count_exp, align, desired_align,
27625 issetmem);
27626 /* At most desired_align - align bytes are copied. */
27627 if (min_size < (unsigned)(desired_align - align))
27628 min_size = 0;
27629 else
27630 min_size -= desired_align - align;
27632 else
27634 /* If we know how many bytes need to be stored before dst is
27635 sufficiently aligned, maintain aliasing info accurately. */
27636 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27637 srcreg,
27638 promoted_val,
27639 vec_promoted_val,
27640 desired_align,
27641 align_bytes,
27642 issetmem);
27644 count_exp = plus_constant (counter_mode (count_exp),
27645 count_exp, -align_bytes);
27646 count -= align_bytes;
27647 min_size -= align_bytes;
27648 max_size -= align_bytes;
27650 if (need_zero_guard
27651 && min_size < (unsigned HOST_WIDE_INT) size_needed
27652 && (count < (unsigned HOST_WIDE_INT) size_needed
27653 || (align_bytes == 0
27654 && count < ((unsigned HOST_WIDE_INT) size_needed
27655 + desired_align - align))))
27657 /* It is possible that we copied enough so the main loop will not
27658 execute. */
27659 gcc_assert (size_needed > 1);
27660 if (label == NULL_RTX)
27661 label = gen_label_rtx ();
27662 emit_cmp_and_jump_insns (count_exp,
27663 GEN_INT (size_needed),
27664 LTU, 0, counter_mode (count_exp), 1, label);
27665 if (expected_size == -1
27666 || expected_size < (desired_align - align) / 2 + size_needed)
27667 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27668 else
27669 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27672 if (label && size_needed == 1)
27674 emit_label (label);
27675 LABEL_NUSES (label) = 1;
27676 label = NULL;
27677 epilogue_size_needed = 1;
27678 if (issetmem)
27679 promoted_val = val_exp;
27681 else if (label == NULL_RTX && !misaligned_prologue_used)
27682 epilogue_size_needed = size_needed;
27684 /* Step 3: Main loop. */
27686 switch (alg)
27688 case libcall:
27689 case no_stringop:
27690 case last_alg:
27691 gcc_unreachable ();
27692 case loop_1_byte:
27693 case loop:
27694 case unrolled_loop:
27695 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27696 count_exp, move_mode, unroll_factor,
27697 expected_size, issetmem);
27698 break;
27699 case vector_loop:
27700 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27701 vec_promoted_val, count_exp, move_mode,
27702 unroll_factor, expected_size, issetmem);
27703 break;
27704 case rep_prefix_8_byte:
27705 case rep_prefix_4_byte:
27706 case rep_prefix_1_byte:
27707 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27708 val_exp, count_exp, move_mode, issetmem);
27709 break;
27711 /* Adjust properly the offset of src and dest memory for aliasing. */
27712 if (CONST_INT_P (count_exp))
27714 if (!issetmem)
27715 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27716 (count / size_needed) * size_needed);
27717 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27718 (count / size_needed) * size_needed);
27720 else
27722 if (!issetmem)
27723 src = change_address (src, BLKmode, srcreg);
27724 dst = change_address (dst, BLKmode, destreg);
27727 /* Step 4: Epilogue to copy the remaining bytes. */
27728 epilogue:
27729 if (label)
27731 /* When the main loop is done, COUNT_EXP might hold original count,
27732 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27733 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27734 bytes. Compensate if needed. */
27736 if (size_needed < epilogue_size_needed)
27738 tmp =
27739 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27740 GEN_INT (size_needed - 1), count_exp, 1,
27741 OPTAB_DIRECT);
27742 if (tmp != count_exp)
27743 emit_move_insn (count_exp, tmp);
27745 emit_label (label);
27746 LABEL_NUSES (label) = 1;
27749 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27751 if (force_loopy_epilogue)
27752 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27753 epilogue_size_needed);
27754 else
27756 if (issetmem)
27757 expand_setmem_epilogue (dst, destreg, promoted_val,
27758 vec_promoted_val, count_exp,
27759 epilogue_size_needed);
27760 else
27761 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27762 epilogue_size_needed);
27765 if (jump_around_label)
27766 emit_label (jump_around_label);
27767 return true;
27771 /* Expand the appropriate insns for doing strlen if not just doing
27772 repnz; scasb
27774 out = result, initialized with the start address
27775 align_rtx = alignment of the address.
27776 scratch = scratch register, initialized with the startaddress when
27777 not aligned, otherwise undefined
27779 This is just the body. It needs the initializations mentioned above and
27780 some address computing at the end. These things are done in i386.md. */
27782 static void
27783 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27785 int align;
27786 rtx tmp;
27787 rtx_code_label *align_2_label = NULL;
27788 rtx_code_label *align_3_label = NULL;
27789 rtx_code_label *align_4_label = gen_label_rtx ();
27790 rtx_code_label *end_0_label = gen_label_rtx ();
27791 rtx mem;
27792 rtx tmpreg = gen_reg_rtx (SImode);
27793 rtx scratch = gen_reg_rtx (SImode);
27794 rtx cmp;
27796 align = 0;
27797 if (CONST_INT_P (align_rtx))
27798 align = INTVAL (align_rtx);
27800 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27802 /* Is there a known alignment and is it less than 4? */
27803 if (align < 4)
27805 rtx scratch1 = gen_reg_rtx (Pmode);
27806 emit_move_insn (scratch1, out);
27807 /* Is there a known alignment and is it not 2? */
27808 if (align != 2)
27810 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27811 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27813 /* Leave just the 3 lower bits. */
27814 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27815 NULL_RTX, 0, OPTAB_WIDEN);
27817 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27818 Pmode, 1, align_4_label);
27819 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27820 Pmode, 1, align_2_label);
27821 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27822 Pmode, 1, align_3_label);
27824 else
27826 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27827 check if is aligned to 4 - byte. */
27829 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27830 NULL_RTX, 0, OPTAB_WIDEN);
27832 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27833 Pmode, 1, align_4_label);
27836 mem = change_address (src, QImode, out);
27838 /* Now compare the bytes. */
27840 /* Compare the first n unaligned byte on a byte per byte basis. */
27841 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27842 QImode, 1, end_0_label);
27844 /* Increment the address. */
27845 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27847 /* Not needed with an alignment of 2 */
27848 if (align != 2)
27850 emit_label (align_2_label);
27852 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27853 end_0_label);
27855 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27857 emit_label (align_3_label);
27860 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27861 end_0_label);
27863 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27866 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27867 align this loop. It gives only huge programs, but does not help to
27868 speed up. */
27869 emit_label (align_4_label);
27871 mem = change_address (src, SImode, out);
27872 emit_move_insn (scratch, mem);
27873 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27875 /* This formula yields a nonzero result iff one of the bytes is zero.
27876 This saves three branches inside loop and many cycles. */
27878 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27879 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27880 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27881 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27882 gen_int_mode (0x80808080, SImode)));
27883 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27884 align_4_label);
27886 if (TARGET_CMOVE)
27888 rtx reg = gen_reg_rtx (SImode);
27889 rtx reg2 = gen_reg_rtx (Pmode);
27890 emit_move_insn (reg, tmpreg);
27891 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27893 /* If zero is not in the first two bytes, move two bytes forward. */
27894 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27895 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27896 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27897 emit_insn (gen_rtx_SET (tmpreg,
27898 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27899 reg,
27900 tmpreg)));
27901 /* Emit lea manually to avoid clobbering of flags. */
27902 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27904 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27905 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27906 emit_insn (gen_rtx_SET (out,
27907 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27908 reg2,
27909 out)));
27911 else
27913 rtx_code_label *end_2_label = gen_label_rtx ();
27914 /* Is zero in the first two bytes? */
27916 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27917 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27918 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27919 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27920 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27921 pc_rtx);
27922 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27923 JUMP_LABEL (tmp) = end_2_label;
27925 /* Not in the first two. Move two bytes forward. */
27926 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27927 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27929 emit_label (end_2_label);
27933 /* Avoid branch in fixing the byte. */
27934 tmpreg = gen_lowpart (QImode, tmpreg);
27935 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27936 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27937 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27938 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27940 emit_label (end_0_label);
27943 /* Expand strlen. */
27945 bool
27946 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27948 rtx addr, scratch1, scratch2, scratch3, scratch4;
27950 /* The generic case of strlen expander is long. Avoid it's
27951 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27953 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27954 && !TARGET_INLINE_ALL_STRINGOPS
27955 && !optimize_insn_for_size_p ()
27956 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27957 return false;
27959 addr = force_reg (Pmode, XEXP (src, 0));
27960 scratch1 = gen_reg_rtx (Pmode);
27962 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27963 && !optimize_insn_for_size_p ())
27965 /* Well it seems that some optimizer does not combine a call like
27966 foo(strlen(bar), strlen(bar));
27967 when the move and the subtraction is done here. It does calculate
27968 the length just once when these instructions are done inside of
27969 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27970 often used and I use one fewer register for the lifetime of
27971 output_strlen_unroll() this is better. */
27973 emit_move_insn (out, addr);
27975 ix86_expand_strlensi_unroll_1 (out, src, align);
27977 /* strlensi_unroll_1 returns the address of the zero at the end of
27978 the string, like memchr(), so compute the length by subtracting
27979 the start address. */
27980 emit_insn (ix86_gen_sub3 (out, out, addr));
27982 else
27984 rtx unspec;
27986 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27987 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27988 return false;
27989 /* Can't use this for non-default address spaces. */
27990 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27991 return false;
27993 scratch2 = gen_reg_rtx (Pmode);
27994 scratch3 = gen_reg_rtx (Pmode);
27995 scratch4 = force_reg (Pmode, constm1_rtx);
27997 emit_move_insn (scratch3, addr);
27998 eoschar = force_reg (QImode, eoschar);
28000 src = replace_equiv_address_nv (src, scratch3);
28002 /* If .md starts supporting :P, this can be done in .md. */
28003 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28004 scratch4), UNSPEC_SCAS);
28005 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28006 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28007 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28009 return true;
28012 /* For given symbol (function) construct code to compute address of it's PLT
28013 entry in large x86-64 PIC model. */
28014 static rtx
28015 construct_plt_address (rtx symbol)
28017 rtx tmp, unspec;
28019 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28020 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28021 gcc_assert (Pmode == DImode);
28023 tmp = gen_reg_rtx (Pmode);
28024 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28026 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28027 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28028 return tmp;
28032 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28033 rtx callarg2,
28034 rtx pop, bool sibcall)
28036 rtx vec[3];
28037 rtx use = NULL, call;
28038 unsigned int vec_len = 0;
28039 tree fndecl;
28041 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28043 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28044 if (fndecl
28045 && (lookup_attribute ("interrupt",
28046 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28047 error ("interrupt service routine can't be called directly");
28049 else
28050 fndecl = NULL_TREE;
28052 if (pop == const0_rtx)
28053 pop = NULL;
28054 gcc_assert (!TARGET_64BIT || !pop);
28056 if (TARGET_MACHO && !TARGET_64BIT)
28058 #if TARGET_MACHO
28059 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28060 fnaddr = machopic_indirect_call_target (fnaddr);
28061 #endif
28063 else
28065 /* Static functions and indirect calls don't need the pic register. Also,
28066 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28067 it an indirect call. */
28068 rtx addr = XEXP (fnaddr, 0);
28069 if (flag_pic
28070 && GET_CODE (addr) == SYMBOL_REF
28071 && !SYMBOL_REF_LOCAL_P (addr))
28073 if (flag_plt
28074 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28075 || !lookup_attribute ("noplt",
28076 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28078 if (!TARGET_64BIT
28079 || (ix86_cmodel == CM_LARGE_PIC
28080 && DEFAULT_ABI != MS_ABI))
28082 use_reg (&use, gen_rtx_REG (Pmode,
28083 REAL_PIC_OFFSET_TABLE_REGNUM));
28084 if (ix86_use_pseudo_pic_reg ())
28085 emit_move_insn (gen_rtx_REG (Pmode,
28086 REAL_PIC_OFFSET_TABLE_REGNUM),
28087 pic_offset_table_rtx);
28090 else if (!TARGET_PECOFF && !TARGET_MACHO)
28092 if (TARGET_64BIT)
28094 fnaddr = gen_rtx_UNSPEC (Pmode,
28095 gen_rtvec (1, addr),
28096 UNSPEC_GOTPCREL);
28097 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28099 else
28101 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28102 UNSPEC_GOT);
28103 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28104 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28105 fnaddr);
28107 fnaddr = gen_const_mem (Pmode, fnaddr);
28108 /* Pmode may not be the same as word_mode for x32, which
28109 doesn't support indirect branch via 32-bit memory slot.
28110 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28111 indirect branch via x32 GOT slot is OK. */
28112 if (GET_MODE (fnaddr) != word_mode)
28113 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28114 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28119 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28120 parameters passed in vector registers. */
28121 if (TARGET_64BIT
28122 && (INTVAL (callarg2) > 0
28123 || (INTVAL (callarg2) == 0
28124 && (TARGET_SSE || !flag_skip_rax_setup))))
28126 rtx al = gen_rtx_REG (QImode, AX_REG);
28127 emit_move_insn (al, callarg2);
28128 use_reg (&use, al);
28131 if (ix86_cmodel == CM_LARGE_PIC
28132 && !TARGET_PECOFF
28133 && MEM_P (fnaddr)
28134 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28135 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28136 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28137 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28138 branch via x32 GOT slot is OK. */
28139 else if (!(TARGET_X32
28140 && MEM_P (fnaddr)
28141 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28142 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28143 && (sibcall
28144 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28145 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28147 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28148 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28151 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28153 if (retval)
28155 /* We should add bounds as destination register in case
28156 pointer with bounds may be returned. */
28157 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28159 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28160 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28161 if (GET_CODE (retval) == PARALLEL)
28163 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28164 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28165 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28166 retval = chkp_join_splitted_slot (retval, par);
28168 else
28170 retval = gen_rtx_PARALLEL (VOIDmode,
28171 gen_rtvec (3, retval, b0, b1));
28172 chkp_put_regs_to_expr_list (retval);
28176 call = gen_rtx_SET (retval, call);
28178 vec[vec_len++] = call;
28180 if (pop)
28182 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28183 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28184 vec[vec_len++] = pop;
28187 if (cfun->machine->no_caller_saved_registers
28188 && (!fndecl
28189 || (!TREE_THIS_VOLATILE (fndecl)
28190 && !lookup_attribute ("no_caller_saved_registers",
28191 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28193 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28194 bool is_64bit_ms_abi = (TARGET_64BIT
28195 && ix86_function_abi (fndecl) == MS_ABI);
28196 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28198 /* If there are no caller-saved registers, add all registers
28199 that are clobbered by the call which returns. */
28200 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28201 if (!fixed_regs[i]
28202 && (ix86_call_used_regs[i] == 1
28203 || (ix86_call_used_regs[i] & c_mask))
28204 && !STACK_REGNO_P (i)
28205 && !MMX_REGNO_P (i))
28206 clobber_reg (&use,
28207 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28209 else if (TARGET_64BIT_MS_ABI
28210 && (!callarg2 || INTVAL (callarg2) != -2))
28212 int const cregs_size
28213 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28214 int i;
28216 for (i = 0; i < cregs_size; i++)
28218 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28219 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28221 clobber_reg (&use, gen_rtx_REG (mode, regno));
28225 if (vec_len > 1)
28226 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28227 call = emit_call_insn (call);
28228 if (use)
28229 CALL_INSN_FUNCTION_USAGE (call) = use;
28231 return call;
28234 /* Return true if the function being called was marked with attribute
28235 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28236 to handle the non-PIC case in the backend because there is no easy
28237 interface for the front-end to force non-PLT calls to use the GOT.
28238 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28239 to call the function marked "noplt" indirectly. */
28241 static bool
28242 ix86_nopic_noplt_attribute_p (rtx call_op)
28244 if (flag_pic || ix86_cmodel == CM_LARGE
28245 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28246 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28247 || SYMBOL_REF_LOCAL_P (call_op))
28248 return false;
28250 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28252 if (!flag_plt
28253 || (symbol_decl != NULL_TREE
28254 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28255 return true;
28257 return false;
28260 /* Output the assembly for a call instruction. */
28262 const char *
28263 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28265 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28266 bool seh_nop_p = false;
28267 const char *xasm;
28269 if (SIBLING_CALL_P (insn))
28271 if (direct_p)
28273 if (ix86_nopic_noplt_attribute_p (call_op))
28275 if (TARGET_64BIT)
28276 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28277 else
28278 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28280 else
28281 xasm = "%!jmp\t%P0";
28283 /* SEH epilogue detection requires the indirect branch case
28284 to include REX.W. */
28285 else if (TARGET_SEH)
28286 xasm = "%!rex.W jmp\t%A0";
28287 else
28288 xasm = "%!jmp\t%A0";
28290 output_asm_insn (xasm, &call_op);
28291 return "";
28294 /* SEH unwinding can require an extra nop to be emitted in several
28295 circumstances. Determine if we have one of those. */
28296 if (TARGET_SEH)
28298 rtx_insn *i;
28300 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28302 /* If we get to another real insn, we don't need the nop. */
28303 if (INSN_P (i))
28304 break;
28306 /* If we get to the epilogue note, prevent a catch region from
28307 being adjacent to the standard epilogue sequence. If non-
28308 call-exceptions, we'll have done this during epilogue emission. */
28309 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28310 && !flag_non_call_exceptions
28311 && !can_throw_internal (insn))
28313 seh_nop_p = true;
28314 break;
28318 /* If we didn't find a real insn following the call, prevent the
28319 unwinder from looking into the next function. */
28320 if (i == NULL)
28321 seh_nop_p = true;
28324 if (direct_p)
28326 if (ix86_nopic_noplt_attribute_p (call_op))
28328 if (TARGET_64BIT)
28329 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28330 else
28331 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28333 else
28334 xasm = "%!call\t%P0";
28336 else
28337 xasm = "%!call\t%A0";
28339 output_asm_insn (xasm, &call_op);
28341 if (seh_nop_p)
28342 return "nop";
28344 return "";
28347 /* Clear stack slot assignments remembered from previous functions.
28348 This is called from INIT_EXPANDERS once before RTL is emitted for each
28349 function. */
28351 static struct machine_function *
28352 ix86_init_machine_status (void)
28354 struct machine_function *f;
28356 f = ggc_cleared_alloc<machine_function> ();
28357 f->use_fast_prologue_epilogue_nregs = -1;
28358 f->call_abi = ix86_abi;
28360 return f;
28363 /* Return a MEM corresponding to a stack slot with mode MODE.
28364 Allocate a new slot if necessary.
28366 The RTL for a function can have several slots available: N is
28367 which slot to use. */
28370 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28372 struct stack_local_entry *s;
28374 gcc_assert (n < MAX_386_STACK_LOCALS);
28376 for (s = ix86_stack_locals; s; s = s->next)
28377 if (s->mode == mode && s->n == n)
28378 return validize_mem (copy_rtx (s->rtl));
28380 s = ggc_alloc<stack_local_entry> ();
28381 s->n = n;
28382 s->mode = mode;
28383 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28385 s->next = ix86_stack_locals;
28386 ix86_stack_locals = s;
28387 return validize_mem (copy_rtx (s->rtl));
28390 static void
28391 ix86_instantiate_decls (void)
28393 struct stack_local_entry *s;
28395 for (s = ix86_stack_locals; s; s = s->next)
28396 if (s->rtl != NULL_RTX)
28397 instantiate_decl_rtl (s->rtl);
28400 /* Return the number used for encoding REG, in the range 0..7. */
28402 static int
28403 reg_encoded_number (rtx reg)
28405 unsigned regno = REGNO (reg);
28406 switch (regno)
28408 case AX_REG:
28409 return 0;
28410 case CX_REG:
28411 return 1;
28412 case DX_REG:
28413 return 2;
28414 case BX_REG:
28415 return 3;
28416 case SP_REG:
28417 return 4;
28418 case BP_REG:
28419 return 5;
28420 case SI_REG:
28421 return 6;
28422 case DI_REG:
28423 return 7;
28424 default:
28425 break;
28427 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28428 return regno - FIRST_STACK_REG;
28429 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28430 return regno - FIRST_SSE_REG;
28431 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28432 return regno - FIRST_MMX_REG;
28433 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28434 return regno - FIRST_REX_SSE_REG;
28435 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28436 return regno - FIRST_REX_INT_REG;
28437 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28438 return regno - FIRST_MASK_REG;
28439 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28440 return regno - FIRST_BND_REG;
28441 return -1;
28444 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28445 in its encoding if it could be relevant for ROP mitigation, otherwise
28446 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28447 used for calculating it into them. */
28449 static int
28450 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28451 int *popno0 = 0, int *popno1 = 0)
28453 if (asm_noperands (PATTERN (insn)) >= 0)
28454 return -1;
28455 int has_modrm = get_attr_modrm (insn);
28456 if (!has_modrm)
28457 return -1;
28458 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28459 rtx op0, op1;
28460 switch (cls)
28462 case MODRM_CLASS_OP02:
28463 gcc_assert (noperands >= 3);
28464 if (popno0)
28466 *popno0 = 0;
28467 *popno1 = 2;
28469 op0 = operands[0];
28470 op1 = operands[2];
28471 break;
28472 case MODRM_CLASS_OP01:
28473 gcc_assert (noperands >= 2);
28474 if (popno0)
28476 *popno0 = 0;
28477 *popno1 = 1;
28479 op0 = operands[0];
28480 op1 = operands[1];
28481 break;
28482 default:
28483 return -1;
28485 if (REG_P (op0) && REG_P (op1))
28487 int enc0 = reg_encoded_number (op0);
28488 int enc1 = reg_encoded_number (op1);
28489 return 0xc0 + (enc1 << 3) + enc0;
28491 return -1;
28494 /* Check whether x86 address PARTS is a pc-relative address. */
28496 static bool
28497 rip_relative_addr_p (struct ix86_address *parts)
28499 rtx base, index, disp;
28501 base = parts->base;
28502 index = parts->index;
28503 disp = parts->disp;
28505 if (disp && !base && !index)
28507 if (TARGET_64BIT)
28509 rtx symbol = disp;
28511 if (GET_CODE (disp) == CONST)
28512 symbol = XEXP (disp, 0);
28513 if (GET_CODE (symbol) == PLUS
28514 && CONST_INT_P (XEXP (symbol, 1)))
28515 symbol = XEXP (symbol, 0);
28517 if (GET_CODE (symbol) == LABEL_REF
28518 || (GET_CODE (symbol) == SYMBOL_REF
28519 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28520 || (GET_CODE (symbol) == UNSPEC
28521 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28522 || XINT (symbol, 1) == UNSPEC_PCREL
28523 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28524 return true;
28527 return false;
28530 /* Calculate the length of the memory address in the instruction encoding.
28531 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28532 or other prefixes. We never generate addr32 prefix for LEA insn. */
28535 memory_address_length (rtx addr, bool lea)
28537 struct ix86_address parts;
28538 rtx base, index, disp;
28539 int len;
28540 int ok;
28542 if (GET_CODE (addr) == PRE_DEC
28543 || GET_CODE (addr) == POST_INC
28544 || GET_CODE (addr) == PRE_MODIFY
28545 || GET_CODE (addr) == POST_MODIFY)
28546 return 0;
28548 ok = ix86_decompose_address (addr, &parts);
28549 gcc_assert (ok);
28551 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28553 /* If this is not LEA instruction, add the length of addr32 prefix. */
28554 if (TARGET_64BIT && !lea
28555 && (SImode_address_operand (addr, VOIDmode)
28556 || (parts.base && GET_MODE (parts.base) == SImode)
28557 || (parts.index && GET_MODE (parts.index) == SImode)))
28558 len++;
28560 base = parts.base;
28561 index = parts.index;
28562 disp = parts.disp;
28564 if (base && SUBREG_P (base))
28565 base = SUBREG_REG (base);
28566 if (index && SUBREG_P (index))
28567 index = SUBREG_REG (index);
28569 gcc_assert (base == NULL_RTX || REG_P (base));
28570 gcc_assert (index == NULL_RTX || REG_P (index));
28572 /* Rule of thumb:
28573 - esp as the base always wants an index,
28574 - ebp as the base always wants a displacement,
28575 - r12 as the base always wants an index,
28576 - r13 as the base always wants a displacement. */
28578 /* Register Indirect. */
28579 if (base && !index && !disp)
28581 /* esp (for its index) and ebp (for its displacement) need
28582 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28583 code. */
28584 if (base == arg_pointer_rtx
28585 || base == frame_pointer_rtx
28586 || REGNO (base) == SP_REG
28587 || REGNO (base) == BP_REG
28588 || REGNO (base) == R12_REG
28589 || REGNO (base) == R13_REG)
28590 len++;
28593 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28594 is not disp32, but disp32(%rip), so for disp32
28595 SIB byte is needed, unless print_operand_address
28596 optimizes it into disp32(%rip) or (%rip) is implied
28597 by UNSPEC. */
28598 else if (disp && !base && !index)
28600 len += 4;
28601 if (rip_relative_addr_p (&parts))
28602 len++;
28604 else
28606 /* Find the length of the displacement constant. */
28607 if (disp)
28609 if (base && satisfies_constraint_K (disp))
28610 len += 1;
28611 else
28612 len += 4;
28614 /* ebp always wants a displacement. Similarly r13. */
28615 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28616 len++;
28618 /* An index requires the two-byte modrm form.... */
28619 if (index
28620 /* ...like esp (or r12), which always wants an index. */
28621 || base == arg_pointer_rtx
28622 || base == frame_pointer_rtx
28623 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28624 len++;
28627 return len;
28630 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28631 is set, expect that insn have 8bit immediate alternative. */
28633 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28635 int len = 0;
28636 int i;
28637 extract_insn_cached (insn);
28638 for (i = recog_data.n_operands - 1; i >= 0; --i)
28639 if (CONSTANT_P (recog_data.operand[i]))
28641 enum attr_mode mode = get_attr_mode (insn);
28643 gcc_assert (!len);
28644 if (shortform && CONST_INT_P (recog_data.operand[i]))
28646 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28647 switch (mode)
28649 case MODE_QI:
28650 len = 1;
28651 continue;
28652 case MODE_HI:
28653 ival = trunc_int_for_mode (ival, HImode);
28654 break;
28655 case MODE_SI:
28656 ival = trunc_int_for_mode (ival, SImode);
28657 break;
28658 default:
28659 break;
28661 if (IN_RANGE (ival, -128, 127))
28663 len = 1;
28664 continue;
28667 switch (mode)
28669 case MODE_QI:
28670 len = 1;
28671 break;
28672 case MODE_HI:
28673 len = 2;
28674 break;
28675 case MODE_SI:
28676 len = 4;
28677 break;
28678 /* Immediates for DImode instructions are encoded
28679 as 32bit sign extended values. */
28680 case MODE_DI:
28681 len = 4;
28682 break;
28683 default:
28684 fatal_insn ("unknown insn mode", insn);
28687 return len;
28690 /* Compute default value for "length_address" attribute. */
28692 ix86_attr_length_address_default (rtx_insn *insn)
28694 int i;
28696 if (get_attr_type (insn) == TYPE_LEA)
28698 rtx set = PATTERN (insn), addr;
28700 if (GET_CODE (set) == PARALLEL)
28701 set = XVECEXP (set, 0, 0);
28703 gcc_assert (GET_CODE (set) == SET);
28705 addr = SET_SRC (set);
28707 return memory_address_length (addr, true);
28710 extract_insn_cached (insn);
28711 for (i = recog_data.n_operands - 1; i >= 0; --i)
28713 rtx op = recog_data.operand[i];
28714 if (MEM_P (op))
28716 constrain_operands_cached (insn, reload_completed);
28717 if (which_alternative != -1)
28719 const char *constraints = recog_data.constraints[i];
28720 int alt = which_alternative;
28722 while (*constraints == '=' || *constraints == '+')
28723 constraints++;
28724 while (alt-- > 0)
28725 while (*constraints++ != ',')
28727 /* Skip ignored operands. */
28728 if (*constraints == 'X')
28729 continue;
28732 int len = memory_address_length (XEXP (op, 0), false);
28734 /* Account for segment prefix for non-default addr spaces. */
28735 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28736 len++;
28738 return len;
28741 return 0;
28744 /* Compute default value for "length_vex" attribute. It includes
28745 2 or 3 byte VEX prefix and 1 opcode byte. */
28748 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28749 bool has_vex_w)
28751 int i;
28753 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28754 byte VEX prefix. */
28755 if (!has_0f_opcode || has_vex_w)
28756 return 3 + 1;
28758 /* We can always use 2 byte VEX prefix in 32bit. */
28759 if (!TARGET_64BIT)
28760 return 2 + 1;
28762 extract_insn_cached (insn);
28764 for (i = recog_data.n_operands - 1; i >= 0; --i)
28765 if (REG_P (recog_data.operand[i]))
28767 /* REX.W bit uses 3 byte VEX prefix. */
28768 if (GET_MODE (recog_data.operand[i]) == DImode
28769 && GENERAL_REG_P (recog_data.operand[i]))
28770 return 3 + 1;
28772 else
28774 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28775 if (MEM_P (recog_data.operand[i])
28776 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28777 return 3 + 1;
28780 return 2 + 1;
28783 /* Return the maximum number of instructions a cpu can issue. */
28785 static int
28786 ix86_issue_rate (void)
28788 switch (ix86_tune)
28790 case PROCESSOR_PENTIUM:
28791 case PROCESSOR_LAKEMONT:
28792 case PROCESSOR_BONNELL:
28793 case PROCESSOR_SILVERMONT:
28794 case PROCESSOR_KNL:
28795 case PROCESSOR_INTEL:
28796 case PROCESSOR_K6:
28797 case PROCESSOR_BTVER2:
28798 case PROCESSOR_PENTIUM4:
28799 case PROCESSOR_NOCONA:
28800 return 2;
28802 case PROCESSOR_PENTIUMPRO:
28803 case PROCESSOR_ATHLON:
28804 case PROCESSOR_K8:
28805 case PROCESSOR_AMDFAM10:
28806 case PROCESSOR_GENERIC:
28807 case PROCESSOR_BTVER1:
28808 return 3;
28810 case PROCESSOR_BDVER1:
28811 case PROCESSOR_BDVER2:
28812 case PROCESSOR_BDVER3:
28813 case PROCESSOR_BDVER4:
28814 case PROCESSOR_ZNVER1:
28815 case PROCESSOR_CORE2:
28816 case PROCESSOR_NEHALEM:
28817 case PROCESSOR_SANDYBRIDGE:
28818 case PROCESSOR_HASWELL:
28819 return 4;
28821 default:
28822 return 1;
28826 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
28827 by DEP_INSN and nothing set by DEP_INSN. */
28829 static bool
28830 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
28832 rtx set, set2;
28834 /* Simplify the test for uninteresting insns. */
28835 if (insn_type != TYPE_SETCC
28836 && insn_type != TYPE_ICMOV
28837 && insn_type != TYPE_FCMOV
28838 && insn_type != TYPE_IBR)
28839 return false;
28841 if ((set = single_set (dep_insn)) != 0)
28843 set = SET_DEST (set);
28844 set2 = NULL_RTX;
28846 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
28847 && XVECLEN (PATTERN (dep_insn), 0) == 2
28848 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
28849 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
28851 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28852 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28854 else
28855 return false;
28857 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
28858 return false;
28860 /* This test is true if the dependent insn reads the flags but
28861 not any other potentially set register. */
28862 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
28863 return false;
28865 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
28866 return false;
28868 return true;
28871 /* Return true iff USE_INSN has a memory address with operands set by
28872 SET_INSN. */
28874 bool
28875 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
28877 int i;
28878 extract_insn_cached (use_insn);
28879 for (i = recog_data.n_operands - 1; i >= 0; --i)
28880 if (MEM_P (recog_data.operand[i]))
28882 rtx addr = XEXP (recog_data.operand[i], 0);
28883 return modified_in_p (addr, set_insn) != 0;
28885 return false;
28888 /* Helper function for exact_store_load_dependency.
28889 Return true if addr is found in insn. */
28890 static bool
28891 exact_dependency_1 (rtx addr, rtx insn)
28893 enum rtx_code code;
28894 const char *format_ptr;
28895 int i, j;
28897 code = GET_CODE (insn);
28898 switch (code)
28900 case MEM:
28901 if (rtx_equal_p (addr, insn))
28902 return true;
28903 break;
28904 case REG:
28905 CASE_CONST_ANY:
28906 case SYMBOL_REF:
28907 case CODE_LABEL:
28908 case PC:
28909 case CC0:
28910 case EXPR_LIST:
28911 return false;
28912 default:
28913 break;
28916 format_ptr = GET_RTX_FORMAT (code);
28917 for (i = 0; i < GET_RTX_LENGTH (code); i++)
28919 switch (*format_ptr++)
28921 case 'e':
28922 if (exact_dependency_1 (addr, XEXP (insn, i)))
28923 return true;
28924 break;
28925 case 'E':
28926 for (j = 0; j < XVECLEN (insn, i); j++)
28927 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
28928 return true;
28929 break;
28932 return false;
28935 /* Return true if there exists exact dependency for store & load, i.e.
28936 the same memory address is used in them. */
28937 static bool
28938 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
28940 rtx set1, set2;
28942 set1 = single_set (store);
28943 if (!set1)
28944 return false;
28945 if (!MEM_P (SET_DEST (set1)))
28946 return false;
28947 set2 = single_set (load);
28948 if (!set2)
28949 return false;
28950 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
28951 return true;
28952 return false;
28955 static int
28956 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
28957 unsigned int)
28959 enum attr_type insn_type, dep_insn_type;
28960 enum attr_memory memory;
28961 rtx set, set2;
28962 int dep_insn_code_number;
28964 /* Anti and output dependencies have zero cost on all CPUs. */
28965 if (dep_type != 0)
28966 return 0;
28968 dep_insn_code_number = recog_memoized (dep_insn);
28970 /* If we can't recognize the insns, we can't really do anything. */
28971 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
28972 return cost;
28974 insn_type = get_attr_type (insn);
28975 dep_insn_type = get_attr_type (dep_insn);
28977 switch (ix86_tune)
28979 case PROCESSOR_PENTIUM:
28980 case PROCESSOR_LAKEMONT:
28981 /* Address Generation Interlock adds a cycle of latency. */
28982 if (insn_type == TYPE_LEA)
28984 rtx addr = PATTERN (insn);
28986 if (GET_CODE (addr) == PARALLEL)
28987 addr = XVECEXP (addr, 0, 0);
28989 gcc_assert (GET_CODE (addr) == SET);
28991 addr = SET_SRC (addr);
28992 if (modified_in_p (addr, dep_insn))
28993 cost += 1;
28995 else if (ix86_agi_dependent (dep_insn, insn))
28996 cost += 1;
28998 /* ??? Compares pair with jump/setcc. */
28999 if (ix86_flags_dependent (insn, dep_insn, insn_type))
29000 cost = 0;
29002 /* Floating point stores require value to be ready one cycle earlier. */
29003 if (insn_type == TYPE_FMOV
29004 && get_attr_memory (insn) == MEMORY_STORE
29005 && !ix86_agi_dependent (dep_insn, insn))
29006 cost += 1;
29007 break;
29009 case PROCESSOR_PENTIUMPRO:
29010 /* INT->FP conversion is expensive. */
29011 if (get_attr_fp_int_src (dep_insn))
29012 cost += 5;
29014 /* There is one cycle extra latency between an FP op and a store. */
29015 if (insn_type == TYPE_FMOV
29016 && (set = single_set (dep_insn)) != NULL_RTX
29017 && (set2 = single_set (insn)) != NULL_RTX
29018 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
29019 && MEM_P (SET_DEST (set2)))
29020 cost += 1;
29022 memory = get_attr_memory (insn);
29024 /* Show ability of reorder buffer to hide latency of load by executing
29025 in parallel with previous instruction in case
29026 previous instruction is not needed to compute the address. */
29027 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29028 && !ix86_agi_dependent (dep_insn, insn))
29030 /* Claim moves to take one cycle, as core can issue one load
29031 at time and the next load can start cycle later. */
29032 if (dep_insn_type == TYPE_IMOV
29033 || dep_insn_type == TYPE_FMOV)
29034 cost = 1;
29035 else if (cost > 1)
29036 cost--;
29038 break;
29040 case PROCESSOR_K6:
29041 /* The esp dependency is resolved before
29042 the instruction is really finished. */
29043 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29044 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29045 return 1;
29047 /* INT->FP conversion is expensive. */
29048 if (get_attr_fp_int_src (dep_insn))
29049 cost += 5;
29051 memory = get_attr_memory (insn);
29053 /* Show ability of reorder buffer to hide latency of load by executing
29054 in parallel with previous instruction in case
29055 previous instruction is not needed to compute the address. */
29056 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29057 && !ix86_agi_dependent (dep_insn, insn))
29059 /* Claim moves to take one cycle, as core can issue one load
29060 at time and the next load can start cycle later. */
29061 if (dep_insn_type == TYPE_IMOV
29062 || dep_insn_type == TYPE_FMOV)
29063 cost = 1;
29064 else if (cost > 2)
29065 cost -= 2;
29066 else
29067 cost = 1;
29069 break;
29071 case PROCESSOR_AMDFAM10:
29072 case PROCESSOR_BDVER1:
29073 case PROCESSOR_BDVER2:
29074 case PROCESSOR_BDVER3:
29075 case PROCESSOR_BDVER4:
29076 case PROCESSOR_ZNVER1:
29077 case PROCESSOR_BTVER1:
29078 case PROCESSOR_BTVER2:
29079 case PROCESSOR_GENERIC:
29080 /* Stack engine allows to execute push&pop instructions in parall. */
29081 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29082 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29083 return 0;
29084 /* FALLTHRU */
29086 case PROCESSOR_ATHLON:
29087 case PROCESSOR_K8:
29088 memory = get_attr_memory (insn);
29090 /* Show ability of reorder buffer to hide latency of load by executing
29091 in parallel with previous instruction in case
29092 previous instruction is not needed to compute the address. */
29093 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29094 && !ix86_agi_dependent (dep_insn, insn))
29096 enum attr_unit unit = get_attr_unit (insn);
29097 int loadcost = 3;
29099 /* Because of the difference between the length of integer and
29100 floating unit pipeline preparation stages, the memory operands
29101 for floating point are cheaper.
29103 ??? For Athlon it the difference is most probably 2. */
29104 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29105 loadcost = 3;
29106 else
29107 loadcost = TARGET_ATHLON ? 2 : 0;
29109 if (cost >= loadcost)
29110 cost -= loadcost;
29111 else
29112 cost = 0;
29114 break;
29116 case PROCESSOR_CORE2:
29117 case PROCESSOR_NEHALEM:
29118 case PROCESSOR_SANDYBRIDGE:
29119 case PROCESSOR_HASWELL:
29120 /* Stack engine allows to execute push&pop instructions in parall. */
29121 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29122 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29123 return 0;
29125 memory = get_attr_memory (insn);
29127 /* Show ability of reorder buffer to hide latency of load by executing
29128 in parallel with previous instruction in case
29129 previous instruction is not needed to compute the address. */
29130 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29131 && !ix86_agi_dependent (dep_insn, insn))
29133 if (cost >= 4)
29134 cost -= 4;
29135 else
29136 cost = 0;
29138 break;
29140 case PROCESSOR_SILVERMONT:
29141 case PROCESSOR_KNL:
29142 case PROCESSOR_INTEL:
29143 if (!reload_completed)
29144 return cost;
29146 /* Increase cost of integer loads. */
29147 memory = get_attr_memory (dep_insn);
29148 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29150 enum attr_unit unit = get_attr_unit (dep_insn);
29151 if (unit == UNIT_INTEGER && cost == 1)
29153 if (memory == MEMORY_LOAD)
29154 cost = 3;
29155 else
29157 /* Increase cost of ld/st for short int types only
29158 because of store forwarding issue. */
29159 rtx set = single_set (dep_insn);
29160 if (set && (GET_MODE (SET_DEST (set)) == QImode
29161 || GET_MODE (SET_DEST (set)) == HImode))
29163 /* Increase cost of store/load insn if exact
29164 dependence exists and it is load insn. */
29165 enum attr_memory insn_memory = get_attr_memory (insn);
29166 if (insn_memory == MEMORY_LOAD
29167 && exact_store_load_dependency (dep_insn, insn))
29168 cost = 3;
29174 default:
29175 break;
29178 return cost;
29181 /* How many alternative schedules to try. This should be as wide as the
29182 scheduling freedom in the DFA, but no wider. Making this value too
29183 large results extra work for the scheduler. */
29185 static int
29186 ia32_multipass_dfa_lookahead (void)
29188 switch (ix86_tune)
29190 case PROCESSOR_PENTIUM:
29191 case PROCESSOR_LAKEMONT:
29192 return 2;
29194 case PROCESSOR_PENTIUMPRO:
29195 case PROCESSOR_K6:
29196 return 1;
29198 case PROCESSOR_BDVER1:
29199 case PROCESSOR_BDVER2:
29200 case PROCESSOR_BDVER3:
29201 case PROCESSOR_BDVER4:
29202 /* We use lookahead value 4 for BD both before and after reload
29203 schedules. Plan is to have value 8 included for O3. */
29204 return 4;
29206 case PROCESSOR_CORE2:
29207 case PROCESSOR_NEHALEM:
29208 case PROCESSOR_SANDYBRIDGE:
29209 case PROCESSOR_HASWELL:
29210 case PROCESSOR_BONNELL:
29211 case PROCESSOR_SILVERMONT:
29212 case PROCESSOR_KNL:
29213 case PROCESSOR_INTEL:
29214 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29215 as many instructions can be executed on a cycle, i.e.,
29216 issue_rate. I wonder why tuning for many CPUs does not do this. */
29217 if (reload_completed)
29218 return ix86_issue_rate ();
29219 /* Don't use lookahead for pre-reload schedule to save compile time. */
29220 return 0;
29222 default:
29223 return 0;
29227 /* Return true if target platform supports macro-fusion. */
29229 static bool
29230 ix86_macro_fusion_p ()
29232 return TARGET_FUSE_CMP_AND_BRANCH;
29235 /* Check whether current microarchitecture support macro fusion
29236 for insn pair "CONDGEN + CONDJMP". Refer to
29237 "Intel Architectures Optimization Reference Manual". */
29239 static bool
29240 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29242 rtx src, dest;
29243 enum rtx_code ccode;
29244 rtx compare_set = NULL_RTX, test_if, cond;
29245 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29247 if (!any_condjump_p (condjmp))
29248 return false;
29250 if (get_attr_type (condgen) != TYPE_TEST
29251 && get_attr_type (condgen) != TYPE_ICMP
29252 && get_attr_type (condgen) != TYPE_INCDEC
29253 && get_attr_type (condgen) != TYPE_ALU)
29254 return false;
29256 compare_set = single_set (condgen);
29257 if (compare_set == NULL_RTX
29258 && !TARGET_FUSE_ALU_AND_BRANCH)
29259 return false;
29261 if (compare_set == NULL_RTX)
29263 int i;
29264 rtx pat = PATTERN (condgen);
29265 for (i = 0; i < XVECLEN (pat, 0); i++)
29266 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29268 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29269 if (GET_CODE (set_src) == COMPARE)
29270 compare_set = XVECEXP (pat, 0, i);
29271 else
29272 alu_set = XVECEXP (pat, 0, i);
29275 if (compare_set == NULL_RTX)
29276 return false;
29277 src = SET_SRC (compare_set);
29278 if (GET_CODE (src) != COMPARE)
29279 return false;
29281 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29282 supported. */
29283 if ((MEM_P (XEXP (src, 0))
29284 && CONST_INT_P (XEXP (src, 1)))
29285 || (MEM_P (XEXP (src, 1))
29286 && CONST_INT_P (XEXP (src, 0))))
29287 return false;
29289 /* No fusion for RIP-relative address. */
29290 if (MEM_P (XEXP (src, 0)))
29291 addr = XEXP (XEXP (src, 0), 0);
29292 else if (MEM_P (XEXP (src, 1)))
29293 addr = XEXP (XEXP (src, 1), 0);
29295 if (addr) {
29296 ix86_address parts;
29297 int ok = ix86_decompose_address (addr, &parts);
29298 gcc_assert (ok);
29300 if (rip_relative_addr_p (&parts))
29301 return false;
29304 test_if = SET_SRC (pc_set (condjmp));
29305 cond = XEXP (test_if, 0);
29306 ccode = GET_CODE (cond);
29307 /* Check whether conditional jump use Sign or Overflow Flags. */
29308 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29309 && (ccode == GE
29310 || ccode == GT
29311 || ccode == LE
29312 || ccode == LT))
29313 return false;
29315 /* Return true for TYPE_TEST and TYPE_ICMP. */
29316 if (get_attr_type (condgen) == TYPE_TEST
29317 || get_attr_type (condgen) == TYPE_ICMP)
29318 return true;
29320 /* The following is the case that macro-fusion for alu + jmp. */
29321 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29322 return false;
29324 /* No fusion for alu op with memory destination operand. */
29325 dest = SET_DEST (alu_set);
29326 if (MEM_P (dest))
29327 return false;
29329 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29330 supported. */
29331 if (get_attr_type (condgen) == TYPE_INCDEC
29332 && (ccode == GEU
29333 || ccode == GTU
29334 || ccode == LEU
29335 || ccode == LTU))
29336 return false;
29338 return true;
29341 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29342 execution. It is applied if
29343 (1) IMUL instruction is on the top of list;
29344 (2) There exists the only producer of independent IMUL instruction in
29345 ready list.
29346 Return index of IMUL producer if it was found and -1 otherwise. */
29347 static int
29348 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29350 rtx_insn *insn;
29351 rtx set, insn1, insn2;
29352 sd_iterator_def sd_it;
29353 dep_t dep;
29354 int index = -1;
29355 int i;
29357 if (!TARGET_BONNELL)
29358 return index;
29360 /* Check that IMUL instruction is on the top of ready list. */
29361 insn = ready[n_ready - 1];
29362 set = single_set (insn);
29363 if (!set)
29364 return index;
29365 if (!(GET_CODE (SET_SRC (set)) == MULT
29366 && GET_MODE (SET_SRC (set)) == SImode))
29367 return index;
29369 /* Search for producer of independent IMUL instruction. */
29370 for (i = n_ready - 2; i >= 0; i--)
29372 insn = ready[i];
29373 if (!NONDEBUG_INSN_P (insn))
29374 continue;
29375 /* Skip IMUL instruction. */
29376 insn2 = PATTERN (insn);
29377 if (GET_CODE (insn2) == PARALLEL)
29378 insn2 = XVECEXP (insn2, 0, 0);
29379 if (GET_CODE (insn2) == SET
29380 && GET_CODE (SET_SRC (insn2)) == MULT
29381 && GET_MODE (SET_SRC (insn2)) == SImode)
29382 continue;
29384 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29386 rtx con;
29387 con = DEP_CON (dep);
29388 if (!NONDEBUG_INSN_P (con))
29389 continue;
29390 insn1 = PATTERN (con);
29391 if (GET_CODE (insn1) == PARALLEL)
29392 insn1 = XVECEXP (insn1, 0, 0);
29394 if (GET_CODE (insn1) == SET
29395 && GET_CODE (SET_SRC (insn1)) == MULT
29396 && GET_MODE (SET_SRC (insn1)) == SImode)
29398 sd_iterator_def sd_it1;
29399 dep_t dep1;
29400 /* Check if there is no other dependee for IMUL. */
29401 index = i;
29402 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29404 rtx pro;
29405 pro = DEP_PRO (dep1);
29406 if (!NONDEBUG_INSN_P (pro))
29407 continue;
29408 if (pro != insn)
29409 index = -1;
29411 if (index >= 0)
29412 break;
29415 if (index >= 0)
29416 break;
29418 return index;
29421 /* Try to find the best candidate on the top of ready list if two insns
29422 have the same priority - candidate is best if its dependees were
29423 scheduled earlier. Applied for Silvermont only.
29424 Return true if top 2 insns must be interchanged. */
29425 static bool
29426 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29428 rtx_insn *top = ready[n_ready - 1];
29429 rtx_insn *next = ready[n_ready - 2];
29430 rtx set;
29431 sd_iterator_def sd_it;
29432 dep_t dep;
29433 int clock1 = -1;
29434 int clock2 = -1;
29435 #define INSN_TICK(INSN) (HID (INSN)->tick)
29437 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29438 return false;
29440 if (!NONDEBUG_INSN_P (top))
29441 return false;
29442 if (!NONJUMP_INSN_P (top))
29443 return false;
29444 if (!NONDEBUG_INSN_P (next))
29445 return false;
29446 if (!NONJUMP_INSN_P (next))
29447 return false;
29448 set = single_set (top);
29449 if (!set)
29450 return false;
29451 set = single_set (next);
29452 if (!set)
29453 return false;
29455 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29457 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29458 return false;
29459 /* Determine winner more precise. */
29460 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29462 rtx pro;
29463 pro = DEP_PRO (dep);
29464 if (!NONDEBUG_INSN_P (pro))
29465 continue;
29466 if (INSN_TICK (pro) > clock1)
29467 clock1 = INSN_TICK (pro);
29469 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29471 rtx pro;
29472 pro = DEP_PRO (dep);
29473 if (!NONDEBUG_INSN_P (pro))
29474 continue;
29475 if (INSN_TICK (pro) > clock2)
29476 clock2 = INSN_TICK (pro);
29479 if (clock1 == clock2)
29481 /* Determine winner - load must win. */
29482 enum attr_memory memory1, memory2;
29483 memory1 = get_attr_memory (top);
29484 memory2 = get_attr_memory (next);
29485 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29486 return true;
29488 return (bool) (clock2 < clock1);
29490 return false;
29491 #undef INSN_TICK
29494 /* Perform possible reodering of ready list for Atom/Silvermont only.
29495 Return issue rate. */
29496 static int
29497 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29498 int *pn_ready, int clock_var)
29500 int issue_rate = -1;
29501 int n_ready = *pn_ready;
29502 int i;
29503 rtx_insn *insn;
29504 int index = -1;
29506 /* Set up issue rate. */
29507 issue_rate = ix86_issue_rate ();
29509 /* Do reodering for BONNELL/SILVERMONT only. */
29510 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29511 return issue_rate;
29513 /* Nothing to do if ready list contains only 1 instruction. */
29514 if (n_ready <= 1)
29515 return issue_rate;
29517 /* Do reodering for post-reload scheduler only. */
29518 if (!reload_completed)
29519 return issue_rate;
29521 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29523 if (sched_verbose > 1)
29524 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29525 INSN_UID (ready[index]));
29527 /* Put IMUL producer (ready[index]) at the top of ready list. */
29528 insn = ready[index];
29529 for (i = index; i < n_ready - 1; i++)
29530 ready[i] = ready[i + 1];
29531 ready[n_ready - 1] = insn;
29532 return issue_rate;
29535 /* Skip selective scheduling since HID is not populated in it. */
29536 if (clock_var != 0
29537 && !sel_sched_p ()
29538 && swap_top_of_ready_list (ready, n_ready))
29540 if (sched_verbose > 1)
29541 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29542 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29543 /* Swap 2 top elements of ready list. */
29544 insn = ready[n_ready - 1];
29545 ready[n_ready - 1] = ready[n_ready - 2];
29546 ready[n_ready - 2] = insn;
29548 return issue_rate;
29551 static bool
29552 ix86_class_likely_spilled_p (reg_class_t);
29554 /* Returns true if lhs of insn is HW function argument register and set up
29555 is_spilled to true if it is likely spilled HW register. */
29556 static bool
29557 insn_is_function_arg (rtx insn, bool* is_spilled)
29559 rtx dst;
29561 if (!NONDEBUG_INSN_P (insn))
29562 return false;
29563 /* Call instructions are not movable, ignore it. */
29564 if (CALL_P (insn))
29565 return false;
29566 insn = PATTERN (insn);
29567 if (GET_CODE (insn) == PARALLEL)
29568 insn = XVECEXP (insn, 0, 0);
29569 if (GET_CODE (insn) != SET)
29570 return false;
29571 dst = SET_DEST (insn);
29572 if (REG_P (dst) && HARD_REGISTER_P (dst)
29573 && ix86_function_arg_regno_p (REGNO (dst)))
29575 /* Is it likely spilled HW register? */
29576 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29577 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29578 *is_spilled = true;
29579 return true;
29581 return false;
29584 /* Add output dependencies for chain of function adjacent arguments if only
29585 there is a move to likely spilled HW register. Return first argument
29586 if at least one dependence was added or NULL otherwise. */
29587 static rtx_insn *
29588 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29590 rtx_insn *insn;
29591 rtx_insn *last = call;
29592 rtx_insn *first_arg = NULL;
29593 bool is_spilled = false;
29595 head = PREV_INSN (head);
29597 /* Find nearest to call argument passing instruction. */
29598 while (true)
29600 last = PREV_INSN (last);
29601 if (last == head)
29602 return NULL;
29603 if (!NONDEBUG_INSN_P (last))
29604 continue;
29605 if (insn_is_function_arg (last, &is_spilled))
29606 break;
29607 return NULL;
29610 first_arg = last;
29611 while (true)
29613 insn = PREV_INSN (last);
29614 if (!INSN_P (insn))
29615 break;
29616 if (insn == head)
29617 break;
29618 if (!NONDEBUG_INSN_P (insn))
29620 last = insn;
29621 continue;
29623 if (insn_is_function_arg (insn, &is_spilled))
29625 /* Add output depdendence between two function arguments if chain
29626 of output arguments contains likely spilled HW registers. */
29627 if (is_spilled)
29628 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29629 first_arg = last = insn;
29631 else
29632 break;
29634 if (!is_spilled)
29635 return NULL;
29636 return first_arg;
29639 /* Add output or anti dependency from insn to first_arg to restrict its code
29640 motion. */
29641 static void
29642 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29644 rtx set;
29645 rtx tmp;
29647 /* Add anti dependencies for bounds stores. */
29648 if (INSN_P (insn)
29649 && GET_CODE (PATTERN (insn)) == PARALLEL
29650 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29651 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29653 add_dependence (first_arg, insn, REG_DEP_ANTI);
29654 return;
29657 set = single_set (insn);
29658 if (!set)
29659 return;
29660 tmp = SET_DEST (set);
29661 if (REG_P (tmp))
29663 /* Add output dependency to the first function argument. */
29664 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29665 return;
29667 /* Add anti dependency. */
29668 add_dependence (first_arg, insn, REG_DEP_ANTI);
29671 /* Avoid cross block motion of function argument through adding dependency
29672 from the first non-jump instruction in bb. */
29673 static void
29674 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29676 rtx_insn *insn = BB_END (bb);
29678 while (insn)
29680 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29682 rtx set = single_set (insn);
29683 if (set)
29685 avoid_func_arg_motion (arg, insn);
29686 return;
29689 if (insn == BB_HEAD (bb))
29690 return;
29691 insn = PREV_INSN (insn);
29695 /* Hook for pre-reload schedule - avoid motion of function arguments
29696 passed in likely spilled HW registers. */
29697 static void
29698 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29700 rtx_insn *insn;
29701 rtx_insn *first_arg = NULL;
29702 if (reload_completed)
29703 return;
29704 while (head != tail && DEBUG_INSN_P (head))
29705 head = NEXT_INSN (head);
29706 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29707 if (INSN_P (insn) && CALL_P (insn))
29709 first_arg = add_parameter_dependencies (insn, head);
29710 if (first_arg)
29712 /* Add dependee for first argument to predecessors if only
29713 region contains more than one block. */
29714 basic_block bb = BLOCK_FOR_INSN (insn);
29715 int rgn = CONTAINING_RGN (bb->index);
29716 int nr_blks = RGN_NR_BLOCKS (rgn);
29717 /* Skip trivial regions and region head blocks that can have
29718 predecessors outside of region. */
29719 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29721 edge e;
29722 edge_iterator ei;
29724 /* Regions are SCCs with the exception of selective
29725 scheduling with pipelining of outer blocks enabled.
29726 So also check that immediate predecessors of a non-head
29727 block are in the same region. */
29728 FOR_EACH_EDGE (e, ei, bb->preds)
29730 /* Avoid creating of loop-carried dependencies through
29731 using topological ordering in the region. */
29732 if (rgn == CONTAINING_RGN (e->src->index)
29733 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29734 add_dependee_for_func_arg (first_arg, e->src);
29737 insn = first_arg;
29738 if (insn == head)
29739 break;
29742 else if (first_arg)
29743 avoid_func_arg_motion (first_arg, insn);
29746 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29747 HW registers to maximum, to schedule them at soon as possible. These are
29748 moves from function argument registers at the top of the function entry
29749 and moves from function return value registers after call. */
29750 static int
29751 ix86_adjust_priority (rtx_insn *insn, int priority)
29753 rtx set;
29755 if (reload_completed)
29756 return priority;
29758 if (!NONDEBUG_INSN_P (insn))
29759 return priority;
29761 set = single_set (insn);
29762 if (set)
29764 rtx tmp = SET_SRC (set);
29765 if (REG_P (tmp)
29766 && HARD_REGISTER_P (tmp)
29767 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29768 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29769 return current_sched_info->sched_max_insns_priority;
29772 return priority;
29775 /* Model decoder of Core 2/i7.
29776 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29777 track the instruction fetch block boundaries and make sure that long
29778 (9+ bytes) instructions are assigned to D0. */
29780 /* Maximum length of an insn that can be handled by
29781 a secondary decoder unit. '8' for Core 2/i7. */
29782 static int core2i7_secondary_decoder_max_insn_size;
29784 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29785 '16' for Core 2/i7. */
29786 static int core2i7_ifetch_block_size;
29788 /* Maximum number of instructions decoder can handle per cycle.
29789 '6' for Core 2/i7. */
29790 static int core2i7_ifetch_block_max_insns;
29792 typedef struct ix86_first_cycle_multipass_data_ *
29793 ix86_first_cycle_multipass_data_t;
29794 typedef const struct ix86_first_cycle_multipass_data_ *
29795 const_ix86_first_cycle_multipass_data_t;
29797 /* A variable to store target state across calls to max_issue within
29798 one cycle. */
29799 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
29800 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
29802 /* Initialize DATA. */
29803 static void
29804 core2i7_first_cycle_multipass_init (void *_data)
29806 ix86_first_cycle_multipass_data_t data
29807 = (ix86_first_cycle_multipass_data_t) _data;
29809 data->ifetch_block_len = 0;
29810 data->ifetch_block_n_insns = 0;
29811 data->ready_try_change = NULL;
29812 data->ready_try_change_size = 0;
29815 /* Advancing the cycle; reset ifetch block counts. */
29816 static void
29817 core2i7_dfa_post_advance_cycle (void)
29819 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
29821 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29823 data->ifetch_block_len = 0;
29824 data->ifetch_block_n_insns = 0;
29827 static int min_insn_size (rtx_insn *);
29829 /* Filter out insns from ready_try that the core will not be able to issue
29830 on current cycle due to decoder. */
29831 static void
29832 core2i7_first_cycle_multipass_filter_ready_try
29833 (const_ix86_first_cycle_multipass_data_t data,
29834 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
29836 while (n_ready--)
29838 rtx_insn *insn;
29839 int insn_size;
29841 if (ready_try[n_ready])
29842 continue;
29844 insn = get_ready_element (n_ready);
29845 insn_size = min_insn_size (insn);
29847 if (/* If this is a too long an insn for a secondary decoder ... */
29848 (!first_cycle_insn_p
29849 && insn_size > core2i7_secondary_decoder_max_insn_size)
29850 /* ... or it would not fit into the ifetch block ... */
29851 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
29852 /* ... or the decoder is full already ... */
29853 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
29854 /* ... mask the insn out. */
29856 ready_try[n_ready] = 1;
29858 if (data->ready_try_change)
29859 bitmap_set_bit (data->ready_try_change, n_ready);
29864 /* Prepare for a new round of multipass lookahead scheduling. */
29865 static void
29866 core2i7_first_cycle_multipass_begin (void *_data,
29867 signed char *ready_try, int n_ready,
29868 bool first_cycle_insn_p)
29870 ix86_first_cycle_multipass_data_t data
29871 = (ix86_first_cycle_multipass_data_t) _data;
29872 const_ix86_first_cycle_multipass_data_t prev_data
29873 = ix86_first_cycle_multipass_data;
29875 /* Restore the state from the end of the previous round. */
29876 data->ifetch_block_len = prev_data->ifetch_block_len;
29877 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
29879 /* Filter instructions that cannot be issued on current cycle due to
29880 decoder restrictions. */
29881 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29882 first_cycle_insn_p);
29885 /* INSN is being issued in current solution. Account for its impact on
29886 the decoder model. */
29887 static void
29888 core2i7_first_cycle_multipass_issue (void *_data,
29889 signed char *ready_try, int n_ready,
29890 rtx_insn *insn, const void *_prev_data)
29892 ix86_first_cycle_multipass_data_t data
29893 = (ix86_first_cycle_multipass_data_t) _data;
29894 const_ix86_first_cycle_multipass_data_t prev_data
29895 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
29897 int insn_size = min_insn_size (insn);
29899 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
29900 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
29901 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
29902 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29904 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
29905 if (!data->ready_try_change)
29907 data->ready_try_change = sbitmap_alloc (n_ready);
29908 data->ready_try_change_size = n_ready;
29910 else if (data->ready_try_change_size < n_ready)
29912 data->ready_try_change = sbitmap_resize (data->ready_try_change,
29913 n_ready, 0);
29914 data->ready_try_change_size = n_ready;
29916 bitmap_clear (data->ready_try_change);
29918 /* Filter out insns from ready_try that the core will not be able to issue
29919 on current cycle due to decoder. */
29920 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29921 false);
29924 /* Revert the effect on ready_try. */
29925 static void
29926 core2i7_first_cycle_multipass_backtrack (const void *_data,
29927 signed char *ready_try,
29928 int n_ready ATTRIBUTE_UNUSED)
29930 const_ix86_first_cycle_multipass_data_t data
29931 = (const_ix86_first_cycle_multipass_data_t) _data;
29932 unsigned int i = 0;
29933 sbitmap_iterator sbi;
29935 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
29936 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
29938 ready_try[i] = 0;
29942 /* Save the result of multipass lookahead scheduling for the next round. */
29943 static void
29944 core2i7_first_cycle_multipass_end (const void *_data)
29946 const_ix86_first_cycle_multipass_data_t data
29947 = (const_ix86_first_cycle_multipass_data_t) _data;
29948 ix86_first_cycle_multipass_data_t next_data
29949 = ix86_first_cycle_multipass_data;
29951 if (data != NULL)
29953 next_data->ifetch_block_len = data->ifetch_block_len;
29954 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
29958 /* Deallocate target data. */
29959 static void
29960 core2i7_first_cycle_multipass_fini (void *_data)
29962 ix86_first_cycle_multipass_data_t data
29963 = (ix86_first_cycle_multipass_data_t) _data;
29965 if (data->ready_try_change)
29967 sbitmap_free (data->ready_try_change);
29968 data->ready_try_change = NULL;
29969 data->ready_try_change_size = 0;
29973 /* Prepare for scheduling pass. */
29974 static void
29975 ix86_sched_init_global (FILE *, int, int)
29977 /* Install scheduling hooks for current CPU. Some of these hooks are used
29978 in time-critical parts of the scheduler, so we only set them up when
29979 they are actually used. */
29980 switch (ix86_tune)
29982 case PROCESSOR_CORE2:
29983 case PROCESSOR_NEHALEM:
29984 case PROCESSOR_SANDYBRIDGE:
29985 case PROCESSOR_HASWELL:
29986 /* Do not perform multipass scheduling for pre-reload schedule
29987 to save compile time. */
29988 if (reload_completed)
29990 targetm.sched.dfa_post_advance_cycle
29991 = core2i7_dfa_post_advance_cycle;
29992 targetm.sched.first_cycle_multipass_init
29993 = core2i7_first_cycle_multipass_init;
29994 targetm.sched.first_cycle_multipass_begin
29995 = core2i7_first_cycle_multipass_begin;
29996 targetm.sched.first_cycle_multipass_issue
29997 = core2i7_first_cycle_multipass_issue;
29998 targetm.sched.first_cycle_multipass_backtrack
29999 = core2i7_first_cycle_multipass_backtrack;
30000 targetm.sched.first_cycle_multipass_end
30001 = core2i7_first_cycle_multipass_end;
30002 targetm.sched.first_cycle_multipass_fini
30003 = core2i7_first_cycle_multipass_fini;
30005 /* Set decoder parameters. */
30006 core2i7_secondary_decoder_max_insn_size = 8;
30007 core2i7_ifetch_block_size = 16;
30008 core2i7_ifetch_block_max_insns = 6;
30009 break;
30011 /* Fall through. */
30012 default:
30013 targetm.sched.dfa_post_advance_cycle = NULL;
30014 targetm.sched.first_cycle_multipass_init = NULL;
30015 targetm.sched.first_cycle_multipass_begin = NULL;
30016 targetm.sched.first_cycle_multipass_issue = NULL;
30017 targetm.sched.first_cycle_multipass_backtrack = NULL;
30018 targetm.sched.first_cycle_multipass_end = NULL;
30019 targetm.sched.first_cycle_multipass_fini = NULL;
30020 break;
30025 /* Compute the alignment given to a constant that is being placed in memory.
30026 EXP is the constant and ALIGN is the alignment that the object would
30027 ordinarily have.
30028 The value of this function is used instead of that alignment to align
30029 the object. */
30032 ix86_constant_alignment (tree exp, int align)
30034 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30035 || TREE_CODE (exp) == INTEGER_CST)
30037 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30038 return 64;
30039 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30040 return 128;
30042 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30043 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30044 return BITS_PER_WORD;
30046 return align;
30049 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30050 the data type, and ALIGN is the alignment that the object would
30051 ordinarily have. */
30053 static int
30054 iamcu_alignment (tree type, int align)
30056 enum machine_mode mode;
30058 if (align < 32 || TYPE_USER_ALIGN (type))
30059 return align;
30061 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30062 bytes. */
30063 mode = TYPE_MODE (strip_array_types (type));
30064 switch (GET_MODE_CLASS (mode))
30066 case MODE_INT:
30067 case MODE_COMPLEX_INT:
30068 case MODE_COMPLEX_FLOAT:
30069 case MODE_FLOAT:
30070 case MODE_DECIMAL_FLOAT:
30071 return 32;
30072 default:
30073 return align;
30077 /* Compute the alignment for a static variable.
30078 TYPE is the data type, and ALIGN is the alignment that
30079 the object would ordinarily have. The value of this function is used
30080 instead of that alignment to align the object. */
30083 ix86_data_alignment (tree type, int align, bool opt)
30085 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30086 for symbols from other compilation units or symbols that don't need
30087 to bind locally. In order to preserve some ABI compatibility with
30088 those compilers, ensure we don't decrease alignment from what we
30089 used to assume. */
30091 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30093 /* A data structure, equal or greater than the size of a cache line
30094 (64 bytes in the Pentium 4 and other recent Intel processors, including
30095 processors based on Intel Core microarchitecture) should be aligned
30096 so that its base address is a multiple of a cache line size. */
30098 int max_align
30099 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30101 if (max_align < BITS_PER_WORD)
30102 max_align = BITS_PER_WORD;
30104 switch (ix86_align_data_type)
30106 case ix86_align_data_type_abi: opt = false; break;
30107 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30108 case ix86_align_data_type_cacheline: break;
30111 if (TARGET_IAMCU)
30112 align = iamcu_alignment (type, align);
30114 if (opt
30115 && AGGREGATE_TYPE_P (type)
30116 && TYPE_SIZE (type)
30117 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30119 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30120 && align < max_align_compat)
30121 align = max_align_compat;
30122 if (wi::geu_p (TYPE_SIZE (type), max_align)
30123 && align < max_align)
30124 align = max_align;
30127 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30128 to 16byte boundary. */
30129 if (TARGET_64BIT)
30131 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30132 && TYPE_SIZE (type)
30133 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30134 && wi::geu_p (TYPE_SIZE (type), 128)
30135 && align < 128)
30136 return 128;
30139 if (!opt)
30140 return align;
30142 if (TREE_CODE (type) == ARRAY_TYPE)
30144 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30145 return 64;
30146 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30147 return 128;
30149 else if (TREE_CODE (type) == COMPLEX_TYPE)
30152 if (TYPE_MODE (type) == DCmode && align < 64)
30153 return 64;
30154 if ((TYPE_MODE (type) == XCmode
30155 || TYPE_MODE (type) == TCmode) && align < 128)
30156 return 128;
30158 else if ((TREE_CODE (type) == RECORD_TYPE
30159 || TREE_CODE (type) == UNION_TYPE
30160 || TREE_CODE (type) == QUAL_UNION_TYPE)
30161 && TYPE_FIELDS (type))
30163 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30164 return 64;
30165 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30166 return 128;
30168 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30169 || TREE_CODE (type) == INTEGER_TYPE)
30171 if (TYPE_MODE (type) == DFmode && align < 64)
30172 return 64;
30173 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30174 return 128;
30177 return align;
30180 /* Compute the alignment for a local variable or a stack slot. EXP is
30181 the data type or decl itself, MODE is the widest mode available and
30182 ALIGN is the alignment that the object would ordinarily have. The
30183 value of this macro is used instead of that alignment to align the
30184 object. */
30186 unsigned int
30187 ix86_local_alignment (tree exp, machine_mode mode,
30188 unsigned int align)
30190 tree type, decl;
30192 if (exp && DECL_P (exp))
30194 type = TREE_TYPE (exp);
30195 decl = exp;
30197 else
30199 type = exp;
30200 decl = NULL;
30203 /* Don't do dynamic stack realignment for long long objects with
30204 -mpreferred-stack-boundary=2. */
30205 if (!TARGET_64BIT
30206 && align == 64
30207 && ix86_preferred_stack_boundary < 64
30208 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30209 && (!type || !TYPE_USER_ALIGN (type))
30210 && (!decl || !DECL_USER_ALIGN (decl)))
30211 align = 32;
30213 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30214 register in MODE. We will return the largest alignment of XF
30215 and DF. */
30216 if (!type)
30218 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30219 align = GET_MODE_ALIGNMENT (DFmode);
30220 return align;
30223 /* Don't increase alignment for Intel MCU psABI. */
30224 if (TARGET_IAMCU)
30225 return align;
30227 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30228 to 16byte boundary. Exact wording is:
30230 An array uses the same alignment as its elements, except that a local or
30231 global array variable of length at least 16 bytes or
30232 a C99 variable-length array variable always has alignment of at least 16 bytes.
30234 This was added to allow use of aligned SSE instructions at arrays. This
30235 rule is meant for static storage (where compiler can not do the analysis
30236 by itself). We follow it for automatic variables only when convenient.
30237 We fully control everything in the function compiled and functions from
30238 other unit can not rely on the alignment.
30240 Exclude va_list type. It is the common case of local array where
30241 we can not benefit from the alignment.
30243 TODO: Probably one should optimize for size only when var is not escaping. */
30244 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30245 && TARGET_SSE)
30247 if (AGGREGATE_TYPE_P (type)
30248 && (va_list_type_node == NULL_TREE
30249 || (TYPE_MAIN_VARIANT (type)
30250 != TYPE_MAIN_VARIANT (va_list_type_node)))
30251 && TYPE_SIZE (type)
30252 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30253 && wi::geu_p (TYPE_SIZE (type), 16)
30254 && align < 128)
30255 return 128;
30257 if (TREE_CODE (type) == ARRAY_TYPE)
30259 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30260 return 64;
30261 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30262 return 128;
30264 else if (TREE_CODE (type) == COMPLEX_TYPE)
30266 if (TYPE_MODE (type) == DCmode && align < 64)
30267 return 64;
30268 if ((TYPE_MODE (type) == XCmode
30269 || TYPE_MODE (type) == TCmode) && align < 128)
30270 return 128;
30272 else if ((TREE_CODE (type) == RECORD_TYPE
30273 || TREE_CODE (type) == UNION_TYPE
30274 || TREE_CODE (type) == QUAL_UNION_TYPE)
30275 && TYPE_FIELDS (type))
30277 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30278 return 64;
30279 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30280 return 128;
30282 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30283 || TREE_CODE (type) == INTEGER_TYPE)
30286 if (TYPE_MODE (type) == DFmode && align < 64)
30287 return 64;
30288 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30289 return 128;
30291 return align;
30294 /* Compute the minimum required alignment for dynamic stack realignment
30295 purposes for a local variable, parameter or a stack slot. EXP is
30296 the data type or decl itself, MODE is its mode and ALIGN is the
30297 alignment that the object would ordinarily have. */
30299 unsigned int
30300 ix86_minimum_alignment (tree exp, machine_mode mode,
30301 unsigned int align)
30303 tree type, decl;
30305 if (exp && DECL_P (exp))
30307 type = TREE_TYPE (exp);
30308 decl = exp;
30310 else
30312 type = exp;
30313 decl = NULL;
30316 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30317 return align;
30319 /* Don't do dynamic stack realignment for long long objects with
30320 -mpreferred-stack-boundary=2. */
30321 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30322 && (!type || !TYPE_USER_ALIGN (type))
30323 && (!decl || !DECL_USER_ALIGN (decl)))
30325 gcc_checking_assert (!TARGET_STV);
30326 return 32;
30329 return align;
30332 /* Find a location for the static chain incoming to a nested function.
30333 This is a register, unless all free registers are used by arguments. */
30335 static rtx
30336 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30338 unsigned regno;
30340 /* While this function won't be called by the middle-end when a static
30341 chain isn't needed, it's also used throughout the backend so it's
30342 easiest to keep this check centralized. */
30343 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30344 return NULL;
30346 if (TARGET_64BIT)
30348 /* We always use R10 in 64-bit mode. */
30349 regno = R10_REG;
30351 else
30353 const_tree fntype, fndecl;
30354 unsigned int ccvt;
30356 /* By default in 32-bit mode we use ECX to pass the static chain. */
30357 regno = CX_REG;
30359 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30361 fntype = TREE_TYPE (fndecl_or_type);
30362 fndecl = fndecl_or_type;
30364 else
30366 fntype = fndecl_or_type;
30367 fndecl = NULL;
30370 ccvt = ix86_get_callcvt (fntype);
30371 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30373 /* Fastcall functions use ecx/edx for arguments, which leaves
30374 us with EAX for the static chain.
30375 Thiscall functions use ecx for arguments, which also
30376 leaves us with EAX for the static chain. */
30377 regno = AX_REG;
30379 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30381 /* Thiscall functions use ecx for arguments, which leaves
30382 us with EAX and EDX for the static chain.
30383 We are using for abi-compatibility EAX. */
30384 regno = AX_REG;
30386 else if (ix86_function_regparm (fntype, fndecl) == 3)
30388 /* For regparm 3, we have no free call-clobbered registers in
30389 which to store the static chain. In order to implement this,
30390 we have the trampoline push the static chain to the stack.
30391 However, we can't push a value below the return address when
30392 we call the nested function directly, so we have to use an
30393 alternate entry point. For this we use ESI, and have the
30394 alternate entry point push ESI, so that things appear the
30395 same once we're executing the nested function. */
30396 if (incoming_p)
30398 if (fndecl == current_function_decl)
30399 ix86_static_chain_on_stack = true;
30400 return gen_frame_mem (SImode,
30401 plus_constant (Pmode,
30402 arg_pointer_rtx, -8));
30404 regno = SI_REG;
30408 return gen_rtx_REG (Pmode, regno);
30411 /* Emit RTL insns to initialize the variable parts of a trampoline.
30412 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30413 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30414 to be passed to the target function. */
30416 static void
30417 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30419 rtx mem, fnaddr;
30420 int opcode;
30421 int offset = 0;
30423 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30425 if (TARGET_64BIT)
30427 int size;
30429 /* Load the function address to r11. Try to load address using
30430 the shorter movl instead of movabs. We may want to support
30431 movq for kernel mode, but kernel does not use trampolines at
30432 the moment. FNADDR is a 32bit address and may not be in
30433 DImode when ptr_mode == SImode. Always use movl in this
30434 case. */
30435 if (ptr_mode == SImode
30436 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30438 fnaddr = copy_addr_to_reg (fnaddr);
30440 mem = adjust_address (m_tramp, HImode, offset);
30441 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30443 mem = adjust_address (m_tramp, SImode, offset + 2);
30444 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30445 offset += 6;
30447 else
30449 mem = adjust_address (m_tramp, HImode, offset);
30450 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30452 mem = adjust_address (m_tramp, DImode, offset + 2);
30453 emit_move_insn (mem, fnaddr);
30454 offset += 10;
30457 /* Load static chain using movabs to r10. Use the shorter movl
30458 instead of movabs when ptr_mode == SImode. */
30459 if (ptr_mode == SImode)
30461 opcode = 0xba41;
30462 size = 6;
30464 else
30466 opcode = 0xba49;
30467 size = 10;
30470 mem = adjust_address (m_tramp, HImode, offset);
30471 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30473 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30474 emit_move_insn (mem, chain_value);
30475 offset += size;
30477 /* Jump to r11; the last (unused) byte is a nop, only there to
30478 pad the write out to a single 32-bit store. */
30479 mem = adjust_address (m_tramp, SImode, offset);
30480 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30481 offset += 4;
30483 else
30485 rtx disp, chain;
30487 /* Depending on the static chain location, either load a register
30488 with a constant, or push the constant to the stack. All of the
30489 instructions are the same size. */
30490 chain = ix86_static_chain (fndecl, true);
30491 if (REG_P (chain))
30493 switch (REGNO (chain))
30495 case AX_REG:
30496 opcode = 0xb8; break;
30497 case CX_REG:
30498 opcode = 0xb9; break;
30499 default:
30500 gcc_unreachable ();
30503 else
30504 opcode = 0x68;
30506 mem = adjust_address (m_tramp, QImode, offset);
30507 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30509 mem = adjust_address (m_tramp, SImode, offset + 1);
30510 emit_move_insn (mem, chain_value);
30511 offset += 5;
30513 mem = adjust_address (m_tramp, QImode, offset);
30514 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30516 mem = adjust_address (m_tramp, SImode, offset + 1);
30518 /* Compute offset from the end of the jmp to the target function.
30519 In the case in which the trampoline stores the static chain on
30520 the stack, we need to skip the first insn which pushes the
30521 (call-saved) register static chain; this push is 1 byte. */
30522 offset += 5;
30523 disp = expand_binop (SImode, sub_optab, fnaddr,
30524 plus_constant (Pmode, XEXP (m_tramp, 0),
30525 offset - (MEM_P (chain) ? 1 : 0)),
30526 NULL_RTX, 1, OPTAB_DIRECT);
30527 emit_move_insn (mem, disp);
30530 gcc_assert (offset <= TRAMPOLINE_SIZE);
30532 #ifdef HAVE_ENABLE_EXECUTE_STACK
30533 #ifdef CHECK_EXECUTE_STACK_ENABLED
30534 if (CHECK_EXECUTE_STACK_ENABLED)
30535 #endif
30536 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30537 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30538 #endif
30541 /* The following file contains several enumerations and data structures
30542 built from the definitions in i386-builtin-types.def. */
30544 #include "i386-builtin-types.inc"
30546 /* Table for the ix86 builtin non-function types. */
30547 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30549 /* Retrieve an element from the above table, building some of
30550 the types lazily. */
30552 static tree
30553 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30555 unsigned int index;
30556 tree type, itype;
30558 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30560 type = ix86_builtin_type_tab[(int) tcode];
30561 if (type != NULL)
30562 return type;
30564 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30565 if (tcode <= IX86_BT_LAST_VECT)
30567 machine_mode mode;
30569 index = tcode - IX86_BT_LAST_PRIM - 1;
30570 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30571 mode = ix86_builtin_type_vect_mode[index];
30573 type = build_vector_type_for_mode (itype, mode);
30575 else
30577 int quals;
30579 index = tcode - IX86_BT_LAST_VECT - 1;
30580 if (tcode <= IX86_BT_LAST_PTR)
30581 quals = TYPE_UNQUALIFIED;
30582 else
30583 quals = TYPE_QUAL_CONST;
30585 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30586 if (quals != TYPE_UNQUALIFIED)
30587 itype = build_qualified_type (itype, quals);
30589 type = build_pointer_type (itype);
30592 ix86_builtin_type_tab[(int) tcode] = type;
30593 return type;
30596 /* Table for the ix86 builtin function types. */
30597 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30599 /* Retrieve an element from the above table, building some of
30600 the types lazily. */
30602 static tree
30603 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30605 tree type;
30607 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30609 type = ix86_builtin_func_type_tab[(int) tcode];
30610 if (type != NULL)
30611 return type;
30613 if (tcode <= IX86_BT_LAST_FUNC)
30615 unsigned start = ix86_builtin_func_start[(int) tcode];
30616 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30617 tree rtype, atype, args = void_list_node;
30618 unsigned i;
30620 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30621 for (i = after - 1; i > start; --i)
30623 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30624 args = tree_cons (NULL, atype, args);
30627 type = build_function_type (rtype, args);
30629 else
30631 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30632 enum ix86_builtin_func_type icode;
30634 icode = ix86_builtin_func_alias_base[index];
30635 type = ix86_get_builtin_func_type (icode);
30638 ix86_builtin_func_type_tab[(int) tcode] = type;
30639 return type;
30643 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30644 bdesc_* arrays below should come first, then builtins for each bdesc_*
30645 array in ascending order, so that we can use direct array accesses. */
30646 enum ix86_builtins
30648 IX86_BUILTIN_MASKMOVQ,
30649 IX86_BUILTIN_LDMXCSR,
30650 IX86_BUILTIN_STMXCSR,
30651 IX86_BUILTIN_MASKMOVDQU,
30652 IX86_BUILTIN_PSLLDQ128,
30653 IX86_BUILTIN_CLFLUSH,
30654 IX86_BUILTIN_MONITOR,
30655 IX86_BUILTIN_MWAIT,
30656 IX86_BUILTIN_CLZERO,
30657 IX86_BUILTIN_VEC_INIT_V2SI,
30658 IX86_BUILTIN_VEC_INIT_V4HI,
30659 IX86_BUILTIN_VEC_INIT_V8QI,
30660 IX86_BUILTIN_VEC_EXT_V2DF,
30661 IX86_BUILTIN_VEC_EXT_V2DI,
30662 IX86_BUILTIN_VEC_EXT_V4SF,
30663 IX86_BUILTIN_VEC_EXT_V4SI,
30664 IX86_BUILTIN_VEC_EXT_V8HI,
30665 IX86_BUILTIN_VEC_EXT_V2SI,
30666 IX86_BUILTIN_VEC_EXT_V4HI,
30667 IX86_BUILTIN_VEC_EXT_V16QI,
30668 IX86_BUILTIN_VEC_SET_V2DI,
30669 IX86_BUILTIN_VEC_SET_V4SF,
30670 IX86_BUILTIN_VEC_SET_V4SI,
30671 IX86_BUILTIN_VEC_SET_V8HI,
30672 IX86_BUILTIN_VEC_SET_V4HI,
30673 IX86_BUILTIN_VEC_SET_V16QI,
30674 IX86_BUILTIN_GATHERSIV2DF,
30675 IX86_BUILTIN_GATHERSIV4DF,
30676 IX86_BUILTIN_GATHERDIV2DF,
30677 IX86_BUILTIN_GATHERDIV4DF,
30678 IX86_BUILTIN_GATHERSIV4SF,
30679 IX86_BUILTIN_GATHERSIV8SF,
30680 IX86_BUILTIN_GATHERDIV4SF,
30681 IX86_BUILTIN_GATHERDIV8SF,
30682 IX86_BUILTIN_GATHERSIV2DI,
30683 IX86_BUILTIN_GATHERSIV4DI,
30684 IX86_BUILTIN_GATHERDIV2DI,
30685 IX86_BUILTIN_GATHERDIV4DI,
30686 IX86_BUILTIN_GATHERSIV4SI,
30687 IX86_BUILTIN_GATHERSIV8SI,
30688 IX86_BUILTIN_GATHERDIV4SI,
30689 IX86_BUILTIN_GATHERDIV8SI,
30690 IX86_BUILTIN_VFMSUBSD3_MASK3,
30691 IX86_BUILTIN_VFMSUBSS3_MASK3,
30692 IX86_BUILTIN_GATHER3SIV8SF,
30693 IX86_BUILTIN_GATHER3SIV4SF,
30694 IX86_BUILTIN_GATHER3SIV4DF,
30695 IX86_BUILTIN_GATHER3SIV2DF,
30696 IX86_BUILTIN_GATHER3DIV8SF,
30697 IX86_BUILTIN_GATHER3DIV4SF,
30698 IX86_BUILTIN_GATHER3DIV4DF,
30699 IX86_BUILTIN_GATHER3DIV2DF,
30700 IX86_BUILTIN_GATHER3SIV8SI,
30701 IX86_BUILTIN_GATHER3SIV4SI,
30702 IX86_BUILTIN_GATHER3SIV4DI,
30703 IX86_BUILTIN_GATHER3SIV2DI,
30704 IX86_BUILTIN_GATHER3DIV8SI,
30705 IX86_BUILTIN_GATHER3DIV4SI,
30706 IX86_BUILTIN_GATHER3DIV4DI,
30707 IX86_BUILTIN_GATHER3DIV2DI,
30708 IX86_BUILTIN_SCATTERSIV8SF,
30709 IX86_BUILTIN_SCATTERSIV4SF,
30710 IX86_BUILTIN_SCATTERSIV4DF,
30711 IX86_BUILTIN_SCATTERSIV2DF,
30712 IX86_BUILTIN_SCATTERDIV8SF,
30713 IX86_BUILTIN_SCATTERDIV4SF,
30714 IX86_BUILTIN_SCATTERDIV4DF,
30715 IX86_BUILTIN_SCATTERDIV2DF,
30716 IX86_BUILTIN_SCATTERSIV8SI,
30717 IX86_BUILTIN_SCATTERSIV4SI,
30718 IX86_BUILTIN_SCATTERSIV4DI,
30719 IX86_BUILTIN_SCATTERSIV2DI,
30720 IX86_BUILTIN_SCATTERDIV8SI,
30721 IX86_BUILTIN_SCATTERDIV4SI,
30722 IX86_BUILTIN_SCATTERDIV4DI,
30723 IX86_BUILTIN_SCATTERDIV2DI,
30724 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30725 where all operands are 32-byte or 64-byte wide respectively. */
30726 IX86_BUILTIN_GATHERALTSIV4DF,
30727 IX86_BUILTIN_GATHERALTDIV8SF,
30728 IX86_BUILTIN_GATHERALTSIV4DI,
30729 IX86_BUILTIN_GATHERALTDIV8SI,
30730 IX86_BUILTIN_GATHER3ALTDIV16SF,
30731 IX86_BUILTIN_GATHER3ALTDIV16SI,
30732 IX86_BUILTIN_GATHER3ALTSIV4DF,
30733 IX86_BUILTIN_GATHER3ALTDIV8SF,
30734 IX86_BUILTIN_GATHER3ALTSIV4DI,
30735 IX86_BUILTIN_GATHER3ALTDIV8SI,
30736 IX86_BUILTIN_GATHER3ALTSIV8DF,
30737 IX86_BUILTIN_GATHER3ALTSIV8DI,
30738 IX86_BUILTIN_GATHER3DIV16SF,
30739 IX86_BUILTIN_GATHER3DIV16SI,
30740 IX86_BUILTIN_GATHER3DIV8DF,
30741 IX86_BUILTIN_GATHER3DIV8DI,
30742 IX86_BUILTIN_GATHER3SIV16SF,
30743 IX86_BUILTIN_GATHER3SIV16SI,
30744 IX86_BUILTIN_GATHER3SIV8DF,
30745 IX86_BUILTIN_GATHER3SIV8DI,
30746 IX86_BUILTIN_SCATTERALTSIV8DF,
30747 IX86_BUILTIN_SCATTERALTDIV16SF,
30748 IX86_BUILTIN_SCATTERALTSIV8DI,
30749 IX86_BUILTIN_SCATTERALTDIV16SI,
30750 IX86_BUILTIN_SCATTERDIV16SF,
30751 IX86_BUILTIN_SCATTERDIV16SI,
30752 IX86_BUILTIN_SCATTERDIV8DF,
30753 IX86_BUILTIN_SCATTERDIV8DI,
30754 IX86_BUILTIN_SCATTERSIV16SF,
30755 IX86_BUILTIN_SCATTERSIV16SI,
30756 IX86_BUILTIN_SCATTERSIV8DF,
30757 IX86_BUILTIN_SCATTERSIV8DI,
30758 IX86_BUILTIN_GATHERPFQPD,
30759 IX86_BUILTIN_GATHERPFDPS,
30760 IX86_BUILTIN_GATHERPFDPD,
30761 IX86_BUILTIN_GATHERPFQPS,
30762 IX86_BUILTIN_SCATTERPFDPD,
30763 IX86_BUILTIN_SCATTERPFDPS,
30764 IX86_BUILTIN_SCATTERPFQPD,
30765 IX86_BUILTIN_SCATTERPFQPS,
30766 IX86_BUILTIN_CLWB,
30767 IX86_BUILTIN_CLFLUSHOPT,
30768 IX86_BUILTIN_INFQ,
30769 IX86_BUILTIN_HUGE_VALQ,
30770 IX86_BUILTIN_NANQ,
30771 IX86_BUILTIN_NANSQ,
30772 IX86_BUILTIN_XABORT,
30773 IX86_BUILTIN_ADDCARRYX32,
30774 IX86_BUILTIN_ADDCARRYX64,
30775 IX86_BUILTIN_SBB32,
30776 IX86_BUILTIN_SBB64,
30777 IX86_BUILTIN_RDRAND16_STEP,
30778 IX86_BUILTIN_RDRAND32_STEP,
30779 IX86_BUILTIN_RDRAND64_STEP,
30780 IX86_BUILTIN_RDSEED16_STEP,
30781 IX86_BUILTIN_RDSEED32_STEP,
30782 IX86_BUILTIN_RDSEED64_STEP,
30783 IX86_BUILTIN_MONITORX,
30784 IX86_BUILTIN_MWAITX,
30785 IX86_BUILTIN_CFSTRING,
30786 IX86_BUILTIN_CPU_INIT,
30787 IX86_BUILTIN_CPU_IS,
30788 IX86_BUILTIN_CPU_SUPPORTS,
30789 IX86_BUILTIN_READ_FLAGS,
30790 IX86_BUILTIN_WRITE_FLAGS,
30792 /* All the remaining builtins are tracked in bdesc_* arrays in
30793 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30794 this point. */
30795 #define BDESC(mask, icode, name, code, comparison, flag) \
30796 code,
30797 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30798 code, \
30799 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30800 #define BDESC_END(kind, next_kind)
30802 #include "i386-builtin.def"
30804 #undef BDESC
30805 #undef BDESC_FIRST
30806 #undef BDESC_END
30808 IX86_BUILTIN_MAX,
30810 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30812 /* Now just the aliases for bdesc_* start/end. */
30813 #define BDESC(mask, icode, name, code, comparison, flag)
30814 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30815 #define BDESC_END(kind, next_kind) \
30816 IX86_BUILTIN__BDESC_##kind##_LAST \
30817 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30819 #include "i386-builtin.def"
30821 #undef BDESC
30822 #undef BDESC_FIRST
30823 #undef BDESC_END
30825 /* Just to make sure there is no comma after the last enumerator. */
30826 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30829 /* Table for the ix86 builtin decls. */
30830 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30832 /* Table of all of the builtin functions that are possible with different ISA's
30833 but are waiting to be built until a function is declared to use that
30834 ISA. */
30835 struct builtin_isa {
30836 const char *name; /* function name */
30837 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30838 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30839 bool const_p; /* true if the declaration is constant */
30840 bool leaf_p; /* true if the declaration has leaf attribute */
30841 bool nothrow_p; /* true if the declaration has nothrow attribute */
30842 bool set_and_not_built_p;
30845 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30847 /* Bits that can still enable any inclusion of a builtin. */
30848 static HOST_WIDE_INT deferred_isa_values = 0;
30850 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30851 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30852 function decl in the ix86_builtins array. Returns the function decl or
30853 NULL_TREE, if the builtin was not added.
30855 If the front end has a special hook for builtin functions, delay adding
30856 builtin functions that aren't in the current ISA until the ISA is changed
30857 with function specific optimization. Doing so, can save about 300K for the
30858 default compiler. When the builtin is expanded, check at that time whether
30859 it is valid.
30861 If the front end doesn't have a special hook, record all builtins, even if
30862 it isn't an instruction set in the current ISA in case the user uses
30863 function specific options for a different ISA, so that we don't get scope
30864 errors if a builtin is added in the middle of a function scope. */
30866 static inline tree
30867 def_builtin (HOST_WIDE_INT mask, const char *name,
30868 enum ix86_builtin_func_type tcode,
30869 enum ix86_builtins code)
30871 tree decl = NULL_TREE;
30873 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30875 ix86_builtins_isa[(int) code].isa = mask;
30877 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
30878 where any bit set means that built-in is enable, this bit must be *and-ed*
30879 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
30880 means that *both* cpuid bits must be set for the built-in to be available.
30881 Handle this here. */
30882 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30883 mask &= ~OPTION_MASK_ISA_AVX512VL;
30885 mask &= ~OPTION_MASK_ISA_64BIT;
30886 if (mask == 0
30887 || (mask & ix86_isa_flags) != 0
30888 || (lang_hooks.builtin_function
30889 == lang_hooks.builtin_function_ext_scope))
30892 tree type = ix86_get_builtin_func_type (tcode);
30893 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30894 NULL, NULL_TREE);
30895 ix86_builtins[(int) code] = decl;
30896 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30898 else
30900 /* Just a MASK where set_and_not_built_p == true can potentially
30901 include a builtin. */
30902 deferred_isa_values |= mask;
30903 ix86_builtins[(int) code] = NULL_TREE;
30904 ix86_builtins_isa[(int) code].tcode = tcode;
30905 ix86_builtins_isa[(int) code].name = name;
30906 ix86_builtins_isa[(int) code].leaf_p = false;
30907 ix86_builtins_isa[(int) code].nothrow_p = false;
30908 ix86_builtins_isa[(int) code].const_p = false;
30909 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30913 return decl;
30916 /* Like def_builtin, but also marks the function decl "const". */
30918 static inline tree
30919 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30920 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30922 tree decl = def_builtin (mask, name, tcode, code);
30923 if (decl)
30924 TREE_READONLY (decl) = 1;
30925 else
30926 ix86_builtins_isa[(int) code].const_p = true;
30928 return decl;
30931 /* Add any new builtin functions for a given ISA that may not have been
30932 declared. This saves a bit of space compared to adding all of the
30933 declarations to the tree, even if we didn't use them. */
30935 static void
30936 ix86_add_new_builtins (HOST_WIDE_INT isa)
30938 if ((isa & deferred_isa_values) == 0)
30939 return;
30941 /* Bits in ISA value can be removed from potential isa values. */
30942 deferred_isa_values &= ~isa;
30944 int i;
30945 tree saved_current_target_pragma = current_target_pragma;
30946 current_target_pragma = NULL_TREE;
30948 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30950 if ((ix86_builtins_isa[i].isa & isa) != 0
30951 && ix86_builtins_isa[i].set_and_not_built_p)
30953 tree decl, type;
30955 /* Don't define the builtin again. */
30956 ix86_builtins_isa[i].set_and_not_built_p = false;
30958 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30959 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30960 type, i, BUILT_IN_MD, NULL,
30961 NULL_TREE);
30963 ix86_builtins[i] = decl;
30964 if (ix86_builtins_isa[i].const_p)
30965 TREE_READONLY (decl) = 1;
30966 if (ix86_builtins_isa[i].leaf_p)
30967 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30968 NULL_TREE);
30969 if (ix86_builtins_isa[i].nothrow_p)
30970 TREE_NOTHROW (decl) = 1;
30974 current_target_pragma = saved_current_target_pragma;
30977 /* Bits for builtin_description.flag. */
30979 /* Set when we don't support the comparison natively, and should
30980 swap_comparison in order to support it. */
30981 #define BUILTIN_DESC_SWAP_OPERANDS 1
30983 struct builtin_description
30985 const HOST_WIDE_INT mask;
30986 const enum insn_code icode;
30987 const char *const name;
30988 const enum ix86_builtins code;
30989 const enum rtx_code comparison;
30990 const int flag;
30993 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30994 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30995 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30996 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30997 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30998 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30999 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
31000 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
31001 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
31002 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
31003 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
31004 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
31005 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
31006 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
31007 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
31008 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
31009 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
31010 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
31011 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
31012 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
31013 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
31014 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
31015 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
31016 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
31017 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
31018 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
31019 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
31020 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
31021 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
31022 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31023 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31024 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31025 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31026 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31027 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31028 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31029 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31030 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31031 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31032 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31033 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31034 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31035 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31036 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31037 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31038 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31039 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31040 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31041 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31042 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31043 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31044 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31046 #define BDESC(mask, icode, name, code, comparison, flag) \
31047 { mask, icode, name, code, comparison, flag },
31048 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31049 static const struct builtin_description bdesc_##kind[] = \
31051 BDESC (mask, icode, name, code, comparison, flag)
31052 #define BDESC_END(kind, next_kind) \
31055 #include "i386-builtin.def"
31057 #undef BDESC
31058 #undef BDESC_FIRST
31059 #undef BDESC_END
31061 /* TM vector builtins. */
31063 /* Reuse the existing x86-specific `struct builtin_description' cause
31064 we're lazy. Add casts to make them fit. */
31065 static const struct builtin_description bdesc_tm[] =
31067 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31068 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31069 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31070 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31071 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31072 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31073 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31075 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31076 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31077 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31078 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31079 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31080 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31081 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31083 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31084 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31085 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31086 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31087 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31088 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31089 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31091 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31092 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31093 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31096 /* Initialize the transactional memory vector load/store builtins. */
31098 static void
31099 ix86_init_tm_builtins (void)
31101 enum ix86_builtin_func_type ftype;
31102 const struct builtin_description *d;
31103 size_t i;
31104 tree decl;
31105 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31106 tree attrs_log, attrs_type_log;
31108 if (!flag_tm)
31109 return;
31111 /* If there are no builtins defined, we must be compiling in a
31112 language without trans-mem support. */
31113 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31114 return;
31116 /* Use whatever attributes a normal TM load has. */
31117 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31118 attrs_load = DECL_ATTRIBUTES (decl);
31119 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31120 /* Use whatever attributes a normal TM store has. */
31121 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31122 attrs_store = DECL_ATTRIBUTES (decl);
31123 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31124 /* Use whatever attributes a normal TM log has. */
31125 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31126 attrs_log = DECL_ATTRIBUTES (decl);
31127 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31129 for (i = 0, d = bdesc_tm;
31130 i < ARRAY_SIZE (bdesc_tm);
31131 i++, d++)
31133 if ((d->mask & ix86_isa_flags) != 0
31134 || (lang_hooks.builtin_function
31135 == lang_hooks.builtin_function_ext_scope))
31137 tree type, attrs, attrs_type;
31138 enum built_in_function code = (enum built_in_function) d->code;
31140 ftype = (enum ix86_builtin_func_type) d->flag;
31141 type = ix86_get_builtin_func_type (ftype);
31143 if (BUILTIN_TM_LOAD_P (code))
31145 attrs = attrs_load;
31146 attrs_type = attrs_type_load;
31148 else if (BUILTIN_TM_STORE_P (code))
31150 attrs = attrs_store;
31151 attrs_type = attrs_type_store;
31153 else
31155 attrs = attrs_log;
31156 attrs_type = attrs_type_log;
31158 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31159 /* The builtin without the prefix for
31160 calling it directly. */
31161 d->name + strlen ("__builtin_"),
31162 attrs);
31163 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31164 set the TYPE_ATTRIBUTES. */
31165 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31167 set_builtin_decl (code, decl, false);
31172 /* Macros for verification of enum ix86_builtins order. */
31173 #define BDESC_VERIFY(x, y, z) \
31174 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31175 #define BDESC_VERIFYS(x, y, z) \
31176 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31178 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31179 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31180 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31181 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31182 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31183 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31184 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31185 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31186 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31187 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31188 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31189 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31190 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31191 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31192 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31193 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31194 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31195 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31197 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31198 in the current target ISA to allow the user to compile particular modules
31199 with different target specific options that differ from the command line
31200 options. */
31201 static void
31202 ix86_init_mmx_sse_builtins (void)
31204 const struct builtin_description * d;
31205 enum ix86_builtin_func_type ftype;
31206 size_t i;
31208 /* Add all special builtins with variable number of operands. */
31209 for (i = 0, d = bdesc_special_args;
31210 i < ARRAY_SIZE (bdesc_special_args);
31211 i++, d++)
31213 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31214 if (d->name == 0)
31215 continue;
31217 ftype = (enum ix86_builtin_func_type) d->flag;
31218 def_builtin (d->mask, d->name, ftype, d->code);
31220 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31221 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31222 ARRAY_SIZE (bdesc_special_args) - 1);
31224 /* Add all builtins with variable number of operands. */
31225 for (i = 0, d = bdesc_args;
31226 i < ARRAY_SIZE (bdesc_args);
31227 i++, d++)
31229 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31230 if (d->name == 0)
31231 continue;
31233 ftype = (enum ix86_builtin_func_type) d->flag;
31234 def_builtin_const (d->mask, d->name, ftype, d->code);
31236 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31237 IX86_BUILTIN__BDESC_ARGS_FIRST,
31238 ARRAY_SIZE (bdesc_args) - 1);
31240 /* Add all builtins with rounding. */
31241 for (i = 0, d = bdesc_round_args;
31242 i < ARRAY_SIZE (bdesc_round_args);
31243 i++, d++)
31245 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31246 if (d->name == 0)
31247 continue;
31249 ftype = (enum ix86_builtin_func_type) d->flag;
31250 def_builtin_const (d->mask, d->name, ftype, d->code);
31252 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31253 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31254 ARRAY_SIZE (bdesc_round_args) - 1);
31256 /* pcmpestr[im] insns. */
31257 for (i = 0, d = bdesc_pcmpestr;
31258 i < ARRAY_SIZE (bdesc_pcmpestr);
31259 i++, d++)
31261 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31262 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31263 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31264 else
31265 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31266 def_builtin_const (d->mask, d->name, ftype, d->code);
31268 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31269 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31270 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31272 /* pcmpistr[im] insns. */
31273 for (i = 0, d = bdesc_pcmpistr;
31274 i < ARRAY_SIZE (bdesc_pcmpistr);
31275 i++, d++)
31277 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31278 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31279 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31280 else
31281 ftype = INT_FTYPE_V16QI_V16QI_INT;
31282 def_builtin_const (d->mask, d->name, ftype, d->code);
31284 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31285 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31286 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31288 /* comi/ucomi insns. */
31289 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31291 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31292 if (d->mask == OPTION_MASK_ISA_SSE2)
31293 ftype = INT_FTYPE_V2DF_V2DF;
31294 else
31295 ftype = INT_FTYPE_V4SF_V4SF;
31296 def_builtin_const (d->mask, d->name, ftype, d->code);
31298 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31299 IX86_BUILTIN__BDESC_COMI_FIRST,
31300 ARRAY_SIZE (bdesc_comi) - 1);
31302 /* SSE */
31303 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31304 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31305 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31306 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31308 /* SSE or 3DNow!A */
31309 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31310 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31311 IX86_BUILTIN_MASKMOVQ);
31313 /* SSE2 */
31314 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31315 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31317 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31318 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31319 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31320 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31322 /* SSE3. */
31323 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31324 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31325 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31326 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31328 /* AES */
31329 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31330 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31331 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31332 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31333 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31334 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31335 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31336 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31337 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31338 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31339 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31340 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31342 /* PCLMUL */
31343 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31344 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31346 /* RDRND */
31347 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31348 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31349 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31350 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31351 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31352 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31353 IX86_BUILTIN_RDRAND64_STEP);
31355 /* AVX2 */
31356 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31357 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31358 IX86_BUILTIN_GATHERSIV2DF);
31360 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31361 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31362 IX86_BUILTIN_GATHERSIV4DF);
31364 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31365 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31366 IX86_BUILTIN_GATHERDIV2DF);
31368 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31369 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31370 IX86_BUILTIN_GATHERDIV4DF);
31372 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31373 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31374 IX86_BUILTIN_GATHERSIV4SF);
31376 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31377 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31378 IX86_BUILTIN_GATHERSIV8SF);
31380 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31381 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31382 IX86_BUILTIN_GATHERDIV4SF);
31384 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31385 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31386 IX86_BUILTIN_GATHERDIV8SF);
31388 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31389 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31390 IX86_BUILTIN_GATHERSIV2DI);
31392 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31393 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31394 IX86_BUILTIN_GATHERSIV4DI);
31396 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31397 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31398 IX86_BUILTIN_GATHERDIV2DI);
31400 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31401 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31402 IX86_BUILTIN_GATHERDIV4DI);
31404 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31405 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31406 IX86_BUILTIN_GATHERSIV4SI);
31408 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31409 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31410 IX86_BUILTIN_GATHERSIV8SI);
31412 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31413 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31414 IX86_BUILTIN_GATHERDIV4SI);
31416 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31417 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31418 IX86_BUILTIN_GATHERDIV8SI);
31420 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31421 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31422 IX86_BUILTIN_GATHERALTSIV4DF);
31424 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31425 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31426 IX86_BUILTIN_GATHERALTDIV8SF);
31428 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31429 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31430 IX86_BUILTIN_GATHERALTSIV4DI);
31432 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31433 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31434 IX86_BUILTIN_GATHERALTDIV8SI);
31436 /* AVX512F */
31437 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31438 V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
31439 IX86_BUILTIN_GATHER3SIV16SF);
31441 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31442 V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
31443 IX86_BUILTIN_GATHER3SIV8DF);
31445 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31446 V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
31447 IX86_BUILTIN_GATHER3DIV16SF);
31449 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31450 V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
31451 IX86_BUILTIN_GATHER3DIV8DF);
31453 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31454 V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
31455 IX86_BUILTIN_GATHER3SIV16SI);
31457 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31458 V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
31459 IX86_BUILTIN_GATHER3SIV8DI);
31461 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31462 V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
31463 IX86_BUILTIN_GATHER3DIV16SI);
31465 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31466 V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
31467 IX86_BUILTIN_GATHER3DIV8DI);
31469 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31470 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31471 IX86_BUILTIN_GATHER3ALTSIV8DF);
31473 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31474 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31475 IX86_BUILTIN_GATHER3ALTDIV16SF);
31477 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31478 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31479 IX86_BUILTIN_GATHER3ALTSIV8DI);
31481 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31482 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31483 IX86_BUILTIN_GATHER3ALTDIV16SI);
31485 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31486 VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
31487 IX86_BUILTIN_SCATTERSIV16SF);
31489 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31490 VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
31491 IX86_BUILTIN_SCATTERSIV8DF);
31493 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31494 VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
31495 IX86_BUILTIN_SCATTERDIV16SF);
31497 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31498 VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
31499 IX86_BUILTIN_SCATTERDIV8DF);
31501 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31502 VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
31503 IX86_BUILTIN_SCATTERSIV16SI);
31505 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31506 VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
31507 IX86_BUILTIN_SCATTERSIV8DI);
31509 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31510 VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
31511 IX86_BUILTIN_SCATTERDIV16SI);
31513 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31514 VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
31515 IX86_BUILTIN_SCATTERDIV8DI);
31517 /* AVX512VL */
31518 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31519 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_QI_INT,
31520 IX86_BUILTIN_GATHER3SIV2DF);
31522 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31523 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_QI_INT,
31524 IX86_BUILTIN_GATHER3SIV4DF);
31526 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31527 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_QI_INT,
31528 IX86_BUILTIN_GATHER3DIV2DF);
31530 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31531 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_QI_INT,
31532 IX86_BUILTIN_GATHER3DIV4DF);
31534 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31535 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_QI_INT,
31536 IX86_BUILTIN_GATHER3SIV4SF);
31538 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31539 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_QI_INT,
31540 IX86_BUILTIN_GATHER3SIV8SF);
31542 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31543 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_QI_INT,
31544 IX86_BUILTIN_GATHER3DIV4SF);
31546 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31547 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_QI_INT,
31548 IX86_BUILTIN_GATHER3DIV8SF);
31550 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31551 V2DI_FTYPE_V2DI_PCINT64_V4SI_QI_INT,
31552 IX86_BUILTIN_GATHER3SIV2DI);
31554 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31555 V4DI_FTYPE_V4DI_PCINT64_V4SI_QI_INT,
31556 IX86_BUILTIN_GATHER3SIV4DI);
31558 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31559 V2DI_FTYPE_V2DI_PCINT64_V2DI_QI_INT,
31560 IX86_BUILTIN_GATHER3DIV2DI);
31562 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31563 V4DI_FTYPE_V4DI_PCINT64_V4DI_QI_INT,
31564 IX86_BUILTIN_GATHER3DIV4DI);
31566 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31567 V4SI_FTYPE_V4SI_PCINT_V4SI_QI_INT,
31568 IX86_BUILTIN_GATHER3SIV4SI);
31570 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31571 V8SI_FTYPE_V8SI_PCINT_V8SI_QI_INT,
31572 IX86_BUILTIN_GATHER3SIV8SI);
31574 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31575 V4SI_FTYPE_V4SI_PCINT_V2DI_QI_INT,
31576 IX86_BUILTIN_GATHER3DIV4SI);
31578 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31579 V4SI_FTYPE_V4SI_PCINT_V4DI_QI_INT,
31580 IX86_BUILTIN_GATHER3DIV8SI);
31582 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31583 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31584 IX86_BUILTIN_GATHER3ALTSIV4DF);
31586 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31587 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31588 IX86_BUILTIN_GATHER3ALTDIV8SF);
31590 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31591 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31592 IX86_BUILTIN_GATHER3ALTSIV4DI);
31594 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31595 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31596 IX86_BUILTIN_GATHER3ALTDIV8SI);
31598 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31599 VOID_FTYPE_PFLOAT_QI_V8SI_V8SF_INT,
31600 IX86_BUILTIN_SCATTERSIV8SF);
31602 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31603 VOID_FTYPE_PFLOAT_QI_V4SI_V4SF_INT,
31604 IX86_BUILTIN_SCATTERSIV4SF);
31606 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31607 VOID_FTYPE_PDOUBLE_QI_V4SI_V4DF_INT,
31608 IX86_BUILTIN_SCATTERSIV4DF);
31610 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31611 VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
31612 IX86_BUILTIN_SCATTERSIV2DF);
31614 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31615 VOID_FTYPE_PFLOAT_QI_V4DI_V4SF_INT,
31616 IX86_BUILTIN_SCATTERDIV8SF);
31618 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31619 VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
31620 IX86_BUILTIN_SCATTERDIV4SF);
31622 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31623 VOID_FTYPE_PDOUBLE_QI_V4DI_V4DF_INT,
31624 IX86_BUILTIN_SCATTERDIV4DF);
31626 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31627 VOID_FTYPE_PDOUBLE_QI_V2DI_V2DF_INT,
31628 IX86_BUILTIN_SCATTERDIV2DF);
31630 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31631 VOID_FTYPE_PINT_QI_V8SI_V8SI_INT,
31632 IX86_BUILTIN_SCATTERSIV8SI);
31634 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31635 VOID_FTYPE_PINT_QI_V4SI_V4SI_INT,
31636 IX86_BUILTIN_SCATTERSIV4SI);
31638 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31639 VOID_FTYPE_PLONGLONG_QI_V4SI_V4DI_INT,
31640 IX86_BUILTIN_SCATTERSIV4DI);
31642 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31643 VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
31644 IX86_BUILTIN_SCATTERSIV2DI);
31646 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31647 VOID_FTYPE_PINT_QI_V4DI_V4SI_INT,
31648 IX86_BUILTIN_SCATTERDIV8SI);
31650 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31651 VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
31652 IX86_BUILTIN_SCATTERDIV4SI);
31654 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31655 VOID_FTYPE_PLONGLONG_QI_V4DI_V4DI_INT,
31656 IX86_BUILTIN_SCATTERDIV4DI);
31658 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31659 VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
31660 IX86_BUILTIN_SCATTERDIV2DI);
31661 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31662 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31663 IX86_BUILTIN_SCATTERALTSIV8DF);
31665 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31666 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31667 IX86_BUILTIN_SCATTERALTDIV16SF);
31669 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31670 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31671 IX86_BUILTIN_SCATTERALTSIV8DI);
31673 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31674 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31675 IX86_BUILTIN_SCATTERALTDIV16SI);
31677 /* AVX512PF */
31678 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31679 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31680 IX86_BUILTIN_GATHERPFDPD);
31681 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31682 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31683 IX86_BUILTIN_GATHERPFDPS);
31684 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31685 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31686 IX86_BUILTIN_GATHERPFQPD);
31687 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31688 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31689 IX86_BUILTIN_GATHERPFQPS);
31690 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31691 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31692 IX86_BUILTIN_SCATTERPFDPD);
31693 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31694 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31695 IX86_BUILTIN_SCATTERPFDPS);
31696 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31697 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31698 IX86_BUILTIN_SCATTERPFQPD);
31699 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31700 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31701 IX86_BUILTIN_SCATTERPFQPS);
31703 /* SHA */
31704 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31705 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31706 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31707 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31708 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31709 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31710 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31711 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31712 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31713 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31714 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31715 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31716 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31717 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31719 /* RTM. */
31720 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31721 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31723 /* MMX access to the vec_init patterns. */
31724 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31725 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31727 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31728 V4HI_FTYPE_HI_HI_HI_HI,
31729 IX86_BUILTIN_VEC_INIT_V4HI);
31731 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31732 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31733 IX86_BUILTIN_VEC_INIT_V8QI);
31735 /* Access to the vec_extract patterns. */
31736 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31737 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31738 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31739 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31740 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31741 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31742 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31743 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31744 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31745 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31747 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31748 "__builtin_ia32_vec_ext_v4hi",
31749 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31751 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31752 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31754 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31755 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31757 /* Access to the vec_set patterns. */
31758 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31759 "__builtin_ia32_vec_set_v2di",
31760 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31762 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31763 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31765 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31766 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31768 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31769 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31771 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31772 "__builtin_ia32_vec_set_v4hi",
31773 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31775 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31776 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31778 /* RDSEED */
31779 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31780 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31781 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31782 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31783 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31784 "__builtin_ia32_rdseed_di_step",
31785 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31787 /* ADCX */
31788 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31789 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31790 def_builtin (OPTION_MASK_ISA_64BIT,
31791 "__builtin_ia32_addcarryx_u64",
31792 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31793 IX86_BUILTIN_ADDCARRYX64);
31795 /* SBB */
31796 def_builtin (0, "__builtin_ia32_sbb_u32",
31797 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31798 def_builtin (OPTION_MASK_ISA_64BIT,
31799 "__builtin_ia32_sbb_u64",
31800 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31801 IX86_BUILTIN_SBB64);
31803 /* Read/write FLAGS. */
31804 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
31805 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31806 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31807 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31808 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
31809 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31810 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31811 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31813 /* CLFLUSHOPT. */
31814 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31815 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31817 /* CLWB. */
31818 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31819 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31821 /* MONITORX and MWAITX. */
31822 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31823 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31824 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31825 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31827 /* CLZERO. */
31828 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31829 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31831 /* Add FMA4 multi-arg argument instructions */
31832 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31834 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31835 if (d->name == 0)
31836 continue;
31838 ftype = (enum ix86_builtin_func_type) d->flag;
31839 def_builtin_const (d->mask, d->name, ftype, d->code);
31841 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31842 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31843 ARRAY_SIZE (bdesc_multi_arg) - 1);
31846 static void
31847 ix86_init_mpx_builtins ()
31849 const struct builtin_description * d;
31850 enum ix86_builtin_func_type ftype;
31851 tree decl;
31852 size_t i;
31854 for (i = 0, d = bdesc_mpx;
31855 i < ARRAY_SIZE (bdesc_mpx);
31856 i++, d++)
31858 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31859 if (d->name == 0)
31860 continue;
31862 ftype = (enum ix86_builtin_func_type) d->flag;
31863 decl = def_builtin (d->mask, d->name, ftype, d->code);
31865 /* With no leaf and nothrow flags for MPX builtins
31866 abnormal edges may follow its call when setjmp
31867 presents in the function. Since we may have a lot
31868 of MPX builtins calls it causes lots of useless
31869 edges and enormous PHI nodes. To avoid this we mark
31870 MPX builtins as leaf and nothrow. */
31871 if (decl)
31873 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31874 NULL_TREE);
31875 TREE_NOTHROW (decl) = 1;
31877 else
31879 ix86_builtins_isa[(int)d->code].leaf_p = true;
31880 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31883 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31884 IX86_BUILTIN__BDESC_MPX_FIRST,
31885 ARRAY_SIZE (bdesc_mpx) - 1);
31887 for (i = 0, d = bdesc_mpx_const;
31888 i < ARRAY_SIZE (bdesc_mpx_const);
31889 i++, d++)
31891 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31892 if (d->name == 0)
31893 continue;
31895 ftype = (enum ix86_builtin_func_type) d->flag;
31896 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
31898 if (decl)
31900 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31901 NULL_TREE);
31902 TREE_NOTHROW (decl) = 1;
31904 else
31906 ix86_builtins_isa[(int)d->code].leaf_p = true;
31907 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31910 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31911 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31912 ARRAY_SIZE (bdesc_mpx_const) - 1);
31914 #undef BDESC_VERIFY
31915 #undef BDESC_VERIFYS
31917 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31918 to return a pointer to VERSION_DECL if the outcome of the expression
31919 formed by PREDICATE_CHAIN is true. This function will be called during
31920 version dispatch to decide which function version to execute. It returns
31921 the basic block at the end, to which more conditions can be added. */
31923 static basic_block
31924 add_condition_to_bb (tree function_decl, tree version_decl,
31925 tree predicate_chain, basic_block new_bb)
31927 gimple *return_stmt;
31928 tree convert_expr, result_var;
31929 gimple *convert_stmt;
31930 gimple *call_cond_stmt;
31931 gimple *if_else_stmt;
31933 basic_block bb1, bb2, bb3;
31934 edge e12, e23;
31936 tree cond_var, and_expr_var = NULL_TREE;
31937 gimple_seq gseq;
31939 tree predicate_decl, predicate_arg;
31941 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31943 gcc_assert (new_bb != NULL);
31944 gseq = bb_seq (new_bb);
31947 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31948 build_fold_addr_expr (version_decl));
31949 result_var = create_tmp_var (ptr_type_node);
31950 convert_stmt = gimple_build_assign (result_var, convert_expr);
31951 return_stmt = gimple_build_return (result_var);
31953 if (predicate_chain == NULL_TREE)
31955 gimple_seq_add_stmt (&gseq, convert_stmt);
31956 gimple_seq_add_stmt (&gseq, return_stmt);
31957 set_bb_seq (new_bb, gseq);
31958 gimple_set_bb (convert_stmt, new_bb);
31959 gimple_set_bb (return_stmt, new_bb);
31960 pop_cfun ();
31961 return new_bb;
31964 while (predicate_chain != NULL)
31966 cond_var = create_tmp_var (integer_type_node);
31967 predicate_decl = TREE_PURPOSE (predicate_chain);
31968 predicate_arg = TREE_VALUE (predicate_chain);
31969 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31970 gimple_call_set_lhs (call_cond_stmt, cond_var);
31972 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31973 gimple_set_bb (call_cond_stmt, new_bb);
31974 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31976 predicate_chain = TREE_CHAIN (predicate_chain);
31978 if (and_expr_var == NULL)
31979 and_expr_var = cond_var;
31980 else
31982 gimple *assign_stmt;
31983 /* Use MIN_EXPR to check if any integer is zero?.
31984 and_expr_var = min_expr <cond_var, and_expr_var> */
31985 assign_stmt = gimple_build_assign (and_expr_var,
31986 build2 (MIN_EXPR, integer_type_node,
31987 cond_var, and_expr_var));
31989 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31990 gimple_set_bb (assign_stmt, new_bb);
31991 gimple_seq_add_stmt (&gseq, assign_stmt);
31995 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31996 integer_zero_node,
31997 NULL_TREE, NULL_TREE);
31998 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31999 gimple_set_bb (if_else_stmt, new_bb);
32000 gimple_seq_add_stmt (&gseq, if_else_stmt);
32002 gimple_seq_add_stmt (&gseq, convert_stmt);
32003 gimple_seq_add_stmt (&gseq, return_stmt);
32004 set_bb_seq (new_bb, gseq);
32006 bb1 = new_bb;
32007 e12 = split_block (bb1, if_else_stmt);
32008 bb2 = e12->dest;
32009 e12->flags &= ~EDGE_FALLTHRU;
32010 e12->flags |= EDGE_TRUE_VALUE;
32012 e23 = split_block (bb2, return_stmt);
32014 gimple_set_bb (convert_stmt, bb2);
32015 gimple_set_bb (return_stmt, bb2);
32017 bb3 = e23->dest;
32018 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
32020 remove_edge (e23);
32021 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32023 pop_cfun ();
32025 return bb3;
32028 /* This parses the attribute arguments to target in DECL and determines
32029 the right builtin to use to match the platform specification.
32030 It returns the priority value for this version decl. If PREDICATE_LIST
32031 is not NULL, it stores the list of cpu features that need to be checked
32032 before dispatching this function. */
32034 static unsigned int
32035 get_builtin_code_for_version (tree decl, tree *predicate_list)
32037 tree attrs;
32038 struct cl_target_option cur_target;
32039 tree target_node;
32040 struct cl_target_option *new_target;
32041 const char *arg_str = NULL;
32042 const char *attrs_str = NULL;
32043 char *tok_str = NULL;
32044 char *token;
32046 /* Priority of i386 features, greater value is higher priority. This is
32047 used to decide the order in which function dispatch must happen. For
32048 instance, a version specialized for SSE4.2 should be checked for dispatch
32049 before a version for SSE3, as SSE4.2 implies SSE3. */
32050 enum feature_priority
32052 P_ZERO = 0,
32053 P_MMX,
32054 P_SSE,
32055 P_SSE2,
32056 P_SSE3,
32057 P_SSSE3,
32058 P_PROC_SSSE3,
32059 P_SSE4_A,
32060 P_PROC_SSE4_A,
32061 P_SSE4_1,
32062 P_SSE4_2,
32063 P_PROC_SSE4_2,
32064 P_POPCNT,
32065 P_AES,
32066 P_PCLMUL,
32067 P_AVX,
32068 P_PROC_AVX,
32069 P_BMI,
32070 P_PROC_BMI,
32071 P_FMA4,
32072 P_XOP,
32073 P_PROC_XOP,
32074 P_FMA,
32075 P_PROC_FMA,
32076 P_BMI2,
32077 P_AVX2,
32078 P_PROC_AVX2,
32079 P_AVX512F,
32080 P_PROC_AVX512F
32083 enum feature_priority priority = P_ZERO;
32085 /* These are the target attribute strings for which a dispatcher is
32086 available, from fold_builtin_cpu. */
32088 static struct _feature_list
32090 const char *const name;
32091 const enum feature_priority priority;
32093 const feature_list[] =
32095 {"mmx", P_MMX},
32096 {"sse", P_SSE},
32097 {"sse2", P_SSE2},
32098 {"sse3", P_SSE3},
32099 {"sse4a", P_SSE4_A},
32100 {"ssse3", P_SSSE3},
32101 {"sse4.1", P_SSE4_1},
32102 {"sse4.2", P_SSE4_2},
32103 {"popcnt", P_POPCNT},
32104 {"aes", P_AES},
32105 {"pclmul", P_PCLMUL},
32106 {"avx", P_AVX},
32107 {"bmi", P_BMI},
32108 {"fma4", P_FMA4},
32109 {"xop", P_XOP},
32110 {"fma", P_FMA},
32111 {"bmi2", P_BMI2},
32112 {"avx2", P_AVX2},
32113 {"avx512f", P_AVX512F}
32117 static unsigned int NUM_FEATURES
32118 = sizeof (feature_list) / sizeof (struct _feature_list);
32120 unsigned int i;
32122 tree predicate_chain = NULL_TREE;
32123 tree predicate_decl, predicate_arg;
32125 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32126 gcc_assert (attrs != NULL);
32128 attrs = TREE_VALUE (TREE_VALUE (attrs));
32130 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32131 attrs_str = TREE_STRING_POINTER (attrs);
32133 /* Return priority zero for default function. */
32134 if (strcmp (attrs_str, "default") == 0)
32135 return 0;
32137 /* Handle arch= if specified. For priority, set it to be 1 more than
32138 the best instruction set the processor can handle. For instance, if
32139 there is a version for atom and a version for ssse3 (the highest ISA
32140 priority for atom), the atom version must be checked for dispatch
32141 before the ssse3 version. */
32142 if (strstr (attrs_str, "arch=") != NULL)
32144 cl_target_option_save (&cur_target, &global_options);
32145 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32146 &global_options_set);
32148 gcc_assert (target_node);
32149 new_target = TREE_TARGET_OPTION (target_node);
32150 gcc_assert (new_target);
32152 if (new_target->arch_specified && new_target->arch > 0)
32154 switch (new_target->arch)
32156 case PROCESSOR_CORE2:
32157 arg_str = "core2";
32158 priority = P_PROC_SSSE3;
32159 break;
32160 case PROCESSOR_NEHALEM:
32161 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32162 arg_str = "westmere";
32163 else
32164 /* We translate "arch=corei7" and "arch=nehalem" to
32165 "corei7" so that it will be mapped to M_INTEL_COREI7
32166 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32167 arg_str = "corei7";
32168 priority = P_PROC_SSE4_2;
32169 break;
32170 case PROCESSOR_SANDYBRIDGE:
32171 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32172 arg_str = "ivybridge";
32173 else
32174 arg_str = "sandybridge";
32175 priority = P_PROC_AVX;
32176 break;
32177 case PROCESSOR_HASWELL:
32178 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32179 arg_str = "skylake-avx512";
32180 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32181 arg_str = "skylake";
32182 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32183 arg_str = "broadwell";
32184 else
32185 arg_str = "haswell";
32186 priority = P_PROC_AVX2;
32187 break;
32188 case PROCESSOR_BONNELL:
32189 arg_str = "bonnell";
32190 priority = P_PROC_SSSE3;
32191 break;
32192 case PROCESSOR_KNL:
32193 arg_str = "knl";
32194 priority = P_PROC_AVX512F;
32195 break;
32196 case PROCESSOR_SILVERMONT:
32197 arg_str = "silvermont";
32198 priority = P_PROC_SSE4_2;
32199 break;
32200 case PROCESSOR_AMDFAM10:
32201 arg_str = "amdfam10h";
32202 priority = P_PROC_SSE4_A;
32203 break;
32204 case PROCESSOR_BTVER1:
32205 arg_str = "btver1";
32206 priority = P_PROC_SSE4_A;
32207 break;
32208 case PROCESSOR_BTVER2:
32209 arg_str = "btver2";
32210 priority = P_PROC_BMI;
32211 break;
32212 case PROCESSOR_BDVER1:
32213 arg_str = "bdver1";
32214 priority = P_PROC_XOP;
32215 break;
32216 case PROCESSOR_BDVER2:
32217 arg_str = "bdver2";
32218 priority = P_PROC_FMA;
32219 break;
32220 case PROCESSOR_BDVER3:
32221 arg_str = "bdver3";
32222 priority = P_PROC_FMA;
32223 break;
32224 case PROCESSOR_BDVER4:
32225 arg_str = "bdver4";
32226 priority = P_PROC_AVX2;
32227 break;
32228 case PROCESSOR_ZNVER1:
32229 arg_str = "znver1";
32230 priority = P_PROC_AVX2;
32231 break;
32235 cl_target_option_restore (&global_options, &cur_target);
32237 if (predicate_list && arg_str == NULL)
32239 error_at (DECL_SOURCE_LOCATION (decl),
32240 "No dispatcher found for the versioning attributes");
32241 return 0;
32244 if (predicate_list)
32246 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32247 /* For a C string literal the length includes the trailing NULL. */
32248 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32249 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32250 predicate_chain);
32254 /* Process feature name. */
32255 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32256 strcpy (tok_str, attrs_str);
32257 token = strtok (tok_str, ",");
32258 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32260 while (token != NULL)
32262 /* Do not process "arch=" */
32263 if (strncmp (token, "arch=", 5) == 0)
32265 token = strtok (NULL, ",");
32266 continue;
32268 for (i = 0; i < NUM_FEATURES; ++i)
32270 if (strcmp (token, feature_list[i].name) == 0)
32272 if (predicate_list)
32274 predicate_arg = build_string_literal (
32275 strlen (feature_list[i].name) + 1,
32276 feature_list[i].name);
32277 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32278 predicate_chain);
32280 /* Find the maximum priority feature. */
32281 if (feature_list[i].priority > priority)
32282 priority = feature_list[i].priority;
32284 break;
32287 if (predicate_list && i == NUM_FEATURES)
32289 error_at (DECL_SOURCE_LOCATION (decl),
32290 "No dispatcher found for %s", token);
32291 return 0;
32293 token = strtok (NULL, ",");
32295 free (tok_str);
32297 if (predicate_list && predicate_chain == NULL_TREE)
32299 error_at (DECL_SOURCE_LOCATION (decl),
32300 "No dispatcher found for the versioning attributes : %s",
32301 attrs_str);
32302 return 0;
32304 else if (predicate_list)
32306 predicate_chain = nreverse (predicate_chain);
32307 *predicate_list = predicate_chain;
32310 return priority;
32313 /* This compares the priority of target features in function DECL1
32314 and DECL2. It returns positive value if DECL1 is higher priority,
32315 negative value if DECL2 is higher priority and 0 if they are the
32316 same. */
32318 static int
32319 ix86_compare_version_priority (tree decl1, tree decl2)
32321 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32322 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32324 return (int)priority1 - (int)priority2;
32327 /* V1 and V2 point to function versions with different priorities
32328 based on the target ISA. This function compares their priorities. */
32330 static int
32331 feature_compare (const void *v1, const void *v2)
32333 typedef struct _function_version_info
32335 tree version_decl;
32336 tree predicate_chain;
32337 unsigned int dispatch_priority;
32338 } function_version_info;
32340 const function_version_info c1 = *(const function_version_info *)v1;
32341 const function_version_info c2 = *(const function_version_info *)v2;
32342 return (c2.dispatch_priority - c1.dispatch_priority);
32345 /* This function generates the dispatch function for
32346 multi-versioned functions. DISPATCH_DECL is the function which will
32347 contain the dispatch logic. FNDECLS are the function choices for
32348 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32349 in DISPATCH_DECL in which the dispatch code is generated. */
32351 static int
32352 dispatch_function_versions (tree dispatch_decl,
32353 void *fndecls_p,
32354 basic_block *empty_bb)
32356 tree default_decl;
32357 gimple *ifunc_cpu_init_stmt;
32358 gimple_seq gseq;
32359 int ix;
32360 tree ele;
32361 vec<tree> *fndecls;
32362 unsigned int num_versions = 0;
32363 unsigned int actual_versions = 0;
32364 unsigned int i;
32366 struct _function_version_info
32368 tree version_decl;
32369 tree predicate_chain;
32370 unsigned int dispatch_priority;
32371 }*function_version_info;
32373 gcc_assert (dispatch_decl != NULL
32374 && fndecls_p != NULL
32375 && empty_bb != NULL);
32377 /*fndecls_p is actually a vector. */
32378 fndecls = static_cast<vec<tree> *> (fndecls_p);
32380 /* At least one more version other than the default. */
32381 num_versions = fndecls->length ();
32382 gcc_assert (num_versions >= 2);
32384 function_version_info = (struct _function_version_info *)
32385 XNEWVEC (struct _function_version_info, (num_versions - 1));
32387 /* The first version in the vector is the default decl. */
32388 default_decl = (*fndecls)[0];
32390 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32392 gseq = bb_seq (*empty_bb);
32393 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32394 constructors, so explicity call __builtin_cpu_init here. */
32395 ifunc_cpu_init_stmt = gimple_build_call_vec (
32396 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32397 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32398 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32399 set_bb_seq (*empty_bb, gseq);
32401 pop_cfun ();
32404 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32406 tree version_decl = ele;
32407 tree predicate_chain = NULL_TREE;
32408 unsigned int priority;
32409 /* Get attribute string, parse it and find the right predicate decl.
32410 The predicate function could be a lengthy combination of many
32411 features, like arch-type and various isa-variants. */
32412 priority = get_builtin_code_for_version (version_decl,
32413 &predicate_chain);
32415 if (predicate_chain == NULL_TREE)
32416 continue;
32418 function_version_info [actual_versions].version_decl = version_decl;
32419 function_version_info [actual_versions].predicate_chain
32420 = predicate_chain;
32421 function_version_info [actual_versions].dispatch_priority = priority;
32422 actual_versions++;
32425 /* Sort the versions according to descending order of dispatch priority. The
32426 priority is based on the ISA. This is not a perfect solution. There
32427 could still be ambiguity. If more than one function version is suitable
32428 to execute, which one should be dispatched? In future, allow the user
32429 to specify a dispatch priority next to the version. */
32430 qsort (function_version_info, actual_versions,
32431 sizeof (struct _function_version_info), feature_compare);
32433 for (i = 0; i < actual_versions; ++i)
32434 *empty_bb = add_condition_to_bb (dispatch_decl,
32435 function_version_info[i].version_decl,
32436 function_version_info[i].predicate_chain,
32437 *empty_bb);
32439 /* dispatch default version at the end. */
32440 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32441 NULL, *empty_bb);
32443 free (function_version_info);
32444 return 0;
32447 /* Comparator function to be used in qsort routine to sort attribute
32448 specification strings to "target". */
32450 static int
32451 attr_strcmp (const void *v1, const void *v2)
32453 const char *c1 = *(char *const*)v1;
32454 const char *c2 = *(char *const*)v2;
32455 return strcmp (c1, c2);
32458 /* ARGLIST is the argument to target attribute. This function tokenizes
32459 the comma separated arguments, sorts them and returns a string which
32460 is a unique identifier for the comma separated arguments. It also
32461 replaces non-identifier characters "=,-" with "_". */
32463 static char *
32464 sorted_attr_string (tree arglist)
32466 tree arg;
32467 size_t str_len_sum = 0;
32468 char **args = NULL;
32469 char *attr_str, *ret_str;
32470 char *attr = NULL;
32471 unsigned int argnum = 1;
32472 unsigned int i;
32474 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32476 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32477 size_t len = strlen (str);
32478 str_len_sum += len + 1;
32479 if (arg != arglist)
32480 argnum++;
32481 for (i = 0; i < strlen (str); i++)
32482 if (str[i] == ',')
32483 argnum++;
32486 attr_str = XNEWVEC (char, str_len_sum);
32487 str_len_sum = 0;
32488 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32490 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32491 size_t len = strlen (str);
32492 memcpy (attr_str + str_len_sum, str, len);
32493 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32494 str_len_sum += len + 1;
32497 /* Replace "=,-" with "_". */
32498 for (i = 0; i < strlen (attr_str); i++)
32499 if (attr_str[i] == '=' || attr_str[i]== '-')
32500 attr_str[i] = '_';
32502 if (argnum == 1)
32503 return attr_str;
32505 args = XNEWVEC (char *, argnum);
32507 i = 0;
32508 attr = strtok (attr_str, ",");
32509 while (attr != NULL)
32511 args[i] = attr;
32512 i++;
32513 attr = strtok (NULL, ",");
32516 qsort (args, argnum, sizeof (char *), attr_strcmp);
32518 ret_str = XNEWVEC (char, str_len_sum);
32519 str_len_sum = 0;
32520 for (i = 0; i < argnum; i++)
32522 size_t len = strlen (args[i]);
32523 memcpy (ret_str + str_len_sum, args[i], len);
32524 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32525 str_len_sum += len + 1;
32528 XDELETEVEC (args);
32529 XDELETEVEC (attr_str);
32530 return ret_str;
32533 /* This function changes the assembler name for functions that are
32534 versions. If DECL is a function version and has a "target"
32535 attribute, it appends the attribute string to its assembler name. */
32537 static tree
32538 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32540 tree version_attr;
32541 const char *orig_name, *version_string;
32542 char *attr_str, *assembler_name;
32544 if (DECL_DECLARED_INLINE_P (decl)
32545 && lookup_attribute ("gnu_inline",
32546 DECL_ATTRIBUTES (decl)))
32547 error_at (DECL_SOURCE_LOCATION (decl),
32548 "Function versions cannot be marked as gnu_inline,"
32549 " bodies have to be generated");
32551 if (DECL_VIRTUAL_P (decl)
32552 || DECL_VINDEX (decl))
32553 sorry ("Virtual function multiversioning not supported");
32555 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32557 /* target attribute string cannot be NULL. */
32558 gcc_assert (version_attr != NULL_TREE);
32560 orig_name = IDENTIFIER_POINTER (id);
32561 version_string
32562 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32564 if (strcmp (version_string, "default") == 0)
32565 return id;
32567 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32568 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32570 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32572 /* Allow assembler name to be modified if already set. */
32573 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32574 SET_DECL_RTL (decl, NULL);
32576 tree ret = get_identifier (assembler_name);
32577 XDELETEVEC (attr_str);
32578 XDELETEVEC (assembler_name);
32579 return ret;
32582 /* This function returns true if FN1 and FN2 are versions of the same function,
32583 that is, the target strings of the function decls are different. This assumes
32584 that FN1 and FN2 have the same signature. */
32586 static bool
32587 ix86_function_versions (tree fn1, tree fn2)
32589 tree attr1, attr2;
32590 char *target1, *target2;
32591 bool result;
32593 if (TREE_CODE (fn1) != FUNCTION_DECL
32594 || TREE_CODE (fn2) != FUNCTION_DECL)
32595 return false;
32597 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32598 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32600 /* At least one function decl should have the target attribute specified. */
32601 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32602 return false;
32604 /* Diagnose missing target attribute if one of the decls is already
32605 multi-versioned. */
32606 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32608 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32610 if (attr2 != NULL_TREE)
32612 std::swap (fn1, fn2);
32613 attr1 = attr2;
32615 error_at (DECL_SOURCE_LOCATION (fn2),
32616 "missing %<target%> attribute for multi-versioned %D",
32617 fn2);
32618 inform (DECL_SOURCE_LOCATION (fn1),
32619 "previous declaration of %D", fn1);
32620 /* Prevent diagnosing of the same error multiple times. */
32621 DECL_ATTRIBUTES (fn2)
32622 = tree_cons (get_identifier ("target"),
32623 copy_node (TREE_VALUE (attr1)),
32624 DECL_ATTRIBUTES (fn2));
32626 return false;
32629 target1 = sorted_attr_string (TREE_VALUE (attr1));
32630 target2 = sorted_attr_string (TREE_VALUE (attr2));
32632 /* The sorted target strings must be different for fn1 and fn2
32633 to be versions. */
32634 if (strcmp (target1, target2) == 0)
32635 result = false;
32636 else
32637 result = true;
32639 XDELETEVEC (target1);
32640 XDELETEVEC (target2);
32642 return result;
32645 static tree
32646 ix86_mangle_decl_assembler_name (tree decl, tree id)
32648 /* For function version, add the target suffix to the assembler name. */
32649 if (TREE_CODE (decl) == FUNCTION_DECL
32650 && DECL_FUNCTION_VERSIONED (decl))
32651 id = ix86_mangle_function_version_assembler_name (decl, id);
32652 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32653 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32654 #endif
32656 return id;
32659 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32660 is true, append the full path name of the source file. */
32662 static char *
32663 make_name (tree decl, const char *suffix, bool make_unique)
32665 char *global_var_name;
32666 int name_len;
32667 const char *name;
32668 const char *unique_name = NULL;
32670 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32672 /* Get a unique name that can be used globally without any chances
32673 of collision at link time. */
32674 if (make_unique)
32675 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32677 name_len = strlen (name) + strlen (suffix) + 2;
32679 if (make_unique)
32680 name_len += strlen (unique_name) + 1;
32681 global_var_name = XNEWVEC (char, name_len);
32683 /* Use '.' to concatenate names as it is demangler friendly. */
32684 if (make_unique)
32685 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32686 suffix);
32687 else
32688 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32690 return global_var_name;
32693 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32695 /* Make a dispatcher declaration for the multi-versioned function DECL.
32696 Calls to DECL function will be replaced with calls to the dispatcher
32697 by the front-end. Return the decl created. */
32699 static tree
32700 make_dispatcher_decl (const tree decl)
32702 tree func_decl;
32703 char *func_name;
32704 tree fn_type, func_type;
32705 bool is_uniq = false;
32707 if (TREE_PUBLIC (decl) == 0)
32708 is_uniq = true;
32710 func_name = make_name (decl, "ifunc", is_uniq);
32712 fn_type = TREE_TYPE (decl);
32713 func_type = build_function_type (TREE_TYPE (fn_type),
32714 TYPE_ARG_TYPES (fn_type));
32716 func_decl = build_fn_decl (func_name, func_type);
32717 XDELETEVEC (func_name);
32718 TREE_USED (func_decl) = 1;
32719 DECL_CONTEXT (func_decl) = NULL_TREE;
32720 DECL_INITIAL (func_decl) = error_mark_node;
32721 DECL_ARTIFICIAL (func_decl) = 1;
32722 /* Mark this func as external, the resolver will flip it again if
32723 it gets generated. */
32724 DECL_EXTERNAL (func_decl) = 1;
32725 /* This will be of type IFUNCs have to be externally visible. */
32726 TREE_PUBLIC (func_decl) = 1;
32728 return func_decl;
32731 #endif
32733 /* Returns true if decl is multi-versioned and DECL is the default function,
32734 that is it is not tagged with target specific optimization. */
32736 static bool
32737 is_function_default_version (const tree decl)
32739 if (TREE_CODE (decl) != FUNCTION_DECL
32740 || !DECL_FUNCTION_VERSIONED (decl))
32741 return false;
32742 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32743 gcc_assert (attr);
32744 attr = TREE_VALUE (TREE_VALUE (attr));
32745 return (TREE_CODE (attr) == STRING_CST
32746 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
32749 /* Make a dispatcher declaration for the multi-versioned function DECL.
32750 Calls to DECL function will be replaced with calls to the dispatcher
32751 by the front-end. Returns the decl of the dispatcher function. */
32753 static tree
32754 ix86_get_function_versions_dispatcher (void *decl)
32756 tree fn = (tree) decl;
32757 struct cgraph_node *node = NULL;
32758 struct cgraph_node *default_node = NULL;
32759 struct cgraph_function_version_info *node_v = NULL;
32760 struct cgraph_function_version_info *first_v = NULL;
32762 tree dispatch_decl = NULL;
32764 struct cgraph_function_version_info *default_version_info = NULL;
32766 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32768 node = cgraph_node::get (fn);
32769 gcc_assert (node != NULL);
32771 node_v = node->function_version ();
32772 gcc_assert (node_v != NULL);
32774 if (node_v->dispatcher_resolver != NULL)
32775 return node_v->dispatcher_resolver;
32777 /* Find the default version and make it the first node. */
32778 first_v = node_v;
32779 /* Go to the beginning of the chain. */
32780 while (first_v->prev != NULL)
32781 first_v = first_v->prev;
32782 default_version_info = first_v;
32783 while (default_version_info != NULL)
32785 if (is_function_default_version
32786 (default_version_info->this_node->decl))
32787 break;
32788 default_version_info = default_version_info->next;
32791 /* If there is no default node, just return NULL. */
32792 if (default_version_info == NULL)
32793 return NULL;
32795 /* Make default info the first node. */
32796 if (first_v != default_version_info)
32798 default_version_info->prev->next = default_version_info->next;
32799 if (default_version_info->next)
32800 default_version_info->next->prev = default_version_info->prev;
32801 first_v->prev = default_version_info;
32802 default_version_info->next = first_v;
32803 default_version_info->prev = NULL;
32806 default_node = default_version_info->this_node;
32808 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32809 if (targetm.has_ifunc_p ())
32811 struct cgraph_function_version_info *it_v = NULL;
32812 struct cgraph_node *dispatcher_node = NULL;
32813 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32815 /* Right now, the dispatching is done via ifunc. */
32816 dispatch_decl = make_dispatcher_decl (default_node->decl);
32818 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32819 gcc_assert (dispatcher_node != NULL);
32820 dispatcher_node->dispatcher_function = 1;
32821 dispatcher_version_info
32822 = dispatcher_node->insert_new_function_version ();
32823 dispatcher_version_info->next = default_version_info;
32824 dispatcher_node->definition = 1;
32826 /* Set the dispatcher for all the versions. */
32827 it_v = default_version_info;
32828 while (it_v != NULL)
32830 it_v->dispatcher_resolver = dispatch_decl;
32831 it_v = it_v->next;
32834 else
32835 #endif
32837 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32838 "multiversioning needs ifunc which is not supported "
32839 "on this target");
32842 return dispatch_decl;
32845 /* Make the resolver function decl to dispatch the versions of
32846 a multi-versioned function, DEFAULT_DECL. Create an
32847 empty basic block in the resolver and store the pointer in
32848 EMPTY_BB. Return the decl of the resolver function. */
32850 static tree
32851 make_resolver_func (const tree default_decl,
32852 const tree dispatch_decl,
32853 basic_block *empty_bb)
32855 char *resolver_name;
32856 tree decl, type, decl_name, t;
32857 bool is_uniq = false;
32859 /* IFUNC's have to be globally visible. So, if the default_decl is
32860 not, then the name of the IFUNC should be made unique. */
32861 if (TREE_PUBLIC (default_decl) == 0)
32862 is_uniq = true;
32864 /* Append the filename to the resolver function if the versions are
32865 not externally visible. This is because the resolver function has
32866 to be externally visible for the loader to find it. So, appending
32867 the filename will prevent conflicts with a resolver function from
32868 another module which is based on the same version name. */
32869 resolver_name = make_name (default_decl, "resolver", is_uniq);
32871 /* The resolver function should return a (void *). */
32872 type = build_function_type_list (ptr_type_node, NULL_TREE);
32874 decl = build_fn_decl (resolver_name, type);
32875 decl_name = get_identifier (resolver_name);
32876 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32878 DECL_NAME (decl) = decl_name;
32879 TREE_USED (decl) = 1;
32880 DECL_ARTIFICIAL (decl) = 1;
32881 DECL_IGNORED_P (decl) = 0;
32882 /* IFUNC resolvers have to be externally visible. */
32883 TREE_PUBLIC (decl) = 1;
32884 DECL_UNINLINABLE (decl) = 1;
32886 /* Resolver is not external, body is generated. */
32887 DECL_EXTERNAL (decl) = 0;
32888 DECL_EXTERNAL (dispatch_decl) = 0;
32890 DECL_CONTEXT (decl) = NULL_TREE;
32891 DECL_INITIAL (decl) = make_node (BLOCK);
32892 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32894 if (DECL_COMDAT_GROUP (default_decl)
32895 || TREE_PUBLIC (default_decl))
32897 /* In this case, each translation unit with a call to this
32898 versioned function will put out a resolver. Ensure it
32899 is comdat to keep just one copy. */
32900 DECL_COMDAT (decl) = 1;
32901 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32903 /* Build result decl and add to function_decl. */
32904 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32905 DECL_ARTIFICIAL (t) = 1;
32906 DECL_IGNORED_P (t) = 1;
32907 DECL_RESULT (decl) = t;
32909 gimplify_function_tree (decl);
32910 push_cfun (DECL_STRUCT_FUNCTION (decl));
32911 *empty_bb = init_lowered_empty_function (decl, false, 0);
32913 cgraph_node::add_new_function (decl, true);
32914 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32916 pop_cfun ();
32918 gcc_assert (dispatch_decl != NULL);
32919 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
32920 DECL_ATTRIBUTES (dispatch_decl)
32921 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
32923 /* Create the alias for dispatch to resolver here. */
32924 /*cgraph_create_function_alias (dispatch_decl, decl);*/
32925 cgraph_node::create_same_body_alias (dispatch_decl, decl);
32926 XDELETEVEC (resolver_name);
32927 return decl;
32930 /* Generate the dispatching code body to dispatch multi-versioned function
32931 DECL. The target hook is called to process the "target" attributes and
32932 provide the code to dispatch the right function at run-time. NODE points
32933 to the dispatcher decl whose body will be created. */
32935 static tree
32936 ix86_generate_version_dispatcher_body (void *node_p)
32938 tree resolver_decl;
32939 basic_block empty_bb;
32940 tree default_ver_decl;
32941 struct cgraph_node *versn;
32942 struct cgraph_node *node;
32944 struct cgraph_function_version_info *node_version_info = NULL;
32945 struct cgraph_function_version_info *versn_info = NULL;
32947 node = (cgraph_node *)node_p;
32949 node_version_info = node->function_version ();
32950 gcc_assert (node->dispatcher_function
32951 && node_version_info != NULL);
32953 if (node_version_info->dispatcher_resolver)
32954 return node_version_info->dispatcher_resolver;
32956 /* The first version in the chain corresponds to the default version. */
32957 default_ver_decl = node_version_info->next->this_node->decl;
32959 /* node is going to be an alias, so remove the finalized bit. */
32960 node->definition = false;
32962 resolver_decl = make_resolver_func (default_ver_decl,
32963 node->decl, &empty_bb);
32965 node_version_info->dispatcher_resolver = resolver_decl;
32967 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32969 auto_vec<tree, 2> fn_ver_vec;
32971 for (versn_info = node_version_info->next; versn_info;
32972 versn_info = versn_info->next)
32974 versn = versn_info->this_node;
32975 /* Check for virtual functions here again, as by this time it should
32976 have been determined if this function needs a vtable index or
32977 not. This happens for methods in derived classes that override
32978 virtual methods in base classes but are not explicitly marked as
32979 virtual. */
32980 if (DECL_VINDEX (versn->decl))
32981 sorry ("Virtual function multiversioning not supported");
32983 fn_ver_vec.safe_push (versn->decl);
32986 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32987 cgraph_edge::rebuild_edges ();
32988 pop_cfun ();
32989 return resolver_decl;
32991 /* This builds the processor_model struct type defined in
32992 libgcc/config/i386/cpuinfo.c */
32994 static tree
32995 build_processor_model_struct (void)
32997 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32998 "__cpu_features"};
32999 tree field = NULL_TREE, field_chain = NULL_TREE;
33000 int i;
33001 tree type = make_node (RECORD_TYPE);
33003 /* The first 3 fields are unsigned int. */
33004 for (i = 0; i < 3; ++i)
33006 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33007 get_identifier (field_name[i]), unsigned_type_node);
33008 if (field_chain != NULL_TREE)
33009 DECL_CHAIN (field) = field_chain;
33010 field_chain = field;
33013 /* The last field is an array of unsigned integers of size one. */
33014 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
33015 get_identifier (field_name[3]),
33016 build_array_type (unsigned_type_node,
33017 build_index_type (size_one_node)));
33018 if (field_chain != NULL_TREE)
33019 DECL_CHAIN (field) = field_chain;
33020 field_chain = field;
33022 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33023 return type;
33026 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33028 static tree
33029 make_var_decl (tree type, const char *name)
33031 tree new_decl;
33033 new_decl = build_decl (UNKNOWN_LOCATION,
33034 VAR_DECL,
33035 get_identifier(name),
33036 type);
33038 DECL_EXTERNAL (new_decl) = 1;
33039 TREE_STATIC (new_decl) = 1;
33040 TREE_PUBLIC (new_decl) = 1;
33041 DECL_INITIAL (new_decl) = 0;
33042 DECL_ARTIFICIAL (new_decl) = 0;
33043 DECL_PRESERVE_P (new_decl) = 1;
33045 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33046 assemble_variable (new_decl, 0, 0, 0);
33048 return new_decl;
33051 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33052 into an integer defined in libgcc/config/i386/cpuinfo.c */
33054 static tree
33055 fold_builtin_cpu (tree fndecl, tree *args)
33057 unsigned int i;
33058 enum ix86_builtins fn_code = (enum ix86_builtins)
33059 DECL_FUNCTION_CODE (fndecl);
33060 tree param_string_cst = NULL;
33062 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33063 enum processor_features
33065 F_CMOV = 0,
33066 F_MMX,
33067 F_POPCNT,
33068 F_SSE,
33069 F_SSE2,
33070 F_SSE3,
33071 F_SSSE3,
33072 F_SSE4_1,
33073 F_SSE4_2,
33074 F_AVX,
33075 F_AVX2,
33076 F_SSE4_A,
33077 F_FMA4,
33078 F_XOP,
33079 F_FMA,
33080 F_AVX512F,
33081 F_BMI,
33082 F_BMI2,
33083 F_AES,
33084 F_PCLMUL,
33085 F_AVX512VL,
33086 F_AVX512BW,
33087 F_AVX512DQ,
33088 F_AVX512CD,
33089 F_AVX512ER,
33090 F_AVX512PF,
33091 F_AVX512VBMI,
33092 F_AVX512IFMA,
33093 F_MAX
33096 /* These are the values for vendor types and cpu types and subtypes
33097 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33098 the corresponding start value. */
33099 enum processor_model
33101 M_INTEL = 1,
33102 M_AMD,
33103 M_CPU_TYPE_START,
33104 M_INTEL_BONNELL,
33105 M_INTEL_CORE2,
33106 M_INTEL_COREI7,
33107 M_AMDFAM10H,
33108 M_AMDFAM15H,
33109 M_INTEL_SILVERMONT,
33110 M_INTEL_KNL,
33111 M_AMD_BTVER1,
33112 M_AMD_BTVER2,
33113 M_CPU_SUBTYPE_START,
33114 M_INTEL_COREI7_NEHALEM,
33115 M_INTEL_COREI7_WESTMERE,
33116 M_INTEL_COREI7_SANDYBRIDGE,
33117 M_AMDFAM10H_BARCELONA,
33118 M_AMDFAM10H_SHANGHAI,
33119 M_AMDFAM10H_ISTANBUL,
33120 M_AMDFAM15H_BDVER1,
33121 M_AMDFAM15H_BDVER2,
33122 M_AMDFAM15H_BDVER3,
33123 M_AMDFAM15H_BDVER4,
33124 M_AMDFAM17H_ZNVER1,
33125 M_INTEL_COREI7_IVYBRIDGE,
33126 M_INTEL_COREI7_HASWELL,
33127 M_INTEL_COREI7_BROADWELL,
33128 M_INTEL_COREI7_SKYLAKE,
33129 M_INTEL_COREI7_SKYLAKE_AVX512
33132 static struct _arch_names_table
33134 const char *const name;
33135 const enum processor_model model;
33137 const arch_names_table[] =
33139 {"amd", M_AMD},
33140 {"intel", M_INTEL},
33141 {"atom", M_INTEL_BONNELL},
33142 {"slm", M_INTEL_SILVERMONT},
33143 {"core2", M_INTEL_CORE2},
33144 {"corei7", M_INTEL_COREI7},
33145 {"nehalem", M_INTEL_COREI7_NEHALEM},
33146 {"westmere", M_INTEL_COREI7_WESTMERE},
33147 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33148 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33149 {"haswell", M_INTEL_COREI7_HASWELL},
33150 {"broadwell", M_INTEL_COREI7_BROADWELL},
33151 {"skylake", M_INTEL_COREI7_SKYLAKE},
33152 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33153 {"bonnell", M_INTEL_BONNELL},
33154 {"silvermont", M_INTEL_SILVERMONT},
33155 {"knl", M_INTEL_KNL},
33156 {"amdfam10h", M_AMDFAM10H},
33157 {"barcelona", M_AMDFAM10H_BARCELONA},
33158 {"shanghai", M_AMDFAM10H_SHANGHAI},
33159 {"istanbul", M_AMDFAM10H_ISTANBUL},
33160 {"btver1", M_AMD_BTVER1},
33161 {"amdfam15h", M_AMDFAM15H},
33162 {"bdver1", M_AMDFAM15H_BDVER1},
33163 {"bdver2", M_AMDFAM15H_BDVER2},
33164 {"bdver3", M_AMDFAM15H_BDVER3},
33165 {"bdver4", M_AMDFAM15H_BDVER4},
33166 {"btver2", M_AMD_BTVER2},
33167 {"znver1", M_AMDFAM17H_ZNVER1},
33170 static struct _isa_names_table
33172 const char *const name;
33173 const enum processor_features feature;
33175 const isa_names_table[] =
33177 {"cmov", F_CMOV},
33178 {"mmx", F_MMX},
33179 {"popcnt", F_POPCNT},
33180 {"sse", F_SSE},
33181 {"sse2", F_SSE2},
33182 {"sse3", F_SSE3},
33183 {"ssse3", F_SSSE3},
33184 {"sse4a", F_SSE4_A},
33185 {"sse4.1", F_SSE4_1},
33186 {"sse4.2", F_SSE4_2},
33187 {"avx", F_AVX},
33188 {"fma4", F_FMA4},
33189 {"xop", F_XOP},
33190 {"fma", F_FMA},
33191 {"avx2", F_AVX2},
33192 {"avx512f", F_AVX512F},
33193 {"bmi", F_BMI},
33194 {"bmi2", F_BMI2},
33195 {"aes", F_AES},
33196 {"pclmul", F_PCLMUL},
33197 {"avx512vl",F_AVX512VL},
33198 {"avx512bw",F_AVX512BW},
33199 {"avx512dq",F_AVX512DQ},
33200 {"avx512cd",F_AVX512CD},
33201 {"avx512er",F_AVX512ER},
33202 {"avx512pf",F_AVX512PF},
33203 {"avx512vbmi",F_AVX512VBMI},
33204 {"avx512ifma",F_AVX512IFMA},
33207 tree __processor_model_type = build_processor_model_struct ();
33208 tree __cpu_model_var = make_var_decl (__processor_model_type,
33209 "__cpu_model");
33212 varpool_node::add (__cpu_model_var);
33214 gcc_assert ((args != NULL) && (*args != NULL));
33216 param_string_cst = *args;
33217 while (param_string_cst
33218 && TREE_CODE (param_string_cst) != STRING_CST)
33220 /* *args must be a expr that can contain other EXPRS leading to a
33221 STRING_CST. */
33222 if (!EXPR_P (param_string_cst))
33224 error ("Parameter to builtin must be a string constant or literal");
33225 return integer_zero_node;
33227 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33230 gcc_assert (param_string_cst);
33232 if (fn_code == IX86_BUILTIN_CPU_IS)
33234 tree ref;
33235 tree field;
33236 tree final;
33238 unsigned int field_val = 0;
33239 unsigned int NUM_ARCH_NAMES
33240 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33242 for (i = 0; i < NUM_ARCH_NAMES; i++)
33243 if (strcmp (arch_names_table[i].name,
33244 TREE_STRING_POINTER (param_string_cst)) == 0)
33245 break;
33247 if (i == NUM_ARCH_NAMES)
33249 error ("Parameter to builtin not valid: %s",
33250 TREE_STRING_POINTER (param_string_cst));
33251 return integer_zero_node;
33254 field = TYPE_FIELDS (__processor_model_type);
33255 field_val = arch_names_table[i].model;
33257 /* CPU types are stored in the next field. */
33258 if (field_val > M_CPU_TYPE_START
33259 && field_val < M_CPU_SUBTYPE_START)
33261 field = DECL_CHAIN (field);
33262 field_val -= M_CPU_TYPE_START;
33265 /* CPU subtypes are stored in the next field. */
33266 if (field_val > M_CPU_SUBTYPE_START)
33268 field = DECL_CHAIN ( DECL_CHAIN (field));
33269 field_val -= M_CPU_SUBTYPE_START;
33272 /* Get the appropriate field in __cpu_model. */
33273 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33274 field, NULL_TREE);
33276 /* Check the value. */
33277 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33278 build_int_cstu (unsigned_type_node, field_val));
33279 return build1 (CONVERT_EXPR, integer_type_node, final);
33281 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33283 tree ref;
33284 tree array_elt;
33285 tree field;
33286 tree final;
33288 unsigned int field_val = 0;
33289 unsigned int NUM_ISA_NAMES
33290 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33292 for (i = 0; i < NUM_ISA_NAMES; i++)
33293 if (strcmp (isa_names_table[i].name,
33294 TREE_STRING_POINTER (param_string_cst)) == 0)
33295 break;
33297 if (i == NUM_ISA_NAMES)
33299 error ("Parameter to builtin not valid: %s",
33300 TREE_STRING_POINTER (param_string_cst));
33301 return integer_zero_node;
33304 field = TYPE_FIELDS (__processor_model_type);
33305 /* Get the last field, which is __cpu_features. */
33306 while (DECL_CHAIN (field))
33307 field = DECL_CHAIN (field);
33309 /* Get the appropriate field: __cpu_model.__cpu_features */
33310 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33311 field, NULL_TREE);
33313 /* Access the 0th element of __cpu_features array. */
33314 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33315 integer_zero_node, NULL_TREE, NULL_TREE);
33317 field_val = (1 << isa_names_table[i].feature);
33318 /* Return __cpu_model.__cpu_features[0] & field_val */
33319 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33320 build_int_cstu (unsigned_type_node, field_val));
33321 return build1 (CONVERT_EXPR, integer_type_node, final);
33323 gcc_unreachable ();
33326 static tree
33327 ix86_fold_builtin (tree fndecl, int n_args,
33328 tree *args, bool ignore ATTRIBUTE_UNUSED)
33330 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33332 enum ix86_builtins fn_code = (enum ix86_builtins)
33333 DECL_FUNCTION_CODE (fndecl);
33334 switch (fn_code)
33336 case IX86_BUILTIN_CPU_IS:
33337 case IX86_BUILTIN_CPU_SUPPORTS:
33338 gcc_assert (n_args == 1);
33339 return fold_builtin_cpu (fndecl, args);
33341 case IX86_BUILTIN_NANQ:
33342 case IX86_BUILTIN_NANSQ:
33344 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33345 const char *str = c_getstr (*args);
33346 int quiet = fn_code == IX86_BUILTIN_NANQ;
33347 REAL_VALUE_TYPE real;
33349 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33350 return build_real (type, real);
33351 return NULL_TREE;
33354 case IX86_BUILTIN_INFQ:
33355 case IX86_BUILTIN_HUGE_VALQ:
33357 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33358 REAL_VALUE_TYPE inf;
33359 real_inf (&inf);
33360 return build_real (type, inf);
33363 case IX86_BUILTIN_TZCNT16:
33364 case IX86_BUILTIN_TZCNT32:
33365 case IX86_BUILTIN_TZCNT64:
33366 gcc_assert (n_args == 1);
33367 if (TREE_CODE (args[0]) == INTEGER_CST)
33369 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33370 tree arg = args[0];
33371 if (fn_code == IX86_BUILTIN_TZCNT16)
33372 arg = fold_convert (short_unsigned_type_node, arg);
33373 if (integer_zerop (arg))
33374 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33375 else
33376 return fold_const_call (CFN_CTZ, type, arg);
33378 break;
33380 case IX86_BUILTIN_LZCNT16:
33381 case IX86_BUILTIN_LZCNT32:
33382 case IX86_BUILTIN_LZCNT64:
33383 gcc_assert (n_args == 1);
33384 if (TREE_CODE (args[0]) == INTEGER_CST)
33386 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33387 tree arg = args[0];
33388 if (fn_code == IX86_BUILTIN_LZCNT16)
33389 arg = fold_convert (short_unsigned_type_node, arg);
33390 if (integer_zerop (arg))
33391 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33392 else
33393 return fold_const_call (CFN_CLZ, type, arg);
33395 break;
33397 case IX86_BUILTIN_BEXTR32:
33398 case IX86_BUILTIN_BEXTR64:
33399 case IX86_BUILTIN_BEXTRI32:
33400 case IX86_BUILTIN_BEXTRI64:
33401 gcc_assert (n_args == 2);
33402 if (tree_fits_uhwi_p (args[1]))
33404 unsigned HOST_WIDE_INT res = 0;
33405 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33406 unsigned int start = tree_to_uhwi (args[1]);
33407 unsigned int len = (start & 0xff00) >> 8;
33408 start &= 0xff;
33409 if (start >= prec || len == 0)
33410 res = 0;
33411 else if (!tree_fits_uhwi_p (args[0]))
33412 break;
33413 else
33414 res = tree_to_uhwi (args[0]) >> start;
33415 if (len > prec)
33416 len = prec;
33417 if (len < HOST_BITS_PER_WIDE_INT)
33418 res &= (HOST_WIDE_INT_1U << len) - 1;
33419 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33421 break;
33423 case IX86_BUILTIN_BZHI32:
33424 case IX86_BUILTIN_BZHI64:
33425 gcc_assert (n_args == 2);
33426 if (tree_fits_uhwi_p (args[1]))
33428 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33429 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33430 return args[0];
33431 if (!tree_fits_uhwi_p (args[0]))
33432 break;
33433 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33434 res &= ~(HOST_WIDE_INT_M1U << idx);
33435 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33437 break;
33439 case IX86_BUILTIN_PDEP32:
33440 case IX86_BUILTIN_PDEP64:
33441 gcc_assert (n_args == 2);
33442 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33444 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33445 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33446 unsigned HOST_WIDE_INT res = 0;
33447 unsigned HOST_WIDE_INT m, k = 1;
33448 for (m = 1; m; m <<= 1)
33449 if ((mask & m) != 0)
33451 if ((src & k) != 0)
33452 res |= m;
33453 k <<= 1;
33455 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33457 break;
33459 case IX86_BUILTIN_PEXT32:
33460 case IX86_BUILTIN_PEXT64:
33461 gcc_assert (n_args == 2);
33462 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33464 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33465 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33466 unsigned HOST_WIDE_INT res = 0;
33467 unsigned HOST_WIDE_INT m, k = 1;
33468 for (m = 1; m; m <<= 1)
33469 if ((mask & m) != 0)
33471 if ((src & m) != 0)
33472 res |= k;
33473 k <<= 1;
33475 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33477 break;
33479 default:
33480 break;
33484 #ifdef SUBTARGET_FOLD_BUILTIN
33485 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33486 #endif
33488 return NULL_TREE;
33491 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33492 constant) in GIMPLE. */
33494 bool
33495 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33497 gimple *stmt = gsi_stmt (*gsi);
33498 tree fndecl = gimple_call_fndecl (stmt);
33499 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33500 int n_args = gimple_call_num_args (stmt);
33501 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33502 tree decl = NULL_TREE;
33503 tree arg0, arg1;
33505 switch (fn_code)
33507 case IX86_BUILTIN_TZCNT32:
33508 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33509 goto fold_tzcnt_lzcnt;
33511 case IX86_BUILTIN_TZCNT64:
33512 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33513 goto fold_tzcnt_lzcnt;
33515 case IX86_BUILTIN_LZCNT32:
33516 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33517 goto fold_tzcnt_lzcnt;
33519 case IX86_BUILTIN_LZCNT64:
33520 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33521 goto fold_tzcnt_lzcnt;
33523 fold_tzcnt_lzcnt:
33524 gcc_assert (n_args == 1);
33525 arg0 = gimple_call_arg (stmt, 0);
33526 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33528 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33529 /* If arg0 is provably non-zero, optimize into generic
33530 __builtin_c[tl]z{,ll} function the middle-end handles
33531 better. */
33532 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33533 return false;
33535 location_t loc = gimple_location (stmt);
33536 gimple *g = gimple_build_call (decl, 1, arg0);
33537 gimple_set_location (g, loc);
33538 tree lhs = make_ssa_name (integer_type_node);
33539 gimple_call_set_lhs (g, lhs);
33540 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33541 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33542 gimple_set_location (g, loc);
33543 gsi_replace (gsi, g, false);
33544 return true;
33546 break;
33548 case IX86_BUILTIN_BZHI32:
33549 case IX86_BUILTIN_BZHI64:
33550 gcc_assert (n_args == 2);
33551 arg1 = gimple_call_arg (stmt, 1);
33552 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33554 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33555 arg0 = gimple_call_arg (stmt, 0);
33556 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33557 break;
33558 location_t loc = gimple_location (stmt);
33559 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33560 gimple_set_location (g, loc);
33561 gsi_replace (gsi, g, false);
33562 return true;
33564 break;
33566 case IX86_BUILTIN_PDEP32:
33567 case IX86_BUILTIN_PDEP64:
33568 case IX86_BUILTIN_PEXT32:
33569 case IX86_BUILTIN_PEXT64:
33570 gcc_assert (n_args == 2);
33571 arg1 = gimple_call_arg (stmt, 1);
33572 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33574 location_t loc = gimple_location (stmt);
33575 arg0 = gimple_call_arg (stmt, 0);
33576 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33577 gimple_set_location (g, loc);
33578 gsi_replace (gsi, g, false);
33579 return true;
33581 break;
33583 default:
33584 break;
33587 return false;
33590 /* Make builtins to detect cpu type and features supported. NAME is
33591 the builtin name, CODE is the builtin code, and FTYPE is the function
33592 type of the builtin. */
33594 static void
33595 make_cpu_type_builtin (const char* name, int code,
33596 enum ix86_builtin_func_type ftype, bool is_const)
33598 tree decl;
33599 tree type;
33601 type = ix86_get_builtin_func_type (ftype);
33602 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33603 NULL, NULL_TREE);
33604 gcc_assert (decl != NULL_TREE);
33605 ix86_builtins[(int) code] = decl;
33606 TREE_READONLY (decl) = is_const;
33609 /* Make builtins to get CPU type and features supported. The created
33610 builtins are :
33612 __builtin_cpu_init (), to detect cpu type and features,
33613 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33614 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33617 static void
33618 ix86_init_platform_type_builtins (void)
33620 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33621 INT_FTYPE_VOID, false);
33622 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33623 INT_FTYPE_PCCHAR, true);
33624 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33625 INT_FTYPE_PCCHAR, true);
33628 /* Internal method for ix86_init_builtins. */
33630 static void
33631 ix86_init_builtins_va_builtins_abi (void)
33633 tree ms_va_ref, sysv_va_ref;
33634 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33635 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33636 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33637 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33639 if (!TARGET_64BIT)
33640 return;
33641 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33642 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33643 ms_va_ref = build_reference_type (ms_va_list_type_node);
33644 sysv_va_ref =
33645 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33647 fnvoid_va_end_ms =
33648 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33649 fnvoid_va_start_ms =
33650 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33651 fnvoid_va_end_sysv =
33652 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33653 fnvoid_va_start_sysv =
33654 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33655 NULL_TREE);
33656 fnvoid_va_copy_ms =
33657 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33658 NULL_TREE);
33659 fnvoid_va_copy_sysv =
33660 build_function_type_list (void_type_node, sysv_va_ref,
33661 sysv_va_ref, NULL_TREE);
33663 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33664 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33665 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33666 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33667 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33668 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33669 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33670 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33671 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33672 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33673 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33674 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33677 static void
33678 ix86_init_builtin_types (void)
33680 tree float80_type_node, const_string_type_node;
33682 /* The __float80 type. */
33683 float80_type_node = long_double_type_node;
33684 if (TYPE_MODE (float80_type_node) != XFmode)
33686 if (float64x_type_node != NULL_TREE
33687 && TYPE_MODE (float64x_type_node) == XFmode)
33688 float80_type_node = float64x_type_node;
33689 else
33691 /* The __float80 type. */
33692 float80_type_node = make_node (REAL_TYPE);
33694 TYPE_PRECISION (float80_type_node) = 80;
33695 layout_type (float80_type_node);
33698 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33700 /* The __float128 type. The node has already been created as
33701 _Float128, so we only need to register the __float128 name for
33702 it. */
33703 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33705 const_string_type_node
33706 = build_pointer_type (build_qualified_type
33707 (char_type_node, TYPE_QUAL_CONST));
33709 /* This macro is built by i386-builtin-types.awk. */
33710 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33713 static void
33714 ix86_init_builtins (void)
33716 tree ftype, decl;
33718 ix86_init_builtin_types ();
33720 /* Builtins to get CPU type and features. */
33721 ix86_init_platform_type_builtins ();
33723 /* TFmode support builtins. */
33724 def_builtin_const (0, "__builtin_infq",
33725 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33726 def_builtin_const (0, "__builtin_huge_valq",
33727 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33729 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33730 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33731 BUILT_IN_MD, "nanq", NULL_TREE);
33732 TREE_READONLY (decl) = 1;
33733 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33735 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33736 BUILT_IN_MD, "nansq", NULL_TREE);
33737 TREE_READONLY (decl) = 1;
33738 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33740 /* We will expand them to normal call if SSE isn't available since
33741 they are used by libgcc. */
33742 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33743 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33744 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33745 TREE_READONLY (decl) = 1;
33746 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33748 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33749 decl = add_builtin_function ("__builtin_copysignq", ftype,
33750 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33751 "__copysigntf3", NULL_TREE);
33752 TREE_READONLY (decl) = 1;
33753 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33755 ix86_init_tm_builtins ();
33756 ix86_init_mmx_sse_builtins ();
33757 ix86_init_mpx_builtins ();
33759 if (TARGET_LP64)
33760 ix86_init_builtins_va_builtins_abi ();
33762 #ifdef SUBTARGET_INIT_BUILTINS
33763 SUBTARGET_INIT_BUILTINS;
33764 #endif
33767 /* Return the ix86 builtin for CODE. */
33769 static tree
33770 ix86_builtin_decl (unsigned code, bool)
33772 if (code >= IX86_BUILTIN_MAX)
33773 return error_mark_node;
33775 return ix86_builtins[code];
33778 /* Errors in the source file can cause expand_expr to return const0_rtx
33779 where we expect a vector. To avoid crashing, use one of the vector
33780 clear instructions. */
33781 static rtx
33782 safe_vector_operand (rtx x, machine_mode mode)
33784 if (x == const0_rtx)
33785 x = CONST0_RTX (mode);
33786 return x;
33789 /* Fixup modeless constants to fit required mode. */
33790 static rtx
33791 fixup_modeless_constant (rtx x, machine_mode mode)
33793 if (GET_MODE (x) == VOIDmode)
33794 x = convert_to_mode (mode, x, 1);
33795 return x;
33798 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33800 static rtx
33801 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33803 rtx pat;
33804 tree arg0 = CALL_EXPR_ARG (exp, 0);
33805 tree arg1 = CALL_EXPR_ARG (exp, 1);
33806 rtx op0 = expand_normal (arg0);
33807 rtx op1 = expand_normal (arg1);
33808 machine_mode tmode = insn_data[icode].operand[0].mode;
33809 machine_mode mode0 = insn_data[icode].operand[1].mode;
33810 machine_mode mode1 = insn_data[icode].operand[2].mode;
33812 if (VECTOR_MODE_P (mode0))
33813 op0 = safe_vector_operand (op0, mode0);
33814 if (VECTOR_MODE_P (mode1))
33815 op1 = safe_vector_operand (op1, mode1);
33817 if (optimize || !target
33818 || GET_MODE (target) != tmode
33819 || !insn_data[icode].operand[0].predicate (target, tmode))
33820 target = gen_reg_rtx (tmode);
33822 if (GET_MODE (op1) == SImode && mode1 == TImode)
33824 rtx x = gen_reg_rtx (V4SImode);
33825 emit_insn (gen_sse2_loadd (x, op1));
33826 op1 = gen_lowpart (TImode, x);
33829 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33830 op0 = copy_to_mode_reg (mode0, op0);
33831 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33832 op1 = copy_to_mode_reg (mode1, op1);
33834 pat = GEN_FCN (icode) (target, op0, op1);
33835 if (! pat)
33836 return 0;
33838 emit_insn (pat);
33840 return target;
33843 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33845 static rtx
33846 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33847 enum ix86_builtin_func_type m_type,
33848 enum rtx_code sub_code)
33850 rtx pat;
33851 int i;
33852 int nargs;
33853 bool comparison_p = false;
33854 bool tf_p = false;
33855 bool last_arg_constant = false;
33856 int num_memory = 0;
33857 struct {
33858 rtx op;
33859 machine_mode mode;
33860 } args[4];
33862 machine_mode tmode = insn_data[icode].operand[0].mode;
33864 switch (m_type)
33866 case MULTI_ARG_4_DF2_DI_I:
33867 case MULTI_ARG_4_DF2_DI_I1:
33868 case MULTI_ARG_4_SF2_SI_I:
33869 case MULTI_ARG_4_SF2_SI_I1:
33870 nargs = 4;
33871 last_arg_constant = true;
33872 break;
33874 case MULTI_ARG_3_SF:
33875 case MULTI_ARG_3_DF:
33876 case MULTI_ARG_3_SF2:
33877 case MULTI_ARG_3_DF2:
33878 case MULTI_ARG_3_DI:
33879 case MULTI_ARG_3_SI:
33880 case MULTI_ARG_3_SI_DI:
33881 case MULTI_ARG_3_HI:
33882 case MULTI_ARG_3_HI_SI:
33883 case MULTI_ARG_3_QI:
33884 case MULTI_ARG_3_DI2:
33885 case MULTI_ARG_3_SI2:
33886 case MULTI_ARG_3_HI2:
33887 case MULTI_ARG_3_QI2:
33888 nargs = 3;
33889 break;
33891 case MULTI_ARG_2_SF:
33892 case MULTI_ARG_2_DF:
33893 case MULTI_ARG_2_DI:
33894 case MULTI_ARG_2_SI:
33895 case MULTI_ARG_2_HI:
33896 case MULTI_ARG_2_QI:
33897 nargs = 2;
33898 break;
33900 case MULTI_ARG_2_DI_IMM:
33901 case MULTI_ARG_2_SI_IMM:
33902 case MULTI_ARG_2_HI_IMM:
33903 case MULTI_ARG_2_QI_IMM:
33904 nargs = 2;
33905 last_arg_constant = true;
33906 break;
33908 case MULTI_ARG_1_SF:
33909 case MULTI_ARG_1_DF:
33910 case MULTI_ARG_1_SF2:
33911 case MULTI_ARG_1_DF2:
33912 case MULTI_ARG_1_DI:
33913 case MULTI_ARG_1_SI:
33914 case MULTI_ARG_1_HI:
33915 case MULTI_ARG_1_QI:
33916 case MULTI_ARG_1_SI_DI:
33917 case MULTI_ARG_1_HI_DI:
33918 case MULTI_ARG_1_HI_SI:
33919 case MULTI_ARG_1_QI_DI:
33920 case MULTI_ARG_1_QI_SI:
33921 case MULTI_ARG_1_QI_HI:
33922 nargs = 1;
33923 break;
33925 case MULTI_ARG_2_DI_CMP:
33926 case MULTI_ARG_2_SI_CMP:
33927 case MULTI_ARG_2_HI_CMP:
33928 case MULTI_ARG_2_QI_CMP:
33929 nargs = 2;
33930 comparison_p = true;
33931 break;
33933 case MULTI_ARG_2_SF_TF:
33934 case MULTI_ARG_2_DF_TF:
33935 case MULTI_ARG_2_DI_TF:
33936 case MULTI_ARG_2_SI_TF:
33937 case MULTI_ARG_2_HI_TF:
33938 case MULTI_ARG_2_QI_TF:
33939 nargs = 2;
33940 tf_p = true;
33941 break;
33943 default:
33944 gcc_unreachable ();
33947 if (optimize || !target
33948 || GET_MODE (target) != tmode
33949 || !insn_data[icode].operand[0].predicate (target, tmode))
33950 target = gen_reg_rtx (tmode);
33952 gcc_assert (nargs <= 4);
33954 for (i = 0; i < nargs; i++)
33956 tree arg = CALL_EXPR_ARG (exp, i);
33957 rtx op = expand_normal (arg);
33958 int adjust = (comparison_p) ? 1 : 0;
33959 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33961 if (last_arg_constant && i == nargs - 1)
33963 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33965 enum insn_code new_icode = icode;
33966 switch (icode)
33968 case CODE_FOR_xop_vpermil2v2df3:
33969 case CODE_FOR_xop_vpermil2v4sf3:
33970 case CODE_FOR_xop_vpermil2v4df3:
33971 case CODE_FOR_xop_vpermil2v8sf3:
33972 error ("the last argument must be a 2-bit immediate");
33973 return gen_reg_rtx (tmode);
33974 case CODE_FOR_xop_rotlv2di3:
33975 new_icode = CODE_FOR_rotlv2di3;
33976 goto xop_rotl;
33977 case CODE_FOR_xop_rotlv4si3:
33978 new_icode = CODE_FOR_rotlv4si3;
33979 goto xop_rotl;
33980 case CODE_FOR_xop_rotlv8hi3:
33981 new_icode = CODE_FOR_rotlv8hi3;
33982 goto xop_rotl;
33983 case CODE_FOR_xop_rotlv16qi3:
33984 new_icode = CODE_FOR_rotlv16qi3;
33985 xop_rotl:
33986 if (CONST_INT_P (op))
33988 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33989 op = GEN_INT (INTVAL (op) & mask);
33990 gcc_checking_assert
33991 (insn_data[icode].operand[i + 1].predicate (op, mode));
33993 else
33995 gcc_checking_assert
33996 (nargs == 2
33997 && insn_data[new_icode].operand[0].mode == tmode
33998 && insn_data[new_icode].operand[1].mode == tmode
33999 && insn_data[new_icode].operand[2].mode == mode
34000 && insn_data[new_icode].operand[0].predicate
34001 == insn_data[icode].operand[0].predicate
34002 && insn_data[new_icode].operand[1].predicate
34003 == insn_data[icode].operand[1].predicate);
34004 icode = new_icode;
34005 goto non_constant;
34007 break;
34008 default:
34009 gcc_unreachable ();
34013 else
34015 non_constant:
34016 if (VECTOR_MODE_P (mode))
34017 op = safe_vector_operand (op, mode);
34019 /* If we aren't optimizing, only allow one memory operand to be
34020 generated. */
34021 if (memory_operand (op, mode))
34022 num_memory++;
34024 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34026 if (optimize
34027 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34028 || num_memory > 1)
34029 op = force_reg (mode, op);
34032 args[i].op = op;
34033 args[i].mode = mode;
34036 switch (nargs)
34038 case 1:
34039 pat = GEN_FCN (icode) (target, args[0].op);
34040 break;
34042 case 2:
34043 if (tf_p)
34044 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34045 GEN_INT ((int)sub_code));
34046 else if (! comparison_p)
34047 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34048 else
34050 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34051 args[0].op,
34052 args[1].op);
34054 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34056 break;
34058 case 3:
34059 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34060 break;
34062 case 4:
34063 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34064 break;
34066 default:
34067 gcc_unreachable ();
34070 if (! pat)
34071 return 0;
34073 emit_insn (pat);
34074 return target;
34077 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34078 insns with vec_merge. */
34080 static rtx
34081 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34082 rtx target)
34084 rtx pat;
34085 tree arg0 = CALL_EXPR_ARG (exp, 0);
34086 rtx op1, op0 = expand_normal (arg0);
34087 machine_mode tmode = insn_data[icode].operand[0].mode;
34088 machine_mode mode0 = insn_data[icode].operand[1].mode;
34090 if (optimize || !target
34091 || GET_MODE (target) != tmode
34092 || !insn_data[icode].operand[0].predicate (target, tmode))
34093 target = gen_reg_rtx (tmode);
34095 if (VECTOR_MODE_P (mode0))
34096 op0 = safe_vector_operand (op0, mode0);
34098 if ((optimize && !register_operand (op0, mode0))
34099 || !insn_data[icode].operand[1].predicate (op0, mode0))
34100 op0 = copy_to_mode_reg (mode0, op0);
34102 op1 = op0;
34103 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34104 op1 = copy_to_mode_reg (mode0, op1);
34106 pat = GEN_FCN (icode) (target, op0, op1);
34107 if (! pat)
34108 return 0;
34109 emit_insn (pat);
34110 return target;
34113 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34115 static rtx
34116 ix86_expand_sse_compare (const struct builtin_description *d,
34117 tree exp, rtx target, bool swap)
34119 rtx pat;
34120 tree arg0 = CALL_EXPR_ARG (exp, 0);
34121 tree arg1 = CALL_EXPR_ARG (exp, 1);
34122 rtx op0 = expand_normal (arg0);
34123 rtx op1 = expand_normal (arg1);
34124 rtx op2;
34125 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34126 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34127 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34128 enum rtx_code comparison = d->comparison;
34130 if (VECTOR_MODE_P (mode0))
34131 op0 = safe_vector_operand (op0, mode0);
34132 if (VECTOR_MODE_P (mode1))
34133 op1 = safe_vector_operand (op1, mode1);
34135 /* Swap operands if we have a comparison that isn't available in
34136 hardware. */
34137 if (swap)
34138 std::swap (op0, op1);
34140 if (optimize || !target
34141 || GET_MODE (target) != tmode
34142 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34143 target = gen_reg_rtx (tmode);
34145 if ((optimize && !register_operand (op0, mode0))
34146 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34147 op0 = copy_to_mode_reg (mode0, op0);
34148 if ((optimize && !register_operand (op1, mode1))
34149 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34150 op1 = copy_to_mode_reg (mode1, op1);
34152 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34153 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34154 if (! pat)
34155 return 0;
34156 emit_insn (pat);
34157 return target;
34160 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34162 static rtx
34163 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34164 rtx target)
34166 rtx pat;
34167 tree arg0 = CALL_EXPR_ARG (exp, 0);
34168 tree arg1 = CALL_EXPR_ARG (exp, 1);
34169 rtx op0 = expand_normal (arg0);
34170 rtx op1 = expand_normal (arg1);
34171 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34172 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34173 enum rtx_code comparison = d->comparison;
34175 if (VECTOR_MODE_P (mode0))
34176 op0 = safe_vector_operand (op0, mode0);
34177 if (VECTOR_MODE_P (mode1))
34178 op1 = safe_vector_operand (op1, mode1);
34180 /* Swap operands if we have a comparison that isn't available in
34181 hardware. */
34182 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34183 std::swap (op0, op1);
34185 target = gen_reg_rtx (SImode);
34186 emit_move_insn (target, const0_rtx);
34187 target = gen_rtx_SUBREG (QImode, target, 0);
34189 if ((optimize && !register_operand (op0, mode0))
34190 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34191 op0 = copy_to_mode_reg (mode0, op0);
34192 if ((optimize && !register_operand (op1, mode1))
34193 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34194 op1 = copy_to_mode_reg (mode1, op1);
34196 pat = GEN_FCN (d->icode) (op0, op1);
34197 if (! pat)
34198 return 0;
34199 emit_insn (pat);
34200 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34201 gen_rtx_fmt_ee (comparison, QImode,
34202 SET_DEST (pat),
34203 const0_rtx)));
34205 return SUBREG_REG (target);
34208 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34210 static rtx
34211 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34212 rtx target)
34214 rtx pat;
34215 tree arg0 = CALL_EXPR_ARG (exp, 0);
34216 rtx op1, op0 = expand_normal (arg0);
34217 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34218 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34220 if (optimize || target == 0
34221 || GET_MODE (target) != tmode
34222 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34223 target = gen_reg_rtx (tmode);
34225 if (VECTOR_MODE_P (mode0))
34226 op0 = safe_vector_operand (op0, mode0);
34228 if ((optimize && !register_operand (op0, mode0))
34229 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34230 op0 = copy_to_mode_reg (mode0, op0);
34232 op1 = GEN_INT (d->comparison);
34234 pat = GEN_FCN (d->icode) (target, op0, op1);
34235 if (! pat)
34236 return 0;
34237 emit_insn (pat);
34238 return target;
34241 static rtx
34242 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34243 tree exp, rtx target)
34245 rtx pat;
34246 tree arg0 = CALL_EXPR_ARG (exp, 0);
34247 tree arg1 = CALL_EXPR_ARG (exp, 1);
34248 rtx op0 = expand_normal (arg0);
34249 rtx op1 = expand_normal (arg1);
34250 rtx op2;
34251 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34252 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34253 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34255 if (optimize || target == 0
34256 || GET_MODE (target) != tmode
34257 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34258 target = gen_reg_rtx (tmode);
34260 op0 = safe_vector_operand (op0, mode0);
34261 op1 = safe_vector_operand (op1, mode1);
34263 if ((optimize && !register_operand (op0, mode0))
34264 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34265 op0 = copy_to_mode_reg (mode0, op0);
34266 if ((optimize && !register_operand (op1, mode1))
34267 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34268 op1 = copy_to_mode_reg (mode1, op1);
34270 op2 = GEN_INT (d->comparison);
34272 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34273 if (! pat)
34274 return 0;
34275 emit_insn (pat);
34276 return target;
34279 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34281 static rtx
34282 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34283 rtx target)
34285 rtx pat;
34286 tree arg0 = CALL_EXPR_ARG (exp, 0);
34287 tree arg1 = CALL_EXPR_ARG (exp, 1);
34288 rtx op0 = expand_normal (arg0);
34289 rtx op1 = expand_normal (arg1);
34290 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34291 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34292 enum rtx_code comparison = d->comparison;
34294 if (VECTOR_MODE_P (mode0))
34295 op0 = safe_vector_operand (op0, mode0);
34296 if (VECTOR_MODE_P (mode1))
34297 op1 = safe_vector_operand (op1, mode1);
34299 target = gen_reg_rtx (SImode);
34300 emit_move_insn (target, const0_rtx);
34301 target = gen_rtx_SUBREG (QImode, target, 0);
34303 if ((optimize && !register_operand (op0, mode0))
34304 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34305 op0 = copy_to_mode_reg (mode0, op0);
34306 if ((optimize && !register_operand (op1, mode1))
34307 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34308 op1 = copy_to_mode_reg (mode1, op1);
34310 pat = GEN_FCN (d->icode) (op0, op1);
34311 if (! pat)
34312 return 0;
34313 emit_insn (pat);
34314 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34315 gen_rtx_fmt_ee (comparison, QImode,
34316 SET_DEST (pat),
34317 const0_rtx)));
34319 return SUBREG_REG (target);
34322 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34324 static rtx
34325 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34326 tree exp, rtx target)
34328 rtx pat;
34329 tree arg0 = CALL_EXPR_ARG (exp, 0);
34330 tree arg1 = CALL_EXPR_ARG (exp, 1);
34331 tree arg2 = CALL_EXPR_ARG (exp, 2);
34332 tree arg3 = CALL_EXPR_ARG (exp, 3);
34333 tree arg4 = CALL_EXPR_ARG (exp, 4);
34334 rtx scratch0, scratch1;
34335 rtx op0 = expand_normal (arg0);
34336 rtx op1 = expand_normal (arg1);
34337 rtx op2 = expand_normal (arg2);
34338 rtx op3 = expand_normal (arg3);
34339 rtx op4 = expand_normal (arg4);
34340 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34342 tmode0 = insn_data[d->icode].operand[0].mode;
34343 tmode1 = insn_data[d->icode].operand[1].mode;
34344 modev2 = insn_data[d->icode].operand[2].mode;
34345 modei3 = insn_data[d->icode].operand[3].mode;
34346 modev4 = insn_data[d->icode].operand[4].mode;
34347 modei5 = insn_data[d->icode].operand[5].mode;
34348 modeimm = insn_data[d->icode].operand[6].mode;
34350 if (VECTOR_MODE_P (modev2))
34351 op0 = safe_vector_operand (op0, modev2);
34352 if (VECTOR_MODE_P (modev4))
34353 op2 = safe_vector_operand (op2, modev4);
34355 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34356 op0 = copy_to_mode_reg (modev2, op0);
34357 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34358 op1 = copy_to_mode_reg (modei3, op1);
34359 if ((optimize && !register_operand (op2, modev4))
34360 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34361 op2 = copy_to_mode_reg (modev4, op2);
34362 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34363 op3 = copy_to_mode_reg (modei5, op3);
34365 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34367 error ("the fifth argument must be an 8-bit immediate");
34368 return const0_rtx;
34371 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34373 if (optimize || !target
34374 || GET_MODE (target) != tmode0
34375 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34376 target = gen_reg_rtx (tmode0);
34378 scratch1 = gen_reg_rtx (tmode1);
34380 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34382 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34384 if (optimize || !target
34385 || GET_MODE (target) != tmode1
34386 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34387 target = gen_reg_rtx (tmode1);
34389 scratch0 = gen_reg_rtx (tmode0);
34391 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34393 else
34395 gcc_assert (d->flag);
34397 scratch0 = gen_reg_rtx (tmode0);
34398 scratch1 = gen_reg_rtx (tmode1);
34400 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34403 if (! pat)
34404 return 0;
34406 emit_insn (pat);
34408 if (d->flag)
34410 target = gen_reg_rtx (SImode);
34411 emit_move_insn (target, const0_rtx);
34412 target = gen_rtx_SUBREG (QImode, target, 0);
34414 emit_insn
34415 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34416 gen_rtx_fmt_ee (EQ, QImode,
34417 gen_rtx_REG ((machine_mode) d->flag,
34418 FLAGS_REG),
34419 const0_rtx)));
34420 return SUBREG_REG (target);
34422 else
34423 return target;
34427 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34429 static rtx
34430 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34431 tree exp, rtx target)
34433 rtx pat;
34434 tree arg0 = CALL_EXPR_ARG (exp, 0);
34435 tree arg1 = CALL_EXPR_ARG (exp, 1);
34436 tree arg2 = CALL_EXPR_ARG (exp, 2);
34437 rtx scratch0, scratch1;
34438 rtx op0 = expand_normal (arg0);
34439 rtx op1 = expand_normal (arg1);
34440 rtx op2 = expand_normal (arg2);
34441 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34443 tmode0 = insn_data[d->icode].operand[0].mode;
34444 tmode1 = insn_data[d->icode].operand[1].mode;
34445 modev2 = insn_data[d->icode].operand[2].mode;
34446 modev3 = insn_data[d->icode].operand[3].mode;
34447 modeimm = insn_data[d->icode].operand[4].mode;
34449 if (VECTOR_MODE_P (modev2))
34450 op0 = safe_vector_operand (op0, modev2);
34451 if (VECTOR_MODE_P (modev3))
34452 op1 = safe_vector_operand (op1, modev3);
34454 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34455 op0 = copy_to_mode_reg (modev2, op0);
34456 if ((optimize && !register_operand (op1, modev3))
34457 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34458 op1 = copy_to_mode_reg (modev3, op1);
34460 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34462 error ("the third argument must be an 8-bit immediate");
34463 return const0_rtx;
34466 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34468 if (optimize || !target
34469 || GET_MODE (target) != tmode0
34470 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34471 target = gen_reg_rtx (tmode0);
34473 scratch1 = gen_reg_rtx (tmode1);
34475 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34477 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34479 if (optimize || !target
34480 || GET_MODE (target) != tmode1
34481 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34482 target = gen_reg_rtx (tmode1);
34484 scratch0 = gen_reg_rtx (tmode0);
34486 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34488 else
34490 gcc_assert (d->flag);
34492 scratch0 = gen_reg_rtx (tmode0);
34493 scratch1 = gen_reg_rtx (tmode1);
34495 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34498 if (! pat)
34499 return 0;
34501 emit_insn (pat);
34503 if (d->flag)
34505 target = gen_reg_rtx (SImode);
34506 emit_move_insn (target, const0_rtx);
34507 target = gen_rtx_SUBREG (QImode, target, 0);
34509 emit_insn
34510 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34511 gen_rtx_fmt_ee (EQ, QImode,
34512 gen_rtx_REG ((machine_mode) d->flag,
34513 FLAGS_REG),
34514 const0_rtx)));
34515 return SUBREG_REG (target);
34517 else
34518 return target;
34521 /* Subroutine of ix86_expand_builtin to take care of insns with
34522 variable number of operands. */
34524 static rtx
34525 ix86_expand_args_builtin (const struct builtin_description *d,
34526 tree exp, rtx target)
34528 rtx pat, real_target;
34529 unsigned int i, nargs;
34530 unsigned int nargs_constant = 0;
34531 unsigned int mask_pos = 0;
34532 int num_memory = 0;
34533 struct
34535 rtx op;
34536 machine_mode mode;
34537 } args[6];
34538 bool last_arg_count = false;
34539 enum insn_code icode = d->icode;
34540 const struct insn_data_d *insn_p = &insn_data[icode];
34541 machine_mode tmode = insn_p->operand[0].mode;
34542 machine_mode rmode = VOIDmode;
34543 bool swap = false;
34544 enum rtx_code comparison = d->comparison;
34546 switch ((enum ix86_builtin_func_type) d->flag)
34548 case V2DF_FTYPE_V2DF_ROUND:
34549 case V4DF_FTYPE_V4DF_ROUND:
34550 case V8DF_FTYPE_V8DF_ROUND:
34551 case V4SF_FTYPE_V4SF_ROUND:
34552 case V8SF_FTYPE_V8SF_ROUND:
34553 case V16SF_FTYPE_V16SF_ROUND:
34554 case V4SI_FTYPE_V4SF_ROUND:
34555 case V8SI_FTYPE_V8SF_ROUND:
34556 case V16SI_FTYPE_V16SF_ROUND:
34557 return ix86_expand_sse_round (d, exp, target);
34558 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34559 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34560 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34561 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34562 case INT_FTYPE_V8SF_V8SF_PTEST:
34563 case INT_FTYPE_V4DI_V4DI_PTEST:
34564 case INT_FTYPE_V4DF_V4DF_PTEST:
34565 case INT_FTYPE_V4SF_V4SF_PTEST:
34566 case INT_FTYPE_V2DI_V2DI_PTEST:
34567 case INT_FTYPE_V2DF_V2DF_PTEST:
34568 return ix86_expand_sse_ptest (d, exp, target);
34569 case FLOAT128_FTYPE_FLOAT128:
34570 case FLOAT_FTYPE_FLOAT:
34571 case INT_FTYPE_INT:
34572 case UINT_FTYPE_UINT:
34573 case UINT16_FTYPE_UINT16:
34574 case UINT64_FTYPE_INT:
34575 case UINT64_FTYPE_UINT64:
34576 case INT64_FTYPE_INT64:
34577 case INT64_FTYPE_V4SF:
34578 case INT64_FTYPE_V2DF:
34579 case INT_FTYPE_V16QI:
34580 case INT_FTYPE_V8QI:
34581 case INT_FTYPE_V8SF:
34582 case INT_FTYPE_V4DF:
34583 case INT_FTYPE_V4SF:
34584 case INT_FTYPE_V2DF:
34585 case INT_FTYPE_V32QI:
34586 case V16QI_FTYPE_V16QI:
34587 case V8SI_FTYPE_V8SF:
34588 case V8SI_FTYPE_V4SI:
34589 case V8HI_FTYPE_V8HI:
34590 case V8HI_FTYPE_V16QI:
34591 case V8QI_FTYPE_V8QI:
34592 case V8SF_FTYPE_V8SF:
34593 case V8SF_FTYPE_V8SI:
34594 case V8SF_FTYPE_V4SF:
34595 case V8SF_FTYPE_V8HI:
34596 case V4SI_FTYPE_V4SI:
34597 case V4SI_FTYPE_V16QI:
34598 case V4SI_FTYPE_V4SF:
34599 case V4SI_FTYPE_V8SI:
34600 case V4SI_FTYPE_V8HI:
34601 case V4SI_FTYPE_V4DF:
34602 case V4SI_FTYPE_V2DF:
34603 case V4HI_FTYPE_V4HI:
34604 case V4DF_FTYPE_V4DF:
34605 case V4DF_FTYPE_V4SI:
34606 case V4DF_FTYPE_V4SF:
34607 case V4DF_FTYPE_V2DF:
34608 case V4SF_FTYPE_V4SF:
34609 case V4SF_FTYPE_V4SI:
34610 case V4SF_FTYPE_V8SF:
34611 case V4SF_FTYPE_V4DF:
34612 case V4SF_FTYPE_V8HI:
34613 case V4SF_FTYPE_V2DF:
34614 case V2DI_FTYPE_V2DI:
34615 case V2DI_FTYPE_V16QI:
34616 case V2DI_FTYPE_V8HI:
34617 case V2DI_FTYPE_V4SI:
34618 case V2DF_FTYPE_V2DF:
34619 case V2DF_FTYPE_V4SI:
34620 case V2DF_FTYPE_V4DF:
34621 case V2DF_FTYPE_V4SF:
34622 case V2DF_FTYPE_V2SI:
34623 case V2SI_FTYPE_V2SI:
34624 case V2SI_FTYPE_V4SF:
34625 case V2SI_FTYPE_V2SF:
34626 case V2SI_FTYPE_V2DF:
34627 case V2SF_FTYPE_V2SF:
34628 case V2SF_FTYPE_V2SI:
34629 case V32QI_FTYPE_V32QI:
34630 case V32QI_FTYPE_V16QI:
34631 case V16HI_FTYPE_V16HI:
34632 case V16HI_FTYPE_V8HI:
34633 case V8SI_FTYPE_V8SI:
34634 case V16HI_FTYPE_V16QI:
34635 case V8SI_FTYPE_V16QI:
34636 case V4DI_FTYPE_V16QI:
34637 case V8SI_FTYPE_V8HI:
34638 case V4DI_FTYPE_V8HI:
34639 case V4DI_FTYPE_V4SI:
34640 case V4DI_FTYPE_V2DI:
34641 case UHI_FTYPE_UHI:
34642 case UHI_FTYPE_V16QI:
34643 case USI_FTYPE_V32QI:
34644 case UDI_FTYPE_V64QI:
34645 case V16QI_FTYPE_UHI:
34646 case V32QI_FTYPE_USI:
34647 case V64QI_FTYPE_UDI:
34648 case V8HI_FTYPE_UQI:
34649 case V16HI_FTYPE_UHI:
34650 case V32HI_FTYPE_USI:
34651 case V4SI_FTYPE_UQI:
34652 case V8SI_FTYPE_UQI:
34653 case V4SI_FTYPE_UHI:
34654 case V8SI_FTYPE_UHI:
34655 case UQI_FTYPE_V8HI:
34656 case UHI_FTYPE_V16HI:
34657 case USI_FTYPE_V32HI:
34658 case UQI_FTYPE_V4SI:
34659 case UQI_FTYPE_V8SI:
34660 case UHI_FTYPE_V16SI:
34661 case UQI_FTYPE_V2DI:
34662 case UQI_FTYPE_V4DI:
34663 case UQI_FTYPE_V8DI:
34664 case V16SI_FTYPE_UHI:
34665 case V2DI_FTYPE_UQI:
34666 case V4DI_FTYPE_UQI:
34667 case V16SI_FTYPE_INT:
34668 case V16SF_FTYPE_V8SF:
34669 case V16SI_FTYPE_V8SI:
34670 case V16SF_FTYPE_V4SF:
34671 case V16SI_FTYPE_V4SI:
34672 case V16SI_FTYPE_V16SF:
34673 case V16SF_FTYPE_V16SF:
34674 case V8DI_FTYPE_UQI:
34675 case V8DF_FTYPE_V4DF:
34676 case V8DF_FTYPE_V2DF:
34677 case V8DF_FTYPE_V8DF:
34678 nargs = 1;
34679 break;
34680 case V4SF_FTYPE_V4SF_VEC_MERGE:
34681 case V2DF_FTYPE_V2DF_VEC_MERGE:
34682 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34683 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34684 case V16QI_FTYPE_V16QI_V16QI:
34685 case V16QI_FTYPE_V8HI_V8HI:
34686 case V16SF_FTYPE_V16SF_V16SF:
34687 case V8QI_FTYPE_V8QI_V8QI:
34688 case V8QI_FTYPE_V4HI_V4HI:
34689 case V8HI_FTYPE_V8HI_V8HI:
34690 case V8HI_FTYPE_V16QI_V16QI:
34691 case V8HI_FTYPE_V4SI_V4SI:
34692 case V8SF_FTYPE_V8SF_V8SF:
34693 case V8SF_FTYPE_V8SF_V8SI:
34694 case V8DF_FTYPE_V8DF_V8DF:
34695 case V4SI_FTYPE_V4SI_V4SI:
34696 case V4SI_FTYPE_V8HI_V8HI:
34697 case V4SI_FTYPE_V2DF_V2DF:
34698 case V4HI_FTYPE_V4HI_V4HI:
34699 case V4HI_FTYPE_V8QI_V8QI:
34700 case V4HI_FTYPE_V2SI_V2SI:
34701 case V4DF_FTYPE_V4DF_V4DF:
34702 case V4DF_FTYPE_V4DF_V4DI:
34703 case V4SF_FTYPE_V4SF_V4SF:
34704 case V4SF_FTYPE_V4SF_V4SI:
34705 case V4SF_FTYPE_V4SF_V2SI:
34706 case V4SF_FTYPE_V4SF_V2DF:
34707 case V4SF_FTYPE_V4SF_UINT:
34708 case V4SF_FTYPE_V4SF_DI:
34709 case V4SF_FTYPE_V4SF_SI:
34710 case V2DI_FTYPE_V2DI_V2DI:
34711 case V2DI_FTYPE_V16QI_V16QI:
34712 case V2DI_FTYPE_V4SI_V4SI:
34713 case V2DI_FTYPE_V2DI_V16QI:
34714 case V2SI_FTYPE_V2SI_V2SI:
34715 case V2SI_FTYPE_V4HI_V4HI:
34716 case V2SI_FTYPE_V2SF_V2SF:
34717 case V2DF_FTYPE_V2DF_V2DF:
34718 case V2DF_FTYPE_V2DF_V4SF:
34719 case V2DF_FTYPE_V2DF_V2DI:
34720 case V2DF_FTYPE_V2DF_DI:
34721 case V2DF_FTYPE_V2DF_SI:
34722 case V2DF_FTYPE_V2DF_UINT:
34723 case V2SF_FTYPE_V2SF_V2SF:
34724 case V1DI_FTYPE_V1DI_V1DI:
34725 case V1DI_FTYPE_V8QI_V8QI:
34726 case V1DI_FTYPE_V2SI_V2SI:
34727 case V32QI_FTYPE_V16HI_V16HI:
34728 case V16HI_FTYPE_V8SI_V8SI:
34729 case V32QI_FTYPE_V32QI_V32QI:
34730 case V16HI_FTYPE_V32QI_V32QI:
34731 case V16HI_FTYPE_V16HI_V16HI:
34732 case V8SI_FTYPE_V4DF_V4DF:
34733 case V8SI_FTYPE_V8SI_V8SI:
34734 case V8SI_FTYPE_V16HI_V16HI:
34735 case V4DI_FTYPE_V4DI_V4DI:
34736 case V4DI_FTYPE_V8SI_V8SI:
34737 case V8DI_FTYPE_V64QI_V64QI:
34738 if (comparison == UNKNOWN)
34739 return ix86_expand_binop_builtin (icode, exp, target);
34740 nargs = 2;
34741 break;
34742 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34743 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34744 gcc_assert (comparison != UNKNOWN);
34745 nargs = 2;
34746 swap = true;
34747 break;
34748 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34749 case V16HI_FTYPE_V16HI_SI_COUNT:
34750 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34751 case V8SI_FTYPE_V8SI_SI_COUNT:
34752 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34753 case V4DI_FTYPE_V4DI_INT_COUNT:
34754 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34755 case V8HI_FTYPE_V8HI_SI_COUNT:
34756 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34757 case V4SI_FTYPE_V4SI_SI_COUNT:
34758 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34759 case V4HI_FTYPE_V4HI_SI_COUNT:
34760 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34761 case V2DI_FTYPE_V2DI_SI_COUNT:
34762 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34763 case V2SI_FTYPE_V2SI_SI_COUNT:
34764 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34765 case V1DI_FTYPE_V1DI_SI_COUNT:
34766 nargs = 2;
34767 last_arg_count = true;
34768 break;
34769 case UINT64_FTYPE_UINT64_UINT64:
34770 case UINT_FTYPE_UINT_UINT:
34771 case UINT_FTYPE_UINT_USHORT:
34772 case UINT_FTYPE_UINT_UCHAR:
34773 case UINT16_FTYPE_UINT16_INT:
34774 case UINT8_FTYPE_UINT8_INT:
34775 case UHI_FTYPE_UHI_UHI:
34776 case USI_FTYPE_USI_USI:
34777 case UDI_FTYPE_UDI_UDI:
34778 case V16SI_FTYPE_V8DF_V8DF:
34779 nargs = 2;
34780 break;
34781 case V2DI_FTYPE_V2DI_INT_CONVERT:
34782 nargs = 2;
34783 rmode = V1TImode;
34784 nargs_constant = 1;
34785 break;
34786 case V4DI_FTYPE_V4DI_INT_CONVERT:
34787 nargs = 2;
34788 rmode = V2TImode;
34789 nargs_constant = 1;
34790 break;
34791 case V8DI_FTYPE_V8DI_INT_CONVERT:
34792 nargs = 2;
34793 rmode = V4TImode;
34794 nargs_constant = 1;
34795 break;
34796 case V8HI_FTYPE_V8HI_INT:
34797 case V8HI_FTYPE_V8SF_INT:
34798 case V16HI_FTYPE_V16SF_INT:
34799 case V8HI_FTYPE_V4SF_INT:
34800 case V8SF_FTYPE_V8SF_INT:
34801 case V4SF_FTYPE_V16SF_INT:
34802 case V16SF_FTYPE_V16SF_INT:
34803 case V4SI_FTYPE_V4SI_INT:
34804 case V4SI_FTYPE_V8SI_INT:
34805 case V4HI_FTYPE_V4HI_INT:
34806 case V4DF_FTYPE_V4DF_INT:
34807 case V4DF_FTYPE_V8DF_INT:
34808 case V4SF_FTYPE_V4SF_INT:
34809 case V4SF_FTYPE_V8SF_INT:
34810 case V2DI_FTYPE_V2DI_INT:
34811 case V2DF_FTYPE_V2DF_INT:
34812 case V2DF_FTYPE_V4DF_INT:
34813 case V16HI_FTYPE_V16HI_INT:
34814 case V8SI_FTYPE_V8SI_INT:
34815 case V16SI_FTYPE_V16SI_INT:
34816 case V4SI_FTYPE_V16SI_INT:
34817 case V4DI_FTYPE_V4DI_INT:
34818 case V2DI_FTYPE_V4DI_INT:
34819 case V4DI_FTYPE_V8DI_INT:
34820 case QI_FTYPE_V4SF_INT:
34821 case QI_FTYPE_V2DF_INT:
34822 nargs = 2;
34823 nargs_constant = 1;
34824 break;
34825 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34826 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34827 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34828 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34829 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34830 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34831 case UHI_FTYPE_V16SI_V16SI_UHI:
34832 case UQI_FTYPE_V8DI_V8DI_UQI:
34833 case V16HI_FTYPE_V16SI_V16HI_UHI:
34834 case V16QI_FTYPE_V16SI_V16QI_UHI:
34835 case V16QI_FTYPE_V8DI_V16QI_UQI:
34836 case V16SF_FTYPE_V16SF_V16SF_UHI:
34837 case V16SF_FTYPE_V4SF_V16SF_UHI:
34838 case V16SI_FTYPE_SI_V16SI_UHI:
34839 case V16SI_FTYPE_V16HI_V16SI_UHI:
34840 case V16SI_FTYPE_V16QI_V16SI_UHI:
34841 case V8SF_FTYPE_V4SF_V8SF_UQI:
34842 case V4DF_FTYPE_V2DF_V4DF_UQI:
34843 case V8SI_FTYPE_V4SI_V8SI_UQI:
34844 case V8SI_FTYPE_SI_V8SI_UQI:
34845 case V4SI_FTYPE_V4SI_V4SI_UQI:
34846 case V4SI_FTYPE_SI_V4SI_UQI:
34847 case V4DI_FTYPE_V2DI_V4DI_UQI:
34848 case V4DI_FTYPE_DI_V4DI_UQI:
34849 case V2DI_FTYPE_V2DI_V2DI_UQI:
34850 case V2DI_FTYPE_DI_V2DI_UQI:
34851 case V64QI_FTYPE_V64QI_V64QI_UDI:
34852 case V64QI_FTYPE_V16QI_V64QI_UDI:
34853 case V64QI_FTYPE_QI_V64QI_UDI:
34854 case V32QI_FTYPE_V32QI_V32QI_USI:
34855 case V32QI_FTYPE_V16QI_V32QI_USI:
34856 case V32QI_FTYPE_QI_V32QI_USI:
34857 case V16QI_FTYPE_V16QI_V16QI_UHI:
34858 case V16QI_FTYPE_QI_V16QI_UHI:
34859 case V32HI_FTYPE_V8HI_V32HI_USI:
34860 case V32HI_FTYPE_HI_V32HI_USI:
34861 case V16HI_FTYPE_V8HI_V16HI_UHI:
34862 case V16HI_FTYPE_HI_V16HI_UHI:
34863 case V8HI_FTYPE_V8HI_V8HI_UQI:
34864 case V8HI_FTYPE_HI_V8HI_UQI:
34865 case V8SF_FTYPE_V8HI_V8SF_UQI:
34866 case V4SF_FTYPE_V8HI_V4SF_UQI:
34867 case V8SI_FTYPE_V8SF_V8SI_UQI:
34868 case V4SI_FTYPE_V4SF_V4SI_UQI:
34869 case V4DI_FTYPE_V4SF_V4DI_UQI:
34870 case V2DI_FTYPE_V4SF_V2DI_UQI:
34871 case V4SF_FTYPE_V4DI_V4SF_UQI:
34872 case V4SF_FTYPE_V2DI_V4SF_UQI:
34873 case V4DF_FTYPE_V4DI_V4DF_UQI:
34874 case V2DF_FTYPE_V2DI_V2DF_UQI:
34875 case V16QI_FTYPE_V8HI_V16QI_UQI:
34876 case V16QI_FTYPE_V16HI_V16QI_UHI:
34877 case V16QI_FTYPE_V4SI_V16QI_UQI:
34878 case V16QI_FTYPE_V8SI_V16QI_UQI:
34879 case V8HI_FTYPE_V4SI_V8HI_UQI:
34880 case V8HI_FTYPE_V8SI_V8HI_UQI:
34881 case V16QI_FTYPE_V2DI_V16QI_UQI:
34882 case V16QI_FTYPE_V4DI_V16QI_UQI:
34883 case V8HI_FTYPE_V2DI_V8HI_UQI:
34884 case V8HI_FTYPE_V4DI_V8HI_UQI:
34885 case V4SI_FTYPE_V2DI_V4SI_UQI:
34886 case V4SI_FTYPE_V4DI_V4SI_UQI:
34887 case V32QI_FTYPE_V32HI_V32QI_USI:
34888 case UHI_FTYPE_V16QI_V16QI_UHI:
34889 case USI_FTYPE_V32QI_V32QI_USI:
34890 case UDI_FTYPE_V64QI_V64QI_UDI:
34891 case UQI_FTYPE_V8HI_V8HI_UQI:
34892 case UHI_FTYPE_V16HI_V16HI_UHI:
34893 case USI_FTYPE_V32HI_V32HI_USI:
34894 case UQI_FTYPE_V4SI_V4SI_UQI:
34895 case UQI_FTYPE_V8SI_V8SI_UQI:
34896 case UQI_FTYPE_V2DI_V2DI_UQI:
34897 case UQI_FTYPE_V4DI_V4DI_UQI:
34898 case V4SF_FTYPE_V2DF_V4SF_UQI:
34899 case V4SF_FTYPE_V4DF_V4SF_UQI:
34900 case V16SI_FTYPE_V16SI_V16SI_UHI:
34901 case V16SI_FTYPE_V4SI_V16SI_UHI:
34902 case V2DI_FTYPE_V4SI_V2DI_UQI:
34903 case V2DI_FTYPE_V8HI_V2DI_UQI:
34904 case V2DI_FTYPE_V16QI_V2DI_UQI:
34905 case V4DI_FTYPE_V4DI_V4DI_UQI:
34906 case V4DI_FTYPE_V4SI_V4DI_UQI:
34907 case V4DI_FTYPE_V8HI_V4DI_UQI:
34908 case V4DI_FTYPE_V16QI_V4DI_UQI:
34909 case V4DI_FTYPE_V4DF_V4DI_UQI:
34910 case V2DI_FTYPE_V2DF_V2DI_UQI:
34911 case V4SI_FTYPE_V4DF_V4SI_UQI:
34912 case V4SI_FTYPE_V2DF_V4SI_UQI:
34913 case V4SI_FTYPE_V8HI_V4SI_UQI:
34914 case V4SI_FTYPE_V16QI_V4SI_UQI:
34915 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34916 case V8DF_FTYPE_V2DF_V8DF_UQI:
34917 case V8DF_FTYPE_V4DF_V8DF_UQI:
34918 case V8DF_FTYPE_V8DF_V8DF_UQI:
34919 case V8SF_FTYPE_V8SF_V8SF_UQI:
34920 case V8SF_FTYPE_V8SI_V8SF_UQI:
34921 case V4DF_FTYPE_V4DF_V4DF_UQI:
34922 case V4SF_FTYPE_V4SF_V4SF_UQI:
34923 case V2DF_FTYPE_V2DF_V2DF_UQI:
34924 case V2DF_FTYPE_V4SF_V2DF_UQI:
34925 case V2DF_FTYPE_V4SI_V2DF_UQI:
34926 case V4SF_FTYPE_V4SI_V4SF_UQI:
34927 case V4DF_FTYPE_V4SF_V4DF_UQI:
34928 case V4DF_FTYPE_V4SI_V4DF_UQI:
34929 case V8SI_FTYPE_V8SI_V8SI_UQI:
34930 case V8SI_FTYPE_V8HI_V8SI_UQI:
34931 case V8SI_FTYPE_V16QI_V8SI_UQI:
34932 case V8DF_FTYPE_V8SI_V8DF_UQI:
34933 case V8DI_FTYPE_DI_V8DI_UQI:
34934 case V16SF_FTYPE_V8SF_V16SF_UHI:
34935 case V16SI_FTYPE_V8SI_V16SI_UHI:
34936 case V16HI_FTYPE_V16HI_V16HI_UHI:
34937 case V8HI_FTYPE_V16QI_V8HI_UQI:
34938 case V16HI_FTYPE_V16QI_V16HI_UHI:
34939 case V32HI_FTYPE_V32HI_V32HI_USI:
34940 case V32HI_FTYPE_V32QI_V32HI_USI:
34941 case V8DI_FTYPE_V16QI_V8DI_UQI:
34942 case V8DI_FTYPE_V2DI_V8DI_UQI:
34943 case V8DI_FTYPE_V4DI_V8DI_UQI:
34944 case V8DI_FTYPE_V8DI_V8DI_UQI:
34945 case V8DI_FTYPE_V8HI_V8DI_UQI:
34946 case V8DI_FTYPE_V8SI_V8DI_UQI:
34947 case V8HI_FTYPE_V8DI_V8HI_UQI:
34948 case V8SI_FTYPE_V8DI_V8SI_UQI:
34949 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34950 nargs = 3;
34951 break;
34952 case V32QI_FTYPE_V32QI_V32QI_INT:
34953 case V16HI_FTYPE_V16HI_V16HI_INT:
34954 case V16QI_FTYPE_V16QI_V16QI_INT:
34955 case V4DI_FTYPE_V4DI_V4DI_INT:
34956 case V8HI_FTYPE_V8HI_V8HI_INT:
34957 case V8SI_FTYPE_V8SI_V8SI_INT:
34958 case V8SI_FTYPE_V8SI_V4SI_INT:
34959 case V8SF_FTYPE_V8SF_V8SF_INT:
34960 case V8SF_FTYPE_V8SF_V4SF_INT:
34961 case V4SI_FTYPE_V4SI_V4SI_INT:
34962 case V4DF_FTYPE_V4DF_V4DF_INT:
34963 case V16SF_FTYPE_V16SF_V16SF_INT:
34964 case V16SF_FTYPE_V16SF_V4SF_INT:
34965 case V16SI_FTYPE_V16SI_V4SI_INT:
34966 case V4DF_FTYPE_V4DF_V2DF_INT:
34967 case V4SF_FTYPE_V4SF_V4SF_INT:
34968 case V2DI_FTYPE_V2DI_V2DI_INT:
34969 case V4DI_FTYPE_V4DI_V2DI_INT:
34970 case V2DF_FTYPE_V2DF_V2DF_INT:
34971 case UQI_FTYPE_V8DI_V8UDI_INT:
34972 case UQI_FTYPE_V8DF_V8DF_INT:
34973 case UQI_FTYPE_V2DF_V2DF_INT:
34974 case UQI_FTYPE_V4SF_V4SF_INT:
34975 case UHI_FTYPE_V16SI_V16SI_INT:
34976 case UHI_FTYPE_V16SF_V16SF_INT:
34977 nargs = 3;
34978 nargs_constant = 1;
34979 break;
34980 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34981 nargs = 3;
34982 rmode = V4DImode;
34983 nargs_constant = 1;
34984 break;
34985 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34986 nargs = 3;
34987 rmode = V2DImode;
34988 nargs_constant = 1;
34989 break;
34990 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34991 nargs = 3;
34992 rmode = DImode;
34993 nargs_constant = 1;
34994 break;
34995 case V2DI_FTYPE_V2DI_UINT_UINT:
34996 nargs = 3;
34997 nargs_constant = 2;
34998 break;
34999 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35000 nargs = 3;
35001 rmode = V8DImode;
35002 nargs_constant = 1;
35003 break;
35004 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35005 nargs = 5;
35006 rmode = V8DImode;
35007 mask_pos = 2;
35008 nargs_constant = 1;
35009 break;
35010 case QI_FTYPE_V8DF_INT_UQI:
35011 case QI_FTYPE_V4DF_INT_UQI:
35012 case QI_FTYPE_V2DF_INT_UQI:
35013 case HI_FTYPE_V16SF_INT_UHI:
35014 case QI_FTYPE_V8SF_INT_UQI:
35015 case QI_FTYPE_V4SF_INT_UQI:
35016 nargs = 3;
35017 mask_pos = 1;
35018 nargs_constant = 1;
35019 break;
35020 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35021 nargs = 5;
35022 rmode = V4DImode;
35023 mask_pos = 2;
35024 nargs_constant = 1;
35025 break;
35026 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35027 nargs = 5;
35028 rmode = V2DImode;
35029 mask_pos = 2;
35030 nargs_constant = 1;
35031 break;
35032 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35033 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35034 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35035 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35036 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35037 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35038 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35039 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35040 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35041 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35042 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35043 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35044 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35045 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35046 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35047 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35048 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35049 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35050 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35051 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35052 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35053 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35054 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35055 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35056 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35057 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35058 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35059 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35060 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35061 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35062 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35063 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35064 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35065 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35066 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35067 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35068 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35069 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35070 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35071 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35072 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35073 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35074 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35075 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35076 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35077 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35078 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35079 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35080 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35081 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35082 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35083 nargs = 4;
35084 break;
35085 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35086 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35087 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35088 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35089 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35090 nargs = 4;
35091 nargs_constant = 1;
35092 break;
35093 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35094 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35095 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35096 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35097 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35098 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35099 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35100 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35101 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35102 case USI_FTYPE_V32QI_V32QI_INT_USI:
35103 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35104 case USI_FTYPE_V32HI_V32HI_INT_USI:
35105 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35106 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35107 nargs = 4;
35108 mask_pos = 1;
35109 nargs_constant = 1;
35110 break;
35111 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35112 nargs = 4;
35113 nargs_constant = 2;
35114 break;
35115 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35116 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35117 nargs = 4;
35118 break;
35119 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35120 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35121 mask_pos = 1;
35122 nargs = 4;
35123 nargs_constant = 1;
35124 break;
35125 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35126 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35127 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35128 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35129 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35130 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35131 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35132 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35133 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35134 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35135 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35136 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35137 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35138 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35139 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35140 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35141 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35142 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35143 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35144 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35145 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35146 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35147 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35148 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35149 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35150 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35151 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35152 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35153 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35154 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35155 nargs = 4;
35156 mask_pos = 2;
35157 nargs_constant = 1;
35158 break;
35159 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35160 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35161 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35162 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35163 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35164 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35165 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35166 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35167 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35168 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35169 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35170 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35171 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35172 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35173 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35174 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35175 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35176 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35177 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35178 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35179 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35180 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35181 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35182 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35183 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35184 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35185 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35186 nargs = 5;
35187 mask_pos = 2;
35188 nargs_constant = 1;
35189 break;
35190 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35191 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35192 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35193 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35194 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35195 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35196 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35197 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35198 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35199 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35200 nargs = 5;
35201 mask_pos = 1;
35202 nargs_constant = 1;
35203 break;
35205 default:
35206 gcc_unreachable ();
35209 gcc_assert (nargs <= ARRAY_SIZE (args));
35211 if (comparison != UNKNOWN)
35213 gcc_assert (nargs == 2);
35214 return ix86_expand_sse_compare (d, exp, target, swap);
35217 if (rmode == VOIDmode || rmode == tmode)
35219 if (optimize
35220 || target == 0
35221 || GET_MODE (target) != tmode
35222 || !insn_p->operand[0].predicate (target, tmode))
35223 target = gen_reg_rtx (tmode);
35224 real_target = target;
35226 else
35228 real_target = gen_reg_rtx (tmode);
35229 target = lowpart_subreg (rmode, real_target, tmode);
35232 for (i = 0; i < nargs; i++)
35234 tree arg = CALL_EXPR_ARG (exp, i);
35235 rtx op = expand_normal (arg);
35236 machine_mode mode = insn_p->operand[i + 1].mode;
35237 bool match = insn_p->operand[i + 1].predicate (op, mode);
35239 if (last_arg_count && (i + 1) == nargs)
35241 /* SIMD shift insns take either an 8-bit immediate or
35242 register as count. But builtin functions take int as
35243 count. If count doesn't match, we put it in register. */
35244 if (!match)
35246 op = lowpart_subreg (SImode, op, GET_MODE (op));
35247 if (!insn_p->operand[i + 1].predicate (op, mode))
35248 op = copy_to_reg (op);
35251 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35252 (!mask_pos && (nargs - i) <= nargs_constant))
35254 if (!match)
35255 switch (icode)
35257 case CODE_FOR_avx_vinsertf128v4di:
35258 case CODE_FOR_avx_vextractf128v4di:
35259 error ("the last argument must be an 1-bit immediate");
35260 return const0_rtx;
35262 case CODE_FOR_avx512f_cmpv8di3_mask:
35263 case CODE_FOR_avx512f_cmpv16si3_mask:
35264 case CODE_FOR_avx512f_ucmpv8di3_mask:
35265 case CODE_FOR_avx512f_ucmpv16si3_mask:
35266 case CODE_FOR_avx512vl_cmpv4di3_mask:
35267 case CODE_FOR_avx512vl_cmpv8si3_mask:
35268 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35269 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35270 case CODE_FOR_avx512vl_cmpv2di3_mask:
35271 case CODE_FOR_avx512vl_cmpv4si3_mask:
35272 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35273 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35274 error ("the last argument must be a 3-bit immediate");
35275 return const0_rtx;
35277 case CODE_FOR_sse4_1_roundsd:
35278 case CODE_FOR_sse4_1_roundss:
35280 case CODE_FOR_sse4_1_roundpd:
35281 case CODE_FOR_sse4_1_roundps:
35282 case CODE_FOR_avx_roundpd256:
35283 case CODE_FOR_avx_roundps256:
35285 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35286 case CODE_FOR_sse4_1_roundps_sfix:
35287 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35288 case CODE_FOR_avx_roundps_sfix256:
35290 case CODE_FOR_sse4_1_blendps:
35291 case CODE_FOR_avx_blendpd256:
35292 case CODE_FOR_avx_vpermilv4df:
35293 case CODE_FOR_avx_vpermilv4df_mask:
35294 case CODE_FOR_avx512f_getmantv8df_mask:
35295 case CODE_FOR_avx512f_getmantv16sf_mask:
35296 case CODE_FOR_avx512vl_getmantv8sf_mask:
35297 case CODE_FOR_avx512vl_getmantv4df_mask:
35298 case CODE_FOR_avx512vl_getmantv4sf_mask:
35299 case CODE_FOR_avx512vl_getmantv2df_mask:
35300 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35301 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35302 case CODE_FOR_avx512dq_rangepv4df_mask:
35303 case CODE_FOR_avx512dq_rangepv8sf_mask:
35304 case CODE_FOR_avx512dq_rangepv2df_mask:
35305 case CODE_FOR_avx512dq_rangepv4sf_mask:
35306 case CODE_FOR_avx_shufpd256_mask:
35307 error ("the last argument must be a 4-bit immediate");
35308 return const0_rtx;
35310 case CODE_FOR_sha1rnds4:
35311 case CODE_FOR_sse4_1_blendpd:
35312 case CODE_FOR_avx_vpermilv2df:
35313 case CODE_FOR_avx_vpermilv2df_mask:
35314 case CODE_FOR_xop_vpermil2v2df3:
35315 case CODE_FOR_xop_vpermil2v4sf3:
35316 case CODE_FOR_xop_vpermil2v4df3:
35317 case CODE_FOR_xop_vpermil2v8sf3:
35318 case CODE_FOR_avx512f_vinsertf32x4_mask:
35319 case CODE_FOR_avx512f_vinserti32x4_mask:
35320 case CODE_FOR_avx512f_vextractf32x4_mask:
35321 case CODE_FOR_avx512f_vextracti32x4_mask:
35322 case CODE_FOR_sse2_shufpd:
35323 case CODE_FOR_sse2_shufpd_mask:
35324 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35325 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35326 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35327 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35328 error ("the last argument must be a 2-bit immediate");
35329 return const0_rtx;
35331 case CODE_FOR_avx_vextractf128v4df:
35332 case CODE_FOR_avx_vextractf128v8sf:
35333 case CODE_FOR_avx_vextractf128v8si:
35334 case CODE_FOR_avx_vinsertf128v4df:
35335 case CODE_FOR_avx_vinsertf128v8sf:
35336 case CODE_FOR_avx_vinsertf128v8si:
35337 case CODE_FOR_avx512f_vinsertf64x4_mask:
35338 case CODE_FOR_avx512f_vinserti64x4_mask:
35339 case CODE_FOR_avx512f_vextractf64x4_mask:
35340 case CODE_FOR_avx512f_vextracti64x4_mask:
35341 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35342 case CODE_FOR_avx512dq_vinserti32x8_mask:
35343 case CODE_FOR_avx512vl_vinsertv4df:
35344 case CODE_FOR_avx512vl_vinsertv4di:
35345 case CODE_FOR_avx512vl_vinsertv8sf:
35346 case CODE_FOR_avx512vl_vinsertv8si:
35347 error ("the last argument must be a 1-bit immediate");
35348 return const0_rtx;
35350 case CODE_FOR_avx_vmcmpv2df3:
35351 case CODE_FOR_avx_vmcmpv4sf3:
35352 case CODE_FOR_avx_cmpv2df3:
35353 case CODE_FOR_avx_cmpv4sf3:
35354 case CODE_FOR_avx_cmpv4df3:
35355 case CODE_FOR_avx_cmpv8sf3:
35356 case CODE_FOR_avx512f_cmpv8df3_mask:
35357 case CODE_FOR_avx512f_cmpv16sf3_mask:
35358 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35359 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35360 error ("the last argument must be a 5-bit immediate");
35361 return const0_rtx;
35363 default:
35364 switch (nargs_constant)
35366 case 2:
35367 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35368 (!mask_pos && (nargs - i) == nargs_constant))
35370 error ("the next to last argument must be an 8-bit immediate");
35371 break;
35373 /* FALLTHRU */
35374 case 1:
35375 error ("the last argument must be an 8-bit immediate");
35376 break;
35377 default:
35378 gcc_unreachable ();
35380 return const0_rtx;
35383 else
35385 if (VECTOR_MODE_P (mode))
35386 op = safe_vector_operand (op, mode);
35388 /* If we aren't optimizing, only allow one memory operand to
35389 be generated. */
35390 if (memory_operand (op, mode))
35391 num_memory++;
35393 op = fixup_modeless_constant (op, mode);
35395 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35397 if (optimize || !match || num_memory > 1)
35398 op = copy_to_mode_reg (mode, op);
35400 else
35402 op = copy_to_reg (op);
35403 op = lowpart_subreg (mode, op, GET_MODE (op));
35407 args[i].op = op;
35408 args[i].mode = mode;
35411 switch (nargs)
35413 case 1:
35414 pat = GEN_FCN (icode) (real_target, args[0].op);
35415 break;
35416 case 2:
35417 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35418 break;
35419 case 3:
35420 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35421 args[2].op);
35422 break;
35423 case 4:
35424 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35425 args[2].op, args[3].op);
35426 break;
35427 case 5:
35428 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35429 args[2].op, args[3].op, args[4].op);
35430 break;
35431 case 6:
35432 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35433 args[2].op, args[3].op, args[4].op,
35434 args[5].op);
35435 break;
35436 default:
35437 gcc_unreachable ();
35440 if (! pat)
35441 return 0;
35443 emit_insn (pat);
35444 return target;
35447 /* Transform pattern of following layout:
35448 (parallel [
35449 set (A B)
35450 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35452 into:
35453 (set (A B))
35456 (parallel [ A B
35458 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35461 into:
35462 (parallel [ A B ... ]) */
35464 static rtx
35465 ix86_erase_embedded_rounding (rtx pat)
35467 if (GET_CODE (pat) == INSN)
35468 pat = PATTERN (pat);
35470 gcc_assert (GET_CODE (pat) == PARALLEL);
35472 if (XVECLEN (pat, 0) == 2)
35474 rtx p0 = XVECEXP (pat, 0, 0);
35475 rtx p1 = XVECEXP (pat, 0, 1);
35477 gcc_assert (GET_CODE (p0) == SET
35478 && GET_CODE (p1) == UNSPEC
35479 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35481 return p0;
35483 else
35485 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35486 int i = 0;
35487 int j = 0;
35489 for (; i < XVECLEN (pat, 0); ++i)
35491 rtx elem = XVECEXP (pat, 0, i);
35492 if (GET_CODE (elem) != UNSPEC
35493 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35494 res [j++] = elem;
35497 /* No more than 1 occurence was removed. */
35498 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35500 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35504 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35505 with rounding. */
35506 static rtx
35507 ix86_expand_sse_comi_round (const struct builtin_description *d,
35508 tree exp, rtx target)
35510 rtx pat, set_dst;
35511 tree arg0 = CALL_EXPR_ARG (exp, 0);
35512 tree arg1 = CALL_EXPR_ARG (exp, 1);
35513 tree arg2 = CALL_EXPR_ARG (exp, 2);
35514 tree arg3 = CALL_EXPR_ARG (exp, 3);
35515 rtx op0 = expand_normal (arg0);
35516 rtx op1 = expand_normal (arg1);
35517 rtx op2 = expand_normal (arg2);
35518 rtx op3 = expand_normal (arg3);
35519 enum insn_code icode = d->icode;
35520 const struct insn_data_d *insn_p = &insn_data[icode];
35521 machine_mode mode0 = insn_p->operand[0].mode;
35522 machine_mode mode1 = insn_p->operand[1].mode;
35523 enum rtx_code comparison = UNEQ;
35524 bool need_ucomi = false;
35526 /* See avxintrin.h for values. */
35527 enum rtx_code comi_comparisons[32] =
35529 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35530 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35531 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35533 bool need_ucomi_values[32] =
35535 true, false, false, true, true, false, false, true,
35536 true, false, false, true, true, false, false, true,
35537 false, true, true, false, false, true, true, false,
35538 false, true, true, false, false, true, true, false
35541 if (!CONST_INT_P (op2))
35543 error ("the third argument must be comparison constant");
35544 return const0_rtx;
35546 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35548 error ("incorrect comparison mode");
35549 return const0_rtx;
35552 if (!insn_p->operand[2].predicate (op3, SImode))
35554 error ("incorrect rounding operand");
35555 return const0_rtx;
35558 comparison = comi_comparisons[INTVAL (op2)];
35559 need_ucomi = need_ucomi_values[INTVAL (op2)];
35561 if (VECTOR_MODE_P (mode0))
35562 op0 = safe_vector_operand (op0, mode0);
35563 if (VECTOR_MODE_P (mode1))
35564 op1 = safe_vector_operand (op1, mode1);
35566 target = gen_reg_rtx (SImode);
35567 emit_move_insn (target, const0_rtx);
35568 target = gen_rtx_SUBREG (QImode, target, 0);
35570 if ((optimize && !register_operand (op0, mode0))
35571 || !insn_p->operand[0].predicate (op0, mode0))
35572 op0 = copy_to_mode_reg (mode0, op0);
35573 if ((optimize && !register_operand (op1, mode1))
35574 || !insn_p->operand[1].predicate (op1, mode1))
35575 op1 = copy_to_mode_reg (mode1, op1);
35577 if (need_ucomi)
35578 icode = icode == CODE_FOR_sse_comi_round
35579 ? CODE_FOR_sse_ucomi_round
35580 : CODE_FOR_sse2_ucomi_round;
35582 pat = GEN_FCN (icode) (op0, op1, op3);
35583 if (! pat)
35584 return 0;
35586 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35587 if (INTVAL (op3) == NO_ROUND)
35589 pat = ix86_erase_embedded_rounding (pat);
35590 if (! pat)
35591 return 0;
35593 set_dst = SET_DEST (pat);
35595 else
35597 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35598 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35601 emit_insn (pat);
35602 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35603 gen_rtx_fmt_ee (comparison, QImode,
35604 set_dst,
35605 const0_rtx)));
35607 return SUBREG_REG (target);
35610 static rtx
35611 ix86_expand_round_builtin (const struct builtin_description *d,
35612 tree exp, rtx target)
35614 rtx pat;
35615 unsigned int i, nargs;
35616 struct
35618 rtx op;
35619 machine_mode mode;
35620 } args[6];
35621 enum insn_code icode = d->icode;
35622 const struct insn_data_d *insn_p = &insn_data[icode];
35623 machine_mode tmode = insn_p->operand[0].mode;
35624 unsigned int nargs_constant = 0;
35625 unsigned int redundant_embed_rnd = 0;
35627 switch ((enum ix86_builtin_func_type) d->flag)
35629 case UINT64_FTYPE_V2DF_INT:
35630 case UINT64_FTYPE_V4SF_INT:
35631 case UINT_FTYPE_V2DF_INT:
35632 case UINT_FTYPE_V4SF_INT:
35633 case INT64_FTYPE_V2DF_INT:
35634 case INT64_FTYPE_V4SF_INT:
35635 case INT_FTYPE_V2DF_INT:
35636 case INT_FTYPE_V4SF_INT:
35637 nargs = 2;
35638 break;
35639 case V4SF_FTYPE_V4SF_UINT_INT:
35640 case V4SF_FTYPE_V4SF_UINT64_INT:
35641 case V2DF_FTYPE_V2DF_UINT64_INT:
35642 case V4SF_FTYPE_V4SF_INT_INT:
35643 case V4SF_FTYPE_V4SF_INT64_INT:
35644 case V2DF_FTYPE_V2DF_INT64_INT:
35645 case V4SF_FTYPE_V4SF_V4SF_INT:
35646 case V2DF_FTYPE_V2DF_V2DF_INT:
35647 case V4SF_FTYPE_V4SF_V2DF_INT:
35648 case V2DF_FTYPE_V2DF_V4SF_INT:
35649 nargs = 3;
35650 break;
35651 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35652 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35653 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35654 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35655 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35656 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35657 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35658 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35659 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35660 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35661 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35662 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35663 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35664 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35665 nargs = 4;
35666 break;
35667 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35668 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35669 nargs_constant = 2;
35670 nargs = 4;
35671 break;
35672 case INT_FTYPE_V4SF_V4SF_INT_INT:
35673 case INT_FTYPE_V2DF_V2DF_INT_INT:
35674 return ix86_expand_sse_comi_round (d, exp, target);
35675 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35676 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35677 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35678 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35679 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35680 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35681 nargs = 5;
35682 break;
35683 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35684 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35685 nargs_constant = 4;
35686 nargs = 5;
35687 break;
35688 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35689 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35690 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35691 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35692 nargs_constant = 3;
35693 nargs = 5;
35694 break;
35695 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35696 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35697 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35698 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35699 nargs = 6;
35700 nargs_constant = 4;
35701 break;
35702 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35703 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35704 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35705 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35706 nargs = 6;
35707 nargs_constant = 3;
35708 break;
35709 default:
35710 gcc_unreachable ();
35712 gcc_assert (nargs <= ARRAY_SIZE (args));
35714 if (optimize
35715 || target == 0
35716 || GET_MODE (target) != tmode
35717 || !insn_p->operand[0].predicate (target, tmode))
35718 target = gen_reg_rtx (tmode);
35720 for (i = 0; i < nargs; i++)
35722 tree arg = CALL_EXPR_ARG (exp, i);
35723 rtx op = expand_normal (arg);
35724 machine_mode mode = insn_p->operand[i + 1].mode;
35725 bool match = insn_p->operand[i + 1].predicate (op, mode);
35727 if (i == nargs - nargs_constant)
35729 if (!match)
35731 switch (icode)
35733 case CODE_FOR_avx512f_getmantv8df_mask_round:
35734 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35735 case CODE_FOR_avx512f_vgetmantv2df_round:
35736 case CODE_FOR_avx512f_vgetmantv4sf_round:
35737 error ("the immediate argument must be a 4-bit immediate");
35738 return const0_rtx;
35739 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35740 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35741 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35742 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35743 error ("the immediate argument must be a 5-bit immediate");
35744 return const0_rtx;
35745 default:
35746 error ("the immediate argument must be an 8-bit immediate");
35747 return const0_rtx;
35751 else if (i == nargs-1)
35753 if (!insn_p->operand[nargs].predicate (op, SImode))
35755 error ("incorrect rounding operand");
35756 return const0_rtx;
35759 /* If there is no rounding use normal version of the pattern. */
35760 if (INTVAL (op) == NO_ROUND)
35761 redundant_embed_rnd = 1;
35763 else
35765 if (VECTOR_MODE_P (mode))
35766 op = safe_vector_operand (op, mode);
35768 op = fixup_modeless_constant (op, mode);
35770 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35772 if (optimize || !match)
35773 op = copy_to_mode_reg (mode, op);
35775 else
35777 op = copy_to_reg (op);
35778 op = lowpart_subreg (mode, op, GET_MODE (op));
35782 args[i].op = op;
35783 args[i].mode = mode;
35786 switch (nargs)
35788 case 1:
35789 pat = GEN_FCN (icode) (target, args[0].op);
35790 break;
35791 case 2:
35792 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35793 break;
35794 case 3:
35795 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35796 args[2].op);
35797 break;
35798 case 4:
35799 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35800 args[2].op, args[3].op);
35801 break;
35802 case 5:
35803 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35804 args[2].op, args[3].op, args[4].op);
35805 break;
35806 case 6:
35807 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35808 args[2].op, args[3].op, args[4].op,
35809 args[5].op);
35810 break;
35811 default:
35812 gcc_unreachable ();
35815 if (!pat)
35816 return 0;
35818 if (redundant_embed_rnd)
35819 pat = ix86_erase_embedded_rounding (pat);
35821 emit_insn (pat);
35822 return target;
35825 /* Subroutine of ix86_expand_builtin to take care of special insns
35826 with variable number of operands. */
35828 static rtx
35829 ix86_expand_special_args_builtin (const struct builtin_description *d,
35830 tree exp, rtx target)
35832 tree arg;
35833 rtx pat, op;
35834 unsigned int i, nargs, arg_adjust, memory;
35835 bool aligned_mem = false;
35836 struct
35838 rtx op;
35839 machine_mode mode;
35840 } args[3];
35841 enum insn_code icode = d->icode;
35842 bool last_arg_constant = false;
35843 const struct insn_data_d *insn_p = &insn_data[icode];
35844 machine_mode tmode = insn_p->operand[0].mode;
35845 enum { load, store } klass;
35847 switch ((enum ix86_builtin_func_type) d->flag)
35849 case VOID_FTYPE_VOID:
35850 emit_insn (GEN_FCN (icode) (target));
35851 return 0;
35852 case VOID_FTYPE_UINT64:
35853 case VOID_FTYPE_UNSIGNED:
35854 nargs = 0;
35855 klass = store;
35856 memory = 0;
35857 break;
35859 case INT_FTYPE_VOID:
35860 case USHORT_FTYPE_VOID:
35861 case UINT64_FTYPE_VOID:
35862 case UNSIGNED_FTYPE_VOID:
35863 nargs = 0;
35864 klass = load;
35865 memory = 0;
35866 break;
35867 case UINT64_FTYPE_PUNSIGNED:
35868 case V2DI_FTYPE_PV2DI:
35869 case V4DI_FTYPE_PV4DI:
35870 case V32QI_FTYPE_PCCHAR:
35871 case V16QI_FTYPE_PCCHAR:
35872 case V8SF_FTYPE_PCV4SF:
35873 case V8SF_FTYPE_PCFLOAT:
35874 case V4SF_FTYPE_PCFLOAT:
35875 case V4DF_FTYPE_PCV2DF:
35876 case V4DF_FTYPE_PCDOUBLE:
35877 case V2DF_FTYPE_PCDOUBLE:
35878 case VOID_FTYPE_PVOID:
35879 case V8DI_FTYPE_PV8DI:
35880 nargs = 1;
35881 klass = load;
35882 memory = 0;
35883 switch (icode)
35885 case CODE_FOR_sse4_1_movntdqa:
35886 case CODE_FOR_avx2_movntdqa:
35887 case CODE_FOR_avx512f_movntdqa:
35888 aligned_mem = true;
35889 break;
35890 default:
35891 break;
35893 break;
35894 case VOID_FTYPE_PV2SF_V4SF:
35895 case VOID_FTYPE_PV8DI_V8DI:
35896 case VOID_FTYPE_PV4DI_V4DI:
35897 case VOID_FTYPE_PV2DI_V2DI:
35898 case VOID_FTYPE_PCHAR_V32QI:
35899 case VOID_FTYPE_PCHAR_V16QI:
35900 case VOID_FTYPE_PFLOAT_V16SF:
35901 case VOID_FTYPE_PFLOAT_V8SF:
35902 case VOID_FTYPE_PFLOAT_V4SF:
35903 case VOID_FTYPE_PDOUBLE_V8DF:
35904 case VOID_FTYPE_PDOUBLE_V4DF:
35905 case VOID_FTYPE_PDOUBLE_V2DF:
35906 case VOID_FTYPE_PLONGLONG_LONGLONG:
35907 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35908 case VOID_FTYPE_PINT_INT:
35909 nargs = 1;
35910 klass = store;
35911 /* Reserve memory operand for target. */
35912 memory = ARRAY_SIZE (args);
35913 switch (icode)
35915 /* These builtins and instructions require the memory
35916 to be properly aligned. */
35917 case CODE_FOR_avx_movntv4di:
35918 case CODE_FOR_sse2_movntv2di:
35919 case CODE_FOR_avx_movntv8sf:
35920 case CODE_FOR_sse_movntv4sf:
35921 case CODE_FOR_sse4a_vmmovntv4sf:
35922 case CODE_FOR_avx_movntv4df:
35923 case CODE_FOR_sse2_movntv2df:
35924 case CODE_FOR_sse4a_vmmovntv2df:
35925 case CODE_FOR_sse2_movntidi:
35926 case CODE_FOR_sse_movntq:
35927 case CODE_FOR_sse2_movntisi:
35928 case CODE_FOR_avx512f_movntv16sf:
35929 case CODE_FOR_avx512f_movntv8df:
35930 case CODE_FOR_avx512f_movntv8di:
35931 aligned_mem = true;
35932 break;
35933 default:
35934 break;
35936 break;
35937 case V4SF_FTYPE_V4SF_PCV2SF:
35938 case V2DF_FTYPE_V2DF_PCDOUBLE:
35939 nargs = 2;
35940 klass = load;
35941 memory = 1;
35942 break;
35943 case V8SF_FTYPE_PCV8SF_V8SI:
35944 case V4DF_FTYPE_PCV4DF_V4DI:
35945 case V4SF_FTYPE_PCV4SF_V4SI:
35946 case V2DF_FTYPE_PCV2DF_V2DI:
35947 case V8SI_FTYPE_PCV8SI_V8SI:
35948 case V4DI_FTYPE_PCV4DI_V4DI:
35949 case V4SI_FTYPE_PCV4SI_V4SI:
35950 case V2DI_FTYPE_PCV2DI_V2DI:
35951 nargs = 2;
35952 klass = load;
35953 memory = 0;
35954 break;
35955 case VOID_FTYPE_PV8DF_V8DF_UQI:
35956 case VOID_FTYPE_PV4DF_V4DF_UQI:
35957 case VOID_FTYPE_PV2DF_V2DF_UQI:
35958 case VOID_FTYPE_PV16SF_V16SF_UHI:
35959 case VOID_FTYPE_PV8SF_V8SF_UQI:
35960 case VOID_FTYPE_PV4SF_V4SF_UQI:
35961 case VOID_FTYPE_PV8DI_V8DI_UQI:
35962 case VOID_FTYPE_PV4DI_V4DI_UQI:
35963 case VOID_FTYPE_PV2DI_V2DI_UQI:
35964 case VOID_FTYPE_PV16SI_V16SI_UHI:
35965 case VOID_FTYPE_PV8SI_V8SI_UQI:
35966 case VOID_FTYPE_PV4SI_V4SI_UQI:
35967 switch (icode)
35969 /* These builtins and instructions require the memory
35970 to be properly aligned. */
35971 case CODE_FOR_avx512f_storev16sf_mask:
35972 case CODE_FOR_avx512f_storev16si_mask:
35973 case CODE_FOR_avx512f_storev8df_mask:
35974 case CODE_FOR_avx512f_storev8di_mask:
35975 case CODE_FOR_avx512vl_storev8sf_mask:
35976 case CODE_FOR_avx512vl_storev8si_mask:
35977 case CODE_FOR_avx512vl_storev4df_mask:
35978 case CODE_FOR_avx512vl_storev4di_mask:
35979 case CODE_FOR_avx512vl_storev4sf_mask:
35980 case CODE_FOR_avx512vl_storev4si_mask:
35981 case CODE_FOR_avx512vl_storev2df_mask:
35982 case CODE_FOR_avx512vl_storev2di_mask:
35983 aligned_mem = true;
35984 break;
35985 default:
35986 break;
35988 /* FALLTHRU */
35989 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35990 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35991 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35992 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35993 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35994 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35995 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35996 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35997 case VOID_FTYPE_PV8SI_V8DI_UQI:
35998 case VOID_FTYPE_PV8HI_V8DI_UQI:
35999 case VOID_FTYPE_PV16HI_V16SI_UHI:
36000 case VOID_FTYPE_PV16QI_V8DI_UQI:
36001 case VOID_FTYPE_PV16QI_V16SI_UHI:
36002 case VOID_FTYPE_PV4SI_V4DI_UQI:
36003 case VOID_FTYPE_PV4SI_V2DI_UQI:
36004 case VOID_FTYPE_PV8HI_V4DI_UQI:
36005 case VOID_FTYPE_PV8HI_V2DI_UQI:
36006 case VOID_FTYPE_PV8HI_V8SI_UQI:
36007 case VOID_FTYPE_PV8HI_V4SI_UQI:
36008 case VOID_FTYPE_PV16QI_V4DI_UQI:
36009 case VOID_FTYPE_PV16QI_V2DI_UQI:
36010 case VOID_FTYPE_PV16QI_V8SI_UQI:
36011 case VOID_FTYPE_PV16QI_V4SI_UQI:
36012 case VOID_FTYPE_PCHAR_V64QI_UDI:
36013 case VOID_FTYPE_PCHAR_V32QI_USI:
36014 case VOID_FTYPE_PCHAR_V16QI_UHI:
36015 case VOID_FTYPE_PSHORT_V32HI_USI:
36016 case VOID_FTYPE_PSHORT_V16HI_UHI:
36017 case VOID_FTYPE_PSHORT_V8HI_UQI:
36018 case VOID_FTYPE_PINT_V16SI_UHI:
36019 case VOID_FTYPE_PINT_V8SI_UQI:
36020 case VOID_FTYPE_PINT_V4SI_UQI:
36021 case VOID_FTYPE_PINT64_V8DI_UQI:
36022 case VOID_FTYPE_PINT64_V4DI_UQI:
36023 case VOID_FTYPE_PINT64_V2DI_UQI:
36024 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36025 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36026 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36027 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36028 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36029 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36030 nargs = 2;
36031 klass = store;
36032 /* Reserve memory operand for target. */
36033 memory = ARRAY_SIZE (args);
36034 break;
36035 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36036 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36037 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36038 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36039 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36040 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36041 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36042 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36043 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36044 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36045 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36046 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36047 switch (icode)
36049 /* These builtins and instructions require the memory
36050 to be properly aligned. */
36051 case CODE_FOR_avx512f_loadv16sf_mask:
36052 case CODE_FOR_avx512f_loadv16si_mask:
36053 case CODE_FOR_avx512f_loadv8df_mask:
36054 case CODE_FOR_avx512f_loadv8di_mask:
36055 case CODE_FOR_avx512vl_loadv8sf_mask:
36056 case CODE_FOR_avx512vl_loadv8si_mask:
36057 case CODE_FOR_avx512vl_loadv4df_mask:
36058 case CODE_FOR_avx512vl_loadv4di_mask:
36059 case CODE_FOR_avx512vl_loadv4sf_mask:
36060 case CODE_FOR_avx512vl_loadv4si_mask:
36061 case CODE_FOR_avx512vl_loadv2df_mask:
36062 case CODE_FOR_avx512vl_loadv2di_mask:
36063 case CODE_FOR_avx512bw_loadv64qi_mask:
36064 case CODE_FOR_avx512vl_loadv32qi_mask:
36065 case CODE_FOR_avx512vl_loadv16qi_mask:
36066 case CODE_FOR_avx512bw_loadv32hi_mask:
36067 case CODE_FOR_avx512vl_loadv16hi_mask:
36068 case CODE_FOR_avx512vl_loadv8hi_mask:
36069 aligned_mem = true;
36070 break;
36071 default:
36072 break;
36074 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36075 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36076 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36077 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36078 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36079 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36080 case V16SI_FTYPE_PCINT_V16SI_UHI:
36081 case V8SI_FTYPE_PCINT_V8SI_UQI:
36082 case V4SI_FTYPE_PCINT_V4SI_UQI:
36083 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36084 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36085 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36086 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36087 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36088 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36089 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36090 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36091 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36092 nargs = 3;
36093 klass = load;
36094 memory = 0;
36095 break;
36096 case VOID_FTYPE_UINT_UINT_UINT:
36097 case VOID_FTYPE_UINT64_UINT_UINT:
36098 case UCHAR_FTYPE_UINT_UINT_UINT:
36099 case UCHAR_FTYPE_UINT64_UINT_UINT:
36100 nargs = 3;
36101 klass = load;
36102 memory = ARRAY_SIZE (args);
36103 last_arg_constant = true;
36104 break;
36105 default:
36106 gcc_unreachable ();
36109 gcc_assert (nargs <= ARRAY_SIZE (args));
36111 if (klass == store)
36113 arg = CALL_EXPR_ARG (exp, 0);
36114 op = expand_normal (arg);
36115 gcc_assert (target == 0);
36116 if (memory)
36118 op = ix86_zero_extend_to_Pmode (op);
36119 target = gen_rtx_MEM (tmode, op);
36120 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36121 on it. Try to improve it using get_pointer_alignment,
36122 and if the special builtin is one that requires strict
36123 mode alignment, also from it's GET_MODE_ALIGNMENT.
36124 Failure to do so could lead to ix86_legitimate_combined_insn
36125 rejecting all changes to such insns. */
36126 unsigned int align = get_pointer_alignment (arg);
36127 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36128 align = GET_MODE_ALIGNMENT (tmode);
36129 if (MEM_ALIGN (target) < align)
36130 set_mem_align (target, align);
36132 else
36133 target = force_reg (tmode, op);
36134 arg_adjust = 1;
36136 else
36138 arg_adjust = 0;
36139 if (optimize
36140 || target == 0
36141 || !register_operand (target, tmode)
36142 || GET_MODE (target) != tmode)
36143 target = gen_reg_rtx (tmode);
36146 for (i = 0; i < nargs; i++)
36148 machine_mode mode = insn_p->operand[i + 1].mode;
36149 bool match;
36151 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36152 op = expand_normal (arg);
36153 match = insn_p->operand[i + 1].predicate (op, mode);
36155 if (last_arg_constant && (i + 1) == nargs)
36157 if (!match)
36159 if (icode == CODE_FOR_lwp_lwpvalsi3
36160 || icode == CODE_FOR_lwp_lwpinssi3
36161 || icode == CODE_FOR_lwp_lwpvaldi3
36162 || icode == CODE_FOR_lwp_lwpinsdi3)
36163 error ("the last argument must be a 32-bit immediate");
36164 else
36165 error ("the last argument must be an 8-bit immediate");
36166 return const0_rtx;
36169 else
36171 if (i == memory)
36173 /* This must be the memory operand. */
36174 op = ix86_zero_extend_to_Pmode (op);
36175 op = gen_rtx_MEM (mode, op);
36176 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36177 on it. Try to improve it using get_pointer_alignment,
36178 and if the special builtin is one that requires strict
36179 mode alignment, also from it's GET_MODE_ALIGNMENT.
36180 Failure to do so could lead to ix86_legitimate_combined_insn
36181 rejecting all changes to such insns. */
36182 unsigned int align = get_pointer_alignment (arg);
36183 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36184 align = GET_MODE_ALIGNMENT (mode);
36185 if (MEM_ALIGN (op) < align)
36186 set_mem_align (op, align);
36188 else
36190 /* This must be register. */
36191 if (VECTOR_MODE_P (mode))
36192 op = safe_vector_operand (op, mode);
36194 op = fixup_modeless_constant (op, mode);
36196 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36197 op = copy_to_mode_reg (mode, op);
36198 else
36200 op = copy_to_reg (op);
36201 op = lowpart_subreg (mode, op, GET_MODE (op));
36206 args[i].op = op;
36207 args[i].mode = mode;
36210 switch (nargs)
36212 case 0:
36213 pat = GEN_FCN (icode) (target);
36214 break;
36215 case 1:
36216 pat = GEN_FCN (icode) (target, args[0].op);
36217 break;
36218 case 2:
36219 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36220 break;
36221 case 3:
36222 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36223 break;
36224 default:
36225 gcc_unreachable ();
36228 if (! pat)
36229 return 0;
36230 emit_insn (pat);
36231 return klass == store ? 0 : target;
36234 /* Return the integer constant in ARG. Constrain it to be in the range
36235 of the subparts of VEC_TYPE; issue an error if not. */
36237 static int
36238 get_element_number (tree vec_type, tree arg)
36240 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36242 if (!tree_fits_uhwi_p (arg)
36243 || (elt = tree_to_uhwi (arg), elt > max))
36245 error ("selector must be an integer constant in the range 0..%wi", max);
36246 return 0;
36249 return elt;
36252 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36253 ix86_expand_vector_init. We DO have language-level syntax for this, in
36254 the form of (type){ init-list }. Except that since we can't place emms
36255 instructions from inside the compiler, we can't allow the use of MMX
36256 registers unless the user explicitly asks for it. So we do *not* define
36257 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36258 we have builtins invoked by mmintrin.h that gives us license to emit
36259 these sorts of instructions. */
36261 static rtx
36262 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36264 machine_mode tmode = TYPE_MODE (type);
36265 machine_mode inner_mode = GET_MODE_INNER (tmode);
36266 int i, n_elt = GET_MODE_NUNITS (tmode);
36267 rtvec v = rtvec_alloc (n_elt);
36269 gcc_assert (VECTOR_MODE_P (tmode));
36270 gcc_assert (call_expr_nargs (exp) == n_elt);
36272 for (i = 0; i < n_elt; ++i)
36274 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36275 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36278 if (!target || !register_operand (target, tmode))
36279 target = gen_reg_rtx (tmode);
36281 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36282 return target;
36285 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36286 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36287 had a language-level syntax for referencing vector elements. */
36289 static rtx
36290 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36292 machine_mode tmode, mode0;
36293 tree arg0, arg1;
36294 int elt;
36295 rtx op0;
36297 arg0 = CALL_EXPR_ARG (exp, 0);
36298 arg1 = CALL_EXPR_ARG (exp, 1);
36300 op0 = expand_normal (arg0);
36301 elt = get_element_number (TREE_TYPE (arg0), arg1);
36303 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36304 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36305 gcc_assert (VECTOR_MODE_P (mode0));
36307 op0 = force_reg (mode0, op0);
36309 if (optimize || !target || !register_operand (target, tmode))
36310 target = gen_reg_rtx (tmode);
36312 ix86_expand_vector_extract (true, target, op0, elt);
36314 return target;
36317 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36318 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36319 a language-level syntax for referencing vector elements. */
36321 static rtx
36322 ix86_expand_vec_set_builtin (tree exp)
36324 machine_mode tmode, mode1;
36325 tree arg0, arg1, arg2;
36326 int elt;
36327 rtx op0, op1, target;
36329 arg0 = CALL_EXPR_ARG (exp, 0);
36330 arg1 = CALL_EXPR_ARG (exp, 1);
36331 arg2 = CALL_EXPR_ARG (exp, 2);
36333 tmode = TYPE_MODE (TREE_TYPE (arg0));
36334 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36335 gcc_assert (VECTOR_MODE_P (tmode));
36337 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36338 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36339 elt = get_element_number (TREE_TYPE (arg0), arg2);
36341 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36342 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36344 op0 = force_reg (tmode, op0);
36345 op1 = force_reg (mode1, op1);
36347 /* OP0 is the source of these builtin functions and shouldn't be
36348 modified. Create a copy, use it and return it as target. */
36349 target = gen_reg_rtx (tmode);
36350 emit_move_insn (target, op0);
36351 ix86_expand_vector_set (true, target, op1, elt);
36353 return target;
36356 /* Emit conditional move of SRC to DST with condition
36357 OP1 CODE OP2. */
36358 static void
36359 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36361 rtx t;
36363 if (TARGET_CMOVE)
36365 t = ix86_expand_compare (code, op1, op2);
36366 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36367 src, dst)));
36369 else
36371 rtx_code_label *nomove = gen_label_rtx ();
36372 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36373 const0_rtx, GET_MODE (op1), 1, nomove);
36374 emit_move_insn (dst, src);
36375 emit_label (nomove);
36379 /* Choose max of DST and SRC and put it to DST. */
36380 static void
36381 ix86_emit_move_max (rtx dst, rtx src)
36383 ix86_emit_cmove (dst, src, LTU, dst, src);
36386 /* Expand an expression EXP that calls a built-in function,
36387 with result going to TARGET if that's convenient
36388 (and in mode MODE if that's convenient).
36389 SUBTARGET may be used as the target for computing one of EXP's operands.
36390 IGNORE is nonzero if the value is to be ignored. */
36392 static rtx
36393 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36394 machine_mode mode, int ignore)
36396 size_t i;
36397 enum insn_code icode;
36398 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36399 tree arg0, arg1, arg2, arg3, arg4;
36400 rtx op0, op1, op2, op3, op4, pat, insn;
36401 machine_mode mode0, mode1, mode2, mode3, mode4;
36402 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36404 /* For CPU builtins that can be folded, fold first and expand the fold. */
36405 switch (fcode)
36407 case IX86_BUILTIN_CPU_INIT:
36409 /* Make it call __cpu_indicator_init in libgcc. */
36410 tree call_expr, fndecl, type;
36411 type = build_function_type_list (integer_type_node, NULL_TREE);
36412 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36413 call_expr = build_call_expr (fndecl, 0);
36414 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36416 case IX86_BUILTIN_CPU_IS:
36417 case IX86_BUILTIN_CPU_SUPPORTS:
36419 tree arg0 = CALL_EXPR_ARG (exp, 0);
36420 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36421 gcc_assert (fold_expr != NULL_TREE);
36422 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36426 /* Determine whether the builtin function is available under the current ISA.
36427 Originally the builtin was not created if it wasn't applicable to the
36428 current ISA based on the command line switches. With function specific
36429 options, we need to check in the context of the function making the call
36430 whether it is supported. */
36431 if (ix86_builtins_isa[fcode].isa
36432 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
36434 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, 0,
36435 NULL, NULL, (enum fpmath_unit) 0,
36436 false);
36437 if (!opts)
36438 error ("%qE needs unknown isa option", fndecl);
36439 else
36441 gcc_assert (opts != NULL);
36442 error ("%qE needs isa option %s", fndecl, opts);
36443 free (opts);
36445 return expand_call (exp, target, ignore);
36448 switch (fcode)
36450 case IX86_BUILTIN_BNDMK:
36451 if (!target
36452 || GET_MODE (target) != BNDmode
36453 || !register_operand (target, BNDmode))
36454 target = gen_reg_rtx (BNDmode);
36456 arg0 = CALL_EXPR_ARG (exp, 0);
36457 arg1 = CALL_EXPR_ARG (exp, 1);
36459 op0 = expand_normal (arg0);
36460 op1 = expand_normal (arg1);
36462 if (!register_operand (op0, Pmode))
36463 op0 = ix86_zero_extend_to_Pmode (op0);
36464 if (!register_operand (op1, Pmode))
36465 op1 = ix86_zero_extend_to_Pmode (op1);
36467 /* Builtin arg1 is size of block but instruction op1 should
36468 be (size - 1). */
36469 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36470 NULL_RTX, 1, OPTAB_DIRECT);
36472 emit_insn (BNDmode == BND64mode
36473 ? gen_bnd64_mk (target, op0, op1)
36474 : gen_bnd32_mk (target, op0, op1));
36475 return target;
36477 case IX86_BUILTIN_BNDSTX:
36478 arg0 = CALL_EXPR_ARG (exp, 0);
36479 arg1 = CALL_EXPR_ARG (exp, 1);
36480 arg2 = CALL_EXPR_ARG (exp, 2);
36482 op0 = expand_normal (arg0);
36483 op1 = expand_normal (arg1);
36484 op2 = expand_normal (arg2);
36486 if (!register_operand (op0, Pmode))
36487 op0 = ix86_zero_extend_to_Pmode (op0);
36488 if (!register_operand (op1, BNDmode))
36489 op1 = copy_to_mode_reg (BNDmode, op1);
36490 if (!register_operand (op2, Pmode))
36491 op2 = ix86_zero_extend_to_Pmode (op2);
36493 emit_insn (BNDmode == BND64mode
36494 ? gen_bnd64_stx (op2, op0, op1)
36495 : gen_bnd32_stx (op2, op0, op1));
36496 return 0;
36498 case IX86_BUILTIN_BNDLDX:
36499 if (!target
36500 || GET_MODE (target) != BNDmode
36501 || !register_operand (target, BNDmode))
36502 target = gen_reg_rtx (BNDmode);
36504 arg0 = CALL_EXPR_ARG (exp, 0);
36505 arg1 = CALL_EXPR_ARG (exp, 1);
36507 op0 = expand_normal (arg0);
36508 op1 = expand_normal (arg1);
36510 if (!register_operand (op0, Pmode))
36511 op0 = ix86_zero_extend_to_Pmode (op0);
36512 if (!register_operand (op1, Pmode))
36513 op1 = ix86_zero_extend_to_Pmode (op1);
36515 emit_insn (BNDmode == BND64mode
36516 ? gen_bnd64_ldx (target, op0, op1)
36517 : gen_bnd32_ldx (target, op0, op1));
36518 return target;
36520 case IX86_BUILTIN_BNDCL:
36521 arg0 = CALL_EXPR_ARG (exp, 0);
36522 arg1 = CALL_EXPR_ARG (exp, 1);
36524 op0 = expand_normal (arg0);
36525 op1 = expand_normal (arg1);
36527 if (!register_operand (op0, Pmode))
36528 op0 = ix86_zero_extend_to_Pmode (op0);
36529 if (!register_operand (op1, BNDmode))
36530 op1 = copy_to_mode_reg (BNDmode, op1);
36532 emit_insn (BNDmode == BND64mode
36533 ? gen_bnd64_cl (op1, op0)
36534 : gen_bnd32_cl (op1, op0));
36535 return 0;
36537 case IX86_BUILTIN_BNDCU:
36538 arg0 = CALL_EXPR_ARG (exp, 0);
36539 arg1 = CALL_EXPR_ARG (exp, 1);
36541 op0 = expand_normal (arg0);
36542 op1 = expand_normal (arg1);
36544 if (!register_operand (op0, Pmode))
36545 op0 = ix86_zero_extend_to_Pmode (op0);
36546 if (!register_operand (op1, BNDmode))
36547 op1 = copy_to_mode_reg (BNDmode, op1);
36549 emit_insn (BNDmode == BND64mode
36550 ? gen_bnd64_cu (op1, op0)
36551 : gen_bnd32_cu (op1, op0));
36552 return 0;
36554 case IX86_BUILTIN_BNDRET:
36555 arg0 = CALL_EXPR_ARG (exp, 0);
36556 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36557 target = chkp_get_rtl_bounds (arg0);
36559 /* If no bounds were specified for returned value,
36560 then use INIT bounds. It usually happens when
36561 some built-in function is expanded. */
36562 if (!target)
36564 rtx t1 = gen_reg_rtx (Pmode);
36565 rtx t2 = gen_reg_rtx (Pmode);
36566 target = gen_reg_rtx (BNDmode);
36567 emit_move_insn (t1, const0_rtx);
36568 emit_move_insn (t2, constm1_rtx);
36569 emit_insn (BNDmode == BND64mode
36570 ? gen_bnd64_mk (target, t1, t2)
36571 : gen_bnd32_mk (target, t1, t2));
36574 gcc_assert (target && REG_P (target));
36575 return target;
36577 case IX86_BUILTIN_BNDNARROW:
36579 rtx m1, m1h1, m1h2, lb, ub, t1;
36581 /* Return value and lb. */
36582 arg0 = CALL_EXPR_ARG (exp, 0);
36583 /* Bounds. */
36584 arg1 = CALL_EXPR_ARG (exp, 1);
36585 /* Size. */
36586 arg2 = CALL_EXPR_ARG (exp, 2);
36588 lb = expand_normal (arg0);
36589 op1 = expand_normal (arg1);
36590 op2 = expand_normal (arg2);
36592 /* Size was passed but we need to use (size - 1) as for bndmk. */
36593 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36594 NULL_RTX, 1, OPTAB_DIRECT);
36596 /* Add LB to size and inverse to get UB. */
36597 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36598 op2, 1, OPTAB_DIRECT);
36599 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36601 if (!register_operand (lb, Pmode))
36602 lb = ix86_zero_extend_to_Pmode (lb);
36603 if (!register_operand (ub, Pmode))
36604 ub = ix86_zero_extend_to_Pmode (ub);
36606 /* We need to move bounds to memory before any computations. */
36607 if (MEM_P (op1))
36608 m1 = op1;
36609 else
36611 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36612 emit_move_insn (m1, op1);
36615 /* Generate mem expression to be used for access to LB and UB. */
36616 m1h1 = adjust_address (m1, Pmode, 0);
36617 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36619 t1 = gen_reg_rtx (Pmode);
36621 /* Compute LB. */
36622 emit_move_insn (t1, m1h1);
36623 ix86_emit_move_max (t1, lb);
36624 emit_move_insn (m1h1, t1);
36626 /* Compute UB. UB is stored in 1's complement form. Therefore
36627 we also use max here. */
36628 emit_move_insn (t1, m1h2);
36629 ix86_emit_move_max (t1, ub);
36630 emit_move_insn (m1h2, t1);
36632 op2 = gen_reg_rtx (BNDmode);
36633 emit_move_insn (op2, m1);
36635 return chkp_join_splitted_slot (lb, op2);
36638 case IX86_BUILTIN_BNDINT:
36640 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36642 if (!target
36643 || GET_MODE (target) != BNDmode
36644 || !register_operand (target, BNDmode))
36645 target = gen_reg_rtx (BNDmode);
36647 arg0 = CALL_EXPR_ARG (exp, 0);
36648 arg1 = CALL_EXPR_ARG (exp, 1);
36650 op0 = expand_normal (arg0);
36651 op1 = expand_normal (arg1);
36653 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36654 rh1 = adjust_address (res, Pmode, 0);
36655 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36657 /* Put first bounds to temporaries. */
36658 lb1 = gen_reg_rtx (Pmode);
36659 ub1 = gen_reg_rtx (Pmode);
36660 if (MEM_P (op0))
36662 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36663 emit_move_insn (ub1, adjust_address (op0, Pmode,
36664 GET_MODE_SIZE (Pmode)));
36666 else
36668 emit_move_insn (res, op0);
36669 emit_move_insn (lb1, rh1);
36670 emit_move_insn (ub1, rh2);
36673 /* Put second bounds to temporaries. */
36674 lb2 = gen_reg_rtx (Pmode);
36675 ub2 = gen_reg_rtx (Pmode);
36676 if (MEM_P (op1))
36678 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36679 emit_move_insn (ub2, adjust_address (op1, Pmode,
36680 GET_MODE_SIZE (Pmode)));
36682 else
36684 emit_move_insn (res, op1);
36685 emit_move_insn (lb2, rh1);
36686 emit_move_insn (ub2, rh2);
36689 /* Compute LB. */
36690 ix86_emit_move_max (lb1, lb2);
36691 emit_move_insn (rh1, lb1);
36693 /* Compute UB. UB is stored in 1's complement form. Therefore
36694 we also use max here. */
36695 ix86_emit_move_max (ub1, ub2);
36696 emit_move_insn (rh2, ub1);
36698 emit_move_insn (target, res);
36700 return target;
36703 case IX86_BUILTIN_SIZEOF:
36705 tree name;
36706 rtx symbol;
36708 if (!target
36709 || GET_MODE (target) != Pmode
36710 || !register_operand (target, Pmode))
36711 target = gen_reg_rtx (Pmode);
36713 arg0 = CALL_EXPR_ARG (exp, 0);
36714 gcc_assert (VAR_P (arg0));
36716 name = DECL_ASSEMBLER_NAME (arg0);
36717 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36719 emit_insn (Pmode == SImode
36720 ? gen_move_size_reloc_si (target, symbol)
36721 : gen_move_size_reloc_di (target, symbol));
36723 return target;
36726 case IX86_BUILTIN_BNDLOWER:
36728 rtx mem, hmem;
36730 if (!target
36731 || GET_MODE (target) != Pmode
36732 || !register_operand (target, Pmode))
36733 target = gen_reg_rtx (Pmode);
36735 arg0 = CALL_EXPR_ARG (exp, 0);
36736 op0 = expand_normal (arg0);
36738 /* We need to move bounds to memory first. */
36739 if (MEM_P (op0))
36740 mem = op0;
36741 else
36743 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36744 emit_move_insn (mem, op0);
36747 /* Generate mem expression to access LB and load it. */
36748 hmem = adjust_address (mem, Pmode, 0);
36749 emit_move_insn (target, hmem);
36751 return target;
36754 case IX86_BUILTIN_BNDUPPER:
36756 rtx mem, hmem, res;
36758 if (!target
36759 || GET_MODE (target) != Pmode
36760 || !register_operand (target, Pmode))
36761 target = gen_reg_rtx (Pmode);
36763 arg0 = CALL_EXPR_ARG (exp, 0);
36764 op0 = expand_normal (arg0);
36766 /* We need to move bounds to memory first. */
36767 if (MEM_P (op0))
36768 mem = op0;
36769 else
36771 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36772 emit_move_insn (mem, op0);
36775 /* Generate mem expression to access UB. */
36776 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36778 /* We need to inverse all bits of UB. */
36779 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36781 if (res != target)
36782 emit_move_insn (target, res);
36784 return target;
36787 case IX86_BUILTIN_MASKMOVQ:
36788 case IX86_BUILTIN_MASKMOVDQU:
36789 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36790 ? CODE_FOR_mmx_maskmovq
36791 : CODE_FOR_sse2_maskmovdqu);
36792 /* Note the arg order is different from the operand order. */
36793 arg1 = CALL_EXPR_ARG (exp, 0);
36794 arg2 = CALL_EXPR_ARG (exp, 1);
36795 arg0 = CALL_EXPR_ARG (exp, 2);
36796 op0 = expand_normal (arg0);
36797 op1 = expand_normal (arg1);
36798 op2 = expand_normal (arg2);
36799 mode0 = insn_data[icode].operand[0].mode;
36800 mode1 = insn_data[icode].operand[1].mode;
36801 mode2 = insn_data[icode].operand[2].mode;
36803 op0 = ix86_zero_extend_to_Pmode (op0);
36804 op0 = gen_rtx_MEM (mode1, op0);
36806 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36807 op0 = copy_to_mode_reg (mode0, op0);
36808 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36809 op1 = copy_to_mode_reg (mode1, op1);
36810 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36811 op2 = copy_to_mode_reg (mode2, op2);
36812 pat = GEN_FCN (icode) (op0, op1, op2);
36813 if (! pat)
36814 return 0;
36815 emit_insn (pat);
36816 return 0;
36818 case IX86_BUILTIN_LDMXCSR:
36819 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36820 target = assign_386_stack_local (SImode, SLOT_TEMP);
36821 emit_move_insn (target, op0);
36822 emit_insn (gen_sse_ldmxcsr (target));
36823 return 0;
36825 case IX86_BUILTIN_STMXCSR:
36826 target = assign_386_stack_local (SImode, SLOT_TEMP);
36827 emit_insn (gen_sse_stmxcsr (target));
36828 return copy_to_mode_reg (SImode, target);
36830 case IX86_BUILTIN_CLFLUSH:
36831 arg0 = CALL_EXPR_ARG (exp, 0);
36832 op0 = expand_normal (arg0);
36833 icode = CODE_FOR_sse2_clflush;
36834 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36835 op0 = ix86_zero_extend_to_Pmode (op0);
36837 emit_insn (gen_sse2_clflush (op0));
36838 return 0;
36840 case IX86_BUILTIN_CLWB:
36841 arg0 = CALL_EXPR_ARG (exp, 0);
36842 op0 = expand_normal (arg0);
36843 icode = CODE_FOR_clwb;
36844 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36845 op0 = ix86_zero_extend_to_Pmode (op0);
36847 emit_insn (gen_clwb (op0));
36848 return 0;
36850 case IX86_BUILTIN_CLFLUSHOPT:
36851 arg0 = CALL_EXPR_ARG (exp, 0);
36852 op0 = expand_normal (arg0);
36853 icode = CODE_FOR_clflushopt;
36854 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36855 op0 = ix86_zero_extend_to_Pmode (op0);
36857 emit_insn (gen_clflushopt (op0));
36858 return 0;
36860 case IX86_BUILTIN_MONITOR:
36861 case IX86_BUILTIN_MONITORX:
36862 arg0 = CALL_EXPR_ARG (exp, 0);
36863 arg1 = CALL_EXPR_ARG (exp, 1);
36864 arg2 = CALL_EXPR_ARG (exp, 2);
36865 op0 = expand_normal (arg0);
36866 op1 = expand_normal (arg1);
36867 op2 = expand_normal (arg2);
36868 if (!REG_P (op0))
36869 op0 = ix86_zero_extend_to_Pmode (op0);
36870 if (!REG_P (op1))
36871 op1 = copy_to_mode_reg (SImode, op1);
36872 if (!REG_P (op2))
36873 op2 = copy_to_mode_reg (SImode, op2);
36875 emit_insn (fcode == IX86_BUILTIN_MONITOR
36876 ? ix86_gen_monitor (op0, op1, op2)
36877 : ix86_gen_monitorx (op0, op1, op2));
36878 return 0;
36880 case IX86_BUILTIN_MWAIT:
36881 arg0 = CALL_EXPR_ARG (exp, 0);
36882 arg1 = CALL_EXPR_ARG (exp, 1);
36883 op0 = expand_normal (arg0);
36884 op1 = expand_normal (arg1);
36885 if (!REG_P (op0))
36886 op0 = copy_to_mode_reg (SImode, op0);
36887 if (!REG_P (op1))
36888 op1 = copy_to_mode_reg (SImode, op1);
36889 emit_insn (gen_sse3_mwait (op0, op1));
36890 return 0;
36892 case IX86_BUILTIN_MWAITX:
36893 arg0 = CALL_EXPR_ARG (exp, 0);
36894 arg1 = CALL_EXPR_ARG (exp, 1);
36895 arg2 = CALL_EXPR_ARG (exp, 2);
36896 op0 = expand_normal (arg0);
36897 op1 = expand_normal (arg1);
36898 op2 = expand_normal (arg2);
36899 if (!REG_P (op0))
36900 op0 = copy_to_mode_reg (SImode, op0);
36901 if (!REG_P (op1))
36902 op1 = copy_to_mode_reg (SImode, op1);
36903 if (!REG_P (op2))
36904 op2 = copy_to_mode_reg (SImode, op2);
36905 emit_insn (gen_mwaitx (op0, op1, op2));
36906 return 0;
36908 case IX86_BUILTIN_CLZERO:
36909 arg0 = CALL_EXPR_ARG (exp, 0);
36910 op0 = expand_normal (arg0);
36911 if (!REG_P (op0))
36912 op0 = ix86_zero_extend_to_Pmode (op0);
36913 emit_insn (ix86_gen_clzero (op0));
36914 return 0;
36916 case IX86_BUILTIN_VEC_INIT_V2SI:
36917 case IX86_BUILTIN_VEC_INIT_V4HI:
36918 case IX86_BUILTIN_VEC_INIT_V8QI:
36919 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36921 case IX86_BUILTIN_VEC_EXT_V2DF:
36922 case IX86_BUILTIN_VEC_EXT_V2DI:
36923 case IX86_BUILTIN_VEC_EXT_V4SF:
36924 case IX86_BUILTIN_VEC_EXT_V4SI:
36925 case IX86_BUILTIN_VEC_EXT_V8HI:
36926 case IX86_BUILTIN_VEC_EXT_V2SI:
36927 case IX86_BUILTIN_VEC_EXT_V4HI:
36928 case IX86_BUILTIN_VEC_EXT_V16QI:
36929 return ix86_expand_vec_ext_builtin (exp, target);
36931 case IX86_BUILTIN_VEC_SET_V2DI:
36932 case IX86_BUILTIN_VEC_SET_V4SF:
36933 case IX86_BUILTIN_VEC_SET_V4SI:
36934 case IX86_BUILTIN_VEC_SET_V8HI:
36935 case IX86_BUILTIN_VEC_SET_V4HI:
36936 case IX86_BUILTIN_VEC_SET_V16QI:
36937 return ix86_expand_vec_set_builtin (exp);
36939 case IX86_BUILTIN_NANQ:
36940 case IX86_BUILTIN_NANSQ:
36941 return expand_call (exp, target, ignore);
36943 case IX86_BUILTIN_RDPMC:
36944 case IX86_BUILTIN_RDTSC:
36945 case IX86_BUILTIN_RDTSCP:
36947 op0 = gen_reg_rtx (DImode);
36948 op1 = gen_reg_rtx (DImode);
36950 if (fcode == IX86_BUILTIN_RDPMC)
36952 arg0 = CALL_EXPR_ARG (exp, 0);
36953 op2 = expand_normal (arg0);
36954 if (!register_operand (op2, SImode))
36955 op2 = copy_to_mode_reg (SImode, op2);
36957 insn = (TARGET_64BIT
36958 ? gen_rdpmc_rex64 (op0, op1, op2)
36959 : gen_rdpmc (op0, op2));
36960 emit_insn (insn);
36962 else if (fcode == IX86_BUILTIN_RDTSC)
36964 insn = (TARGET_64BIT
36965 ? gen_rdtsc_rex64 (op0, op1)
36966 : gen_rdtsc (op0));
36967 emit_insn (insn);
36969 else
36971 op2 = gen_reg_rtx (SImode);
36973 insn = (TARGET_64BIT
36974 ? gen_rdtscp_rex64 (op0, op1, op2)
36975 : gen_rdtscp (op0, op2));
36976 emit_insn (insn);
36978 arg0 = CALL_EXPR_ARG (exp, 0);
36979 op4 = expand_normal (arg0);
36980 if (!address_operand (op4, VOIDmode))
36982 op4 = convert_memory_address (Pmode, op4);
36983 op4 = copy_addr_to_reg (op4);
36985 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36988 if (target == 0)
36990 /* mode is VOIDmode if __builtin_rd* has been called
36991 without lhs. */
36992 if (mode == VOIDmode)
36993 return target;
36994 target = gen_reg_rtx (mode);
36997 if (TARGET_64BIT)
36999 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37000 op1, 1, OPTAB_DIRECT);
37001 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37002 op0, 1, OPTAB_DIRECT);
37005 emit_move_insn (target, op0);
37006 return target;
37008 case IX86_BUILTIN_FXSAVE:
37009 case IX86_BUILTIN_FXRSTOR:
37010 case IX86_BUILTIN_FXSAVE64:
37011 case IX86_BUILTIN_FXRSTOR64:
37012 case IX86_BUILTIN_FNSTENV:
37013 case IX86_BUILTIN_FLDENV:
37014 mode0 = BLKmode;
37015 switch (fcode)
37017 case IX86_BUILTIN_FXSAVE:
37018 icode = CODE_FOR_fxsave;
37019 break;
37020 case IX86_BUILTIN_FXRSTOR:
37021 icode = CODE_FOR_fxrstor;
37022 break;
37023 case IX86_BUILTIN_FXSAVE64:
37024 icode = CODE_FOR_fxsave64;
37025 break;
37026 case IX86_BUILTIN_FXRSTOR64:
37027 icode = CODE_FOR_fxrstor64;
37028 break;
37029 case IX86_BUILTIN_FNSTENV:
37030 icode = CODE_FOR_fnstenv;
37031 break;
37032 case IX86_BUILTIN_FLDENV:
37033 icode = CODE_FOR_fldenv;
37034 break;
37035 default:
37036 gcc_unreachable ();
37039 arg0 = CALL_EXPR_ARG (exp, 0);
37040 op0 = expand_normal (arg0);
37042 if (!address_operand (op0, VOIDmode))
37044 op0 = convert_memory_address (Pmode, op0);
37045 op0 = copy_addr_to_reg (op0);
37047 op0 = gen_rtx_MEM (mode0, op0);
37049 pat = GEN_FCN (icode) (op0);
37050 if (pat)
37051 emit_insn (pat);
37052 return 0;
37054 case IX86_BUILTIN_XSAVE:
37055 case IX86_BUILTIN_XRSTOR:
37056 case IX86_BUILTIN_XSAVE64:
37057 case IX86_BUILTIN_XRSTOR64:
37058 case IX86_BUILTIN_XSAVEOPT:
37059 case IX86_BUILTIN_XSAVEOPT64:
37060 case IX86_BUILTIN_XSAVES:
37061 case IX86_BUILTIN_XRSTORS:
37062 case IX86_BUILTIN_XSAVES64:
37063 case IX86_BUILTIN_XRSTORS64:
37064 case IX86_BUILTIN_XSAVEC:
37065 case IX86_BUILTIN_XSAVEC64:
37066 arg0 = CALL_EXPR_ARG (exp, 0);
37067 arg1 = CALL_EXPR_ARG (exp, 1);
37068 op0 = expand_normal (arg0);
37069 op1 = expand_normal (arg1);
37071 if (!address_operand (op0, VOIDmode))
37073 op0 = convert_memory_address (Pmode, op0);
37074 op0 = copy_addr_to_reg (op0);
37076 op0 = gen_rtx_MEM (BLKmode, op0);
37078 op1 = force_reg (DImode, op1);
37080 if (TARGET_64BIT)
37082 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37083 NULL, 1, OPTAB_DIRECT);
37084 switch (fcode)
37086 case IX86_BUILTIN_XSAVE:
37087 icode = CODE_FOR_xsave_rex64;
37088 break;
37089 case IX86_BUILTIN_XRSTOR:
37090 icode = CODE_FOR_xrstor_rex64;
37091 break;
37092 case IX86_BUILTIN_XSAVE64:
37093 icode = CODE_FOR_xsave64;
37094 break;
37095 case IX86_BUILTIN_XRSTOR64:
37096 icode = CODE_FOR_xrstor64;
37097 break;
37098 case IX86_BUILTIN_XSAVEOPT:
37099 icode = CODE_FOR_xsaveopt_rex64;
37100 break;
37101 case IX86_BUILTIN_XSAVEOPT64:
37102 icode = CODE_FOR_xsaveopt64;
37103 break;
37104 case IX86_BUILTIN_XSAVES:
37105 icode = CODE_FOR_xsaves_rex64;
37106 break;
37107 case IX86_BUILTIN_XRSTORS:
37108 icode = CODE_FOR_xrstors_rex64;
37109 break;
37110 case IX86_BUILTIN_XSAVES64:
37111 icode = CODE_FOR_xsaves64;
37112 break;
37113 case IX86_BUILTIN_XRSTORS64:
37114 icode = CODE_FOR_xrstors64;
37115 break;
37116 case IX86_BUILTIN_XSAVEC:
37117 icode = CODE_FOR_xsavec_rex64;
37118 break;
37119 case IX86_BUILTIN_XSAVEC64:
37120 icode = CODE_FOR_xsavec64;
37121 break;
37122 default:
37123 gcc_unreachable ();
37126 op2 = gen_lowpart (SImode, op2);
37127 op1 = gen_lowpart (SImode, op1);
37128 pat = GEN_FCN (icode) (op0, op1, op2);
37130 else
37132 switch (fcode)
37134 case IX86_BUILTIN_XSAVE:
37135 icode = CODE_FOR_xsave;
37136 break;
37137 case IX86_BUILTIN_XRSTOR:
37138 icode = CODE_FOR_xrstor;
37139 break;
37140 case IX86_BUILTIN_XSAVEOPT:
37141 icode = CODE_FOR_xsaveopt;
37142 break;
37143 case IX86_BUILTIN_XSAVES:
37144 icode = CODE_FOR_xsaves;
37145 break;
37146 case IX86_BUILTIN_XRSTORS:
37147 icode = CODE_FOR_xrstors;
37148 break;
37149 case IX86_BUILTIN_XSAVEC:
37150 icode = CODE_FOR_xsavec;
37151 break;
37152 default:
37153 gcc_unreachable ();
37155 pat = GEN_FCN (icode) (op0, op1);
37158 if (pat)
37159 emit_insn (pat);
37160 return 0;
37162 case IX86_BUILTIN_LLWPCB:
37163 arg0 = CALL_EXPR_ARG (exp, 0);
37164 op0 = expand_normal (arg0);
37165 icode = CODE_FOR_lwp_llwpcb;
37166 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37167 op0 = ix86_zero_extend_to_Pmode (op0);
37168 emit_insn (gen_lwp_llwpcb (op0));
37169 return 0;
37171 case IX86_BUILTIN_SLWPCB:
37172 icode = CODE_FOR_lwp_slwpcb;
37173 if (!target
37174 || !insn_data[icode].operand[0].predicate (target, Pmode))
37175 target = gen_reg_rtx (Pmode);
37176 emit_insn (gen_lwp_slwpcb (target));
37177 return target;
37179 case IX86_BUILTIN_BEXTRI32:
37180 case IX86_BUILTIN_BEXTRI64:
37181 arg0 = CALL_EXPR_ARG (exp, 0);
37182 arg1 = CALL_EXPR_ARG (exp, 1);
37183 op0 = expand_normal (arg0);
37184 op1 = expand_normal (arg1);
37185 icode = (fcode == IX86_BUILTIN_BEXTRI32
37186 ? CODE_FOR_tbm_bextri_si
37187 : CODE_FOR_tbm_bextri_di);
37188 if (!CONST_INT_P (op1))
37190 error ("last argument must be an immediate");
37191 return const0_rtx;
37193 else
37195 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37196 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37197 op1 = GEN_INT (length);
37198 op2 = GEN_INT (lsb_index);
37199 pat = GEN_FCN (icode) (target, op0, op1, op2);
37200 if (pat)
37201 emit_insn (pat);
37202 return target;
37205 case IX86_BUILTIN_RDRAND16_STEP:
37206 icode = CODE_FOR_rdrandhi_1;
37207 mode0 = HImode;
37208 goto rdrand_step;
37210 case IX86_BUILTIN_RDRAND32_STEP:
37211 icode = CODE_FOR_rdrandsi_1;
37212 mode0 = SImode;
37213 goto rdrand_step;
37215 case IX86_BUILTIN_RDRAND64_STEP:
37216 icode = CODE_FOR_rdranddi_1;
37217 mode0 = DImode;
37219 rdrand_step:
37220 op0 = gen_reg_rtx (mode0);
37221 emit_insn (GEN_FCN (icode) (op0));
37223 arg0 = CALL_EXPR_ARG (exp, 0);
37224 op1 = expand_normal (arg0);
37225 if (!address_operand (op1, VOIDmode))
37227 op1 = convert_memory_address (Pmode, op1);
37228 op1 = copy_addr_to_reg (op1);
37230 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37232 op1 = gen_reg_rtx (SImode);
37233 emit_move_insn (op1, CONST1_RTX (SImode));
37235 /* Emit SImode conditional move. */
37236 if (mode0 == HImode)
37238 op2 = gen_reg_rtx (SImode);
37239 emit_insn (gen_zero_extendhisi2 (op2, op0));
37241 else if (mode0 == SImode)
37242 op2 = op0;
37243 else
37244 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37246 if (target == 0
37247 || !register_operand (target, SImode))
37248 target = gen_reg_rtx (SImode);
37250 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37251 const0_rtx);
37252 emit_insn (gen_rtx_SET (target,
37253 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37254 return target;
37256 case IX86_BUILTIN_RDSEED16_STEP:
37257 icode = CODE_FOR_rdseedhi_1;
37258 mode0 = HImode;
37259 goto rdseed_step;
37261 case IX86_BUILTIN_RDSEED32_STEP:
37262 icode = CODE_FOR_rdseedsi_1;
37263 mode0 = SImode;
37264 goto rdseed_step;
37266 case IX86_BUILTIN_RDSEED64_STEP:
37267 icode = CODE_FOR_rdseeddi_1;
37268 mode0 = DImode;
37270 rdseed_step:
37271 op0 = gen_reg_rtx (mode0);
37272 emit_insn (GEN_FCN (icode) (op0));
37274 arg0 = CALL_EXPR_ARG (exp, 0);
37275 op1 = expand_normal (arg0);
37276 if (!address_operand (op1, VOIDmode))
37278 op1 = convert_memory_address (Pmode, op1);
37279 op1 = copy_addr_to_reg (op1);
37281 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37283 op2 = gen_reg_rtx (QImode);
37285 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37286 const0_rtx);
37287 emit_insn (gen_rtx_SET (op2, pat));
37289 if (target == 0
37290 || !register_operand (target, SImode))
37291 target = gen_reg_rtx (SImode);
37293 emit_insn (gen_zero_extendqisi2 (target, op2));
37294 return target;
37296 case IX86_BUILTIN_SBB32:
37297 icode = CODE_FOR_subborrowsi;
37298 mode0 = SImode;
37299 goto handlecarry;
37301 case IX86_BUILTIN_SBB64:
37302 icode = CODE_FOR_subborrowdi;
37303 mode0 = DImode;
37304 goto handlecarry;
37306 case IX86_BUILTIN_ADDCARRYX32:
37307 icode = CODE_FOR_addcarrysi;
37308 mode0 = SImode;
37309 goto handlecarry;
37311 case IX86_BUILTIN_ADDCARRYX64:
37312 icode = CODE_FOR_addcarrydi;
37313 mode0 = DImode;
37315 handlecarry:
37316 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37317 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37318 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37319 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37321 op1 = expand_normal (arg0);
37322 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37324 op2 = expand_normal (arg1);
37325 if (!register_operand (op2, mode0))
37326 op2 = copy_to_mode_reg (mode0, op2);
37328 op3 = expand_normal (arg2);
37329 if (!register_operand (op3, mode0))
37330 op3 = copy_to_mode_reg (mode0, op3);
37332 op4 = expand_normal (arg3);
37333 if (!address_operand (op4, VOIDmode))
37335 op4 = convert_memory_address (Pmode, op4);
37336 op4 = copy_addr_to_reg (op4);
37339 /* Generate CF from input operand. */
37340 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37342 /* Generate instruction that consumes CF. */
37343 op0 = gen_reg_rtx (mode0);
37345 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37346 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37347 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37349 /* Return current CF value. */
37350 if (target == 0)
37351 target = gen_reg_rtx (QImode);
37353 PUT_MODE (pat, QImode);
37354 emit_insn (gen_rtx_SET (target, pat));
37356 /* Store the result. */
37357 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37359 return target;
37361 case IX86_BUILTIN_READ_FLAGS:
37362 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37364 if (optimize
37365 || target == NULL_RTX
37366 || !nonimmediate_operand (target, word_mode)
37367 || GET_MODE (target) != word_mode)
37368 target = gen_reg_rtx (word_mode);
37370 emit_insn (gen_pop (target));
37371 return target;
37373 case IX86_BUILTIN_WRITE_FLAGS:
37375 arg0 = CALL_EXPR_ARG (exp, 0);
37376 op0 = expand_normal (arg0);
37377 if (!general_no_elim_operand (op0, word_mode))
37378 op0 = copy_to_mode_reg (word_mode, op0);
37380 emit_insn (gen_push (op0));
37381 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37382 return 0;
37384 case IX86_BUILTIN_KORTESTC16:
37385 icode = CODE_FOR_kortestchi;
37386 mode0 = HImode;
37387 mode1 = CCCmode;
37388 goto kortest;
37390 case IX86_BUILTIN_KORTESTZ16:
37391 icode = CODE_FOR_kortestzhi;
37392 mode0 = HImode;
37393 mode1 = CCZmode;
37395 kortest:
37396 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37397 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37398 op0 = expand_normal (arg0);
37399 op1 = expand_normal (arg1);
37401 op0 = copy_to_reg (op0);
37402 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37403 op1 = copy_to_reg (op1);
37404 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37406 target = gen_reg_rtx (QImode);
37407 emit_insn (gen_rtx_SET (target, const0_rtx));
37409 /* Emit kortest. */
37410 emit_insn (GEN_FCN (icode) (op0, op1));
37411 /* And use setcc to return result from flags. */
37412 ix86_expand_setcc (target, EQ,
37413 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37414 return target;
37416 case IX86_BUILTIN_GATHERSIV2DF:
37417 icode = CODE_FOR_avx2_gathersiv2df;
37418 goto gather_gen;
37419 case IX86_BUILTIN_GATHERSIV4DF:
37420 icode = CODE_FOR_avx2_gathersiv4df;
37421 goto gather_gen;
37422 case IX86_BUILTIN_GATHERDIV2DF:
37423 icode = CODE_FOR_avx2_gatherdiv2df;
37424 goto gather_gen;
37425 case IX86_BUILTIN_GATHERDIV4DF:
37426 icode = CODE_FOR_avx2_gatherdiv4df;
37427 goto gather_gen;
37428 case IX86_BUILTIN_GATHERSIV4SF:
37429 icode = CODE_FOR_avx2_gathersiv4sf;
37430 goto gather_gen;
37431 case IX86_BUILTIN_GATHERSIV8SF:
37432 icode = CODE_FOR_avx2_gathersiv8sf;
37433 goto gather_gen;
37434 case IX86_BUILTIN_GATHERDIV4SF:
37435 icode = CODE_FOR_avx2_gatherdiv4sf;
37436 goto gather_gen;
37437 case IX86_BUILTIN_GATHERDIV8SF:
37438 icode = CODE_FOR_avx2_gatherdiv8sf;
37439 goto gather_gen;
37440 case IX86_BUILTIN_GATHERSIV2DI:
37441 icode = CODE_FOR_avx2_gathersiv2di;
37442 goto gather_gen;
37443 case IX86_BUILTIN_GATHERSIV4DI:
37444 icode = CODE_FOR_avx2_gathersiv4di;
37445 goto gather_gen;
37446 case IX86_BUILTIN_GATHERDIV2DI:
37447 icode = CODE_FOR_avx2_gatherdiv2di;
37448 goto gather_gen;
37449 case IX86_BUILTIN_GATHERDIV4DI:
37450 icode = CODE_FOR_avx2_gatherdiv4di;
37451 goto gather_gen;
37452 case IX86_BUILTIN_GATHERSIV4SI:
37453 icode = CODE_FOR_avx2_gathersiv4si;
37454 goto gather_gen;
37455 case IX86_BUILTIN_GATHERSIV8SI:
37456 icode = CODE_FOR_avx2_gathersiv8si;
37457 goto gather_gen;
37458 case IX86_BUILTIN_GATHERDIV4SI:
37459 icode = CODE_FOR_avx2_gatherdiv4si;
37460 goto gather_gen;
37461 case IX86_BUILTIN_GATHERDIV8SI:
37462 icode = CODE_FOR_avx2_gatherdiv8si;
37463 goto gather_gen;
37464 case IX86_BUILTIN_GATHERALTSIV4DF:
37465 icode = CODE_FOR_avx2_gathersiv4df;
37466 goto gather_gen;
37467 case IX86_BUILTIN_GATHERALTDIV8SF:
37468 icode = CODE_FOR_avx2_gatherdiv8sf;
37469 goto gather_gen;
37470 case IX86_BUILTIN_GATHERALTSIV4DI:
37471 icode = CODE_FOR_avx2_gathersiv4di;
37472 goto gather_gen;
37473 case IX86_BUILTIN_GATHERALTDIV8SI:
37474 icode = CODE_FOR_avx2_gatherdiv8si;
37475 goto gather_gen;
37476 case IX86_BUILTIN_GATHER3SIV16SF:
37477 icode = CODE_FOR_avx512f_gathersiv16sf;
37478 goto gather_gen;
37479 case IX86_BUILTIN_GATHER3SIV8DF:
37480 icode = CODE_FOR_avx512f_gathersiv8df;
37481 goto gather_gen;
37482 case IX86_BUILTIN_GATHER3DIV16SF:
37483 icode = CODE_FOR_avx512f_gatherdiv16sf;
37484 goto gather_gen;
37485 case IX86_BUILTIN_GATHER3DIV8DF:
37486 icode = CODE_FOR_avx512f_gatherdiv8df;
37487 goto gather_gen;
37488 case IX86_BUILTIN_GATHER3SIV16SI:
37489 icode = CODE_FOR_avx512f_gathersiv16si;
37490 goto gather_gen;
37491 case IX86_BUILTIN_GATHER3SIV8DI:
37492 icode = CODE_FOR_avx512f_gathersiv8di;
37493 goto gather_gen;
37494 case IX86_BUILTIN_GATHER3DIV16SI:
37495 icode = CODE_FOR_avx512f_gatherdiv16si;
37496 goto gather_gen;
37497 case IX86_BUILTIN_GATHER3DIV8DI:
37498 icode = CODE_FOR_avx512f_gatherdiv8di;
37499 goto gather_gen;
37500 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37501 icode = CODE_FOR_avx512f_gathersiv8df;
37502 goto gather_gen;
37503 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37504 icode = CODE_FOR_avx512f_gatherdiv16sf;
37505 goto gather_gen;
37506 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37507 icode = CODE_FOR_avx512f_gathersiv8di;
37508 goto gather_gen;
37509 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37510 icode = CODE_FOR_avx512f_gatherdiv16si;
37511 goto gather_gen;
37512 case IX86_BUILTIN_GATHER3SIV2DF:
37513 icode = CODE_FOR_avx512vl_gathersiv2df;
37514 goto gather_gen;
37515 case IX86_BUILTIN_GATHER3SIV4DF:
37516 icode = CODE_FOR_avx512vl_gathersiv4df;
37517 goto gather_gen;
37518 case IX86_BUILTIN_GATHER3DIV2DF:
37519 icode = CODE_FOR_avx512vl_gatherdiv2df;
37520 goto gather_gen;
37521 case IX86_BUILTIN_GATHER3DIV4DF:
37522 icode = CODE_FOR_avx512vl_gatherdiv4df;
37523 goto gather_gen;
37524 case IX86_BUILTIN_GATHER3SIV4SF:
37525 icode = CODE_FOR_avx512vl_gathersiv4sf;
37526 goto gather_gen;
37527 case IX86_BUILTIN_GATHER3SIV8SF:
37528 icode = CODE_FOR_avx512vl_gathersiv8sf;
37529 goto gather_gen;
37530 case IX86_BUILTIN_GATHER3DIV4SF:
37531 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37532 goto gather_gen;
37533 case IX86_BUILTIN_GATHER3DIV8SF:
37534 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37535 goto gather_gen;
37536 case IX86_BUILTIN_GATHER3SIV2DI:
37537 icode = CODE_FOR_avx512vl_gathersiv2di;
37538 goto gather_gen;
37539 case IX86_BUILTIN_GATHER3SIV4DI:
37540 icode = CODE_FOR_avx512vl_gathersiv4di;
37541 goto gather_gen;
37542 case IX86_BUILTIN_GATHER3DIV2DI:
37543 icode = CODE_FOR_avx512vl_gatherdiv2di;
37544 goto gather_gen;
37545 case IX86_BUILTIN_GATHER3DIV4DI:
37546 icode = CODE_FOR_avx512vl_gatherdiv4di;
37547 goto gather_gen;
37548 case IX86_BUILTIN_GATHER3SIV4SI:
37549 icode = CODE_FOR_avx512vl_gathersiv4si;
37550 goto gather_gen;
37551 case IX86_BUILTIN_GATHER3SIV8SI:
37552 icode = CODE_FOR_avx512vl_gathersiv8si;
37553 goto gather_gen;
37554 case IX86_BUILTIN_GATHER3DIV4SI:
37555 icode = CODE_FOR_avx512vl_gatherdiv4si;
37556 goto gather_gen;
37557 case IX86_BUILTIN_GATHER3DIV8SI:
37558 icode = CODE_FOR_avx512vl_gatherdiv8si;
37559 goto gather_gen;
37560 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37561 icode = CODE_FOR_avx512vl_gathersiv4df;
37562 goto gather_gen;
37563 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37564 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37565 goto gather_gen;
37566 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37567 icode = CODE_FOR_avx512vl_gathersiv4di;
37568 goto gather_gen;
37569 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37570 icode = CODE_FOR_avx512vl_gatherdiv8si;
37571 goto gather_gen;
37572 case IX86_BUILTIN_SCATTERSIV16SF:
37573 icode = CODE_FOR_avx512f_scattersiv16sf;
37574 goto scatter_gen;
37575 case IX86_BUILTIN_SCATTERSIV8DF:
37576 icode = CODE_FOR_avx512f_scattersiv8df;
37577 goto scatter_gen;
37578 case IX86_BUILTIN_SCATTERDIV16SF:
37579 icode = CODE_FOR_avx512f_scatterdiv16sf;
37580 goto scatter_gen;
37581 case IX86_BUILTIN_SCATTERDIV8DF:
37582 icode = CODE_FOR_avx512f_scatterdiv8df;
37583 goto scatter_gen;
37584 case IX86_BUILTIN_SCATTERSIV16SI:
37585 icode = CODE_FOR_avx512f_scattersiv16si;
37586 goto scatter_gen;
37587 case IX86_BUILTIN_SCATTERSIV8DI:
37588 icode = CODE_FOR_avx512f_scattersiv8di;
37589 goto scatter_gen;
37590 case IX86_BUILTIN_SCATTERDIV16SI:
37591 icode = CODE_FOR_avx512f_scatterdiv16si;
37592 goto scatter_gen;
37593 case IX86_BUILTIN_SCATTERDIV8DI:
37594 icode = CODE_FOR_avx512f_scatterdiv8di;
37595 goto scatter_gen;
37596 case IX86_BUILTIN_SCATTERSIV8SF:
37597 icode = CODE_FOR_avx512vl_scattersiv8sf;
37598 goto scatter_gen;
37599 case IX86_BUILTIN_SCATTERSIV4SF:
37600 icode = CODE_FOR_avx512vl_scattersiv4sf;
37601 goto scatter_gen;
37602 case IX86_BUILTIN_SCATTERSIV4DF:
37603 icode = CODE_FOR_avx512vl_scattersiv4df;
37604 goto scatter_gen;
37605 case IX86_BUILTIN_SCATTERSIV2DF:
37606 icode = CODE_FOR_avx512vl_scattersiv2df;
37607 goto scatter_gen;
37608 case IX86_BUILTIN_SCATTERDIV8SF:
37609 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37610 goto scatter_gen;
37611 case IX86_BUILTIN_SCATTERDIV4SF:
37612 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37613 goto scatter_gen;
37614 case IX86_BUILTIN_SCATTERDIV4DF:
37615 icode = CODE_FOR_avx512vl_scatterdiv4df;
37616 goto scatter_gen;
37617 case IX86_BUILTIN_SCATTERDIV2DF:
37618 icode = CODE_FOR_avx512vl_scatterdiv2df;
37619 goto scatter_gen;
37620 case IX86_BUILTIN_SCATTERSIV8SI:
37621 icode = CODE_FOR_avx512vl_scattersiv8si;
37622 goto scatter_gen;
37623 case IX86_BUILTIN_SCATTERSIV4SI:
37624 icode = CODE_FOR_avx512vl_scattersiv4si;
37625 goto scatter_gen;
37626 case IX86_BUILTIN_SCATTERSIV4DI:
37627 icode = CODE_FOR_avx512vl_scattersiv4di;
37628 goto scatter_gen;
37629 case IX86_BUILTIN_SCATTERSIV2DI:
37630 icode = CODE_FOR_avx512vl_scattersiv2di;
37631 goto scatter_gen;
37632 case IX86_BUILTIN_SCATTERDIV8SI:
37633 icode = CODE_FOR_avx512vl_scatterdiv8si;
37634 goto scatter_gen;
37635 case IX86_BUILTIN_SCATTERDIV4SI:
37636 icode = CODE_FOR_avx512vl_scatterdiv4si;
37637 goto scatter_gen;
37638 case IX86_BUILTIN_SCATTERDIV4DI:
37639 icode = CODE_FOR_avx512vl_scatterdiv4di;
37640 goto scatter_gen;
37641 case IX86_BUILTIN_SCATTERDIV2DI:
37642 icode = CODE_FOR_avx512vl_scatterdiv2di;
37643 goto scatter_gen;
37644 case IX86_BUILTIN_GATHERPFDPD:
37645 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37646 goto vec_prefetch_gen;
37647 case IX86_BUILTIN_SCATTERALTSIV8DF:
37648 icode = CODE_FOR_avx512f_scattersiv8df;
37649 goto scatter_gen;
37650 case IX86_BUILTIN_SCATTERALTDIV16SF:
37651 icode = CODE_FOR_avx512f_scatterdiv16sf;
37652 goto scatter_gen;
37653 case IX86_BUILTIN_SCATTERALTSIV8DI:
37654 icode = CODE_FOR_avx512f_scattersiv8di;
37655 goto scatter_gen;
37656 case IX86_BUILTIN_SCATTERALTDIV16SI:
37657 icode = CODE_FOR_avx512f_scatterdiv16si;
37658 goto scatter_gen;
37659 case IX86_BUILTIN_GATHERPFDPS:
37660 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37661 goto vec_prefetch_gen;
37662 case IX86_BUILTIN_GATHERPFQPD:
37663 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37664 goto vec_prefetch_gen;
37665 case IX86_BUILTIN_GATHERPFQPS:
37666 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37667 goto vec_prefetch_gen;
37668 case IX86_BUILTIN_SCATTERPFDPD:
37669 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37670 goto vec_prefetch_gen;
37671 case IX86_BUILTIN_SCATTERPFDPS:
37672 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37673 goto vec_prefetch_gen;
37674 case IX86_BUILTIN_SCATTERPFQPD:
37675 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37676 goto vec_prefetch_gen;
37677 case IX86_BUILTIN_SCATTERPFQPS:
37678 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37679 goto vec_prefetch_gen;
37681 gather_gen:
37682 rtx half;
37683 rtx (*gen) (rtx, rtx);
37685 arg0 = CALL_EXPR_ARG (exp, 0);
37686 arg1 = CALL_EXPR_ARG (exp, 1);
37687 arg2 = CALL_EXPR_ARG (exp, 2);
37688 arg3 = CALL_EXPR_ARG (exp, 3);
37689 arg4 = CALL_EXPR_ARG (exp, 4);
37690 op0 = expand_normal (arg0);
37691 op1 = expand_normal (arg1);
37692 op2 = expand_normal (arg2);
37693 op3 = expand_normal (arg3);
37694 op4 = expand_normal (arg4);
37695 /* Note the arg order is different from the operand order. */
37696 mode0 = insn_data[icode].operand[1].mode;
37697 mode2 = insn_data[icode].operand[3].mode;
37698 mode3 = insn_data[icode].operand[4].mode;
37699 mode4 = insn_data[icode].operand[5].mode;
37701 if (target == NULL_RTX
37702 || GET_MODE (target) != insn_data[icode].operand[0].mode
37703 || !insn_data[icode].operand[0].predicate (target,
37704 GET_MODE (target)))
37705 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37706 else
37707 subtarget = target;
37709 switch (fcode)
37711 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37712 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37713 half = gen_reg_rtx (V8SImode);
37714 if (!nonimmediate_operand (op2, V16SImode))
37715 op2 = copy_to_mode_reg (V16SImode, op2);
37716 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37717 op2 = half;
37718 break;
37719 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37720 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37721 case IX86_BUILTIN_GATHERALTSIV4DF:
37722 case IX86_BUILTIN_GATHERALTSIV4DI:
37723 half = gen_reg_rtx (V4SImode);
37724 if (!nonimmediate_operand (op2, V8SImode))
37725 op2 = copy_to_mode_reg (V8SImode, op2);
37726 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37727 op2 = half;
37728 break;
37729 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37730 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37731 half = gen_reg_rtx (mode0);
37732 if (mode0 == V8SFmode)
37733 gen = gen_vec_extract_lo_v16sf;
37734 else
37735 gen = gen_vec_extract_lo_v16si;
37736 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37737 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37738 emit_insn (gen (half, op0));
37739 op0 = half;
37740 if (GET_MODE (op3) != VOIDmode)
37742 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37743 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37744 emit_insn (gen (half, op3));
37745 op3 = half;
37747 break;
37748 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37749 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37750 case IX86_BUILTIN_GATHERALTDIV8SF:
37751 case IX86_BUILTIN_GATHERALTDIV8SI:
37752 half = gen_reg_rtx (mode0);
37753 if (mode0 == V4SFmode)
37754 gen = gen_vec_extract_lo_v8sf;
37755 else
37756 gen = gen_vec_extract_lo_v8si;
37757 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37758 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37759 emit_insn (gen (half, op0));
37760 op0 = half;
37761 if (GET_MODE (op3) != VOIDmode)
37763 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37764 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37765 emit_insn (gen (half, op3));
37766 op3 = half;
37768 break;
37769 default:
37770 break;
37773 /* Force memory operand only with base register here. But we
37774 don't want to do it on memory operand for other builtin
37775 functions. */
37776 op1 = ix86_zero_extend_to_Pmode (op1);
37778 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37779 op0 = copy_to_mode_reg (mode0, op0);
37780 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37781 op1 = copy_to_mode_reg (Pmode, op1);
37782 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37783 op2 = copy_to_mode_reg (mode2, op2);
37785 op3 = fixup_modeless_constant (op3, mode3);
37787 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37789 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37790 op3 = copy_to_mode_reg (mode3, op3);
37792 else
37794 op3 = copy_to_reg (op3);
37795 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37797 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37799 error ("the last argument must be scale 1, 2, 4, 8");
37800 return const0_rtx;
37803 /* Optimize. If mask is known to have all high bits set,
37804 replace op0 with pc_rtx to signal that the instruction
37805 overwrites the whole destination and doesn't use its
37806 previous contents. */
37807 if (optimize)
37809 if (TREE_CODE (arg3) == INTEGER_CST)
37811 if (integer_all_onesp (arg3))
37812 op0 = pc_rtx;
37814 else if (TREE_CODE (arg3) == VECTOR_CST)
37816 unsigned int negative = 0;
37817 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37819 tree cst = VECTOR_CST_ELT (arg3, i);
37820 if (TREE_CODE (cst) == INTEGER_CST
37821 && tree_int_cst_sign_bit (cst))
37822 negative++;
37823 else if (TREE_CODE (cst) == REAL_CST
37824 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37825 negative++;
37827 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37828 op0 = pc_rtx;
37830 else if (TREE_CODE (arg3) == SSA_NAME
37831 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37833 /* Recognize also when mask is like:
37834 __v2df src = _mm_setzero_pd ();
37835 __v2df mask = _mm_cmpeq_pd (src, src);
37837 __v8sf src = _mm256_setzero_ps ();
37838 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37839 as that is a cheaper way to load all ones into
37840 a register than having to load a constant from
37841 memory. */
37842 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37843 if (is_gimple_call (def_stmt))
37845 tree fndecl = gimple_call_fndecl (def_stmt);
37846 if (fndecl
37847 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37848 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37850 case IX86_BUILTIN_CMPPD:
37851 case IX86_BUILTIN_CMPPS:
37852 case IX86_BUILTIN_CMPPD256:
37853 case IX86_BUILTIN_CMPPS256:
37854 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37855 break;
37856 /* FALLTHRU */
37857 case IX86_BUILTIN_CMPEQPD:
37858 case IX86_BUILTIN_CMPEQPS:
37859 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37860 && initializer_zerop (gimple_call_arg (def_stmt,
37861 1)))
37862 op0 = pc_rtx;
37863 break;
37864 default:
37865 break;
37871 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37872 if (! pat)
37873 return const0_rtx;
37874 emit_insn (pat);
37876 switch (fcode)
37878 case IX86_BUILTIN_GATHER3DIV16SF:
37879 if (target == NULL_RTX)
37880 target = gen_reg_rtx (V8SFmode);
37881 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37882 break;
37883 case IX86_BUILTIN_GATHER3DIV16SI:
37884 if (target == NULL_RTX)
37885 target = gen_reg_rtx (V8SImode);
37886 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37887 break;
37888 case IX86_BUILTIN_GATHER3DIV8SF:
37889 case IX86_BUILTIN_GATHERDIV8SF:
37890 if (target == NULL_RTX)
37891 target = gen_reg_rtx (V4SFmode);
37892 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37893 break;
37894 case IX86_BUILTIN_GATHER3DIV8SI:
37895 case IX86_BUILTIN_GATHERDIV8SI:
37896 if (target == NULL_RTX)
37897 target = gen_reg_rtx (V4SImode);
37898 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37899 break;
37900 default:
37901 target = subtarget;
37902 break;
37904 return target;
37906 scatter_gen:
37907 arg0 = CALL_EXPR_ARG (exp, 0);
37908 arg1 = CALL_EXPR_ARG (exp, 1);
37909 arg2 = CALL_EXPR_ARG (exp, 2);
37910 arg3 = CALL_EXPR_ARG (exp, 3);
37911 arg4 = CALL_EXPR_ARG (exp, 4);
37912 op0 = expand_normal (arg0);
37913 op1 = expand_normal (arg1);
37914 op2 = expand_normal (arg2);
37915 op3 = expand_normal (arg3);
37916 op4 = expand_normal (arg4);
37917 mode1 = insn_data[icode].operand[1].mode;
37918 mode2 = insn_data[icode].operand[2].mode;
37919 mode3 = insn_data[icode].operand[3].mode;
37920 mode4 = insn_data[icode].operand[4].mode;
37922 /* Scatter instruction stores operand op3 to memory with
37923 indices from op2 and scale from op4 under writemask op1.
37924 If index operand op2 has more elements then source operand
37925 op3 one need to use only its low half. And vice versa. */
37926 switch (fcode)
37928 case IX86_BUILTIN_SCATTERALTSIV8DF:
37929 case IX86_BUILTIN_SCATTERALTSIV8DI:
37930 half = gen_reg_rtx (V8SImode);
37931 if (!nonimmediate_operand (op2, V16SImode))
37932 op2 = copy_to_mode_reg (V16SImode, op2);
37933 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37934 op2 = half;
37935 break;
37936 case IX86_BUILTIN_SCATTERALTDIV16SF:
37937 case IX86_BUILTIN_SCATTERALTDIV16SI:
37938 half = gen_reg_rtx (mode3);
37939 if (mode3 == V8SFmode)
37940 gen = gen_vec_extract_lo_v16sf;
37941 else
37942 gen = gen_vec_extract_lo_v16si;
37943 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37944 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37945 emit_insn (gen (half, op3));
37946 op3 = half;
37947 break;
37948 default:
37949 break;
37952 /* Force memory operand only with base register here. But we
37953 don't want to do it on memory operand for other builtin
37954 functions. */
37955 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37957 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37958 op0 = copy_to_mode_reg (Pmode, op0);
37960 op1 = fixup_modeless_constant (op1, mode1);
37962 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37964 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37965 op1 = copy_to_mode_reg (mode1, op1);
37967 else
37969 op1 = copy_to_reg (op1);
37970 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37973 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37974 op2 = copy_to_mode_reg (mode2, op2);
37976 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37977 op3 = copy_to_mode_reg (mode3, op3);
37979 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37981 error ("the last argument must be scale 1, 2, 4, 8");
37982 return const0_rtx;
37985 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37986 if (! pat)
37987 return const0_rtx;
37989 emit_insn (pat);
37990 return 0;
37992 vec_prefetch_gen:
37993 arg0 = CALL_EXPR_ARG (exp, 0);
37994 arg1 = CALL_EXPR_ARG (exp, 1);
37995 arg2 = CALL_EXPR_ARG (exp, 2);
37996 arg3 = CALL_EXPR_ARG (exp, 3);
37997 arg4 = CALL_EXPR_ARG (exp, 4);
37998 op0 = expand_normal (arg0);
37999 op1 = expand_normal (arg1);
38000 op2 = expand_normal (arg2);
38001 op3 = expand_normal (arg3);
38002 op4 = expand_normal (arg4);
38003 mode0 = insn_data[icode].operand[0].mode;
38004 mode1 = insn_data[icode].operand[1].mode;
38005 mode3 = insn_data[icode].operand[3].mode;
38006 mode4 = insn_data[icode].operand[4].mode;
38008 op0 = fixup_modeless_constant (op0, mode0);
38010 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38012 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38013 op0 = copy_to_mode_reg (mode0, op0);
38015 else
38017 op0 = copy_to_reg (op0);
38018 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38021 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38022 op1 = copy_to_mode_reg (mode1, op1);
38024 /* Force memory operand only with base register here. But we
38025 don't want to do it on memory operand for other builtin
38026 functions. */
38027 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38029 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38030 op2 = copy_to_mode_reg (Pmode, op2);
38032 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38034 error ("the forth argument must be scale 1, 2, 4, 8");
38035 return const0_rtx;
38038 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38040 error ("incorrect hint operand");
38041 return const0_rtx;
38044 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38045 if (! pat)
38046 return const0_rtx;
38048 emit_insn (pat);
38050 return 0;
38052 case IX86_BUILTIN_XABORT:
38053 icode = CODE_FOR_xabort;
38054 arg0 = CALL_EXPR_ARG (exp, 0);
38055 op0 = expand_normal (arg0);
38056 mode0 = insn_data[icode].operand[0].mode;
38057 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38059 error ("the xabort's argument must be an 8-bit immediate");
38060 return const0_rtx;
38062 emit_insn (gen_xabort (op0));
38063 return 0;
38065 default:
38066 break;
38069 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38070 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38072 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38073 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38074 target);
38077 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38078 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38080 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38081 switch (fcode)
38083 case IX86_BUILTIN_FABSQ:
38084 case IX86_BUILTIN_COPYSIGNQ:
38085 if (!TARGET_SSE)
38086 /* Emit a normal call if SSE isn't available. */
38087 return expand_call (exp, target, ignore);
38088 /* FALLTHRU */
38089 default:
38090 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38094 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38095 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38097 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38098 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38101 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38102 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38104 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38105 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38108 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38109 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38111 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38112 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38115 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38116 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38118 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38119 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38122 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38123 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38125 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38126 const struct builtin_description *d = bdesc_multi_arg + i;
38127 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38128 (enum ix86_builtin_func_type)
38129 d->flag, d->comparison);
38132 gcc_unreachable ();
38135 /* This returns the target-specific builtin with code CODE if
38136 current_function_decl has visibility on this builtin, which is checked
38137 using isa flags. Returns NULL_TREE otherwise. */
38139 static tree ix86_get_builtin (enum ix86_builtins code)
38141 struct cl_target_option *opts;
38142 tree target_tree = NULL_TREE;
38144 /* Determine the isa flags of current_function_decl. */
38146 if (current_function_decl)
38147 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38149 if (target_tree == NULL)
38150 target_tree = target_option_default_node;
38152 opts = TREE_TARGET_OPTION (target_tree);
38154 if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38155 return ix86_builtin_decl (code, true);
38156 else
38157 return NULL_TREE;
38160 /* Return function decl for target specific builtin
38161 for given MPX builtin passed i FCODE. */
38162 static tree
38163 ix86_builtin_mpx_function (unsigned fcode)
38165 switch (fcode)
38167 case BUILT_IN_CHKP_BNDMK:
38168 return ix86_builtins[IX86_BUILTIN_BNDMK];
38170 case BUILT_IN_CHKP_BNDSTX:
38171 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38173 case BUILT_IN_CHKP_BNDLDX:
38174 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38176 case BUILT_IN_CHKP_BNDCL:
38177 return ix86_builtins[IX86_BUILTIN_BNDCL];
38179 case BUILT_IN_CHKP_BNDCU:
38180 return ix86_builtins[IX86_BUILTIN_BNDCU];
38182 case BUILT_IN_CHKP_BNDRET:
38183 return ix86_builtins[IX86_BUILTIN_BNDRET];
38185 case BUILT_IN_CHKP_INTERSECT:
38186 return ix86_builtins[IX86_BUILTIN_BNDINT];
38188 case BUILT_IN_CHKP_NARROW:
38189 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38191 case BUILT_IN_CHKP_SIZEOF:
38192 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38194 case BUILT_IN_CHKP_EXTRACT_LOWER:
38195 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38197 case BUILT_IN_CHKP_EXTRACT_UPPER:
38198 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38200 default:
38201 return NULL_TREE;
38204 gcc_unreachable ();
38207 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38209 Return an address to be used to load/store bounds for pointer
38210 passed in SLOT.
38212 SLOT_NO is an integer constant holding number of a target
38213 dependent special slot to be used in case SLOT is not a memory.
38215 SPECIAL_BASE is a pointer to be used as a base of fake address
38216 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38217 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38219 static rtx
38220 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38222 rtx addr = NULL;
38224 /* NULL slot means we pass bounds for pointer not passed to the
38225 function at all. Register slot means we pass pointer in a
38226 register. In both these cases bounds are passed via Bounds
38227 Table. Since we do not have actual pointer stored in memory,
38228 we have to use fake addresses to access Bounds Table. We
38229 start with (special_base - sizeof (void*)) and decrease this
38230 address by pointer size to get addresses for other slots. */
38231 if (!slot || REG_P (slot))
38233 gcc_assert (CONST_INT_P (slot_no));
38234 addr = plus_constant (Pmode, special_base,
38235 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38237 /* If pointer is passed in a memory then its address is used to
38238 access Bounds Table. */
38239 else if (MEM_P (slot))
38241 addr = XEXP (slot, 0);
38242 if (!register_operand (addr, Pmode))
38243 addr = copy_addr_to_reg (addr);
38245 else
38246 gcc_unreachable ();
38248 return addr;
38251 /* Expand pass uses this hook to load bounds for function parameter
38252 PTR passed in SLOT in case its bounds are not passed in a register.
38254 If SLOT is a memory, then bounds are loaded as for regular pointer
38255 loaded from memory. PTR may be NULL in case SLOT is a memory.
38256 In such case value of PTR (if required) may be loaded from SLOT.
38258 If SLOT is NULL or a register then SLOT_NO is an integer constant
38259 holding number of the target dependent special slot which should be
38260 used to obtain bounds.
38262 Return loaded bounds. */
38264 static rtx
38265 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38267 rtx reg = gen_reg_rtx (BNDmode);
38268 rtx addr;
38270 /* Get address to be used to access Bounds Table. Special slots start
38271 at the location of return address of the current function. */
38272 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38274 /* Load pointer value from a memory if we don't have it. */
38275 if (!ptr)
38277 gcc_assert (MEM_P (slot));
38278 ptr = copy_addr_to_reg (slot);
38281 if (!register_operand (ptr, Pmode))
38282 ptr = ix86_zero_extend_to_Pmode (ptr);
38284 emit_insn (BNDmode == BND64mode
38285 ? gen_bnd64_ldx (reg, addr, ptr)
38286 : gen_bnd32_ldx (reg, addr, ptr));
38288 return reg;
38291 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38292 passed in SLOT in case BOUNDS are not passed in a register.
38294 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38295 stored in memory. PTR may be NULL in case SLOT is a memory.
38296 In such case value of PTR (if required) may be loaded from SLOT.
38298 If SLOT is NULL or a register then SLOT_NO is an integer constant
38299 holding number of the target dependent special slot which should be
38300 used to store BOUNDS. */
38302 static void
38303 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38305 rtx addr;
38307 /* Get address to be used to access Bounds Table. Special slots start
38308 at the location of return address of a called function. */
38309 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38311 /* Load pointer value from a memory if we don't have it. */
38312 if (!ptr)
38314 gcc_assert (MEM_P (slot));
38315 ptr = copy_addr_to_reg (slot);
38318 if (!register_operand (ptr, Pmode))
38319 ptr = ix86_zero_extend_to_Pmode (ptr);
38321 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38322 if (!register_operand (bounds, BNDmode))
38323 bounds = copy_to_mode_reg (BNDmode, bounds);
38325 emit_insn (BNDmode == BND64mode
38326 ? gen_bnd64_stx (addr, ptr, bounds)
38327 : gen_bnd32_stx (addr, ptr, bounds));
38330 /* Load and return bounds returned by function in SLOT. */
38332 static rtx
38333 ix86_load_returned_bounds (rtx slot)
38335 rtx res;
38337 gcc_assert (REG_P (slot));
38338 res = gen_reg_rtx (BNDmode);
38339 emit_move_insn (res, slot);
38341 return res;
38344 /* Store BOUNDS returned by function into SLOT. */
38346 static void
38347 ix86_store_returned_bounds (rtx slot, rtx bounds)
38349 gcc_assert (REG_P (slot));
38350 emit_move_insn (slot, bounds);
38353 /* Returns a function decl for a vectorized version of the combined function
38354 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38355 if it is not available. */
38357 static tree
38358 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38359 tree type_in)
38361 machine_mode in_mode, out_mode;
38362 int in_n, out_n;
38364 if (TREE_CODE (type_out) != VECTOR_TYPE
38365 || TREE_CODE (type_in) != VECTOR_TYPE)
38366 return NULL_TREE;
38368 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38369 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38370 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38371 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38373 switch (fn)
38375 CASE_CFN_EXP2:
38376 if (out_mode == SFmode && in_mode == SFmode)
38378 if (out_n == 16 && in_n == 16)
38379 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38381 break;
38383 CASE_CFN_IFLOOR:
38384 CASE_CFN_LFLOOR:
38385 CASE_CFN_LLFLOOR:
38386 /* The round insn does not trap on denormals. */
38387 if (flag_trapping_math || !TARGET_ROUND)
38388 break;
38390 if (out_mode == SImode && in_mode == DFmode)
38392 if (out_n == 4 && in_n == 2)
38393 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38394 else if (out_n == 8 && in_n == 4)
38395 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38396 else if (out_n == 16 && in_n == 8)
38397 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38399 if (out_mode == SImode && in_mode == SFmode)
38401 if (out_n == 4 && in_n == 4)
38402 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38403 else if (out_n == 8 && in_n == 8)
38404 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38405 else if (out_n == 16 && in_n == 16)
38406 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38408 break;
38410 CASE_CFN_ICEIL:
38411 CASE_CFN_LCEIL:
38412 CASE_CFN_LLCEIL:
38413 /* The round insn does not trap on denormals. */
38414 if (flag_trapping_math || !TARGET_ROUND)
38415 break;
38417 if (out_mode == SImode && in_mode == DFmode)
38419 if (out_n == 4 && in_n == 2)
38420 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38421 else if (out_n == 8 && in_n == 4)
38422 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38423 else if (out_n == 16 && in_n == 8)
38424 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38426 if (out_mode == SImode && in_mode == SFmode)
38428 if (out_n == 4 && in_n == 4)
38429 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38430 else if (out_n == 8 && in_n == 8)
38431 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38432 else if (out_n == 16 && in_n == 16)
38433 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38435 break;
38437 CASE_CFN_IRINT:
38438 CASE_CFN_LRINT:
38439 CASE_CFN_LLRINT:
38440 if (out_mode == SImode && in_mode == DFmode)
38442 if (out_n == 4 && in_n == 2)
38443 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38444 else if (out_n == 8 && in_n == 4)
38445 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38446 else if (out_n == 16 && in_n == 8)
38447 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38449 if (out_mode == SImode && in_mode == SFmode)
38451 if (out_n == 4 && in_n == 4)
38452 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38453 else if (out_n == 8 && in_n == 8)
38454 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38455 else if (out_n == 16 && in_n == 16)
38456 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38458 break;
38460 CASE_CFN_IROUND:
38461 CASE_CFN_LROUND:
38462 CASE_CFN_LLROUND:
38463 /* The round insn does not trap on denormals. */
38464 if (flag_trapping_math || !TARGET_ROUND)
38465 break;
38467 if (out_mode == SImode && in_mode == DFmode)
38469 if (out_n == 4 && in_n == 2)
38470 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38471 else if (out_n == 8 && in_n == 4)
38472 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38473 else if (out_n == 16 && in_n == 8)
38474 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38476 if (out_mode == SImode && in_mode == SFmode)
38478 if (out_n == 4 && in_n == 4)
38479 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38480 else if (out_n == 8 && in_n == 8)
38481 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38482 else if (out_n == 16 && in_n == 16)
38483 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38485 break;
38487 CASE_CFN_FLOOR:
38488 /* The round insn does not trap on denormals. */
38489 if (flag_trapping_math || !TARGET_ROUND)
38490 break;
38492 if (out_mode == DFmode && in_mode == DFmode)
38494 if (out_n == 2 && in_n == 2)
38495 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38496 else if (out_n == 4 && in_n == 4)
38497 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38498 else if (out_n == 8 && in_n == 8)
38499 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38501 if (out_mode == SFmode && in_mode == SFmode)
38503 if (out_n == 4 && in_n == 4)
38504 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38505 else if (out_n == 8 && in_n == 8)
38506 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38507 else if (out_n == 16 && in_n == 16)
38508 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38510 break;
38512 CASE_CFN_CEIL:
38513 /* The round insn does not trap on denormals. */
38514 if (flag_trapping_math || !TARGET_ROUND)
38515 break;
38517 if (out_mode == DFmode && in_mode == DFmode)
38519 if (out_n == 2 && in_n == 2)
38520 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38521 else if (out_n == 4 && in_n == 4)
38522 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38523 else if (out_n == 8 && in_n == 8)
38524 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38526 if (out_mode == SFmode && in_mode == SFmode)
38528 if (out_n == 4 && in_n == 4)
38529 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38530 else if (out_n == 8 && in_n == 8)
38531 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38532 else if (out_n == 16 && in_n == 16)
38533 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38535 break;
38537 CASE_CFN_TRUNC:
38538 /* The round insn does not trap on denormals. */
38539 if (flag_trapping_math || !TARGET_ROUND)
38540 break;
38542 if (out_mode == DFmode && in_mode == DFmode)
38544 if (out_n == 2 && in_n == 2)
38545 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38546 else if (out_n == 4 && in_n == 4)
38547 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38548 else if (out_n == 8 && in_n == 8)
38549 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38551 if (out_mode == SFmode && in_mode == SFmode)
38553 if (out_n == 4 && in_n == 4)
38554 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38555 else if (out_n == 8 && in_n == 8)
38556 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38557 else if (out_n == 16 && in_n == 16)
38558 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38560 break;
38562 CASE_CFN_RINT:
38563 /* The round insn does not trap on denormals. */
38564 if (flag_trapping_math || !TARGET_ROUND)
38565 break;
38567 if (out_mode == DFmode && in_mode == DFmode)
38569 if (out_n == 2 && in_n == 2)
38570 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38571 else if (out_n == 4 && in_n == 4)
38572 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38574 if (out_mode == SFmode && in_mode == SFmode)
38576 if (out_n == 4 && in_n == 4)
38577 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38578 else if (out_n == 8 && in_n == 8)
38579 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38581 break;
38583 CASE_CFN_FMA:
38584 if (out_mode == DFmode && in_mode == DFmode)
38586 if (out_n == 2 && in_n == 2)
38587 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38588 if (out_n == 4 && in_n == 4)
38589 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38591 if (out_mode == SFmode && in_mode == SFmode)
38593 if (out_n == 4 && in_n == 4)
38594 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38595 if (out_n == 8 && in_n == 8)
38596 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38598 break;
38600 default:
38601 break;
38604 /* Dispatch to a handler for a vectorization library. */
38605 if (ix86_veclib_handler)
38606 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38608 return NULL_TREE;
38611 /* Handler for an SVML-style interface to
38612 a library with vectorized intrinsics. */
38614 static tree
38615 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38617 char name[20];
38618 tree fntype, new_fndecl, args;
38619 unsigned arity;
38620 const char *bname;
38621 machine_mode el_mode, in_mode;
38622 int n, in_n;
38624 /* The SVML is suitable for unsafe math only. */
38625 if (!flag_unsafe_math_optimizations)
38626 return NULL_TREE;
38628 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38629 n = TYPE_VECTOR_SUBPARTS (type_out);
38630 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38631 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38632 if (el_mode != in_mode
38633 || n != in_n)
38634 return NULL_TREE;
38636 switch (fn)
38638 CASE_CFN_EXP:
38639 CASE_CFN_LOG:
38640 CASE_CFN_LOG10:
38641 CASE_CFN_POW:
38642 CASE_CFN_TANH:
38643 CASE_CFN_TAN:
38644 CASE_CFN_ATAN:
38645 CASE_CFN_ATAN2:
38646 CASE_CFN_ATANH:
38647 CASE_CFN_CBRT:
38648 CASE_CFN_SINH:
38649 CASE_CFN_SIN:
38650 CASE_CFN_ASINH:
38651 CASE_CFN_ASIN:
38652 CASE_CFN_COSH:
38653 CASE_CFN_COS:
38654 CASE_CFN_ACOSH:
38655 CASE_CFN_ACOS:
38656 if ((el_mode != DFmode || n != 2)
38657 && (el_mode != SFmode || n != 4))
38658 return NULL_TREE;
38659 break;
38661 default:
38662 return NULL_TREE;
38665 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38666 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38668 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38669 strcpy (name, "vmlsLn4");
38670 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38671 strcpy (name, "vmldLn2");
38672 else if (n == 4)
38674 sprintf (name, "vmls%s", bname+10);
38675 name[strlen (name)-1] = '4';
38677 else
38678 sprintf (name, "vmld%s2", bname+10);
38680 /* Convert to uppercase. */
38681 name[4] &= ~0x20;
38683 arity = 0;
38684 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38685 arity++;
38687 if (arity == 1)
38688 fntype = build_function_type_list (type_out, type_in, NULL);
38689 else
38690 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38692 /* Build a function declaration for the vectorized function. */
38693 new_fndecl = build_decl (BUILTINS_LOCATION,
38694 FUNCTION_DECL, get_identifier (name), fntype);
38695 TREE_PUBLIC (new_fndecl) = 1;
38696 DECL_EXTERNAL (new_fndecl) = 1;
38697 DECL_IS_NOVOPS (new_fndecl) = 1;
38698 TREE_READONLY (new_fndecl) = 1;
38700 return new_fndecl;
38703 /* Handler for an ACML-style interface to
38704 a library with vectorized intrinsics. */
38706 static tree
38707 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38709 char name[20] = "__vr.._";
38710 tree fntype, new_fndecl, args;
38711 unsigned arity;
38712 const char *bname;
38713 machine_mode el_mode, in_mode;
38714 int n, in_n;
38716 /* The ACML is 64bits only and suitable for unsafe math only as
38717 it does not correctly support parts of IEEE with the required
38718 precision such as denormals. */
38719 if (!TARGET_64BIT
38720 || !flag_unsafe_math_optimizations)
38721 return NULL_TREE;
38723 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38724 n = TYPE_VECTOR_SUBPARTS (type_out);
38725 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38726 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38727 if (el_mode != in_mode
38728 || n != in_n)
38729 return NULL_TREE;
38731 switch (fn)
38733 CASE_CFN_SIN:
38734 CASE_CFN_COS:
38735 CASE_CFN_EXP:
38736 CASE_CFN_LOG:
38737 CASE_CFN_LOG2:
38738 CASE_CFN_LOG10:
38739 if (el_mode == DFmode && n == 2)
38741 name[4] = 'd';
38742 name[5] = '2';
38744 else if (el_mode == SFmode && n == 4)
38746 name[4] = 's';
38747 name[5] = '4';
38749 else
38750 return NULL_TREE;
38751 break;
38753 default:
38754 return NULL_TREE;
38757 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38758 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38759 sprintf (name + 7, "%s", bname+10);
38761 arity = 0;
38762 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38763 arity++;
38765 if (arity == 1)
38766 fntype = build_function_type_list (type_out, type_in, NULL);
38767 else
38768 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38770 /* Build a function declaration for the vectorized function. */
38771 new_fndecl = build_decl (BUILTINS_LOCATION,
38772 FUNCTION_DECL, get_identifier (name), fntype);
38773 TREE_PUBLIC (new_fndecl) = 1;
38774 DECL_EXTERNAL (new_fndecl) = 1;
38775 DECL_IS_NOVOPS (new_fndecl) = 1;
38776 TREE_READONLY (new_fndecl) = 1;
38778 return new_fndecl;
38781 /* Returns a decl of a function that implements gather load with
38782 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38783 Return NULL_TREE if it is not available. */
38785 static tree
38786 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38787 const_tree index_type, int scale)
38789 bool si;
38790 enum ix86_builtins code;
38792 if (! TARGET_AVX2)
38793 return NULL_TREE;
38795 if ((TREE_CODE (index_type) != INTEGER_TYPE
38796 && !POINTER_TYPE_P (index_type))
38797 || (TYPE_MODE (index_type) != SImode
38798 && TYPE_MODE (index_type) != DImode))
38799 return NULL_TREE;
38801 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38802 return NULL_TREE;
38804 /* v*gather* insn sign extends index to pointer mode. */
38805 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38806 && TYPE_UNSIGNED (index_type))
38807 return NULL_TREE;
38809 if (scale <= 0
38810 || scale > 8
38811 || (scale & (scale - 1)) != 0)
38812 return NULL_TREE;
38814 si = TYPE_MODE (index_type) == SImode;
38815 switch (TYPE_MODE (mem_vectype))
38817 case V2DFmode:
38818 if (TARGET_AVX512VL)
38819 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38820 else
38821 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38822 break;
38823 case V4DFmode:
38824 if (TARGET_AVX512VL)
38825 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38826 else
38827 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38828 break;
38829 case V2DImode:
38830 if (TARGET_AVX512VL)
38831 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38832 else
38833 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38834 break;
38835 case V4DImode:
38836 if (TARGET_AVX512VL)
38837 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38838 else
38839 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38840 break;
38841 case V4SFmode:
38842 if (TARGET_AVX512VL)
38843 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38844 else
38845 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38846 break;
38847 case V8SFmode:
38848 if (TARGET_AVX512VL)
38849 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38850 else
38851 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38852 break;
38853 case V4SImode:
38854 if (TARGET_AVX512VL)
38855 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38856 else
38857 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38858 break;
38859 case V8SImode:
38860 if (TARGET_AVX512VL)
38861 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38862 else
38863 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38864 break;
38865 case V8DFmode:
38866 if (TARGET_AVX512F)
38867 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38868 else
38869 return NULL_TREE;
38870 break;
38871 case V8DImode:
38872 if (TARGET_AVX512F)
38873 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38874 else
38875 return NULL_TREE;
38876 break;
38877 case V16SFmode:
38878 if (TARGET_AVX512F)
38879 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38880 else
38881 return NULL_TREE;
38882 break;
38883 case V16SImode:
38884 if (TARGET_AVX512F)
38885 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38886 else
38887 return NULL_TREE;
38888 break;
38889 default:
38890 return NULL_TREE;
38893 return ix86_get_builtin (code);
38896 /* Returns a decl of a function that implements scatter store with
38897 register type VECTYPE and index type INDEX_TYPE and SCALE.
38898 Return NULL_TREE if it is not available. */
38900 static tree
38901 ix86_vectorize_builtin_scatter (const_tree vectype,
38902 const_tree index_type, int scale)
38904 bool si;
38905 enum ix86_builtins code;
38907 if (!TARGET_AVX512F)
38908 return NULL_TREE;
38910 if ((TREE_CODE (index_type) != INTEGER_TYPE
38911 && !POINTER_TYPE_P (index_type))
38912 || (TYPE_MODE (index_type) != SImode
38913 && TYPE_MODE (index_type) != DImode))
38914 return NULL_TREE;
38916 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38917 return NULL_TREE;
38919 /* v*scatter* insn sign extends index to pointer mode. */
38920 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38921 && TYPE_UNSIGNED (index_type))
38922 return NULL_TREE;
38924 /* Scale can be 1, 2, 4 or 8. */
38925 if (scale <= 0
38926 || scale > 8
38927 || (scale & (scale - 1)) != 0)
38928 return NULL_TREE;
38930 si = TYPE_MODE (index_type) == SImode;
38931 switch (TYPE_MODE (vectype))
38933 case V8DFmode:
38934 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38935 break;
38936 case V8DImode:
38937 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38938 break;
38939 case V16SFmode:
38940 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38941 break;
38942 case V16SImode:
38943 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38944 break;
38945 default:
38946 return NULL_TREE;
38949 return ix86_builtins[code];
38952 /* Return true if it is safe to use the rsqrt optabs to optimize
38953 1.0/sqrt. */
38955 static bool
38956 use_rsqrt_p ()
38958 return (TARGET_SSE_MATH
38959 && flag_finite_math_only
38960 && !flag_trapping_math
38961 && flag_unsafe_math_optimizations);
38964 /* Returns a code for a target-specific builtin that implements
38965 reciprocal of the function, or NULL_TREE if not available. */
38967 static tree
38968 ix86_builtin_reciprocal (tree fndecl)
38970 switch (DECL_FUNCTION_CODE (fndecl))
38972 /* Vectorized version of sqrt to rsqrt conversion. */
38973 case IX86_BUILTIN_SQRTPS_NR:
38974 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38976 case IX86_BUILTIN_SQRTPS_NR256:
38977 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38979 default:
38980 return NULL_TREE;
38984 /* Helper for avx_vpermilps256_operand et al. This is also used by
38985 the expansion functions to turn the parallel back into a mask.
38986 The return value is 0 for no match and the imm8+1 for a match. */
38989 avx_vpermilp_parallel (rtx par, machine_mode mode)
38991 unsigned i, nelt = GET_MODE_NUNITS (mode);
38992 unsigned mask = 0;
38993 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38995 if (XVECLEN (par, 0) != (int) nelt)
38996 return 0;
38998 /* Validate that all of the elements are constants, and not totally
38999 out of range. Copy the data into an integral array to make the
39000 subsequent checks easier. */
39001 for (i = 0; i < nelt; ++i)
39003 rtx er = XVECEXP (par, 0, i);
39004 unsigned HOST_WIDE_INT ei;
39006 if (!CONST_INT_P (er))
39007 return 0;
39008 ei = INTVAL (er);
39009 if (ei >= nelt)
39010 return 0;
39011 ipar[i] = ei;
39014 switch (mode)
39016 case V8DFmode:
39017 /* In the 512-bit DFmode case, we can only move elements within
39018 a 128-bit lane. First fill the second part of the mask,
39019 then fallthru. */
39020 for (i = 4; i < 6; ++i)
39022 if (ipar[i] < 4 || ipar[i] >= 6)
39023 return 0;
39024 mask |= (ipar[i] - 4) << i;
39026 for (i = 6; i < 8; ++i)
39028 if (ipar[i] < 6)
39029 return 0;
39030 mask |= (ipar[i] - 6) << i;
39032 /* FALLTHRU */
39034 case V4DFmode:
39035 /* In the 256-bit DFmode case, we can only move elements within
39036 a 128-bit lane. */
39037 for (i = 0; i < 2; ++i)
39039 if (ipar[i] >= 2)
39040 return 0;
39041 mask |= ipar[i] << i;
39043 for (i = 2; i < 4; ++i)
39045 if (ipar[i] < 2)
39046 return 0;
39047 mask |= (ipar[i] - 2) << i;
39049 break;
39051 case V16SFmode:
39052 /* In 512 bit SFmode case, permutation in the upper 256 bits
39053 must mirror the permutation in the lower 256-bits. */
39054 for (i = 0; i < 8; ++i)
39055 if (ipar[i] + 8 != ipar[i + 8])
39056 return 0;
39057 /* FALLTHRU */
39059 case V8SFmode:
39060 /* In 256 bit SFmode case, we have full freedom of
39061 movement within the low 128-bit lane, but the high 128-bit
39062 lane must mirror the exact same pattern. */
39063 for (i = 0; i < 4; ++i)
39064 if (ipar[i] + 4 != ipar[i + 4])
39065 return 0;
39066 nelt = 4;
39067 /* FALLTHRU */
39069 case V2DFmode:
39070 case V4SFmode:
39071 /* In the 128-bit case, we've full freedom in the placement of
39072 the elements from the source operand. */
39073 for (i = 0; i < nelt; ++i)
39074 mask |= ipar[i] << (i * (nelt / 2));
39075 break;
39077 default:
39078 gcc_unreachable ();
39081 /* Make sure success has a non-zero value by adding one. */
39082 return mask + 1;
39085 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39086 the expansion functions to turn the parallel back into a mask.
39087 The return value is 0 for no match and the imm8+1 for a match. */
39090 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39092 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39093 unsigned mask = 0;
39094 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39096 if (XVECLEN (par, 0) != (int) nelt)
39097 return 0;
39099 /* Validate that all of the elements are constants, and not totally
39100 out of range. Copy the data into an integral array to make the
39101 subsequent checks easier. */
39102 for (i = 0; i < nelt; ++i)
39104 rtx er = XVECEXP (par, 0, i);
39105 unsigned HOST_WIDE_INT ei;
39107 if (!CONST_INT_P (er))
39108 return 0;
39109 ei = INTVAL (er);
39110 if (ei >= 2 * nelt)
39111 return 0;
39112 ipar[i] = ei;
39115 /* Validate that the halves of the permute are halves. */
39116 for (i = 0; i < nelt2 - 1; ++i)
39117 if (ipar[i] + 1 != ipar[i + 1])
39118 return 0;
39119 for (i = nelt2; i < nelt - 1; ++i)
39120 if (ipar[i] + 1 != ipar[i + 1])
39121 return 0;
39123 /* Reconstruct the mask. */
39124 for (i = 0; i < 2; ++i)
39126 unsigned e = ipar[i * nelt2];
39127 if (e % nelt2)
39128 return 0;
39129 e /= nelt2;
39130 mask |= e << (i * 4);
39133 /* Make sure success has a non-zero value by adding one. */
39134 return mask + 1;
39137 /* Return a register priority for hard reg REGNO. */
39138 static int
39139 ix86_register_priority (int hard_regno)
39141 /* ebp and r13 as the base always wants a displacement, r12 as the
39142 base always wants an index. So discourage their usage in an
39143 address. */
39144 if (hard_regno == R12_REG || hard_regno == R13_REG)
39145 return 0;
39146 if (hard_regno == BP_REG)
39147 return 1;
39148 /* New x86-64 int registers result in bigger code size. Discourage
39149 them. */
39150 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39151 return 2;
39152 /* New x86-64 SSE registers result in bigger code size. Discourage
39153 them. */
39154 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39155 return 2;
39156 /* Usage of AX register results in smaller code. Prefer it. */
39157 if (hard_regno == AX_REG)
39158 return 4;
39159 return 3;
39162 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39164 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39165 QImode must go into class Q_REGS.
39166 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39167 movdf to do mem-to-mem moves through integer regs. */
39169 static reg_class_t
39170 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39172 machine_mode mode = GET_MODE (x);
39174 /* We're only allowed to return a subclass of CLASS. Many of the
39175 following checks fail for NO_REGS, so eliminate that early. */
39176 if (regclass == NO_REGS)
39177 return NO_REGS;
39179 /* All classes can load zeros. */
39180 if (x == CONST0_RTX (mode))
39181 return regclass;
39183 /* Force constants into memory if we are loading a (nonzero) constant into
39184 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39185 instructions to load from a constant. */
39186 if (CONSTANT_P (x)
39187 && (MAYBE_MMX_CLASS_P (regclass)
39188 || MAYBE_SSE_CLASS_P (regclass)
39189 || MAYBE_MASK_CLASS_P (regclass)))
39190 return NO_REGS;
39192 /* Floating-point constants need more complex checks. */
39193 if (CONST_DOUBLE_P (x))
39195 /* General regs can load everything. */
39196 if (INTEGER_CLASS_P (regclass))
39197 return regclass;
39199 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39200 zero above. We only want to wind up preferring 80387 registers if
39201 we plan on doing computation with them. */
39202 if (IS_STACK_MODE (mode)
39203 && standard_80387_constant_p (x) > 0)
39205 /* Limit class to FP regs. */
39206 if (FLOAT_CLASS_P (regclass))
39207 return FLOAT_REGS;
39208 else if (regclass == FP_TOP_SSE_REGS)
39209 return FP_TOP_REG;
39210 else if (regclass == FP_SECOND_SSE_REGS)
39211 return FP_SECOND_REG;
39214 return NO_REGS;
39217 /* Prefer SSE regs only, if we can use them for math. */
39218 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39219 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39221 /* Generally when we see PLUS here, it's the function invariant
39222 (plus soft-fp const_int). Which can only be computed into general
39223 regs. */
39224 if (GET_CODE (x) == PLUS)
39225 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39227 /* QImode constants are easy to load, but non-constant QImode data
39228 must go into Q_REGS. */
39229 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39231 if (Q_CLASS_P (regclass))
39232 return regclass;
39233 else if (reg_class_subset_p (Q_REGS, regclass))
39234 return Q_REGS;
39235 else
39236 return NO_REGS;
39239 return regclass;
39242 /* Discourage putting floating-point values in SSE registers unless
39243 SSE math is being used, and likewise for the 387 registers. */
39244 static reg_class_t
39245 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39247 machine_mode mode = GET_MODE (x);
39249 /* Restrict the output reload class to the register bank that we are doing
39250 math on. If we would like not to return a subset of CLASS, reject this
39251 alternative: if reload cannot do this, it will still use its choice. */
39252 mode = GET_MODE (x);
39253 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39254 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39256 if (IS_STACK_MODE (mode))
39258 if (regclass == FP_TOP_SSE_REGS)
39259 return FP_TOP_REG;
39260 else if (regclass == FP_SECOND_SSE_REGS)
39261 return FP_SECOND_REG;
39262 else
39263 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39266 return regclass;
39269 static reg_class_t
39270 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39271 machine_mode mode, secondary_reload_info *sri)
39273 /* Double-word spills from general registers to non-offsettable memory
39274 references (zero-extended addresses) require special handling. */
39275 if (TARGET_64BIT
39276 && MEM_P (x)
39277 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39278 && INTEGER_CLASS_P (rclass)
39279 && !offsettable_memref_p (x))
39281 sri->icode = (in_p
39282 ? CODE_FOR_reload_noff_load
39283 : CODE_FOR_reload_noff_store);
39284 /* Add the cost of moving address to a temporary. */
39285 sri->extra_cost = 1;
39287 return NO_REGS;
39290 /* QImode spills from non-QI registers require
39291 intermediate register on 32bit targets. */
39292 if (mode == QImode
39293 && (MAYBE_MASK_CLASS_P (rclass)
39294 || (!TARGET_64BIT && !in_p
39295 && INTEGER_CLASS_P (rclass)
39296 && MAYBE_NON_Q_CLASS_P (rclass))))
39298 int regno;
39300 if (REG_P (x))
39301 regno = REGNO (x);
39302 else
39303 regno = -1;
39305 if (regno >= FIRST_PSEUDO_REGISTER || SUBREG_P (x))
39306 regno = true_regnum (x);
39308 /* Return Q_REGS if the operand is in memory. */
39309 if (regno == -1)
39310 return Q_REGS;
39313 /* This condition handles corner case where an expression involving
39314 pointers gets vectorized. We're trying to use the address of a
39315 stack slot as a vector initializer.
39317 (set (reg:V2DI 74 [ vect_cst_.2 ])
39318 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39320 Eventually frame gets turned into sp+offset like this:
39322 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39323 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39324 (const_int 392 [0x188]))))
39326 That later gets turned into:
39328 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39329 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39330 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39332 We'll have the following reload recorded:
39334 Reload 0: reload_in (DI) =
39335 (plus:DI (reg/f:DI 7 sp)
39336 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39337 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39338 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39339 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39340 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39341 reload_reg_rtx: (reg:V2DI 22 xmm1)
39343 Which isn't going to work since SSE instructions can't handle scalar
39344 additions. Returning GENERAL_REGS forces the addition into integer
39345 register and reload can handle subsequent reloads without problems. */
39347 if (in_p && GET_CODE (x) == PLUS
39348 && SSE_CLASS_P (rclass)
39349 && SCALAR_INT_MODE_P (mode))
39350 return GENERAL_REGS;
39352 return NO_REGS;
39355 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39357 static bool
39358 ix86_class_likely_spilled_p (reg_class_t rclass)
39360 switch (rclass)
39362 case AREG:
39363 case DREG:
39364 case CREG:
39365 case BREG:
39366 case AD_REGS:
39367 case SIREG:
39368 case DIREG:
39369 case SSE_FIRST_REG:
39370 case FP_TOP_REG:
39371 case FP_SECOND_REG:
39372 case BND_REGS:
39373 return true;
39375 default:
39376 break;
39379 return false;
39382 /* If we are copying between general and FP registers, we need a memory
39383 location. The same is true for SSE and MMX registers.
39385 To optimize register_move_cost performance, allow inline variant.
39387 The macro can't work reliably when one of the CLASSES is class containing
39388 registers from multiple units (SSE, MMX, integer). We avoid this by never
39389 combining those units in single alternative in the machine description.
39390 Ensure that this constraint holds to avoid unexpected surprises.
39392 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
39393 enforce these sanity checks. */
39395 static inline bool
39396 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39397 machine_mode mode, int strict)
39399 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39400 return false;
39401 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39402 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39403 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39404 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39405 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39406 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
39408 gcc_assert (!strict || lra_in_progress);
39409 return true;
39412 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39413 return true;
39415 /* Between mask and general, we have moves no larger than word size. */
39416 if ((MAYBE_MASK_CLASS_P (class1) != MAYBE_MASK_CLASS_P (class2))
39417 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39418 return true;
39420 /* ??? This is a lie. We do have moves between mmx/general, and for
39421 mmx/sse2. But by saying we need secondary memory we discourage the
39422 register allocator from using the mmx registers unless needed. */
39423 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39424 return true;
39426 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39428 /* SSE1 doesn't have any direct moves from other classes. */
39429 if (!TARGET_SSE2)
39430 return true;
39432 /* If the target says that inter-unit moves are more expensive
39433 than moving through memory, then don't generate them. */
39434 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39435 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39436 return true;
39438 /* Between SSE and general, we have moves no larger than word size. */
39439 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39440 return true;
39443 return false;
39446 bool
39447 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39448 machine_mode mode, int strict)
39450 return inline_secondary_memory_needed (class1, class2, mode, strict);
39453 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39455 On the 80386, this is the size of MODE in words,
39456 except in the FP regs, where a single reg is always enough. */
39458 static unsigned char
39459 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39461 if (MAYBE_INTEGER_CLASS_P (rclass))
39463 if (mode == XFmode)
39464 return (TARGET_64BIT ? 2 : 3);
39465 else if (mode == XCmode)
39466 return (TARGET_64BIT ? 4 : 6);
39467 else
39468 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39470 else
39472 if (COMPLEX_MODE_P (mode))
39473 return 2;
39474 else
39475 return 1;
39479 /* Return true if the registers in CLASS cannot represent the change from
39480 modes FROM to TO. */
39482 bool
39483 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
39484 enum reg_class regclass)
39486 if (from == to)
39487 return false;
39489 /* x87 registers can't do subreg at all, as all values are reformatted
39490 to extended precision. */
39491 if (MAYBE_FLOAT_CLASS_P (regclass))
39492 return true;
39494 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39496 /* Vector registers do not support QI or HImode loads. If we don't
39497 disallow a change to these modes, reload will assume it's ok to
39498 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39499 the vec_dupv4hi pattern. */
39500 if (GET_MODE_SIZE (from) < 4)
39501 return true;
39504 return false;
39507 /* Return the cost of moving data of mode M between a
39508 register and memory. A value of 2 is the default; this cost is
39509 relative to those in `REGISTER_MOVE_COST'.
39511 This function is used extensively by register_move_cost that is used to
39512 build tables at startup. Make it inline in this case.
39513 When IN is 2, return maximum of in and out move cost.
39515 If moving between registers and memory is more expensive than
39516 between two registers, you should define this macro to express the
39517 relative cost.
39519 Model also increased moving costs of QImode registers in non
39520 Q_REGS classes.
39522 static inline int
39523 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39524 int in)
39526 int cost;
39527 if (FLOAT_CLASS_P (regclass))
39529 int index;
39530 switch (mode)
39532 case SFmode:
39533 index = 0;
39534 break;
39535 case DFmode:
39536 index = 1;
39537 break;
39538 case XFmode:
39539 index = 2;
39540 break;
39541 default:
39542 return 100;
39544 if (in == 2)
39545 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39546 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39548 if (SSE_CLASS_P (regclass))
39550 int index;
39551 switch (GET_MODE_SIZE (mode))
39553 case 4:
39554 index = 0;
39555 break;
39556 case 8:
39557 index = 1;
39558 break;
39559 case 16:
39560 index = 2;
39561 break;
39562 default:
39563 return 100;
39565 if (in == 2)
39566 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39567 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39569 if (MMX_CLASS_P (regclass))
39571 int index;
39572 switch (GET_MODE_SIZE (mode))
39574 case 4:
39575 index = 0;
39576 break;
39577 case 8:
39578 index = 1;
39579 break;
39580 default:
39581 return 100;
39583 if (in)
39584 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39585 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39587 switch (GET_MODE_SIZE (mode))
39589 case 1:
39590 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39592 if (!in)
39593 return ix86_cost->int_store[0];
39594 if (TARGET_PARTIAL_REG_DEPENDENCY
39595 && optimize_function_for_speed_p (cfun))
39596 cost = ix86_cost->movzbl_load;
39597 else
39598 cost = ix86_cost->int_load[0];
39599 if (in == 2)
39600 return MAX (cost, ix86_cost->int_store[0]);
39601 return cost;
39603 else
39605 if (in == 2)
39606 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39607 if (in)
39608 return ix86_cost->movzbl_load;
39609 else
39610 return ix86_cost->int_store[0] + 4;
39612 break;
39613 case 2:
39614 if (in == 2)
39615 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39616 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39617 default:
39618 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39619 if (mode == TFmode)
39620 mode = XFmode;
39621 if (in == 2)
39622 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39623 else if (in)
39624 cost = ix86_cost->int_load[2];
39625 else
39626 cost = ix86_cost->int_store[2];
39627 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39631 static int
39632 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39633 bool in)
39635 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39639 /* Return the cost of moving data from a register in class CLASS1 to
39640 one in class CLASS2.
39642 It is not required that the cost always equal 2 when FROM is the same as TO;
39643 on some machines it is expensive to move between registers if they are not
39644 general registers. */
39646 static int
39647 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39648 reg_class_t class2_i)
39650 enum reg_class class1 = (enum reg_class) class1_i;
39651 enum reg_class class2 = (enum reg_class) class2_i;
39653 /* In case we require secondary memory, compute cost of the store followed
39654 by load. In order to avoid bad register allocation choices, we need
39655 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39657 if (inline_secondary_memory_needed (class1, class2, mode, 0))
39659 int cost = 1;
39661 cost += inline_memory_move_cost (mode, class1, 2);
39662 cost += inline_memory_move_cost (mode, class2, 2);
39664 /* In case of copying from general_purpose_register we may emit multiple
39665 stores followed by single load causing memory size mismatch stall.
39666 Count this as arbitrarily high cost of 20. */
39667 if (targetm.class_max_nregs (class1, mode)
39668 > targetm.class_max_nregs (class2, mode))
39669 cost += 20;
39671 /* In the case of FP/MMX moves, the registers actually overlap, and we
39672 have to switch modes in order to treat them differently. */
39673 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39674 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39675 cost += 20;
39677 return cost;
39680 /* Moves between SSE/MMX and integer unit are expensive. */
39681 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39682 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39684 /* ??? By keeping returned value relatively high, we limit the number
39685 of moves between integer and MMX/SSE registers for all targets.
39686 Additionally, high value prevents problem with x86_modes_tieable_p(),
39687 where integer modes in MMX/SSE registers are not tieable
39688 because of missing QImode and HImode moves to, from or between
39689 MMX/SSE registers. */
39690 return MAX (8, ix86_cost->mmxsse_to_integer);
39692 if (MAYBE_FLOAT_CLASS_P (class1))
39693 return ix86_cost->fp_move;
39694 if (MAYBE_SSE_CLASS_P (class1))
39695 return ix86_cost->sse_move;
39696 if (MAYBE_MMX_CLASS_P (class1))
39697 return ix86_cost->mmx_move;
39698 return 2;
39701 /* Return TRUE if hard register REGNO can hold a value of machine-mode
39702 MODE. */
39704 bool
39705 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
39707 /* Flags and only flags can only hold CCmode values. */
39708 if (CC_REGNO_P (regno))
39709 return GET_MODE_CLASS (mode) == MODE_CC;
39710 if (GET_MODE_CLASS (mode) == MODE_CC
39711 || GET_MODE_CLASS (mode) == MODE_RANDOM
39712 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39713 return false;
39714 if (STACK_REGNO_P (regno))
39715 return VALID_FP_MODE_P (mode);
39716 if (MASK_REGNO_P (regno))
39717 return (VALID_MASK_REG_MODE (mode)
39718 || (TARGET_AVX512BW
39719 && VALID_MASK_AVX512BW_MODE (mode)));
39720 if (BND_REGNO_P (regno))
39721 return VALID_BND_REG_MODE (mode);
39722 if (SSE_REGNO_P (regno))
39724 /* We implement the move patterns for all vector modes into and
39725 out of SSE registers, even when no operation instructions
39726 are available. */
39728 /* For AVX-512 we allow, regardless of regno:
39729 - XI mode
39730 - any of 512-bit wide vector mode
39731 - any scalar mode. */
39732 if (TARGET_AVX512F
39733 && (mode == XImode
39734 || VALID_AVX512F_REG_MODE (mode)
39735 || VALID_AVX512F_SCALAR_MODE (mode)))
39736 return true;
39738 /* TODO check for QI/HI scalars. */
39739 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39740 if (TARGET_AVX512VL
39741 && (mode == OImode
39742 || mode == TImode
39743 || VALID_AVX256_REG_MODE (mode)
39744 || VALID_AVX512VL_128_REG_MODE (mode)))
39745 return true;
39747 /* xmm16-xmm31 are only available for AVX-512. */
39748 if (EXT_REX_SSE_REGNO_P (regno))
39749 return false;
39751 /* OImode and AVX modes are available only when AVX is enabled. */
39752 return ((TARGET_AVX
39753 && VALID_AVX256_REG_OR_OI_MODE (mode))
39754 || VALID_SSE_REG_MODE (mode)
39755 || VALID_SSE2_REG_MODE (mode)
39756 || VALID_MMX_REG_MODE (mode)
39757 || VALID_MMX_REG_MODE_3DNOW (mode));
39759 if (MMX_REGNO_P (regno))
39761 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39762 so if the register is available at all, then we can move data of
39763 the given mode into or out of it. */
39764 return (VALID_MMX_REG_MODE (mode)
39765 || VALID_MMX_REG_MODE_3DNOW (mode));
39768 if (mode == QImode)
39770 /* Take care for QImode values - they can be in non-QI regs,
39771 but then they do cause partial register stalls. */
39772 if (ANY_QI_REGNO_P (regno))
39773 return true;
39774 if (!TARGET_PARTIAL_REG_STALL)
39775 return true;
39776 /* LRA checks if the hard register is OK for the given mode.
39777 QImode values can live in non-QI regs, so we allow all
39778 registers here. */
39779 if (lra_in_progress)
39780 return true;
39781 return !can_create_pseudo_p ();
39783 /* We handle both integer and floats in the general purpose registers. */
39784 else if (VALID_INT_MODE_P (mode))
39785 return true;
39786 else if (VALID_FP_MODE_P (mode))
39787 return true;
39788 else if (VALID_DFP_MODE_P (mode))
39789 return true;
39790 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39791 on to use that value in smaller contexts, this can easily force a
39792 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39793 supporting DImode, allow it. */
39794 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39795 return true;
39797 return false;
39800 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39801 tieable integer mode. */
39803 static bool
39804 ix86_tieable_integer_mode_p (machine_mode mode)
39806 switch (mode)
39808 case HImode:
39809 case SImode:
39810 return true;
39812 case QImode:
39813 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39815 case DImode:
39816 return TARGET_64BIT;
39818 default:
39819 return false;
39823 /* Return true if MODE1 is accessible in a register that can hold MODE2
39824 without copying. That is, all register classes that can hold MODE2
39825 can also hold MODE1. */
39827 bool
39828 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39830 if (mode1 == mode2)
39831 return true;
39833 if (ix86_tieable_integer_mode_p (mode1)
39834 && ix86_tieable_integer_mode_p (mode2))
39835 return true;
39837 /* MODE2 being XFmode implies fp stack or general regs, which means we
39838 can tie any smaller floating point modes to it. Note that we do not
39839 tie this with TFmode. */
39840 if (mode2 == XFmode)
39841 return mode1 == SFmode || mode1 == DFmode;
39843 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39844 that we can tie it with SFmode. */
39845 if (mode2 == DFmode)
39846 return mode1 == SFmode;
39848 /* If MODE2 is only appropriate for an SSE register, then tie with
39849 any other mode acceptable to SSE registers. */
39850 if (GET_MODE_SIZE (mode2) == 32
39851 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39852 return (GET_MODE_SIZE (mode1) == 32
39853 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39854 if (GET_MODE_SIZE (mode2) == 16
39855 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39856 return (GET_MODE_SIZE (mode1) == 16
39857 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39859 /* If MODE2 is appropriate for an MMX register, then tie
39860 with any other mode acceptable to MMX registers. */
39861 if (GET_MODE_SIZE (mode2) == 8
39862 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39863 return (GET_MODE_SIZE (mode1) == 8
39864 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39866 return false;
39869 /* Return the cost of moving between two registers of mode MODE. */
39871 static int
39872 ix86_set_reg_reg_cost (machine_mode mode)
39874 unsigned int units = UNITS_PER_WORD;
39876 switch (GET_MODE_CLASS (mode))
39878 default:
39879 break;
39881 case MODE_CC:
39882 units = GET_MODE_SIZE (CCmode);
39883 break;
39885 case MODE_FLOAT:
39886 if ((TARGET_SSE && mode == TFmode)
39887 || (TARGET_80387 && mode == XFmode)
39888 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39889 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39890 units = GET_MODE_SIZE (mode);
39891 break;
39893 case MODE_COMPLEX_FLOAT:
39894 if ((TARGET_SSE && mode == TCmode)
39895 || (TARGET_80387 && mode == XCmode)
39896 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39897 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39898 units = GET_MODE_SIZE (mode);
39899 break;
39901 case MODE_VECTOR_INT:
39902 case MODE_VECTOR_FLOAT:
39903 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39904 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39905 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39906 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39907 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39908 units = GET_MODE_SIZE (mode);
39911 /* Return the cost of moving between two registers of mode MODE,
39912 assuming that the move will be in pieces of at most UNITS bytes. */
39913 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39916 /* Compute a (partial) cost for rtx X. Return true if the complete
39917 cost has been computed, and false if subexpressions should be
39918 scanned. In either case, *TOTAL contains the cost result. */
39920 static bool
39921 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39922 int *total, bool speed)
39924 rtx mask;
39925 enum rtx_code code = GET_CODE (x);
39926 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39927 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39929 switch (code)
39931 case SET:
39932 if (register_operand (SET_DEST (x), VOIDmode)
39933 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39935 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39936 return true;
39938 return false;
39940 case CONST_INT:
39941 case CONST:
39942 case LABEL_REF:
39943 case SYMBOL_REF:
39944 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39945 *total = 3;
39946 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39947 *total = 2;
39948 else if (flag_pic && SYMBOLIC_CONST (x)
39949 && !(TARGET_64BIT
39950 && (GET_CODE (x) == LABEL_REF
39951 || (GET_CODE (x) == SYMBOL_REF
39952 && SYMBOL_REF_LOCAL_P (x))))
39953 /* Use 0 cost for CONST to improve its propagation. */
39954 && (TARGET_64BIT || GET_CODE (x) != CONST))
39955 *total = 1;
39956 else
39957 *total = 0;
39958 return true;
39960 case CONST_DOUBLE:
39961 if (IS_STACK_MODE (mode))
39962 switch (standard_80387_constant_p (x))
39964 case -1:
39965 case 0:
39966 break;
39967 case 1: /* 0.0 */
39968 *total = 1;
39969 return true;
39970 default: /* Other constants */
39971 *total = 2;
39972 return true;
39974 /* FALLTHRU */
39976 case CONST_VECTOR:
39977 switch (standard_sse_constant_p (x, mode))
39979 case 0:
39980 break;
39981 case 1: /* 0: xor eliminates false dependency */
39982 *total = 0;
39983 return true;
39984 default: /* -1: cmp contains false dependency */
39985 *total = 1;
39986 return true;
39988 /* FALLTHRU */
39990 case CONST_WIDE_INT:
39991 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39992 it'll probably end up. Add a penalty for size. */
39993 *total = (COSTS_N_INSNS (1)
39994 + (!TARGET_64BIT && flag_pic)
39995 + (GET_MODE_SIZE (mode) <= 4
39996 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39997 return true;
39999 case ZERO_EXTEND:
40000 /* The zero extensions is often completely free on x86_64, so make
40001 it as cheap as possible. */
40002 if (TARGET_64BIT && mode == DImode
40003 && GET_MODE (XEXP (x, 0)) == SImode)
40004 *total = 1;
40005 else if (TARGET_ZERO_EXTEND_WITH_AND)
40006 *total = cost->add;
40007 else
40008 *total = cost->movzx;
40009 return false;
40011 case SIGN_EXTEND:
40012 *total = cost->movsx;
40013 return false;
40015 case ASHIFT:
40016 if (SCALAR_INT_MODE_P (mode)
40017 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40018 && CONST_INT_P (XEXP (x, 1)))
40020 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40021 if (value == 1)
40023 *total = cost->add;
40024 return false;
40026 if ((value == 2 || value == 3)
40027 && cost->lea <= cost->shift_const)
40029 *total = cost->lea;
40030 return false;
40033 /* FALLTHRU */
40035 case ROTATE:
40036 case ASHIFTRT:
40037 case LSHIFTRT:
40038 case ROTATERT:
40039 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40041 /* ??? Should be SSE vector operation cost. */
40042 /* At least for published AMD latencies, this really is the same
40043 as the latency for a simple fpu operation like fabs. */
40044 /* V*QImode is emulated with 1-11 insns. */
40045 if (mode == V16QImode || mode == V32QImode)
40047 int count = 11;
40048 if (TARGET_XOP && mode == V16QImode)
40050 /* For XOP we use vpshab, which requires a broadcast of the
40051 value to the variable shift insn. For constants this
40052 means a V16Q const in mem; even when we can perform the
40053 shift with one insn set the cost to prefer paddb. */
40054 if (CONSTANT_P (XEXP (x, 1)))
40056 *total = (cost->fabs
40057 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
40058 + (speed ? 2 : COSTS_N_BYTES (16)));
40059 return true;
40061 count = 3;
40063 else if (TARGET_SSSE3)
40064 count = 7;
40065 *total = cost->fabs * count;
40067 else
40068 *total = cost->fabs;
40070 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40072 if (CONST_INT_P (XEXP (x, 1)))
40074 if (INTVAL (XEXP (x, 1)) > 32)
40075 *total = cost->shift_const + COSTS_N_INSNS (2);
40076 else
40077 *total = cost->shift_const * 2;
40079 else
40081 if (GET_CODE (XEXP (x, 1)) == AND)
40082 *total = cost->shift_var * 2;
40083 else
40084 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
40087 else
40089 if (CONST_INT_P (XEXP (x, 1)))
40090 *total = cost->shift_const;
40091 else if (SUBREG_P (XEXP (x, 1))
40092 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
40094 /* Return the cost after shift-and truncation. */
40095 *total = cost->shift_var;
40096 return true;
40098 else
40099 *total = cost->shift_var;
40101 return false;
40103 case FMA:
40105 rtx sub;
40107 gcc_assert (FLOAT_MODE_P (mode));
40108 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40110 /* ??? SSE scalar/vector cost should be used here. */
40111 /* ??? Bald assumption that fma has the same cost as fmul. */
40112 *total = cost->fmul;
40113 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40115 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40116 sub = XEXP (x, 0);
40117 if (GET_CODE (sub) == NEG)
40118 sub = XEXP (sub, 0);
40119 *total += rtx_cost (sub, mode, FMA, 0, speed);
40121 sub = XEXP (x, 2);
40122 if (GET_CODE (sub) == NEG)
40123 sub = XEXP (sub, 0);
40124 *total += rtx_cost (sub, mode, FMA, 2, speed);
40125 return true;
40128 case MULT:
40129 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40131 /* ??? SSE scalar cost should be used here. */
40132 *total = cost->fmul;
40133 return false;
40135 else if (X87_FLOAT_MODE_P (mode))
40137 *total = cost->fmul;
40138 return false;
40140 else if (FLOAT_MODE_P (mode))
40142 /* ??? SSE vector cost should be used here. */
40143 *total = cost->fmul;
40144 return false;
40146 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40148 /* V*QImode is emulated with 7-13 insns. */
40149 if (mode == V16QImode || mode == V32QImode)
40151 int extra = 11;
40152 if (TARGET_XOP && mode == V16QImode)
40153 extra = 5;
40154 else if (TARGET_SSSE3)
40155 extra = 6;
40156 *total = cost->fmul * 2 + cost->fabs * extra;
40158 /* V*DImode is emulated with 5-8 insns. */
40159 else if (mode == V2DImode || mode == V4DImode)
40161 if (TARGET_XOP && mode == V2DImode)
40162 *total = cost->fmul * 2 + cost->fabs * 3;
40163 else
40164 *total = cost->fmul * 3 + cost->fabs * 5;
40166 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40167 insns, including two PMULUDQ. */
40168 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40169 *total = cost->fmul * 2 + cost->fabs * 5;
40170 else
40171 *total = cost->fmul;
40172 return false;
40174 else
40176 rtx op0 = XEXP (x, 0);
40177 rtx op1 = XEXP (x, 1);
40178 int nbits;
40179 if (CONST_INT_P (XEXP (x, 1)))
40181 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40182 for (nbits = 0; value != 0; value &= value - 1)
40183 nbits++;
40185 else
40186 /* This is arbitrary. */
40187 nbits = 7;
40189 /* Compute costs correctly for widening multiplication. */
40190 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40191 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40192 == GET_MODE_SIZE (mode))
40194 int is_mulwiden = 0;
40195 machine_mode inner_mode = GET_MODE (op0);
40197 if (GET_CODE (op0) == GET_CODE (op1))
40198 is_mulwiden = 1, op1 = XEXP (op1, 0);
40199 else if (CONST_INT_P (op1))
40201 if (GET_CODE (op0) == SIGN_EXTEND)
40202 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40203 == INTVAL (op1);
40204 else
40205 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40208 if (is_mulwiden)
40209 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40212 *total = (cost->mult_init[MODE_INDEX (mode)]
40213 + nbits * cost->mult_bit
40214 + rtx_cost (op0, mode, outer_code, opno, speed)
40215 + rtx_cost (op1, mode, outer_code, opno, speed));
40217 return true;
40220 case DIV:
40221 case UDIV:
40222 case MOD:
40223 case UMOD:
40224 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40225 /* ??? SSE cost should be used here. */
40226 *total = cost->fdiv;
40227 else if (X87_FLOAT_MODE_P (mode))
40228 *total = cost->fdiv;
40229 else if (FLOAT_MODE_P (mode))
40230 /* ??? SSE vector cost should be used here. */
40231 *total = cost->fdiv;
40232 else
40233 *total = cost->divide[MODE_INDEX (mode)];
40234 return false;
40236 case PLUS:
40237 if (GET_MODE_CLASS (mode) == MODE_INT
40238 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40240 if (GET_CODE (XEXP (x, 0)) == PLUS
40241 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40242 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40243 && CONSTANT_P (XEXP (x, 1)))
40245 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40246 if (val == 2 || val == 4 || val == 8)
40248 *total = cost->lea;
40249 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40250 outer_code, opno, speed);
40251 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40252 outer_code, opno, speed);
40253 *total += rtx_cost (XEXP (x, 1), mode,
40254 outer_code, opno, speed);
40255 return true;
40258 else if (GET_CODE (XEXP (x, 0)) == MULT
40259 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40261 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40262 if (val == 2 || val == 4 || val == 8)
40264 *total = cost->lea;
40265 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40266 outer_code, opno, speed);
40267 *total += rtx_cost (XEXP (x, 1), mode,
40268 outer_code, opno, speed);
40269 return true;
40272 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40274 *total = cost->lea;
40275 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40276 outer_code, opno, speed);
40277 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40278 outer_code, opno, speed);
40279 *total += rtx_cost (XEXP (x, 1), mode,
40280 outer_code, opno, speed);
40281 return true;
40284 /* FALLTHRU */
40286 case MINUS:
40287 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40289 /* ??? SSE cost should be used here. */
40290 *total = cost->fadd;
40291 return false;
40293 else if (X87_FLOAT_MODE_P (mode))
40295 *total = cost->fadd;
40296 return false;
40298 else if (FLOAT_MODE_P (mode))
40300 /* ??? SSE vector cost should be used here. */
40301 *total = cost->fadd;
40302 return false;
40304 /* FALLTHRU */
40306 case AND:
40307 case IOR:
40308 case XOR:
40309 if (GET_MODE_CLASS (mode) == MODE_INT
40310 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40312 *total = (cost->add * 2
40313 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40314 << (GET_MODE (XEXP (x, 0)) != DImode))
40315 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40316 << (GET_MODE (XEXP (x, 1)) != DImode)));
40317 return true;
40319 /* FALLTHRU */
40321 case NEG:
40322 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40324 /* ??? SSE cost should be used here. */
40325 *total = cost->fchs;
40326 return false;
40328 else if (X87_FLOAT_MODE_P (mode))
40330 *total = cost->fchs;
40331 return false;
40333 else if (FLOAT_MODE_P (mode))
40335 /* ??? SSE vector cost should be used here. */
40336 *total = cost->fchs;
40337 return false;
40339 /* FALLTHRU */
40341 case NOT:
40342 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40344 /* ??? Should be SSE vector operation cost. */
40345 /* At least for published AMD latencies, this really is the same
40346 as the latency for a simple fpu operation like fabs. */
40347 *total = cost->fabs;
40349 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40350 *total = cost->add * 2;
40351 else
40352 *total = cost->add;
40353 return false;
40355 case COMPARE:
40356 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40357 && XEXP (XEXP (x, 0), 1) == const1_rtx
40358 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40359 && XEXP (x, 1) == const0_rtx)
40361 /* This kind of construct is implemented using test[bwl].
40362 Treat it as if we had an AND. */
40363 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40364 *total = (cost->add
40365 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40366 opno, speed)
40367 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40368 return true;
40371 /* The embedded comparison operand is completely free. */
40372 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40373 && XEXP (x, 1) == const0_rtx)
40374 *total = 0;
40376 return false;
40378 case FLOAT_EXTEND:
40379 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40380 *total = 0;
40381 return false;
40383 case ABS:
40384 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40385 /* ??? SSE cost should be used here. */
40386 *total = cost->fabs;
40387 else if (X87_FLOAT_MODE_P (mode))
40388 *total = cost->fabs;
40389 else if (FLOAT_MODE_P (mode))
40390 /* ??? SSE vector cost should be used here. */
40391 *total = cost->fabs;
40392 return false;
40394 case SQRT:
40395 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40396 /* ??? SSE cost should be used here. */
40397 *total = cost->fsqrt;
40398 else if (X87_FLOAT_MODE_P (mode))
40399 *total = cost->fsqrt;
40400 else if (FLOAT_MODE_P (mode))
40401 /* ??? SSE vector cost should be used here. */
40402 *total = cost->fsqrt;
40403 return false;
40405 case UNSPEC:
40406 if (XINT (x, 1) == UNSPEC_TP)
40407 *total = 0;
40408 return false;
40410 case VEC_SELECT:
40411 case VEC_CONCAT:
40412 case VEC_DUPLICATE:
40413 /* ??? Assume all of these vector manipulation patterns are
40414 recognizable. In which case they all pretty much have the
40415 same cost. */
40416 *total = cost->fabs;
40417 return true;
40418 case VEC_MERGE:
40419 mask = XEXP (x, 2);
40420 /* This is masked instruction, assume the same cost,
40421 as nonmasked variant. */
40422 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40423 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40424 else
40425 *total = cost->fabs;
40426 return true;
40428 default:
40429 return false;
40433 #if TARGET_MACHO
40435 static int current_machopic_label_num;
40437 /* Given a symbol name and its associated stub, write out the
40438 definition of the stub. */
40440 void
40441 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40443 unsigned int length;
40444 char *binder_name, *symbol_name, lazy_ptr_name[32];
40445 int label = ++current_machopic_label_num;
40447 /* For 64-bit we shouldn't get here. */
40448 gcc_assert (!TARGET_64BIT);
40450 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40451 symb = targetm.strip_name_encoding (symb);
40453 length = strlen (stub);
40454 binder_name = XALLOCAVEC (char, length + 32);
40455 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40457 length = strlen (symb);
40458 symbol_name = XALLOCAVEC (char, length + 32);
40459 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40461 sprintf (lazy_ptr_name, "L%d$lz", label);
40463 if (MACHOPIC_ATT_STUB)
40464 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40465 else if (MACHOPIC_PURE)
40466 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40467 else
40468 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40470 fprintf (file, "%s:\n", stub);
40471 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40473 if (MACHOPIC_ATT_STUB)
40475 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40477 else if (MACHOPIC_PURE)
40479 /* PIC stub. */
40480 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40481 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40482 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40483 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40484 label, lazy_ptr_name, label);
40485 fprintf (file, "\tjmp\t*%%ecx\n");
40487 else
40488 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40490 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40491 it needs no stub-binding-helper. */
40492 if (MACHOPIC_ATT_STUB)
40493 return;
40495 fprintf (file, "%s:\n", binder_name);
40497 if (MACHOPIC_PURE)
40499 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40500 fprintf (file, "\tpushl\t%%ecx\n");
40502 else
40503 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40505 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40507 /* N.B. Keep the correspondence of these
40508 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40509 old-pic/new-pic/non-pic stubs; altering this will break
40510 compatibility with existing dylibs. */
40511 if (MACHOPIC_PURE)
40513 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40514 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40516 else
40517 /* 16-byte -mdynamic-no-pic stub. */
40518 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40520 fprintf (file, "%s:\n", lazy_ptr_name);
40521 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40522 fprintf (file, ASM_LONG "%s\n", binder_name);
40524 #endif /* TARGET_MACHO */
40526 /* Order the registers for register allocator. */
40528 void
40529 x86_order_regs_for_local_alloc (void)
40531 int pos = 0;
40532 int i;
40534 /* First allocate the local general purpose registers. */
40535 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40536 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40537 reg_alloc_order [pos++] = i;
40539 /* Global general purpose registers. */
40540 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40541 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40542 reg_alloc_order [pos++] = i;
40544 /* x87 registers come first in case we are doing FP math
40545 using them. */
40546 if (!TARGET_SSE_MATH)
40547 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40548 reg_alloc_order [pos++] = i;
40550 /* SSE registers. */
40551 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40552 reg_alloc_order [pos++] = i;
40553 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40554 reg_alloc_order [pos++] = i;
40556 /* Extended REX SSE registers. */
40557 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40558 reg_alloc_order [pos++] = i;
40560 /* Mask register. */
40561 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40562 reg_alloc_order [pos++] = i;
40564 /* MPX bound registers. */
40565 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40566 reg_alloc_order [pos++] = i;
40568 /* x87 registers. */
40569 if (TARGET_SSE_MATH)
40570 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40571 reg_alloc_order [pos++] = i;
40573 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40574 reg_alloc_order [pos++] = i;
40576 /* Initialize the rest of array as we do not allocate some registers
40577 at all. */
40578 while (pos < FIRST_PSEUDO_REGISTER)
40579 reg_alloc_order [pos++] = 0;
40582 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40583 in struct attribute_spec handler. */
40584 static tree
40585 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40586 tree args,
40587 int,
40588 bool *no_add_attrs)
40590 if (TREE_CODE (*node) != FUNCTION_TYPE
40591 && TREE_CODE (*node) != METHOD_TYPE
40592 && TREE_CODE (*node) != FIELD_DECL
40593 && TREE_CODE (*node) != TYPE_DECL)
40595 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40596 name);
40597 *no_add_attrs = true;
40598 return NULL_TREE;
40600 if (TARGET_64BIT)
40602 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40603 name);
40604 *no_add_attrs = true;
40605 return NULL_TREE;
40607 if (is_attribute_p ("callee_pop_aggregate_return", name))
40609 tree cst;
40611 cst = TREE_VALUE (args);
40612 if (TREE_CODE (cst) != INTEGER_CST)
40614 warning (OPT_Wattributes,
40615 "%qE attribute requires an integer constant argument",
40616 name);
40617 *no_add_attrs = true;
40619 else if (compare_tree_int (cst, 0) != 0
40620 && compare_tree_int (cst, 1) != 0)
40622 warning (OPT_Wattributes,
40623 "argument to %qE attribute is neither zero, nor one",
40624 name);
40625 *no_add_attrs = true;
40628 return NULL_TREE;
40631 return NULL_TREE;
40634 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40635 struct attribute_spec.handler. */
40636 static tree
40637 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40638 bool *no_add_attrs)
40640 if (TREE_CODE (*node) != FUNCTION_TYPE
40641 && TREE_CODE (*node) != METHOD_TYPE
40642 && TREE_CODE (*node) != FIELD_DECL
40643 && TREE_CODE (*node) != TYPE_DECL)
40645 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40646 name);
40647 *no_add_attrs = true;
40648 return NULL_TREE;
40651 /* Can combine regparm with all attributes but fastcall. */
40652 if (is_attribute_p ("ms_abi", name))
40654 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40656 error ("ms_abi and sysv_abi attributes are not compatible");
40659 return NULL_TREE;
40661 else if (is_attribute_p ("sysv_abi", name))
40663 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40665 error ("ms_abi and sysv_abi attributes are not compatible");
40668 return NULL_TREE;
40671 return NULL_TREE;
40674 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40675 struct attribute_spec.handler. */
40676 static tree
40677 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40678 bool *no_add_attrs)
40680 tree *type = NULL;
40681 if (DECL_P (*node))
40683 if (TREE_CODE (*node) == TYPE_DECL)
40684 type = &TREE_TYPE (*node);
40686 else
40687 type = node;
40689 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40691 warning (OPT_Wattributes, "%qE attribute ignored",
40692 name);
40693 *no_add_attrs = true;
40696 else if ((is_attribute_p ("ms_struct", name)
40697 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40698 || ((is_attribute_p ("gcc_struct", name)
40699 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40701 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40702 name);
40703 *no_add_attrs = true;
40706 return NULL_TREE;
40709 static tree
40710 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40711 bool *no_add_attrs)
40713 if (TREE_CODE (*node) != FUNCTION_DECL)
40715 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40716 name);
40717 *no_add_attrs = true;
40719 return NULL_TREE;
40722 static tree
40723 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40724 int, bool *)
40726 return NULL_TREE;
40729 static tree
40730 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40732 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40733 but the function type contains args and return type data. */
40734 tree func_type = *node;
40735 tree return_type = TREE_TYPE (func_type);
40737 int nargs = 0;
40738 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40739 while (current_arg_type
40740 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40742 if (nargs == 0)
40744 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40745 error ("interrupt service routine should have a pointer "
40746 "as the first argument");
40748 else if (nargs == 1)
40750 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40751 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40752 error ("interrupt service routine should have unsigned %s"
40753 "int as the second argument",
40754 TARGET_64BIT
40755 ? (TARGET_X32 ? "long long " : "long ")
40756 : "");
40758 nargs++;
40759 current_arg_type = TREE_CHAIN (current_arg_type);
40761 if (!nargs || nargs > 2)
40762 error ("interrupt service routine can only have a pointer argument "
40763 "and an optional integer argument");
40764 if (! VOID_TYPE_P (return_type))
40765 error ("interrupt service routine can't have non-void return value");
40767 return NULL_TREE;
40770 static bool
40771 ix86_ms_bitfield_layout_p (const_tree record_type)
40773 return ((TARGET_MS_BITFIELD_LAYOUT
40774 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40775 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40778 /* Returns an expression indicating where the this parameter is
40779 located on entry to the FUNCTION. */
40781 static rtx
40782 x86_this_parameter (tree function)
40784 tree type = TREE_TYPE (function);
40785 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40786 int nregs;
40788 if (TARGET_64BIT)
40790 const int *parm_regs;
40792 if (ix86_function_type_abi (type) == MS_ABI)
40793 parm_regs = x86_64_ms_abi_int_parameter_registers;
40794 else
40795 parm_regs = x86_64_int_parameter_registers;
40796 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40799 nregs = ix86_function_regparm (type, function);
40801 if (nregs > 0 && !stdarg_p (type))
40803 int regno;
40804 unsigned int ccvt = ix86_get_callcvt (type);
40806 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40807 regno = aggr ? DX_REG : CX_REG;
40808 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40810 regno = CX_REG;
40811 if (aggr)
40812 return gen_rtx_MEM (SImode,
40813 plus_constant (Pmode, stack_pointer_rtx, 4));
40815 else
40817 regno = AX_REG;
40818 if (aggr)
40820 regno = DX_REG;
40821 if (nregs == 1)
40822 return gen_rtx_MEM (SImode,
40823 plus_constant (Pmode,
40824 stack_pointer_rtx, 4));
40827 return gen_rtx_REG (SImode, regno);
40830 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40831 aggr ? 8 : 4));
40834 /* Determine whether x86_output_mi_thunk can succeed. */
40836 static bool
40837 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40838 const_tree function)
40840 /* 64-bit can handle anything. */
40841 if (TARGET_64BIT)
40842 return true;
40844 /* For 32-bit, everything's fine if we have one free register. */
40845 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40846 return true;
40848 /* Need a free register for vcall_offset. */
40849 if (vcall_offset)
40850 return false;
40852 /* Need a free register for GOT references. */
40853 if (flag_pic && !targetm.binds_local_p (function))
40854 return false;
40856 /* Otherwise ok. */
40857 return true;
40860 /* Output the assembler code for a thunk function. THUNK_DECL is the
40861 declaration for the thunk function itself, FUNCTION is the decl for
40862 the target function. DELTA is an immediate constant offset to be
40863 added to THIS. If VCALL_OFFSET is nonzero, the word at
40864 *(*this + vcall_offset) should be added to THIS. */
40866 static void
40867 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40868 HOST_WIDE_INT vcall_offset, tree function)
40870 rtx this_param = x86_this_parameter (function);
40871 rtx this_reg, tmp, fnaddr;
40872 unsigned int tmp_regno;
40873 rtx_insn *insn;
40875 if (TARGET_64BIT)
40876 tmp_regno = R10_REG;
40877 else
40879 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40880 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40881 tmp_regno = AX_REG;
40882 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40883 tmp_regno = DX_REG;
40884 else
40885 tmp_regno = CX_REG;
40888 emit_note (NOTE_INSN_PROLOGUE_END);
40890 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40891 pull it in now and let DELTA benefit. */
40892 if (REG_P (this_param))
40893 this_reg = this_param;
40894 else if (vcall_offset)
40896 /* Put the this parameter into %eax. */
40897 this_reg = gen_rtx_REG (Pmode, AX_REG);
40898 emit_move_insn (this_reg, this_param);
40900 else
40901 this_reg = NULL_RTX;
40903 /* Adjust the this parameter by a fixed constant. */
40904 if (delta)
40906 rtx delta_rtx = GEN_INT (delta);
40907 rtx delta_dst = this_reg ? this_reg : this_param;
40909 if (TARGET_64BIT)
40911 if (!x86_64_general_operand (delta_rtx, Pmode))
40913 tmp = gen_rtx_REG (Pmode, tmp_regno);
40914 emit_move_insn (tmp, delta_rtx);
40915 delta_rtx = tmp;
40919 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40922 /* Adjust the this parameter by a value stored in the vtable. */
40923 if (vcall_offset)
40925 rtx vcall_addr, vcall_mem, this_mem;
40927 tmp = gen_rtx_REG (Pmode, tmp_regno);
40929 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40930 if (Pmode != ptr_mode)
40931 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40932 emit_move_insn (tmp, this_mem);
40934 /* Adjust the this parameter. */
40935 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40936 if (TARGET_64BIT
40937 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40939 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40940 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40941 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40944 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40945 if (Pmode != ptr_mode)
40946 emit_insn (gen_addsi_1_zext (this_reg,
40947 gen_rtx_REG (ptr_mode,
40948 REGNO (this_reg)),
40949 vcall_mem));
40950 else
40951 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40954 /* If necessary, drop THIS back to its stack slot. */
40955 if (this_reg && this_reg != this_param)
40956 emit_move_insn (this_param, this_reg);
40958 fnaddr = XEXP (DECL_RTL (function), 0);
40959 if (TARGET_64BIT)
40961 if (!flag_pic || targetm.binds_local_p (function)
40962 || TARGET_PECOFF)
40964 else
40966 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40967 tmp = gen_rtx_CONST (Pmode, tmp);
40968 fnaddr = gen_const_mem (Pmode, tmp);
40971 else
40973 if (!flag_pic || targetm.binds_local_p (function))
40975 #if TARGET_MACHO
40976 else if (TARGET_MACHO)
40978 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40979 fnaddr = XEXP (fnaddr, 0);
40981 #endif /* TARGET_MACHO */
40982 else
40984 tmp = gen_rtx_REG (Pmode, CX_REG);
40985 output_set_got (tmp, NULL_RTX);
40987 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40988 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40989 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40990 fnaddr = gen_const_mem (Pmode, fnaddr);
40994 /* Our sibling call patterns do not allow memories, because we have no
40995 predicate that can distinguish between frame and non-frame memory.
40996 For our purposes here, we can get away with (ab)using a jump pattern,
40997 because we're going to do no optimization. */
40998 if (MEM_P (fnaddr))
41000 if (sibcall_insn_operand (fnaddr, word_mode))
41002 fnaddr = XEXP (DECL_RTL (function), 0);
41003 tmp = gen_rtx_MEM (QImode, fnaddr);
41004 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41005 tmp = emit_call_insn (tmp);
41006 SIBLING_CALL_P (tmp) = 1;
41008 else
41009 emit_jump_insn (gen_indirect_jump (fnaddr));
41011 else
41013 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41015 // CM_LARGE_PIC always uses pseudo PIC register which is
41016 // uninitialized. Since FUNCTION is local and calling it
41017 // doesn't go through PLT, we use scratch register %r11 as
41018 // PIC register and initialize it here.
41019 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41020 ix86_init_large_pic_reg (tmp_regno);
41021 fnaddr = legitimize_pic_address (fnaddr,
41022 gen_rtx_REG (Pmode, tmp_regno));
41025 if (!sibcall_insn_operand (fnaddr, word_mode))
41027 tmp = gen_rtx_REG (word_mode, tmp_regno);
41028 if (GET_MODE (fnaddr) != word_mode)
41029 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41030 emit_move_insn (tmp, fnaddr);
41031 fnaddr = tmp;
41034 tmp = gen_rtx_MEM (QImode, fnaddr);
41035 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41036 tmp = emit_call_insn (tmp);
41037 SIBLING_CALL_P (tmp) = 1;
41039 emit_barrier ();
41041 /* Emit just enough of rest_of_compilation to get the insns emitted.
41042 Note that use_thunk calls assemble_start_function et al. */
41043 insn = get_insns ();
41044 shorten_branches (insn);
41045 final_start_function (insn, file, 1);
41046 final (insn, file, 1);
41047 final_end_function ();
41050 static void
41051 x86_file_start (void)
41053 default_file_start ();
41054 if (TARGET_16BIT)
41055 fputs ("\t.code16gcc\n", asm_out_file);
41056 #if TARGET_MACHO
41057 darwin_file_start ();
41058 #endif
41059 if (X86_FILE_START_VERSION_DIRECTIVE)
41060 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41061 if (X86_FILE_START_FLTUSED)
41062 fputs ("\t.global\t__fltused\n", asm_out_file);
41063 if (ix86_asm_dialect == ASM_INTEL)
41064 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41068 x86_field_alignment (tree field, int computed)
41070 machine_mode mode;
41071 tree type = TREE_TYPE (field);
41073 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41074 return computed;
41075 if (TARGET_IAMCU)
41076 return iamcu_alignment (type, computed);
41077 mode = TYPE_MODE (strip_array_types (type));
41078 if (mode == DFmode || mode == DCmode
41079 || GET_MODE_CLASS (mode) == MODE_INT
41080 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41081 return MIN (32, computed);
41082 return computed;
41085 /* Print call to TARGET to FILE. */
41087 static void
41088 x86_print_call_or_nop (FILE *file, const char *target)
41090 if (flag_nop_mcount)
41091 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
41092 else
41093 fprintf (file, "1:\tcall\t%s\n", target);
41096 /* Output assembler code to FILE to increment profiler label # LABELNO
41097 for profiling a function entry. */
41098 void
41099 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41101 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41102 : MCOUNT_NAME);
41103 if (TARGET_64BIT)
41105 #ifndef NO_PROFILE_COUNTERS
41106 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41107 #endif
41109 if (!TARGET_PECOFF && flag_pic)
41110 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41111 else
41112 x86_print_call_or_nop (file, mcount_name);
41114 else if (flag_pic)
41116 #ifndef NO_PROFILE_COUNTERS
41117 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41118 LPREFIX, labelno);
41119 #endif
41120 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41122 else
41124 #ifndef NO_PROFILE_COUNTERS
41125 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41126 LPREFIX, labelno);
41127 #endif
41128 x86_print_call_or_nop (file, mcount_name);
41131 if (flag_record_mcount)
41133 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41134 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41135 fprintf (file, "\t.previous\n");
41139 /* We don't have exact information about the insn sizes, but we may assume
41140 quite safely that we are informed about all 1 byte insns and memory
41141 address sizes. This is enough to eliminate unnecessary padding in
41142 99% of cases. */
41144 static int
41145 min_insn_size (rtx_insn *insn)
41147 int l = 0, len;
41149 if (!INSN_P (insn) || !active_insn_p (insn))
41150 return 0;
41152 /* Discard alignments we've emit and jump instructions. */
41153 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41154 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41155 return 0;
41157 /* Important case - calls are always 5 bytes.
41158 It is common to have many calls in the row. */
41159 if (CALL_P (insn)
41160 && symbolic_reference_mentioned_p (PATTERN (insn))
41161 && !SIBLING_CALL_P (insn))
41162 return 5;
41163 len = get_attr_length (insn);
41164 if (len <= 1)
41165 return 1;
41167 /* For normal instructions we rely on get_attr_length being exact,
41168 with a few exceptions. */
41169 if (!JUMP_P (insn))
41171 enum attr_type type = get_attr_type (insn);
41173 switch (type)
41175 case TYPE_MULTI:
41176 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41177 || asm_noperands (PATTERN (insn)) >= 0)
41178 return 0;
41179 break;
41180 case TYPE_OTHER:
41181 case TYPE_FCMP:
41182 break;
41183 default:
41184 /* Otherwise trust get_attr_length. */
41185 return len;
41188 l = get_attr_length_address (insn);
41189 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41190 l = 4;
41192 if (l)
41193 return 1+l;
41194 else
41195 return 2;
41198 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41200 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41201 window. */
41203 static void
41204 ix86_avoid_jump_mispredicts (void)
41206 rtx_insn *insn, *start = get_insns ();
41207 int nbytes = 0, njumps = 0;
41208 bool isjump = false;
41210 /* Look for all minimal intervals of instructions containing 4 jumps.
41211 The intervals are bounded by START and INSN. NBYTES is the total
41212 size of instructions in the interval including INSN and not including
41213 START. When the NBYTES is smaller than 16 bytes, it is possible
41214 that the end of START and INSN ends up in the same 16byte page.
41216 The smallest offset in the page INSN can start is the case where START
41217 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41218 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41220 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41221 have to, control transfer to label(s) can be performed through other
41222 means, and also we estimate minimum length of all asm stmts as 0. */
41223 for (insn = start; insn; insn = NEXT_INSN (insn))
41225 int min_size;
41227 if (LABEL_P (insn))
41229 int align = label_to_alignment (insn);
41230 int max_skip = label_to_max_skip (insn);
41232 if (max_skip > 15)
41233 max_skip = 15;
41234 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41235 already in the current 16 byte page, because otherwise
41236 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41237 bytes to reach 16 byte boundary. */
41238 if (align <= 0
41239 || (align <= 3 && max_skip != (1 << align) - 1))
41240 max_skip = 0;
41241 if (dump_file)
41242 fprintf (dump_file, "Label %i with max_skip %i\n",
41243 INSN_UID (insn), max_skip);
41244 if (max_skip)
41246 while (nbytes + max_skip >= 16)
41248 start = NEXT_INSN (start);
41249 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41250 || CALL_P (start))
41251 njumps--, isjump = true;
41252 else
41253 isjump = false;
41254 nbytes -= min_insn_size (start);
41257 continue;
41260 min_size = min_insn_size (insn);
41261 nbytes += min_size;
41262 if (dump_file)
41263 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41264 INSN_UID (insn), min_size);
41265 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41266 || CALL_P (insn))
41267 njumps++;
41268 else
41269 continue;
41271 while (njumps > 3)
41273 start = NEXT_INSN (start);
41274 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41275 || CALL_P (start))
41276 njumps--, isjump = true;
41277 else
41278 isjump = false;
41279 nbytes -= min_insn_size (start);
41281 gcc_assert (njumps >= 0);
41282 if (dump_file)
41283 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41284 INSN_UID (start), INSN_UID (insn), nbytes);
41286 if (njumps == 3 && isjump && nbytes < 16)
41288 int padsize = 15 - nbytes + min_insn_size (insn);
41290 if (dump_file)
41291 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41292 INSN_UID (insn), padsize);
41293 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41297 #endif
41299 /* AMD Athlon works faster
41300 when RET is not destination of conditional jump or directly preceded
41301 by other jump instruction. We avoid the penalty by inserting NOP just
41302 before the RET instructions in such cases. */
41303 static void
41304 ix86_pad_returns (void)
41306 edge e;
41307 edge_iterator ei;
41309 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41311 basic_block bb = e->src;
41312 rtx_insn *ret = BB_END (bb);
41313 rtx_insn *prev;
41314 bool replace = false;
41316 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41317 || optimize_bb_for_size_p (bb))
41318 continue;
41319 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41320 if (active_insn_p (prev) || LABEL_P (prev))
41321 break;
41322 if (prev && LABEL_P (prev))
41324 edge e;
41325 edge_iterator ei;
41327 FOR_EACH_EDGE (e, ei, bb->preds)
41328 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41329 && !(e->flags & EDGE_FALLTHRU))
41331 replace = true;
41332 break;
41335 if (!replace)
41337 prev = prev_active_insn (ret);
41338 if (prev
41339 && ((JUMP_P (prev) && any_condjump_p (prev))
41340 || CALL_P (prev)))
41341 replace = true;
41342 /* Empty functions get branch mispredict even when
41343 the jump destination is not visible to us. */
41344 if (!prev && !optimize_function_for_size_p (cfun))
41345 replace = true;
41347 if (replace)
41349 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41350 delete_insn (ret);
41355 /* Count the minimum number of instructions in BB. Return 4 if the
41356 number of instructions >= 4. */
41358 static int
41359 ix86_count_insn_bb (basic_block bb)
41361 rtx_insn *insn;
41362 int insn_count = 0;
41364 /* Count number of instructions in this block. Return 4 if the number
41365 of instructions >= 4. */
41366 FOR_BB_INSNS (bb, insn)
41368 /* Only happen in exit blocks. */
41369 if (JUMP_P (insn)
41370 && ANY_RETURN_P (PATTERN (insn)))
41371 break;
41373 if (NONDEBUG_INSN_P (insn)
41374 && GET_CODE (PATTERN (insn)) != USE
41375 && GET_CODE (PATTERN (insn)) != CLOBBER)
41377 insn_count++;
41378 if (insn_count >= 4)
41379 return insn_count;
41383 return insn_count;
41387 /* Count the minimum number of instructions in code path in BB.
41388 Return 4 if the number of instructions >= 4. */
41390 static int
41391 ix86_count_insn (basic_block bb)
41393 edge e;
41394 edge_iterator ei;
41395 int min_prev_count;
41397 /* Only bother counting instructions along paths with no
41398 more than 2 basic blocks between entry and exit. Given
41399 that BB has an edge to exit, determine if a predecessor
41400 of BB has an edge from entry. If so, compute the number
41401 of instructions in the predecessor block. If there
41402 happen to be multiple such blocks, compute the minimum. */
41403 min_prev_count = 4;
41404 FOR_EACH_EDGE (e, ei, bb->preds)
41406 edge prev_e;
41407 edge_iterator prev_ei;
41409 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41411 min_prev_count = 0;
41412 break;
41414 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41416 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41418 int count = ix86_count_insn_bb (e->src);
41419 if (count < min_prev_count)
41420 min_prev_count = count;
41421 break;
41426 if (min_prev_count < 4)
41427 min_prev_count += ix86_count_insn_bb (bb);
41429 return min_prev_count;
41432 /* Pad short function to 4 instructions. */
41434 static void
41435 ix86_pad_short_function (void)
41437 edge e;
41438 edge_iterator ei;
41440 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41442 rtx_insn *ret = BB_END (e->src);
41443 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41445 int insn_count = ix86_count_insn (e->src);
41447 /* Pad short function. */
41448 if (insn_count < 4)
41450 rtx_insn *insn = ret;
41452 /* Find epilogue. */
41453 while (insn
41454 && (!NOTE_P (insn)
41455 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41456 insn = PREV_INSN (insn);
41458 if (!insn)
41459 insn = ret;
41461 /* Two NOPs count as one instruction. */
41462 insn_count = 2 * (4 - insn_count);
41463 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41469 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41470 the epilogue, the Windows system unwinder will apply epilogue logic and
41471 produce incorrect offsets. This can be avoided by adding a nop between
41472 the last insn that can throw and the first insn of the epilogue. */
41474 static void
41475 ix86_seh_fixup_eh_fallthru (void)
41477 edge e;
41478 edge_iterator ei;
41480 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41482 rtx_insn *insn, *next;
41484 /* Find the beginning of the epilogue. */
41485 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41486 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41487 break;
41488 if (insn == NULL)
41489 continue;
41491 /* We only care about preceding insns that can throw. */
41492 insn = prev_active_insn (insn);
41493 if (insn == NULL || !can_throw_internal (insn))
41494 continue;
41496 /* Do not separate calls from their debug information. */
41497 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41498 if (NOTE_P (next)
41499 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41500 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41501 insn = next;
41502 else
41503 break;
41505 emit_insn_after (gen_nops (const1_rtx), insn);
41509 /* Given a register number BASE, the lowest of a group of registers, update
41510 regsets IN and OUT with the registers that should be avoided in input
41511 and output operands respectively when trying to avoid generating a modr/m
41512 byte for -fmitigate-rop. */
41514 static void
41515 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41517 SET_HARD_REG_BIT (out, base);
41518 SET_HARD_REG_BIT (out, base + 1);
41519 SET_HARD_REG_BIT (in, base + 2);
41520 SET_HARD_REG_BIT (in, base + 3);
41523 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41524 that certain encodings of modr/m bytes do not occur. */
41525 static void
41526 ix86_mitigate_rop (void)
41528 HARD_REG_SET input_risky;
41529 HARD_REG_SET output_risky;
41530 HARD_REG_SET inout_risky;
41532 CLEAR_HARD_REG_SET (output_risky);
41533 CLEAR_HARD_REG_SET (input_risky);
41534 SET_HARD_REG_BIT (output_risky, AX_REG);
41535 SET_HARD_REG_BIT (output_risky, CX_REG);
41536 SET_HARD_REG_BIT (input_risky, BX_REG);
41537 SET_HARD_REG_BIT (input_risky, DX_REG);
41538 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41539 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41540 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41541 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41542 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41543 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41544 COPY_HARD_REG_SET (inout_risky, input_risky);
41545 IOR_HARD_REG_SET (inout_risky, output_risky);
41547 df_note_add_problem ();
41548 /* Fix up what stack-regs did. */
41549 df_insn_rescan_all ();
41550 df_analyze ();
41552 regrename_init (true);
41553 regrename_analyze (NULL);
41555 auto_vec<du_head_p> cands;
41557 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41559 if (!NONDEBUG_INSN_P (insn))
41560 continue;
41562 if (GET_CODE (PATTERN (insn)) == USE
41563 || GET_CODE (PATTERN (insn)) == CLOBBER)
41564 continue;
41566 extract_insn (insn);
41568 int opno0, opno1;
41569 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41570 recog_data.n_operands, &opno0,
41571 &opno1);
41573 if (!ix86_rop_should_change_byte_p (modrm))
41574 continue;
41576 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41578 /* This happens when regrename has to fail a block. */
41579 if (!info->op_info)
41580 continue;
41582 if (info->op_info[opno0].n_chains != 0)
41584 gcc_assert (info->op_info[opno0].n_chains == 1);
41585 du_head_p op0c;
41586 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41587 if (op0c->target_data_1 + op0c->target_data_2 == 0
41588 && !op0c->cannot_rename)
41589 cands.safe_push (op0c);
41591 op0c->target_data_1++;
41593 if (info->op_info[opno1].n_chains != 0)
41595 gcc_assert (info->op_info[opno1].n_chains == 1);
41596 du_head_p op1c;
41597 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41598 if (op1c->target_data_1 + op1c->target_data_2 == 0
41599 && !op1c->cannot_rename)
41600 cands.safe_push (op1c);
41602 op1c->target_data_2++;
41606 int i;
41607 du_head_p head;
41608 FOR_EACH_VEC_ELT (cands, i, head)
41610 int old_reg, best_reg;
41611 HARD_REG_SET unavailable;
41613 CLEAR_HARD_REG_SET (unavailable);
41614 if (head->target_data_1)
41615 IOR_HARD_REG_SET (unavailable, output_risky);
41616 if (head->target_data_2)
41617 IOR_HARD_REG_SET (unavailable, input_risky);
41619 int n_uses;
41620 reg_class superclass = regrename_find_superclass (head, &n_uses,
41621 &unavailable);
41622 old_reg = head->regno;
41623 best_reg = find_rename_reg (head, superclass, &unavailable,
41624 old_reg, false);
41625 bool ok = regrename_do_replace (head, best_reg);
41626 gcc_assert (ok);
41627 if (dump_file)
41628 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41629 reg_names[best_reg], reg_class_names[superclass]);
41633 regrename_finish ();
41635 df_analyze ();
41637 basic_block bb;
41638 regset_head live;
41640 INIT_REG_SET (&live);
41642 FOR_EACH_BB_FN (bb, cfun)
41644 rtx_insn *insn;
41646 COPY_REG_SET (&live, DF_LR_OUT (bb));
41647 df_simulate_initialize_backwards (bb, &live);
41649 FOR_BB_INSNS_REVERSE (bb, insn)
41651 if (!NONDEBUG_INSN_P (insn))
41652 continue;
41654 df_simulate_one_insn_backwards (bb, insn, &live);
41656 if (GET_CODE (PATTERN (insn)) == USE
41657 || GET_CODE (PATTERN (insn)) == CLOBBER)
41658 continue;
41660 extract_insn (insn);
41661 constrain_operands_cached (insn, reload_completed);
41662 int opno0, opno1;
41663 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41664 recog_data.n_operands, &opno0,
41665 &opno1);
41666 if (modrm < 0
41667 || !ix86_rop_should_change_byte_p (modrm)
41668 || opno0 == opno1)
41669 continue;
41671 rtx oldreg = recog_data.operand[opno1];
41672 preprocess_constraints (insn);
41673 const operand_alternative *alt = which_op_alt ();
41675 int i;
41676 for (i = 0; i < recog_data.n_operands; i++)
41677 if (i != opno1
41678 && alt[i].earlyclobber
41679 && reg_overlap_mentioned_p (recog_data.operand[i],
41680 oldreg))
41681 break;
41683 if (i < recog_data.n_operands)
41684 continue;
41686 if (dump_file)
41687 fprintf (dump_file,
41688 "attempting to fix modrm byte in insn %d:"
41689 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41690 reg_class_names[alt[opno1].cl]);
41692 HARD_REG_SET unavailable;
41693 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41694 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41695 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41696 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41697 IOR_HARD_REG_SET (unavailable, output_risky);
41698 IOR_COMPL_HARD_REG_SET (unavailable,
41699 reg_class_contents[alt[opno1].cl]);
41701 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41702 if (!TEST_HARD_REG_BIT (unavailable, i))
41703 break;
41704 if (i == FIRST_PSEUDO_REGISTER)
41706 if (dump_file)
41707 fprintf (dump_file, ", none available\n");
41708 continue;
41710 if (dump_file)
41711 fprintf (dump_file, " -> %d\n", i);
41712 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41713 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41714 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41719 /* Implement machine specific optimizations. We implement padding of returns
41720 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41721 static void
41722 ix86_reorg (void)
41724 /* We are freeing block_for_insn in the toplev to keep compatibility
41725 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41726 compute_bb_for_insn ();
41728 if (flag_mitigate_rop)
41729 ix86_mitigate_rop ();
41731 if (TARGET_SEH && current_function_has_exception_handlers ())
41732 ix86_seh_fixup_eh_fallthru ();
41734 if (optimize && optimize_function_for_speed_p (cfun))
41736 if (TARGET_PAD_SHORT_FUNCTION)
41737 ix86_pad_short_function ();
41738 else if (TARGET_PAD_RETURNS)
41739 ix86_pad_returns ();
41740 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41741 if (TARGET_FOUR_JUMP_LIMIT)
41742 ix86_avoid_jump_mispredicts ();
41743 #endif
41747 /* Return nonzero when QImode register that must be represented via REX prefix
41748 is used. */
41749 bool
41750 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41752 int i;
41753 extract_insn_cached (insn);
41754 for (i = 0; i < recog_data.n_operands; i++)
41755 if (GENERAL_REG_P (recog_data.operand[i])
41756 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41757 return true;
41758 return false;
41761 /* Return true when INSN mentions register that must be encoded using REX
41762 prefix. */
41763 bool
41764 x86_extended_reg_mentioned_p (rtx insn)
41766 subrtx_iterator::array_type array;
41767 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41769 const_rtx x = *iter;
41770 if (REG_P (x)
41771 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41772 return true;
41774 return false;
41777 /* If profitable, negate (without causing overflow) integer constant
41778 of mode MODE at location LOC. Return true in this case. */
41779 bool
41780 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41782 HOST_WIDE_INT val;
41784 if (!CONST_INT_P (*loc))
41785 return false;
41787 switch (mode)
41789 case DImode:
41790 /* DImode x86_64 constants must fit in 32 bits. */
41791 gcc_assert (x86_64_immediate_operand (*loc, mode));
41793 mode = SImode;
41794 break;
41796 case SImode:
41797 case HImode:
41798 case QImode:
41799 break;
41801 default:
41802 gcc_unreachable ();
41805 /* Avoid overflows. */
41806 if (mode_signbit_p (mode, *loc))
41807 return false;
41809 val = INTVAL (*loc);
41811 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41812 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41813 if ((val < 0 && val != -128)
41814 || val == 128)
41816 *loc = GEN_INT (-val);
41817 return true;
41820 return false;
41823 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41824 optabs would emit if we didn't have TFmode patterns. */
41826 void
41827 x86_emit_floatuns (rtx operands[2])
41829 rtx_code_label *neglab, *donelab;
41830 rtx i0, i1, f0, in, out;
41831 machine_mode mode, inmode;
41833 inmode = GET_MODE (operands[1]);
41834 gcc_assert (inmode == SImode || inmode == DImode);
41836 out = operands[0];
41837 in = force_reg (inmode, operands[1]);
41838 mode = GET_MODE (out);
41839 neglab = gen_label_rtx ();
41840 donelab = gen_label_rtx ();
41841 f0 = gen_reg_rtx (mode);
41843 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41845 expand_float (out, in, 0);
41847 emit_jump_insn (gen_jump (donelab));
41848 emit_barrier ();
41850 emit_label (neglab);
41852 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41853 1, OPTAB_DIRECT);
41854 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41855 1, OPTAB_DIRECT);
41856 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41858 expand_float (f0, i0, 0);
41860 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41862 emit_label (donelab);
41865 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41866 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41867 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41868 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41870 /* Get a vector mode of the same size as the original but with elements
41871 twice as wide. This is only guaranteed to apply to integral vectors. */
41873 static inline machine_mode
41874 get_mode_wider_vector (machine_mode o)
41876 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41877 machine_mode n = GET_MODE_WIDER_MODE (o);
41878 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41879 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41880 return n;
41883 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41884 fill target with val via vec_duplicate. */
41886 static bool
41887 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41889 bool ok;
41890 rtx_insn *insn;
41891 rtx dup;
41893 /* First attempt to recognize VAL as-is. */
41894 dup = gen_rtx_VEC_DUPLICATE (mode, val);
41895 insn = emit_insn (gen_rtx_SET (target, dup));
41896 if (recog_memoized (insn) < 0)
41898 rtx_insn *seq;
41899 /* If that fails, force VAL into a register. */
41901 start_sequence ();
41902 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
41903 seq = get_insns ();
41904 end_sequence ();
41905 if (seq)
41906 emit_insn_before (seq, insn);
41908 ok = recog_memoized (insn) >= 0;
41909 gcc_assert (ok);
41911 return true;
41914 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41915 with all elements equal to VAR. Return true if successful. */
41917 static bool
41918 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41919 rtx target, rtx val)
41921 bool ok;
41923 switch (mode)
41925 case V2SImode:
41926 case V2SFmode:
41927 if (!mmx_ok)
41928 return false;
41929 /* FALLTHRU */
41931 case V4DFmode:
41932 case V4DImode:
41933 case V8SFmode:
41934 case V8SImode:
41935 case V2DFmode:
41936 case V2DImode:
41937 case V4SFmode:
41938 case V4SImode:
41939 case V16SImode:
41940 case V8DImode:
41941 case V16SFmode:
41942 case V8DFmode:
41943 return ix86_vector_duplicate_value (mode, target, val);
41945 case V4HImode:
41946 if (!mmx_ok)
41947 return false;
41948 if (TARGET_SSE || TARGET_3DNOW_A)
41950 rtx x;
41952 val = gen_lowpart (SImode, val);
41953 x = gen_rtx_TRUNCATE (HImode, val);
41954 x = gen_rtx_VEC_DUPLICATE (mode, x);
41955 emit_insn (gen_rtx_SET (target, x));
41956 return true;
41958 goto widen;
41960 case V8QImode:
41961 if (!mmx_ok)
41962 return false;
41963 goto widen;
41965 case V8HImode:
41966 if (TARGET_AVX2)
41967 return ix86_vector_duplicate_value (mode, target, val);
41969 if (TARGET_SSE2)
41971 struct expand_vec_perm_d dperm;
41972 rtx tmp1, tmp2;
41974 permute:
41975 memset (&dperm, 0, sizeof (dperm));
41976 dperm.target = target;
41977 dperm.vmode = mode;
41978 dperm.nelt = GET_MODE_NUNITS (mode);
41979 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41980 dperm.one_operand_p = true;
41982 /* Extend to SImode using a paradoxical SUBREG. */
41983 tmp1 = gen_reg_rtx (SImode);
41984 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41986 /* Insert the SImode value as low element of a V4SImode vector. */
41987 tmp2 = gen_reg_rtx (V4SImode);
41988 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41989 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41991 ok = (expand_vec_perm_1 (&dperm)
41992 || expand_vec_perm_broadcast_1 (&dperm));
41993 gcc_assert (ok);
41994 return ok;
41996 goto widen;
41998 case V16QImode:
41999 if (TARGET_AVX2)
42000 return ix86_vector_duplicate_value (mode, target, val);
42002 if (TARGET_SSE2)
42003 goto permute;
42004 goto widen;
42006 widen:
42007 /* Replicate the value once into the next wider mode and recurse. */
42009 machine_mode smode, wsmode, wvmode;
42010 rtx x;
42012 smode = GET_MODE_INNER (mode);
42013 wvmode = get_mode_wider_vector (mode);
42014 wsmode = GET_MODE_INNER (wvmode);
42016 val = convert_modes (wsmode, smode, val, true);
42017 x = expand_simple_binop (wsmode, ASHIFT, val,
42018 GEN_INT (GET_MODE_BITSIZE (smode)),
42019 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42020 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42022 x = gen_reg_rtx (wvmode);
42023 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42024 gcc_assert (ok);
42025 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42026 return ok;
42029 case V16HImode:
42030 case V32QImode:
42031 if (TARGET_AVX2)
42032 return ix86_vector_duplicate_value (mode, target, val);
42033 else
42035 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42036 rtx x = gen_reg_rtx (hvmode);
42038 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42039 gcc_assert (ok);
42041 x = gen_rtx_VEC_CONCAT (mode, x, x);
42042 emit_insn (gen_rtx_SET (target, x));
42044 return true;
42046 case V64QImode:
42047 case V32HImode:
42048 if (TARGET_AVX512BW)
42049 return ix86_vector_duplicate_value (mode, target, val);
42050 else
42052 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42053 rtx x = gen_reg_rtx (hvmode);
42055 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42056 gcc_assert (ok);
42058 x = gen_rtx_VEC_CONCAT (mode, x, x);
42059 emit_insn (gen_rtx_SET (target, x));
42061 return true;
42063 default:
42064 return false;
42068 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42069 whose ONE_VAR element is VAR, and other elements are zero. Return true
42070 if successful. */
42072 static bool
42073 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42074 rtx target, rtx var, int one_var)
42076 machine_mode vsimode;
42077 rtx new_target;
42078 rtx x, tmp;
42079 bool use_vector_set = false;
42081 switch (mode)
42083 case V2DImode:
42084 /* For SSE4.1, we normally use vector set. But if the second
42085 element is zero and inter-unit moves are OK, we use movq
42086 instead. */
42087 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42088 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42089 && one_var == 0));
42090 break;
42091 case V16QImode:
42092 case V4SImode:
42093 case V4SFmode:
42094 use_vector_set = TARGET_SSE4_1;
42095 break;
42096 case V8HImode:
42097 use_vector_set = TARGET_SSE2;
42098 break;
42099 case V4HImode:
42100 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42101 break;
42102 case V32QImode:
42103 case V16HImode:
42104 case V8SImode:
42105 case V8SFmode:
42106 case V4DFmode:
42107 use_vector_set = TARGET_AVX;
42108 break;
42109 case V4DImode:
42110 /* Use ix86_expand_vector_set in 64bit mode only. */
42111 use_vector_set = TARGET_AVX && TARGET_64BIT;
42112 break;
42113 default:
42114 break;
42117 if (use_vector_set)
42119 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42120 var = force_reg (GET_MODE_INNER (mode), var);
42121 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42122 return true;
42125 switch (mode)
42127 case V2SFmode:
42128 case V2SImode:
42129 if (!mmx_ok)
42130 return false;
42131 /* FALLTHRU */
42133 case V2DFmode:
42134 case V2DImode:
42135 if (one_var != 0)
42136 return false;
42137 var = force_reg (GET_MODE_INNER (mode), var);
42138 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42139 emit_insn (gen_rtx_SET (target, x));
42140 return true;
42142 case V4SFmode:
42143 case V4SImode:
42144 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42145 new_target = gen_reg_rtx (mode);
42146 else
42147 new_target = target;
42148 var = force_reg (GET_MODE_INNER (mode), var);
42149 x = gen_rtx_VEC_DUPLICATE (mode, var);
42150 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42151 emit_insn (gen_rtx_SET (new_target, x));
42152 if (one_var != 0)
42154 /* We need to shuffle the value to the correct position, so
42155 create a new pseudo to store the intermediate result. */
42157 /* With SSE2, we can use the integer shuffle insns. */
42158 if (mode != V4SFmode && TARGET_SSE2)
42160 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42161 const1_rtx,
42162 GEN_INT (one_var == 1 ? 0 : 1),
42163 GEN_INT (one_var == 2 ? 0 : 1),
42164 GEN_INT (one_var == 3 ? 0 : 1)));
42165 if (target != new_target)
42166 emit_move_insn (target, new_target);
42167 return true;
42170 /* Otherwise convert the intermediate result to V4SFmode and
42171 use the SSE1 shuffle instructions. */
42172 if (mode != V4SFmode)
42174 tmp = gen_reg_rtx (V4SFmode);
42175 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42177 else
42178 tmp = new_target;
42180 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42181 const1_rtx,
42182 GEN_INT (one_var == 1 ? 0 : 1),
42183 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42184 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42186 if (mode != V4SFmode)
42187 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42188 else if (tmp != target)
42189 emit_move_insn (target, tmp);
42191 else if (target != new_target)
42192 emit_move_insn (target, new_target);
42193 return true;
42195 case V8HImode:
42196 case V16QImode:
42197 vsimode = V4SImode;
42198 goto widen;
42199 case V4HImode:
42200 case V8QImode:
42201 if (!mmx_ok)
42202 return false;
42203 vsimode = V2SImode;
42204 goto widen;
42205 widen:
42206 if (one_var != 0)
42207 return false;
42209 /* Zero extend the variable element to SImode and recurse. */
42210 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42212 x = gen_reg_rtx (vsimode);
42213 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42214 var, one_var))
42215 gcc_unreachable ();
42217 emit_move_insn (target, gen_lowpart (mode, x));
42218 return true;
42220 default:
42221 return false;
42225 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42226 consisting of the values in VALS. It is known that all elements
42227 except ONE_VAR are constants. Return true if successful. */
42229 static bool
42230 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42231 rtx target, rtx vals, int one_var)
42233 rtx var = XVECEXP (vals, 0, one_var);
42234 machine_mode wmode;
42235 rtx const_vec, x;
42237 const_vec = copy_rtx (vals);
42238 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42239 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42241 switch (mode)
42243 case V2DFmode:
42244 case V2DImode:
42245 case V2SFmode:
42246 case V2SImode:
42247 /* For the two element vectors, it's just as easy to use
42248 the general case. */
42249 return false;
42251 case V4DImode:
42252 /* Use ix86_expand_vector_set in 64bit mode only. */
42253 if (!TARGET_64BIT)
42254 return false;
42255 /* FALLTHRU */
42256 case V4DFmode:
42257 case V8SFmode:
42258 case V8SImode:
42259 case V16HImode:
42260 case V32QImode:
42261 case V4SFmode:
42262 case V4SImode:
42263 case V8HImode:
42264 case V4HImode:
42265 break;
42267 case V16QImode:
42268 if (TARGET_SSE4_1)
42269 break;
42270 wmode = V8HImode;
42271 goto widen;
42272 case V8QImode:
42273 wmode = V4HImode;
42274 goto widen;
42275 widen:
42276 /* There's no way to set one QImode entry easily. Combine
42277 the variable value with its adjacent constant value, and
42278 promote to an HImode set. */
42279 x = XVECEXP (vals, 0, one_var ^ 1);
42280 if (one_var & 1)
42282 var = convert_modes (HImode, QImode, var, true);
42283 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42284 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42285 x = GEN_INT (INTVAL (x) & 0xff);
42287 else
42289 var = convert_modes (HImode, QImode, var, true);
42290 x = gen_int_mode (INTVAL (x) << 8, HImode);
42292 if (x != const0_rtx)
42293 var = expand_simple_binop (HImode, IOR, var, x, var,
42294 1, OPTAB_LIB_WIDEN);
42296 x = gen_reg_rtx (wmode);
42297 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42298 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42300 emit_move_insn (target, gen_lowpart (mode, x));
42301 return true;
42303 default:
42304 return false;
42307 emit_move_insn (target, const_vec);
42308 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42309 return true;
42312 /* A subroutine of ix86_expand_vector_init_general. Use vector
42313 concatenate to handle the most general case: all values variable,
42314 and none identical. */
42316 static void
42317 ix86_expand_vector_init_concat (machine_mode mode,
42318 rtx target, rtx *ops, int n)
42320 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42321 rtx first[16], second[8], third[4];
42322 rtvec v;
42323 int i, j;
42325 switch (n)
42327 case 2:
42328 switch (mode)
42330 case V16SImode:
42331 cmode = V8SImode;
42332 break;
42333 case V16SFmode:
42334 cmode = V8SFmode;
42335 break;
42336 case V8DImode:
42337 cmode = V4DImode;
42338 break;
42339 case V8DFmode:
42340 cmode = V4DFmode;
42341 break;
42342 case V8SImode:
42343 cmode = V4SImode;
42344 break;
42345 case V8SFmode:
42346 cmode = V4SFmode;
42347 break;
42348 case V4DImode:
42349 cmode = V2DImode;
42350 break;
42351 case V4DFmode:
42352 cmode = V2DFmode;
42353 break;
42354 case V4SImode:
42355 cmode = V2SImode;
42356 break;
42357 case V4SFmode:
42358 cmode = V2SFmode;
42359 break;
42360 case V2DImode:
42361 cmode = DImode;
42362 break;
42363 case V2SImode:
42364 cmode = SImode;
42365 break;
42366 case V2DFmode:
42367 cmode = DFmode;
42368 break;
42369 case V2SFmode:
42370 cmode = SFmode;
42371 break;
42372 default:
42373 gcc_unreachable ();
42376 if (!register_operand (ops[1], cmode))
42377 ops[1] = force_reg (cmode, ops[1]);
42378 if (!register_operand (ops[0], cmode))
42379 ops[0] = force_reg (cmode, ops[0]);
42380 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42381 ops[1])));
42382 break;
42384 case 4:
42385 switch (mode)
42387 case V4DImode:
42388 cmode = V2DImode;
42389 break;
42390 case V4DFmode:
42391 cmode = V2DFmode;
42392 break;
42393 case V4SImode:
42394 cmode = V2SImode;
42395 break;
42396 case V4SFmode:
42397 cmode = V2SFmode;
42398 break;
42399 default:
42400 gcc_unreachable ();
42402 goto half;
42404 case 8:
42405 switch (mode)
42407 case V8DImode:
42408 cmode = V2DImode;
42409 hmode = V4DImode;
42410 break;
42411 case V8DFmode:
42412 cmode = V2DFmode;
42413 hmode = V4DFmode;
42414 break;
42415 case V8SImode:
42416 cmode = V2SImode;
42417 hmode = V4SImode;
42418 break;
42419 case V8SFmode:
42420 cmode = V2SFmode;
42421 hmode = V4SFmode;
42422 break;
42423 default:
42424 gcc_unreachable ();
42426 goto half;
42428 case 16:
42429 switch (mode)
42431 case V16SImode:
42432 cmode = V2SImode;
42433 hmode = V4SImode;
42434 gmode = V8SImode;
42435 break;
42436 case V16SFmode:
42437 cmode = V2SFmode;
42438 hmode = V4SFmode;
42439 gmode = V8SFmode;
42440 break;
42441 default:
42442 gcc_unreachable ();
42444 goto half;
42446 half:
42447 /* FIXME: We process inputs backward to help RA. PR 36222. */
42448 i = n - 1;
42449 j = (n >> 1) - 1;
42450 for (; i > 0; i -= 2, j--)
42452 first[j] = gen_reg_rtx (cmode);
42453 v = gen_rtvec (2, ops[i - 1], ops[i]);
42454 ix86_expand_vector_init (false, first[j],
42455 gen_rtx_PARALLEL (cmode, v));
42458 n >>= 1;
42459 if (n > 4)
42461 gcc_assert (hmode != VOIDmode);
42462 gcc_assert (gmode != VOIDmode);
42463 for (i = j = 0; i < n; i += 2, j++)
42465 second[j] = gen_reg_rtx (hmode);
42466 ix86_expand_vector_init_concat (hmode, second [j],
42467 &first [i], 2);
42469 n >>= 1;
42470 for (i = j = 0; i < n; i += 2, j++)
42472 third[j] = gen_reg_rtx (gmode);
42473 ix86_expand_vector_init_concat (gmode, third[j],
42474 &second[i], 2);
42476 n >>= 1;
42477 ix86_expand_vector_init_concat (mode, target, third, n);
42479 else if (n > 2)
42481 gcc_assert (hmode != VOIDmode);
42482 for (i = j = 0; i < n; i += 2, j++)
42484 second[j] = gen_reg_rtx (hmode);
42485 ix86_expand_vector_init_concat (hmode, second [j],
42486 &first [i], 2);
42488 n >>= 1;
42489 ix86_expand_vector_init_concat (mode, target, second, n);
42491 else
42492 ix86_expand_vector_init_concat (mode, target, first, n);
42493 break;
42495 default:
42496 gcc_unreachable ();
42500 /* A subroutine of ix86_expand_vector_init_general. Use vector
42501 interleave to handle the most general case: all values variable,
42502 and none identical. */
42504 static void
42505 ix86_expand_vector_init_interleave (machine_mode mode,
42506 rtx target, rtx *ops, int n)
42508 machine_mode first_imode, second_imode, third_imode, inner_mode;
42509 int i, j;
42510 rtx op0, op1;
42511 rtx (*gen_load_even) (rtx, rtx, rtx);
42512 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42513 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42515 switch (mode)
42517 case V8HImode:
42518 gen_load_even = gen_vec_setv8hi;
42519 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42520 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42521 inner_mode = HImode;
42522 first_imode = V4SImode;
42523 second_imode = V2DImode;
42524 third_imode = VOIDmode;
42525 break;
42526 case V16QImode:
42527 gen_load_even = gen_vec_setv16qi;
42528 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42529 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42530 inner_mode = QImode;
42531 first_imode = V8HImode;
42532 second_imode = V4SImode;
42533 third_imode = V2DImode;
42534 break;
42535 default:
42536 gcc_unreachable ();
42539 for (i = 0; i < n; i++)
42541 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42542 op0 = gen_reg_rtx (SImode);
42543 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42545 /* Insert the SImode value as low element of V4SImode vector. */
42546 op1 = gen_reg_rtx (V4SImode);
42547 op0 = gen_rtx_VEC_MERGE (V4SImode,
42548 gen_rtx_VEC_DUPLICATE (V4SImode,
42549 op0),
42550 CONST0_RTX (V4SImode),
42551 const1_rtx);
42552 emit_insn (gen_rtx_SET (op1, op0));
42554 /* Cast the V4SImode vector back to a vector in orignal mode. */
42555 op0 = gen_reg_rtx (mode);
42556 emit_move_insn (op0, gen_lowpart (mode, op1));
42558 /* Load even elements into the second position. */
42559 emit_insn (gen_load_even (op0,
42560 force_reg (inner_mode,
42561 ops [i + i + 1]),
42562 const1_rtx));
42564 /* Cast vector to FIRST_IMODE vector. */
42565 ops[i] = gen_reg_rtx (first_imode);
42566 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42569 /* Interleave low FIRST_IMODE vectors. */
42570 for (i = j = 0; i < n; i += 2, j++)
42572 op0 = gen_reg_rtx (first_imode);
42573 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42575 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42576 ops[j] = gen_reg_rtx (second_imode);
42577 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42580 /* Interleave low SECOND_IMODE vectors. */
42581 switch (second_imode)
42583 case V4SImode:
42584 for (i = j = 0; i < n / 2; i += 2, j++)
42586 op0 = gen_reg_rtx (second_imode);
42587 emit_insn (gen_interleave_second_low (op0, ops[i],
42588 ops[i + 1]));
42590 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42591 vector. */
42592 ops[j] = gen_reg_rtx (third_imode);
42593 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42595 second_imode = V2DImode;
42596 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42597 /* FALLTHRU */
42599 case V2DImode:
42600 op0 = gen_reg_rtx (second_imode);
42601 emit_insn (gen_interleave_second_low (op0, ops[0],
42602 ops[1]));
42604 /* Cast the SECOND_IMODE vector back to a vector on original
42605 mode. */
42606 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42607 break;
42609 default:
42610 gcc_unreachable ();
42614 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42615 all values variable, and none identical. */
42617 static void
42618 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42619 rtx target, rtx vals)
42621 rtx ops[64], op0, op1, op2, op3, op4, op5;
42622 machine_mode half_mode = VOIDmode;
42623 machine_mode quarter_mode = VOIDmode;
42624 int n, i;
42626 switch (mode)
42628 case V2SFmode:
42629 case V2SImode:
42630 if (!mmx_ok && !TARGET_SSE)
42631 break;
42632 /* FALLTHRU */
42634 case V16SImode:
42635 case V16SFmode:
42636 case V8DFmode:
42637 case V8DImode:
42638 case V8SFmode:
42639 case V8SImode:
42640 case V4DFmode:
42641 case V4DImode:
42642 case V4SFmode:
42643 case V4SImode:
42644 case V2DFmode:
42645 case V2DImode:
42646 n = GET_MODE_NUNITS (mode);
42647 for (i = 0; i < n; i++)
42648 ops[i] = XVECEXP (vals, 0, i);
42649 ix86_expand_vector_init_concat (mode, target, ops, n);
42650 return;
42652 case V32QImode:
42653 half_mode = V16QImode;
42654 goto half;
42656 case V16HImode:
42657 half_mode = V8HImode;
42658 goto half;
42660 half:
42661 n = GET_MODE_NUNITS (mode);
42662 for (i = 0; i < n; i++)
42663 ops[i] = XVECEXP (vals, 0, i);
42664 op0 = gen_reg_rtx (half_mode);
42665 op1 = gen_reg_rtx (half_mode);
42666 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42667 n >> 2);
42668 ix86_expand_vector_init_interleave (half_mode, op1,
42669 &ops [n >> 1], n >> 2);
42670 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42671 return;
42673 case V64QImode:
42674 quarter_mode = V16QImode;
42675 half_mode = V32QImode;
42676 goto quarter;
42678 case V32HImode:
42679 quarter_mode = V8HImode;
42680 half_mode = V16HImode;
42681 goto quarter;
42683 quarter:
42684 n = GET_MODE_NUNITS (mode);
42685 for (i = 0; i < n; i++)
42686 ops[i] = XVECEXP (vals, 0, i);
42687 op0 = gen_reg_rtx (quarter_mode);
42688 op1 = gen_reg_rtx (quarter_mode);
42689 op2 = gen_reg_rtx (quarter_mode);
42690 op3 = gen_reg_rtx (quarter_mode);
42691 op4 = gen_reg_rtx (half_mode);
42692 op5 = gen_reg_rtx (half_mode);
42693 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42694 n >> 3);
42695 ix86_expand_vector_init_interleave (quarter_mode, op1,
42696 &ops [n >> 2], n >> 3);
42697 ix86_expand_vector_init_interleave (quarter_mode, op2,
42698 &ops [n >> 1], n >> 3);
42699 ix86_expand_vector_init_interleave (quarter_mode, op3,
42700 &ops [(n >> 1) | (n >> 2)], n >> 3);
42701 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42702 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42703 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42704 return;
42706 case V16QImode:
42707 if (!TARGET_SSE4_1)
42708 break;
42709 /* FALLTHRU */
42711 case V8HImode:
42712 if (!TARGET_SSE2)
42713 break;
42715 /* Don't use ix86_expand_vector_init_interleave if we can't
42716 move from GPR to SSE register directly. */
42717 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42718 break;
42720 n = GET_MODE_NUNITS (mode);
42721 for (i = 0; i < n; i++)
42722 ops[i] = XVECEXP (vals, 0, i);
42723 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42724 return;
42726 case V4HImode:
42727 case V8QImode:
42728 break;
42730 default:
42731 gcc_unreachable ();
42735 int i, j, n_elts, n_words, n_elt_per_word;
42736 machine_mode inner_mode;
42737 rtx words[4], shift;
42739 inner_mode = GET_MODE_INNER (mode);
42740 n_elts = GET_MODE_NUNITS (mode);
42741 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42742 n_elt_per_word = n_elts / n_words;
42743 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42745 for (i = 0; i < n_words; ++i)
42747 rtx word = NULL_RTX;
42749 for (j = 0; j < n_elt_per_word; ++j)
42751 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42752 elt = convert_modes (word_mode, inner_mode, elt, true);
42754 if (j == 0)
42755 word = elt;
42756 else
42758 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42759 word, 1, OPTAB_LIB_WIDEN);
42760 word = expand_simple_binop (word_mode, IOR, word, elt,
42761 word, 1, OPTAB_LIB_WIDEN);
42765 words[i] = word;
42768 if (n_words == 1)
42769 emit_move_insn (target, gen_lowpart (mode, words[0]));
42770 else if (n_words == 2)
42772 rtx tmp = gen_reg_rtx (mode);
42773 emit_clobber (tmp);
42774 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42775 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42776 emit_move_insn (target, tmp);
42778 else if (n_words == 4)
42780 rtx tmp = gen_reg_rtx (V4SImode);
42781 gcc_assert (word_mode == SImode);
42782 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42783 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42784 emit_move_insn (target, gen_lowpart (mode, tmp));
42786 else
42787 gcc_unreachable ();
42791 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42792 instructions unless MMX_OK is true. */
42794 void
42795 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42797 machine_mode mode = GET_MODE (target);
42798 machine_mode inner_mode = GET_MODE_INNER (mode);
42799 int n_elts = GET_MODE_NUNITS (mode);
42800 int n_var = 0, one_var = -1;
42801 bool all_same = true, all_const_zero = true;
42802 int i;
42803 rtx x;
42805 for (i = 0; i < n_elts; ++i)
42807 x = XVECEXP (vals, 0, i);
42808 if (!(CONST_SCALAR_INT_P (x)
42809 || CONST_DOUBLE_P (x)
42810 || CONST_FIXED_P (x)))
42811 n_var++, one_var = i;
42812 else if (x != CONST0_RTX (inner_mode))
42813 all_const_zero = false;
42814 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42815 all_same = false;
42818 /* Constants are best loaded from the constant pool. */
42819 if (n_var == 0)
42821 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42822 return;
42825 /* If all values are identical, broadcast the value. */
42826 if (all_same
42827 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42828 XVECEXP (vals, 0, 0)))
42829 return;
42831 /* Values where only one field is non-constant are best loaded from
42832 the pool and overwritten via move later. */
42833 if (n_var == 1)
42835 if (all_const_zero
42836 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42837 XVECEXP (vals, 0, one_var),
42838 one_var))
42839 return;
42841 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42842 return;
42845 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42848 void
42849 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42851 machine_mode mode = GET_MODE (target);
42852 machine_mode inner_mode = GET_MODE_INNER (mode);
42853 machine_mode half_mode;
42854 bool use_vec_merge = false;
42855 rtx tmp;
42856 static rtx (*gen_extract[6][2]) (rtx, rtx)
42858 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42859 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42860 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42861 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42862 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42863 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42865 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42867 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42868 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42869 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42870 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42871 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42872 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42874 int i, j, n;
42875 machine_mode mmode = VOIDmode;
42876 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42878 switch (mode)
42880 case V2SFmode:
42881 case V2SImode:
42882 if (mmx_ok)
42884 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42885 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42886 if (elt == 0)
42887 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42888 else
42889 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42890 emit_insn (gen_rtx_SET (target, tmp));
42891 return;
42893 break;
42895 case V2DImode:
42896 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42897 if (use_vec_merge)
42898 break;
42900 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42901 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42902 if (elt == 0)
42903 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42904 else
42905 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42906 emit_insn (gen_rtx_SET (target, tmp));
42907 return;
42909 case V2DFmode:
42911 rtx op0, op1;
42913 /* For the two element vectors, we implement a VEC_CONCAT with
42914 the extraction of the other element. */
42916 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42917 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42919 if (elt == 0)
42920 op0 = val, op1 = tmp;
42921 else
42922 op0 = tmp, op1 = val;
42924 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42925 emit_insn (gen_rtx_SET (target, tmp));
42927 return;
42929 case V4SFmode:
42930 use_vec_merge = TARGET_SSE4_1;
42931 if (use_vec_merge)
42932 break;
42934 switch (elt)
42936 case 0:
42937 use_vec_merge = true;
42938 break;
42940 case 1:
42941 /* tmp = target = A B C D */
42942 tmp = copy_to_reg (target);
42943 /* target = A A B B */
42944 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42945 /* target = X A B B */
42946 ix86_expand_vector_set (false, target, val, 0);
42947 /* target = A X C D */
42948 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42949 const1_rtx, const0_rtx,
42950 GEN_INT (2+4), GEN_INT (3+4)));
42951 return;
42953 case 2:
42954 /* tmp = target = A B C D */
42955 tmp = copy_to_reg (target);
42956 /* tmp = X B C D */
42957 ix86_expand_vector_set (false, tmp, val, 0);
42958 /* target = A B X D */
42959 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42960 const0_rtx, const1_rtx,
42961 GEN_INT (0+4), GEN_INT (3+4)));
42962 return;
42964 case 3:
42965 /* tmp = target = A B C D */
42966 tmp = copy_to_reg (target);
42967 /* tmp = X B C D */
42968 ix86_expand_vector_set (false, tmp, val, 0);
42969 /* target = A B X D */
42970 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42971 const0_rtx, const1_rtx,
42972 GEN_INT (2+4), GEN_INT (0+4)));
42973 return;
42975 default:
42976 gcc_unreachable ();
42978 break;
42980 case V4SImode:
42981 use_vec_merge = TARGET_SSE4_1;
42982 if (use_vec_merge)
42983 break;
42985 /* Element 0 handled by vec_merge below. */
42986 if (elt == 0)
42988 use_vec_merge = true;
42989 break;
42992 if (TARGET_SSE2)
42994 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42995 store into element 0, then shuffle them back. */
42997 rtx order[4];
42999 order[0] = GEN_INT (elt);
43000 order[1] = const1_rtx;
43001 order[2] = const2_rtx;
43002 order[3] = GEN_INT (3);
43003 order[elt] = const0_rtx;
43005 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43006 order[1], order[2], order[3]));
43008 ix86_expand_vector_set (false, target, val, 0);
43010 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43011 order[1], order[2], order[3]));
43013 else
43015 /* For SSE1, we have to reuse the V4SF code. */
43016 rtx t = gen_reg_rtx (V4SFmode);
43017 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43018 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43019 emit_move_insn (target, gen_lowpart (mode, t));
43021 return;
43023 case V8HImode:
43024 use_vec_merge = TARGET_SSE2;
43025 break;
43026 case V4HImode:
43027 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43028 break;
43030 case V16QImode:
43031 use_vec_merge = TARGET_SSE4_1;
43032 break;
43034 case V8QImode:
43035 break;
43037 case V32QImode:
43038 half_mode = V16QImode;
43039 j = 0;
43040 n = 16;
43041 goto half;
43043 case V16HImode:
43044 half_mode = V8HImode;
43045 j = 1;
43046 n = 8;
43047 goto half;
43049 case V8SImode:
43050 half_mode = V4SImode;
43051 j = 2;
43052 n = 4;
43053 goto half;
43055 case V4DImode:
43056 half_mode = V2DImode;
43057 j = 3;
43058 n = 2;
43059 goto half;
43061 case V8SFmode:
43062 half_mode = V4SFmode;
43063 j = 4;
43064 n = 4;
43065 goto half;
43067 case V4DFmode:
43068 half_mode = V2DFmode;
43069 j = 5;
43070 n = 2;
43071 goto half;
43073 half:
43074 /* Compute offset. */
43075 i = elt / n;
43076 elt %= n;
43078 gcc_assert (i <= 1);
43080 /* Extract the half. */
43081 tmp = gen_reg_rtx (half_mode);
43082 emit_insn (gen_extract[j][i] (tmp, target));
43084 /* Put val in tmp at elt. */
43085 ix86_expand_vector_set (false, tmp, val, elt);
43087 /* Put it back. */
43088 emit_insn (gen_insert[j][i] (target, target, tmp));
43089 return;
43091 case V8DFmode:
43092 if (TARGET_AVX512F)
43094 mmode = QImode;
43095 gen_blendm = gen_avx512f_blendmv8df;
43097 break;
43099 case V8DImode:
43100 if (TARGET_AVX512F)
43102 mmode = QImode;
43103 gen_blendm = gen_avx512f_blendmv8di;
43105 break;
43107 case V16SFmode:
43108 if (TARGET_AVX512F)
43110 mmode = HImode;
43111 gen_blendm = gen_avx512f_blendmv16sf;
43113 break;
43115 case V16SImode:
43116 if (TARGET_AVX512F)
43118 mmode = HImode;
43119 gen_blendm = gen_avx512f_blendmv16si;
43121 break;
43123 case V32HImode:
43124 if (TARGET_AVX512F && TARGET_AVX512BW)
43126 mmode = SImode;
43127 gen_blendm = gen_avx512bw_blendmv32hi;
43129 break;
43131 case V64QImode:
43132 if (TARGET_AVX512F && TARGET_AVX512BW)
43134 mmode = DImode;
43135 gen_blendm = gen_avx512bw_blendmv64qi;
43137 break;
43139 default:
43140 break;
43143 if (mmode != VOIDmode)
43145 tmp = gen_reg_rtx (mode);
43146 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43147 /* The avx512*_blendm<mode> expanders have different operand order
43148 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43149 elements where the mask is set and second input operand otherwise,
43150 in {sse,avx}*_*blend* the first input operand is used for elements
43151 where the mask is clear and second input operand otherwise. */
43152 emit_insn (gen_blendm (target, target, tmp,
43153 force_reg (mmode,
43154 gen_int_mode (1 << elt, mmode))));
43156 else if (use_vec_merge)
43158 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43159 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43160 emit_insn (gen_rtx_SET (target, tmp));
43162 else
43164 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43166 emit_move_insn (mem, target);
43168 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43169 emit_move_insn (tmp, val);
43171 emit_move_insn (target, mem);
43175 void
43176 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43178 machine_mode mode = GET_MODE (vec);
43179 machine_mode inner_mode = GET_MODE_INNER (mode);
43180 bool use_vec_extr = false;
43181 rtx tmp;
43183 switch (mode)
43185 case V2SImode:
43186 case V2SFmode:
43187 if (!mmx_ok)
43188 break;
43189 /* FALLTHRU */
43191 case V2DFmode:
43192 case V2DImode:
43193 use_vec_extr = true;
43194 break;
43196 case V4SFmode:
43197 use_vec_extr = TARGET_SSE4_1;
43198 if (use_vec_extr)
43199 break;
43201 switch (elt)
43203 case 0:
43204 tmp = vec;
43205 break;
43207 case 1:
43208 case 3:
43209 tmp = gen_reg_rtx (mode);
43210 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43211 GEN_INT (elt), GEN_INT (elt),
43212 GEN_INT (elt+4), GEN_INT (elt+4)));
43213 break;
43215 case 2:
43216 tmp = gen_reg_rtx (mode);
43217 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43218 break;
43220 default:
43221 gcc_unreachable ();
43223 vec = tmp;
43224 use_vec_extr = true;
43225 elt = 0;
43226 break;
43228 case V4SImode:
43229 use_vec_extr = TARGET_SSE4_1;
43230 if (use_vec_extr)
43231 break;
43233 if (TARGET_SSE2)
43235 switch (elt)
43237 case 0:
43238 tmp = vec;
43239 break;
43241 case 1:
43242 case 3:
43243 tmp = gen_reg_rtx (mode);
43244 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43245 GEN_INT (elt), GEN_INT (elt),
43246 GEN_INT (elt), GEN_INT (elt)));
43247 break;
43249 case 2:
43250 tmp = gen_reg_rtx (mode);
43251 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43252 break;
43254 default:
43255 gcc_unreachable ();
43257 vec = tmp;
43258 use_vec_extr = true;
43259 elt = 0;
43261 else
43263 /* For SSE1, we have to reuse the V4SF code. */
43264 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43265 gen_lowpart (V4SFmode, vec), elt);
43266 return;
43268 break;
43270 case V8HImode:
43271 use_vec_extr = TARGET_SSE2;
43272 break;
43273 case V4HImode:
43274 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43275 break;
43277 case V16QImode:
43278 use_vec_extr = TARGET_SSE4_1;
43279 break;
43281 case V8SFmode:
43282 if (TARGET_AVX)
43284 tmp = gen_reg_rtx (V4SFmode);
43285 if (elt < 4)
43286 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43287 else
43288 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43289 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43290 return;
43292 break;
43294 case V4DFmode:
43295 if (TARGET_AVX)
43297 tmp = gen_reg_rtx (V2DFmode);
43298 if (elt < 2)
43299 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43300 else
43301 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43302 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43303 return;
43305 break;
43307 case V32QImode:
43308 if (TARGET_AVX)
43310 tmp = gen_reg_rtx (V16QImode);
43311 if (elt < 16)
43312 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43313 else
43314 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43315 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43316 return;
43318 break;
43320 case V16HImode:
43321 if (TARGET_AVX)
43323 tmp = gen_reg_rtx (V8HImode);
43324 if (elt < 8)
43325 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43326 else
43327 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43328 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43329 return;
43331 break;
43333 case V8SImode:
43334 if (TARGET_AVX)
43336 tmp = gen_reg_rtx (V4SImode);
43337 if (elt < 4)
43338 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43339 else
43340 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43341 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43342 return;
43344 break;
43346 case V4DImode:
43347 if (TARGET_AVX)
43349 tmp = gen_reg_rtx (V2DImode);
43350 if (elt < 2)
43351 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43352 else
43353 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43354 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43355 return;
43357 break;
43359 case V32HImode:
43360 if (TARGET_AVX512BW)
43362 tmp = gen_reg_rtx (V16HImode);
43363 if (elt < 16)
43364 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43365 else
43366 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43367 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43368 return;
43370 break;
43372 case V64QImode:
43373 if (TARGET_AVX512BW)
43375 tmp = gen_reg_rtx (V32QImode);
43376 if (elt < 32)
43377 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43378 else
43379 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43380 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43381 return;
43383 break;
43385 case V16SFmode:
43386 tmp = gen_reg_rtx (V8SFmode);
43387 if (elt < 8)
43388 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43389 else
43390 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43391 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43392 return;
43394 case V8DFmode:
43395 tmp = gen_reg_rtx (V4DFmode);
43396 if (elt < 4)
43397 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43398 else
43399 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43400 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43401 return;
43403 case V16SImode:
43404 tmp = gen_reg_rtx (V8SImode);
43405 if (elt < 8)
43406 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43407 else
43408 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43409 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43410 return;
43412 case V8DImode:
43413 tmp = gen_reg_rtx (V4DImode);
43414 if (elt < 4)
43415 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43416 else
43417 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43418 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43419 return;
43421 case V8QImode:
43422 /* ??? Could extract the appropriate HImode element and shift. */
43423 default:
43424 break;
43427 if (use_vec_extr)
43429 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43430 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43432 /* Let the rtl optimizers know about the zero extension performed. */
43433 if (inner_mode == QImode || inner_mode == HImode)
43435 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43436 target = gen_lowpart (SImode, target);
43439 emit_insn (gen_rtx_SET (target, tmp));
43441 else
43443 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43445 emit_move_insn (mem, vec);
43447 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43448 emit_move_insn (target, tmp);
43452 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43453 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43454 The upper bits of DEST are undefined, though they shouldn't cause
43455 exceptions (some bits from src or all zeros are ok). */
43457 static void
43458 emit_reduc_half (rtx dest, rtx src, int i)
43460 rtx tem, d = dest;
43461 switch (GET_MODE (src))
43463 case V4SFmode:
43464 if (i == 128)
43465 tem = gen_sse_movhlps (dest, src, src);
43466 else
43467 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43468 GEN_INT (1 + 4), GEN_INT (1 + 4));
43469 break;
43470 case V2DFmode:
43471 tem = gen_vec_interleave_highv2df (dest, src, src);
43472 break;
43473 case V16QImode:
43474 case V8HImode:
43475 case V4SImode:
43476 case V2DImode:
43477 d = gen_reg_rtx (V1TImode);
43478 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43479 GEN_INT (i / 2));
43480 break;
43481 case V8SFmode:
43482 if (i == 256)
43483 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43484 else
43485 tem = gen_avx_shufps256 (dest, src, src,
43486 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43487 break;
43488 case V4DFmode:
43489 if (i == 256)
43490 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43491 else
43492 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43493 break;
43494 case V32QImode:
43495 case V16HImode:
43496 case V8SImode:
43497 case V4DImode:
43498 if (i == 256)
43500 if (GET_MODE (dest) != V4DImode)
43501 d = gen_reg_rtx (V4DImode);
43502 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43503 gen_lowpart (V4DImode, src),
43504 const1_rtx);
43506 else
43508 d = gen_reg_rtx (V2TImode);
43509 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43510 GEN_INT (i / 2));
43512 break;
43513 case V64QImode:
43514 case V32HImode:
43515 case V16SImode:
43516 case V16SFmode:
43517 case V8DImode:
43518 case V8DFmode:
43519 if (i > 128)
43520 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43521 gen_lowpart (V16SImode, src),
43522 gen_lowpart (V16SImode, src),
43523 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43524 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43525 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43526 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43527 GEN_INT (0xC), GEN_INT (0xD),
43528 GEN_INT (0xE), GEN_INT (0xF),
43529 GEN_INT (0x10), GEN_INT (0x11),
43530 GEN_INT (0x12), GEN_INT (0x13),
43531 GEN_INT (0x14), GEN_INT (0x15),
43532 GEN_INT (0x16), GEN_INT (0x17));
43533 else
43534 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43535 gen_lowpart (V16SImode, src),
43536 GEN_INT (i == 128 ? 0x2 : 0x1),
43537 GEN_INT (0x3),
43538 GEN_INT (0x3),
43539 GEN_INT (0x3),
43540 GEN_INT (i == 128 ? 0x6 : 0x5),
43541 GEN_INT (0x7),
43542 GEN_INT (0x7),
43543 GEN_INT (0x7),
43544 GEN_INT (i == 128 ? 0xA : 0x9),
43545 GEN_INT (0xB),
43546 GEN_INT (0xB),
43547 GEN_INT (0xB),
43548 GEN_INT (i == 128 ? 0xE : 0xD),
43549 GEN_INT (0xF),
43550 GEN_INT (0xF),
43551 GEN_INT (0xF));
43552 break;
43553 default:
43554 gcc_unreachable ();
43556 emit_insn (tem);
43557 if (d != dest)
43558 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43561 /* Expand a vector reduction. FN is the binary pattern to reduce;
43562 DEST is the destination; IN is the input vector. */
43564 void
43565 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43567 rtx half, dst, vec = in;
43568 machine_mode mode = GET_MODE (in);
43569 int i;
43571 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43572 if (TARGET_SSE4_1
43573 && mode == V8HImode
43574 && fn == gen_uminv8hi3)
43576 emit_insn (gen_sse4_1_phminposuw (dest, in));
43577 return;
43580 for (i = GET_MODE_BITSIZE (mode);
43581 i > GET_MODE_UNIT_BITSIZE (mode);
43582 i >>= 1)
43584 half = gen_reg_rtx (mode);
43585 emit_reduc_half (half, vec, i);
43586 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43587 dst = dest;
43588 else
43589 dst = gen_reg_rtx (mode);
43590 emit_insn (fn (dst, half, vec));
43591 vec = dst;
43595 /* Target hook for scalar_mode_supported_p. */
43596 static bool
43597 ix86_scalar_mode_supported_p (machine_mode mode)
43599 if (DECIMAL_FLOAT_MODE_P (mode))
43600 return default_decimal_float_supported_p ();
43601 else if (mode == TFmode)
43602 return true;
43603 else
43604 return default_scalar_mode_supported_p (mode);
43607 /* Implements target hook vector_mode_supported_p. */
43608 static bool
43609 ix86_vector_mode_supported_p (machine_mode mode)
43611 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43612 return true;
43613 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43614 return true;
43615 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43616 return true;
43617 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43618 return true;
43619 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43620 return true;
43621 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43622 return true;
43623 return false;
43626 /* Target hook for c_mode_for_suffix. */
43627 static machine_mode
43628 ix86_c_mode_for_suffix (char suffix)
43630 if (suffix == 'q')
43631 return TFmode;
43632 if (suffix == 'w')
43633 return XFmode;
43635 return VOIDmode;
43638 /* Worker function for TARGET_MD_ASM_ADJUST.
43640 We implement asm flag outputs, and maintain source compatibility
43641 with the old cc0-based compiler. */
43643 static rtx_insn *
43644 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43645 vec<const char *> &constraints,
43646 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43648 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43649 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43651 bool saw_asm_flag = false;
43653 start_sequence ();
43654 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43656 const char *con = constraints[i];
43657 if (strncmp (con, "=@cc", 4) != 0)
43658 continue;
43659 con += 4;
43660 if (strchr (con, ',') != NULL)
43662 error ("alternatives not allowed in asm flag output");
43663 continue;
43666 bool invert = false;
43667 if (con[0] == 'n')
43668 invert = true, con++;
43670 machine_mode mode = CCmode;
43671 rtx_code code = UNKNOWN;
43673 switch (con[0])
43675 case 'a':
43676 if (con[1] == 0)
43677 mode = CCAmode, code = EQ;
43678 else if (con[1] == 'e' && con[2] == 0)
43679 mode = CCCmode, code = NE;
43680 break;
43681 case 'b':
43682 if (con[1] == 0)
43683 mode = CCCmode, code = EQ;
43684 else if (con[1] == 'e' && con[2] == 0)
43685 mode = CCAmode, code = NE;
43686 break;
43687 case 'c':
43688 if (con[1] == 0)
43689 mode = CCCmode, code = EQ;
43690 break;
43691 case 'e':
43692 if (con[1] == 0)
43693 mode = CCZmode, code = EQ;
43694 break;
43695 case 'g':
43696 if (con[1] == 0)
43697 mode = CCGCmode, code = GT;
43698 else if (con[1] == 'e' && con[2] == 0)
43699 mode = CCGCmode, code = GE;
43700 break;
43701 case 'l':
43702 if (con[1] == 0)
43703 mode = CCGCmode, code = LT;
43704 else if (con[1] == 'e' && con[2] == 0)
43705 mode = CCGCmode, code = LE;
43706 break;
43707 case 'o':
43708 if (con[1] == 0)
43709 mode = CCOmode, code = EQ;
43710 break;
43711 case 'p':
43712 if (con[1] == 0)
43713 mode = CCPmode, code = EQ;
43714 break;
43715 case 's':
43716 if (con[1] == 0)
43717 mode = CCSmode, code = EQ;
43718 break;
43719 case 'z':
43720 if (con[1] == 0)
43721 mode = CCZmode, code = EQ;
43722 break;
43724 if (code == UNKNOWN)
43726 error ("unknown asm flag output %qs", constraints[i]);
43727 continue;
43729 if (invert)
43730 code = reverse_condition (code);
43732 rtx dest = outputs[i];
43733 if (!saw_asm_flag)
43735 /* This is the first asm flag output. Here we put the flags
43736 register in as the real output and adjust the condition to
43737 allow it. */
43738 constraints[i] = "=Bf";
43739 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43740 saw_asm_flag = true;
43742 else
43744 /* We don't need the flags register as output twice. */
43745 constraints[i] = "=X";
43746 outputs[i] = gen_rtx_SCRATCH (SImode);
43749 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43750 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43752 machine_mode dest_mode = GET_MODE (dest);
43753 if (!SCALAR_INT_MODE_P (dest_mode))
43755 error ("invalid type for asm flag output");
43756 continue;
43759 if (dest_mode == DImode && !TARGET_64BIT)
43760 dest_mode = SImode;
43762 if (dest_mode != QImode)
43764 rtx destqi = gen_reg_rtx (QImode);
43765 emit_insn (gen_rtx_SET (destqi, x));
43767 if (TARGET_ZERO_EXTEND_WITH_AND
43768 && optimize_function_for_speed_p (cfun))
43770 x = force_reg (dest_mode, const0_rtx);
43772 emit_insn (gen_movstrictqi
43773 (gen_lowpart (QImode, x), destqi));
43775 else
43776 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43779 if (dest_mode != GET_MODE (dest))
43781 rtx tmp = gen_reg_rtx (SImode);
43783 emit_insn (gen_rtx_SET (tmp, x));
43784 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43786 else
43787 emit_insn (gen_rtx_SET (dest, x));
43789 rtx_insn *seq = get_insns ();
43790 end_sequence ();
43792 if (saw_asm_flag)
43793 return seq;
43794 else
43796 /* If we had no asm flag outputs, clobber the flags. */
43797 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43798 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43799 return NULL;
43803 /* Implements target vector targetm.asm.encode_section_info. */
43805 static void ATTRIBUTE_UNUSED
43806 ix86_encode_section_info (tree decl, rtx rtl, int first)
43808 default_encode_section_info (decl, rtl, first);
43810 if (ix86_in_large_data_p (decl))
43811 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43814 /* Worker function for REVERSE_CONDITION. */
43816 enum rtx_code
43817 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43819 return (mode != CCFPmode && mode != CCFPUmode
43820 ? reverse_condition (code)
43821 : reverse_condition_maybe_unordered (code));
43824 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43825 to OPERANDS[0]. */
43827 const char *
43828 output_387_reg_move (rtx insn, rtx *operands)
43830 if (REG_P (operands[0]))
43832 if (REG_P (operands[1])
43833 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43835 if (REGNO (operands[0]) == FIRST_STACK_REG)
43836 return output_387_ffreep (operands, 0);
43837 return "fstp\t%y0";
43839 if (STACK_TOP_P (operands[0]))
43840 return "fld%Z1\t%y1";
43841 return "fst\t%y0";
43843 else if (MEM_P (operands[0]))
43845 gcc_assert (REG_P (operands[1]));
43846 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43847 return "fstp%Z0\t%y0";
43848 else
43850 /* There is no non-popping store to memory for XFmode.
43851 So if we need one, follow the store with a load. */
43852 if (GET_MODE (operands[0]) == XFmode)
43853 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43854 else
43855 return "fst%Z0\t%y0";
43858 else
43859 gcc_unreachable();
43862 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43863 FP status register is set. */
43865 void
43866 ix86_emit_fp_unordered_jump (rtx label)
43868 rtx reg = gen_reg_rtx (HImode);
43869 rtx temp;
43871 emit_insn (gen_x86_fnstsw_1 (reg));
43873 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43875 emit_insn (gen_x86_sahf_1 (reg));
43877 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43878 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43880 else
43882 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
43884 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43885 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43888 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43889 gen_rtx_LABEL_REF (VOIDmode, label),
43890 pc_rtx);
43891 temp = gen_rtx_SET (pc_rtx, temp);
43893 emit_jump_insn (temp);
43894 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43897 /* Output code to perform a log1p XFmode calculation. */
43899 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43901 rtx_code_label *label1 = gen_label_rtx ();
43902 rtx_code_label *label2 = gen_label_rtx ();
43904 rtx tmp = gen_reg_rtx (XFmode);
43905 rtx tmp2 = gen_reg_rtx (XFmode);
43906 rtx test;
43908 emit_insn (gen_absxf2 (tmp, op1));
43909 test = gen_rtx_GE (VOIDmode, tmp,
43910 const_double_from_real_value (
43911 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43912 XFmode));
43913 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43915 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43916 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43917 emit_jump (label2);
43919 emit_label (label1);
43920 emit_move_insn (tmp, CONST1_RTX (XFmode));
43921 emit_insn (gen_addxf3 (tmp, op1, tmp));
43922 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43923 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43925 emit_label (label2);
43928 /* Emit code for round calculation. */
43929 void ix86_emit_i387_round (rtx op0, rtx op1)
43931 machine_mode inmode = GET_MODE (op1);
43932 machine_mode outmode = GET_MODE (op0);
43933 rtx e1, e2, res, tmp, tmp1, half;
43934 rtx scratch = gen_reg_rtx (HImode);
43935 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43936 rtx_code_label *jump_label = gen_label_rtx ();
43937 rtx insn;
43938 rtx (*gen_abs) (rtx, rtx);
43939 rtx (*gen_neg) (rtx, rtx);
43941 switch (inmode)
43943 case SFmode:
43944 gen_abs = gen_abssf2;
43945 break;
43946 case DFmode:
43947 gen_abs = gen_absdf2;
43948 break;
43949 case XFmode:
43950 gen_abs = gen_absxf2;
43951 break;
43952 default:
43953 gcc_unreachable ();
43956 switch (outmode)
43958 case SFmode:
43959 gen_neg = gen_negsf2;
43960 break;
43961 case DFmode:
43962 gen_neg = gen_negdf2;
43963 break;
43964 case XFmode:
43965 gen_neg = gen_negxf2;
43966 break;
43967 case HImode:
43968 gen_neg = gen_neghi2;
43969 break;
43970 case SImode:
43971 gen_neg = gen_negsi2;
43972 break;
43973 case DImode:
43974 gen_neg = gen_negdi2;
43975 break;
43976 default:
43977 gcc_unreachable ();
43980 e1 = gen_reg_rtx (inmode);
43981 e2 = gen_reg_rtx (inmode);
43982 res = gen_reg_rtx (outmode);
43984 half = const_double_from_real_value (dconsthalf, inmode);
43986 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43988 /* scratch = fxam(op1) */
43989 emit_insn (gen_rtx_SET (scratch,
43990 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43991 UNSPEC_FXAM)));
43992 /* e1 = fabs(op1) */
43993 emit_insn (gen_abs (e1, op1));
43995 /* e2 = e1 + 0.5 */
43996 half = force_reg (inmode, half);
43997 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43999 /* res = floor(e2) */
44000 if (inmode != XFmode)
44002 tmp1 = gen_reg_rtx (XFmode);
44004 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44006 else
44007 tmp1 = e2;
44009 switch (outmode)
44011 case SFmode:
44012 case DFmode:
44014 rtx tmp0 = gen_reg_rtx (XFmode);
44016 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44018 emit_insn (gen_rtx_SET (res,
44019 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44020 UNSPEC_TRUNC_NOOP)));
44022 break;
44023 case XFmode:
44024 emit_insn (gen_frndintxf2_floor (res, tmp1));
44025 break;
44026 case HImode:
44027 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44028 break;
44029 case SImode:
44030 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44031 break;
44032 case DImode:
44033 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44034 break;
44035 default:
44036 gcc_unreachable ();
44039 /* flags = signbit(a) */
44040 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
44042 /* if (flags) then res = -res */
44043 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44044 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44045 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44046 pc_rtx);
44047 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44048 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44049 JUMP_LABEL (insn) = jump_label;
44051 emit_insn (gen_neg (res, res));
44053 emit_label (jump_label);
44054 LABEL_NUSES (jump_label) = 1;
44056 emit_move_insn (op0, res);
44059 /* Output code to perform a Newton-Rhapson approximation of a single precision
44060 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44062 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44064 rtx x0, x1, e0, e1;
44066 x0 = gen_reg_rtx (mode);
44067 e0 = gen_reg_rtx (mode);
44068 e1 = gen_reg_rtx (mode);
44069 x1 = gen_reg_rtx (mode);
44071 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44073 b = force_reg (mode, b);
44075 /* x0 = rcp(b) estimate */
44076 if (mode == V16SFmode || mode == V8DFmode)
44078 if (TARGET_AVX512ER)
44080 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44081 UNSPEC_RCP28)));
44082 /* res = a * x0 */
44083 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44084 return;
44086 else
44087 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44088 UNSPEC_RCP14)));
44090 else
44091 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44092 UNSPEC_RCP)));
44094 /* e0 = x0 * b */
44095 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44097 /* e0 = x0 * e0 */
44098 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44100 /* e1 = x0 + x0 */
44101 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44103 /* x1 = e1 - e0 */
44104 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44106 /* res = a * x1 */
44107 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44110 /* Output code to perform a Newton-Rhapson approximation of a
44111 single precision floating point [reciprocal] square root. */
44113 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44115 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44116 REAL_VALUE_TYPE r;
44117 int unspec;
44119 x0 = gen_reg_rtx (mode);
44120 e0 = gen_reg_rtx (mode);
44121 e1 = gen_reg_rtx (mode);
44122 e2 = gen_reg_rtx (mode);
44123 e3 = gen_reg_rtx (mode);
44125 if (TARGET_AVX512ER && mode == V16SFmode)
44127 if (recip)
44128 /* res = rsqrt28(a) estimate */
44129 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44130 UNSPEC_RSQRT28)));
44131 else
44133 /* x0 = rsqrt28(a) estimate */
44134 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44135 UNSPEC_RSQRT28)));
44136 /* res = rcp28(x0) estimate */
44137 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44138 UNSPEC_RCP28)));
44140 return;
44143 real_from_integer (&r, VOIDmode, -3, SIGNED);
44144 mthree = const_double_from_real_value (r, SFmode);
44146 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44147 mhalf = const_double_from_real_value (r, SFmode);
44148 unspec = UNSPEC_RSQRT;
44150 if (VECTOR_MODE_P (mode))
44152 mthree = ix86_build_const_vector (mode, true, mthree);
44153 mhalf = ix86_build_const_vector (mode, true, mhalf);
44154 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44155 if (GET_MODE_SIZE (mode) == 64)
44156 unspec = UNSPEC_RSQRT14;
44159 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44160 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44162 a = force_reg (mode, a);
44164 /* x0 = rsqrt(a) estimate */
44165 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44166 unspec)));
44168 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44169 if (!recip)
44171 rtx zero = force_reg (mode, CONST0_RTX(mode));
44172 rtx mask;
44174 /* Handle masked compare. */
44175 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44177 mask = gen_reg_rtx (HImode);
44178 /* Imm value 0x4 corresponds to not-equal comparison. */
44179 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44180 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44182 else
44184 mask = gen_reg_rtx (mode);
44185 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44186 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44190 /* e0 = x0 * a */
44191 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44192 /* e1 = e0 * x0 */
44193 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44195 /* e2 = e1 - 3. */
44196 mthree = force_reg (mode, mthree);
44197 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44199 mhalf = force_reg (mode, mhalf);
44200 if (recip)
44201 /* e3 = -.5 * x0 */
44202 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44203 else
44204 /* e3 = -.5 * e0 */
44205 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44206 /* ret = e2 * e3 */
44207 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44210 #ifdef TARGET_SOLARIS
44211 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44213 static void
44214 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44215 tree decl)
44217 /* With Binutils 2.15, the "@unwind" marker must be specified on
44218 every occurrence of the ".eh_frame" section, not just the first
44219 one. */
44220 if (TARGET_64BIT
44221 && strcmp (name, ".eh_frame") == 0)
44223 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44224 flags & SECTION_WRITE ? "aw" : "a");
44225 return;
44228 #ifndef USE_GAS
44229 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44231 solaris_elf_asm_comdat_section (name, flags, decl);
44232 return;
44234 #endif
44236 default_elf_asm_named_section (name, flags, decl);
44238 #endif /* TARGET_SOLARIS */
44240 /* Return the mangling of TYPE if it is an extended fundamental type. */
44242 static const char *
44243 ix86_mangle_type (const_tree type)
44245 type = TYPE_MAIN_VARIANT (type);
44247 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44248 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44249 return NULL;
44251 switch (TYPE_MODE (type))
44253 case TFmode:
44254 /* __float128 is "g". */
44255 return "g";
44256 case XFmode:
44257 /* "long double" or __float80 is "e". */
44258 return "e";
44259 default:
44260 return NULL;
44264 #ifdef TARGET_THREAD_SSP_OFFSET
44265 /* If using TLS guards, don't waste time creating and expanding
44266 __stack_chk_guard decl and MEM as we are going to ignore it. */
44267 static tree
44268 ix86_stack_protect_guard (void)
44270 if (TARGET_SSP_TLS_GUARD)
44271 return NULL_TREE;
44272 return default_stack_protect_guard ();
44274 #endif
44276 /* For 32-bit code we can save PIC register setup by using
44277 __stack_chk_fail_local hidden function instead of calling
44278 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44279 register, so it is better to call __stack_chk_fail directly. */
44281 static tree ATTRIBUTE_UNUSED
44282 ix86_stack_protect_fail (void)
44284 return TARGET_64BIT
44285 ? default_external_stack_protect_fail ()
44286 : default_hidden_stack_protect_fail ();
44289 /* Select a format to encode pointers in exception handling data. CODE
44290 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44291 true if the symbol may be affected by dynamic relocations.
44293 ??? All x86 object file formats are capable of representing this.
44294 After all, the relocation needed is the same as for the call insn.
44295 Whether or not a particular assembler allows us to enter such, I
44296 guess we'll have to see. */
44298 asm_preferred_eh_data_format (int code, int global)
44300 if (flag_pic)
44302 int type = DW_EH_PE_sdata8;
44303 if (!TARGET_64BIT
44304 || ix86_cmodel == CM_SMALL_PIC
44305 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44306 type = DW_EH_PE_sdata4;
44307 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44309 if (ix86_cmodel == CM_SMALL
44310 || (ix86_cmodel == CM_MEDIUM && code))
44311 return DW_EH_PE_udata4;
44312 return DW_EH_PE_absptr;
44315 /* Expand copysign from SIGN to the positive value ABS_VALUE
44316 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44317 the sign-bit. */
44318 static void
44319 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44321 machine_mode mode = GET_MODE (sign);
44322 rtx sgn = gen_reg_rtx (mode);
44323 if (mask == NULL_RTX)
44325 machine_mode vmode;
44327 if (mode == SFmode)
44328 vmode = V4SFmode;
44329 else if (mode == DFmode)
44330 vmode = V2DFmode;
44331 else
44332 vmode = mode;
44334 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44335 if (!VECTOR_MODE_P (mode))
44337 /* We need to generate a scalar mode mask in this case. */
44338 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44339 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44340 mask = gen_reg_rtx (mode);
44341 emit_insn (gen_rtx_SET (mask, tmp));
44344 else
44345 mask = gen_rtx_NOT (mode, mask);
44346 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44347 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44350 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44351 mask for masking out the sign-bit is stored in *SMASK, if that is
44352 non-null. */
44353 static rtx
44354 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44356 machine_mode vmode, mode = GET_MODE (op0);
44357 rtx xa, mask;
44359 xa = gen_reg_rtx (mode);
44360 if (mode == SFmode)
44361 vmode = V4SFmode;
44362 else if (mode == DFmode)
44363 vmode = V2DFmode;
44364 else
44365 vmode = mode;
44366 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44367 if (!VECTOR_MODE_P (mode))
44369 /* We need to generate a scalar mode mask in this case. */
44370 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44371 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44372 mask = gen_reg_rtx (mode);
44373 emit_insn (gen_rtx_SET (mask, tmp));
44375 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44377 if (smask)
44378 *smask = mask;
44380 return xa;
44383 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44384 swapping the operands if SWAP_OPERANDS is true. The expanded
44385 code is a forward jump to a newly created label in case the
44386 comparison is true. The generated label rtx is returned. */
44387 static rtx_code_label *
44388 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44389 bool swap_operands)
44391 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
44392 rtx_code_label *label;
44393 rtx tmp;
44395 if (swap_operands)
44396 std::swap (op0, op1);
44398 label = gen_label_rtx ();
44399 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
44400 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
44401 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
44402 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44403 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44404 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44405 JUMP_LABEL (tmp) = label;
44407 return label;
44410 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44411 using comparison code CODE. Operands are swapped for the comparison if
44412 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44413 static rtx
44414 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44415 bool swap_operands)
44417 rtx (*insn)(rtx, rtx, rtx, rtx);
44418 machine_mode mode = GET_MODE (op0);
44419 rtx mask = gen_reg_rtx (mode);
44421 if (swap_operands)
44422 std::swap (op0, op1);
44424 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44426 emit_insn (insn (mask, op0, op1,
44427 gen_rtx_fmt_ee (code, mode, op0, op1)));
44428 return mask;
44431 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44432 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44433 static rtx
44434 ix86_gen_TWO52 (machine_mode mode)
44436 REAL_VALUE_TYPE TWO52r;
44437 rtx TWO52;
44439 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44440 TWO52 = const_double_from_real_value (TWO52r, mode);
44441 TWO52 = force_reg (mode, TWO52);
44443 return TWO52;
44446 /* Expand SSE sequence for computing lround from OP1 storing
44447 into OP0. */
44448 void
44449 ix86_expand_lround (rtx op0, rtx op1)
44451 /* C code for the stuff we're doing below:
44452 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44453 return (long)tmp;
44455 machine_mode mode = GET_MODE (op1);
44456 const struct real_format *fmt;
44457 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44458 rtx adj;
44460 /* load nextafter (0.5, 0.0) */
44461 fmt = REAL_MODE_FORMAT (mode);
44462 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44463 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44465 /* adj = copysign (0.5, op1) */
44466 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44467 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44469 /* adj = op1 + adj */
44470 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44472 /* op0 = (imode)adj */
44473 expand_fix (op0, adj, 0);
44476 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44477 into OPERAND0. */
44478 void
44479 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44481 /* C code for the stuff we're doing below (for do_floor):
44482 xi = (long)op1;
44483 xi -= (double)xi > op1 ? 1 : 0;
44484 return xi;
44486 machine_mode fmode = GET_MODE (op1);
44487 machine_mode imode = GET_MODE (op0);
44488 rtx ireg, freg, tmp;
44489 rtx_code_label *label;
44491 /* reg = (long)op1 */
44492 ireg = gen_reg_rtx (imode);
44493 expand_fix (ireg, op1, 0);
44495 /* freg = (double)reg */
44496 freg = gen_reg_rtx (fmode);
44497 expand_float (freg, ireg, 0);
44499 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44500 label = ix86_expand_sse_compare_and_jump (UNLE,
44501 freg, op1, !do_floor);
44502 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44503 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44504 emit_move_insn (ireg, tmp);
44506 emit_label (label);
44507 LABEL_NUSES (label) = 1;
44509 emit_move_insn (op0, ireg);
44512 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44513 result in OPERAND0. */
44514 void
44515 ix86_expand_rint (rtx operand0, rtx operand1)
44517 /* C code for the stuff we're doing below:
44518 xa = fabs (operand1);
44519 if (!isless (xa, 2**52))
44520 return operand1;
44521 xa = xa + 2**52 - 2**52;
44522 return copysign (xa, operand1);
44524 machine_mode mode = GET_MODE (operand0);
44525 rtx res, xa, TWO52, mask;
44526 rtx_code_label *label;
44528 res = gen_reg_rtx (mode);
44529 emit_move_insn (res, operand1);
44531 /* xa = abs (operand1) */
44532 xa = ix86_expand_sse_fabs (res, &mask);
44534 /* if (!isless (xa, TWO52)) goto label; */
44535 TWO52 = ix86_gen_TWO52 (mode);
44536 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44538 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44539 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44541 ix86_sse_copysign_to_positive (res, xa, res, mask);
44543 emit_label (label);
44544 LABEL_NUSES (label) = 1;
44546 emit_move_insn (operand0, res);
44549 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44550 into OPERAND0. */
44551 void
44552 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44554 /* C code for the stuff we expand below.
44555 double xa = fabs (x), x2;
44556 if (!isless (xa, TWO52))
44557 return x;
44558 xa = xa + TWO52 - TWO52;
44559 x2 = copysign (xa, x);
44560 Compensate. Floor:
44561 if (x2 > x)
44562 x2 -= 1;
44563 Compensate. Ceil:
44564 if (x2 < x)
44565 x2 -= -1;
44566 return x2;
44568 machine_mode mode = GET_MODE (operand0);
44569 rtx xa, TWO52, tmp, one, res, mask;
44570 rtx_code_label *label;
44572 TWO52 = ix86_gen_TWO52 (mode);
44574 /* Temporary for holding the result, initialized to the input
44575 operand to ease control flow. */
44576 res = gen_reg_rtx (mode);
44577 emit_move_insn (res, operand1);
44579 /* xa = abs (operand1) */
44580 xa = ix86_expand_sse_fabs (res, &mask);
44582 /* if (!isless (xa, TWO52)) goto label; */
44583 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44585 /* xa = xa + TWO52 - TWO52; */
44586 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44587 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44589 /* xa = copysign (xa, operand1) */
44590 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44592 /* generate 1.0 or -1.0 */
44593 one = force_reg (mode,
44594 const_double_from_real_value (do_floor
44595 ? dconst1 : dconstm1, mode));
44597 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44598 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44599 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44600 /* We always need to subtract here to preserve signed zero. */
44601 tmp = expand_simple_binop (mode, MINUS,
44602 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44603 emit_move_insn (res, tmp);
44605 emit_label (label);
44606 LABEL_NUSES (label) = 1;
44608 emit_move_insn (operand0, res);
44611 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44612 into OPERAND0. */
44613 void
44614 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44616 /* C code for the stuff we expand below.
44617 double xa = fabs (x), x2;
44618 if (!isless (xa, TWO52))
44619 return x;
44620 x2 = (double)(long)x;
44621 Compensate. Floor:
44622 if (x2 > x)
44623 x2 -= 1;
44624 Compensate. Ceil:
44625 if (x2 < x)
44626 x2 += 1;
44627 if (HONOR_SIGNED_ZEROS (mode))
44628 return copysign (x2, x);
44629 return x2;
44631 machine_mode mode = GET_MODE (operand0);
44632 rtx xa, xi, TWO52, tmp, one, res, mask;
44633 rtx_code_label *label;
44635 TWO52 = ix86_gen_TWO52 (mode);
44637 /* Temporary for holding the result, initialized to the input
44638 operand to ease control flow. */
44639 res = gen_reg_rtx (mode);
44640 emit_move_insn (res, operand1);
44642 /* xa = abs (operand1) */
44643 xa = ix86_expand_sse_fabs (res, &mask);
44645 /* if (!isless (xa, TWO52)) goto label; */
44646 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44648 /* xa = (double)(long)x */
44649 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44650 expand_fix (xi, res, 0);
44651 expand_float (xa, xi, 0);
44653 /* generate 1.0 */
44654 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44656 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44657 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44658 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44659 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44660 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44661 emit_move_insn (res, tmp);
44663 if (HONOR_SIGNED_ZEROS (mode))
44664 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44666 emit_label (label);
44667 LABEL_NUSES (label) = 1;
44669 emit_move_insn (operand0, res);
44672 /* Expand SSE sequence for computing round from OPERAND1 storing
44673 into OPERAND0. Sequence that works without relying on DImode truncation
44674 via cvttsd2siq that is only available on 64bit targets. */
44675 void
44676 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44678 /* C code for the stuff we expand below.
44679 double xa = fabs (x), xa2, x2;
44680 if (!isless (xa, TWO52))
44681 return x;
44682 Using the absolute value and copying back sign makes
44683 -0.0 -> -0.0 correct.
44684 xa2 = xa + TWO52 - TWO52;
44685 Compensate.
44686 dxa = xa2 - xa;
44687 if (dxa <= -0.5)
44688 xa2 += 1;
44689 else if (dxa > 0.5)
44690 xa2 -= 1;
44691 x2 = copysign (xa2, x);
44692 return x2;
44694 machine_mode mode = GET_MODE (operand0);
44695 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44696 rtx_code_label *label;
44698 TWO52 = ix86_gen_TWO52 (mode);
44700 /* Temporary for holding the result, initialized to the input
44701 operand to ease control flow. */
44702 res = gen_reg_rtx (mode);
44703 emit_move_insn (res, operand1);
44705 /* xa = abs (operand1) */
44706 xa = ix86_expand_sse_fabs (res, &mask);
44708 /* if (!isless (xa, TWO52)) goto label; */
44709 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44711 /* xa2 = xa + TWO52 - TWO52; */
44712 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44713 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44715 /* dxa = xa2 - xa; */
44716 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44718 /* generate 0.5, 1.0 and -0.5 */
44719 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44720 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44721 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44722 0, OPTAB_DIRECT);
44724 /* Compensate. */
44725 tmp = gen_reg_rtx (mode);
44726 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44727 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44728 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44729 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44730 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44731 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44732 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44733 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44735 /* res = copysign (xa2, operand1) */
44736 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44738 emit_label (label);
44739 LABEL_NUSES (label) = 1;
44741 emit_move_insn (operand0, res);
44744 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44745 into OPERAND0. */
44746 void
44747 ix86_expand_trunc (rtx operand0, rtx operand1)
44749 /* C code for SSE variant we expand below.
44750 double xa = fabs (x), x2;
44751 if (!isless (xa, TWO52))
44752 return x;
44753 x2 = (double)(long)x;
44754 if (HONOR_SIGNED_ZEROS (mode))
44755 return copysign (x2, x);
44756 return x2;
44758 machine_mode mode = GET_MODE (operand0);
44759 rtx xa, xi, TWO52, res, mask;
44760 rtx_code_label *label;
44762 TWO52 = ix86_gen_TWO52 (mode);
44764 /* Temporary for holding the result, initialized to the input
44765 operand to ease control flow. */
44766 res = gen_reg_rtx (mode);
44767 emit_move_insn (res, operand1);
44769 /* xa = abs (operand1) */
44770 xa = ix86_expand_sse_fabs (res, &mask);
44772 /* if (!isless (xa, TWO52)) goto label; */
44773 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44775 /* x = (double)(long)x */
44776 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44777 expand_fix (xi, res, 0);
44778 expand_float (res, xi, 0);
44780 if (HONOR_SIGNED_ZEROS (mode))
44781 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44783 emit_label (label);
44784 LABEL_NUSES (label) = 1;
44786 emit_move_insn (operand0, res);
44789 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44790 into OPERAND0. */
44791 void
44792 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44794 machine_mode mode = GET_MODE (operand0);
44795 rtx xa, mask, TWO52, one, res, smask, tmp;
44796 rtx_code_label *label;
44798 /* C code for SSE variant we expand below.
44799 double xa = fabs (x), x2;
44800 if (!isless (xa, TWO52))
44801 return x;
44802 xa2 = xa + TWO52 - TWO52;
44803 Compensate:
44804 if (xa2 > xa)
44805 xa2 -= 1.0;
44806 x2 = copysign (xa2, x);
44807 return x2;
44810 TWO52 = ix86_gen_TWO52 (mode);
44812 /* Temporary for holding the result, initialized to the input
44813 operand to ease control flow. */
44814 res = gen_reg_rtx (mode);
44815 emit_move_insn (res, operand1);
44817 /* xa = abs (operand1) */
44818 xa = ix86_expand_sse_fabs (res, &smask);
44820 /* if (!isless (xa, TWO52)) goto label; */
44821 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44823 /* res = xa + TWO52 - TWO52; */
44824 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44825 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44826 emit_move_insn (res, tmp);
44828 /* generate 1.0 */
44829 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44831 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44832 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44833 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44834 tmp = expand_simple_binop (mode, MINUS,
44835 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44836 emit_move_insn (res, tmp);
44838 /* res = copysign (res, operand1) */
44839 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44841 emit_label (label);
44842 LABEL_NUSES (label) = 1;
44844 emit_move_insn (operand0, res);
44847 /* Expand SSE sequence for computing round from OPERAND1 storing
44848 into OPERAND0. */
44849 void
44850 ix86_expand_round (rtx operand0, rtx operand1)
44852 /* C code for the stuff we're doing below:
44853 double xa = fabs (x);
44854 if (!isless (xa, TWO52))
44855 return x;
44856 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44857 return copysign (xa, x);
44859 machine_mode mode = GET_MODE (operand0);
44860 rtx res, TWO52, xa, xi, half, mask;
44861 rtx_code_label *label;
44862 const struct real_format *fmt;
44863 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44865 /* Temporary for holding the result, initialized to the input
44866 operand to ease control flow. */
44867 res = gen_reg_rtx (mode);
44868 emit_move_insn (res, operand1);
44870 TWO52 = ix86_gen_TWO52 (mode);
44871 xa = ix86_expand_sse_fabs (res, &mask);
44872 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44874 /* load nextafter (0.5, 0.0) */
44875 fmt = REAL_MODE_FORMAT (mode);
44876 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44877 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44879 /* xa = xa + 0.5 */
44880 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44881 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44883 /* xa = (double)(int64_t)xa */
44884 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44885 expand_fix (xi, xa, 0);
44886 expand_float (xa, xi, 0);
44888 /* res = copysign (xa, operand1) */
44889 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44891 emit_label (label);
44892 LABEL_NUSES (label) = 1;
44894 emit_move_insn (operand0, res);
44897 /* Expand SSE sequence for computing round
44898 from OP1 storing into OP0 using sse4 round insn. */
44899 void
44900 ix86_expand_round_sse4 (rtx op0, rtx op1)
44902 machine_mode mode = GET_MODE (op0);
44903 rtx e1, e2, res, half;
44904 const struct real_format *fmt;
44905 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44906 rtx (*gen_copysign) (rtx, rtx, rtx);
44907 rtx (*gen_round) (rtx, rtx, rtx);
44909 switch (mode)
44911 case SFmode:
44912 gen_copysign = gen_copysignsf3;
44913 gen_round = gen_sse4_1_roundsf2;
44914 break;
44915 case DFmode:
44916 gen_copysign = gen_copysigndf3;
44917 gen_round = gen_sse4_1_rounddf2;
44918 break;
44919 default:
44920 gcc_unreachable ();
44923 /* round (a) = trunc (a + copysign (0.5, a)) */
44925 /* load nextafter (0.5, 0.0) */
44926 fmt = REAL_MODE_FORMAT (mode);
44927 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44928 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44929 half = const_double_from_real_value (pred_half, mode);
44931 /* e1 = copysign (0.5, op1) */
44932 e1 = gen_reg_rtx (mode);
44933 emit_insn (gen_copysign (e1, half, op1));
44935 /* e2 = op1 + e1 */
44936 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44938 /* res = trunc (e2) */
44939 res = gen_reg_rtx (mode);
44940 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44942 emit_move_insn (op0, res);
44946 /* Table of valid machine attributes. */
44947 static const struct attribute_spec ix86_attribute_table[] =
44949 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44950 affects_type_identity } */
44951 /* Stdcall attribute says callee is responsible for popping arguments
44952 if they are not variable. */
44953 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44954 true },
44955 /* Fastcall attribute says callee is responsible for popping arguments
44956 if they are not variable. */
44957 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44958 true },
44959 /* Thiscall attribute says callee is responsible for popping arguments
44960 if they are not variable. */
44961 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44962 true },
44963 /* Cdecl attribute says the callee is a normal C declaration */
44964 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44965 true },
44966 /* Regparm attribute specifies how many integer arguments are to be
44967 passed in registers. */
44968 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44969 true },
44970 /* Sseregparm attribute says we are using x86_64 calling conventions
44971 for FP arguments. */
44972 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44973 true },
44974 /* The transactional memory builtins are implicitly regparm or fastcall
44975 depending on the ABI. Override the generic do-nothing attribute that
44976 these builtins were declared with. */
44977 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44978 true },
44979 /* force_align_arg_pointer says this function realigns the stack at entry. */
44980 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44981 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44982 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44983 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44984 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44985 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44986 false },
44987 #endif
44988 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44989 false },
44990 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44991 false },
44992 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44993 SUBTARGET_ATTRIBUTE_TABLE,
44994 #endif
44995 /* ms_abi and sysv_abi calling convention function attributes. */
44996 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44997 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44998 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44999 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
45000 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
45001 false },
45002 { "callee_pop_aggregate_return", 1, 1, false, true, true,
45003 ix86_handle_callee_pop_aggregate_return, true },
45004 { "interrupt", 0, 0, false, true, true,
45005 ix86_handle_interrupt_attribute, false },
45006 { "no_caller_saved_registers", 0, 0, false, true, true,
45007 ix86_handle_no_caller_saved_registers_attribute, false },
45009 /* End element. */
45010 { NULL, 0, 0, false, false, false, NULL, false }
45013 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45014 static int
45015 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45016 tree vectype, int)
45018 switch (type_of_cost)
45020 case scalar_stmt:
45021 return ix86_cost->scalar_stmt_cost;
45023 case scalar_load:
45024 return ix86_cost->scalar_load_cost;
45026 case scalar_store:
45027 return ix86_cost->scalar_store_cost;
45029 case vector_stmt:
45030 return ix86_cost->vec_stmt_cost;
45032 case vector_load:
45033 return ix86_cost->vec_align_load_cost;
45035 case vector_store:
45036 return ix86_cost->vec_store_cost;
45038 case vec_to_scalar:
45039 return ix86_cost->vec_to_scalar_cost;
45041 case scalar_to_vec:
45042 return ix86_cost->scalar_to_vec_cost;
45044 case unaligned_load:
45045 case unaligned_store:
45046 return ix86_cost->vec_unalign_load_cost;
45048 case cond_branch_taken:
45049 return ix86_cost->cond_taken_branch_cost;
45051 case cond_branch_not_taken:
45052 return ix86_cost->cond_not_taken_branch_cost;
45054 case vec_perm:
45055 case vec_promote_demote:
45056 return ix86_cost->vec_stmt_cost;
45058 case vec_construct:
45059 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
45061 default:
45062 gcc_unreachable ();
45066 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45067 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45068 insn every time. */
45070 static GTY(()) rtx_insn *vselect_insn;
45072 /* Initialize vselect_insn. */
45074 static void
45075 init_vselect_insn (void)
45077 unsigned i;
45078 rtx x;
45080 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45081 for (i = 0; i < MAX_VECT_LEN; ++i)
45082 XVECEXP (x, 0, i) = const0_rtx;
45083 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45084 const0_rtx), x);
45085 x = gen_rtx_SET (const0_rtx, x);
45086 start_sequence ();
45087 vselect_insn = emit_insn (x);
45088 end_sequence ();
45091 /* Construct (set target (vec_select op0 (parallel perm))) and
45092 return true if that's a valid instruction in the active ISA. */
45094 static bool
45095 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45096 unsigned nelt, bool testing_p)
45098 unsigned int i;
45099 rtx x, save_vconcat;
45100 int icode;
45102 if (vselect_insn == NULL_RTX)
45103 init_vselect_insn ();
45105 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45106 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45107 for (i = 0; i < nelt; ++i)
45108 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45109 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45110 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45111 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45112 SET_DEST (PATTERN (vselect_insn)) = target;
45113 icode = recog_memoized (vselect_insn);
45115 if (icode >= 0 && !testing_p)
45116 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45118 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45119 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45120 INSN_CODE (vselect_insn) = -1;
45122 return icode >= 0;
45125 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45127 static bool
45128 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45129 const unsigned char *perm, unsigned nelt,
45130 bool testing_p)
45132 machine_mode v2mode;
45133 rtx x;
45134 bool ok;
45136 if (vselect_insn == NULL_RTX)
45137 init_vselect_insn ();
45139 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
45140 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45141 PUT_MODE (x, v2mode);
45142 XEXP (x, 0) = op0;
45143 XEXP (x, 1) = op1;
45144 ok = expand_vselect (target, x, perm, nelt, testing_p);
45145 XEXP (x, 0) = const0_rtx;
45146 XEXP (x, 1) = const0_rtx;
45147 return ok;
45150 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45151 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45153 static bool
45154 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45156 machine_mode mmode, vmode = d->vmode;
45157 unsigned i, mask, nelt = d->nelt;
45158 rtx target, op0, op1, maskop, x;
45159 rtx rperm[32], vperm;
45161 if (d->one_operand_p)
45162 return false;
45163 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45164 && (TARGET_AVX512BW
45165 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45167 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45169 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45171 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45173 else
45174 return false;
45176 /* This is a blend, not a permute. Elements must stay in their
45177 respective lanes. */
45178 for (i = 0; i < nelt; ++i)
45180 unsigned e = d->perm[i];
45181 if (!(e == i || e == i + nelt))
45182 return false;
45185 if (d->testing_p)
45186 return true;
45188 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45189 decision should be extracted elsewhere, so that we only try that
45190 sequence once all budget==3 options have been tried. */
45191 target = d->target;
45192 op0 = d->op0;
45193 op1 = d->op1;
45194 mask = 0;
45196 switch (vmode)
45198 case V8DFmode:
45199 case V16SFmode:
45200 case V4DFmode:
45201 case V8SFmode:
45202 case V2DFmode:
45203 case V4SFmode:
45204 case V8HImode:
45205 case V8SImode:
45206 case V32HImode:
45207 case V64QImode:
45208 case V16SImode:
45209 case V8DImode:
45210 for (i = 0; i < nelt; ++i)
45211 mask |= (d->perm[i] >= nelt) << i;
45212 break;
45214 case V2DImode:
45215 for (i = 0; i < 2; ++i)
45216 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45217 vmode = V8HImode;
45218 goto do_subreg;
45220 case V4SImode:
45221 for (i = 0; i < 4; ++i)
45222 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45223 vmode = V8HImode;
45224 goto do_subreg;
45226 case V16QImode:
45227 /* See if bytes move in pairs so we can use pblendw with
45228 an immediate argument, rather than pblendvb with a vector
45229 argument. */
45230 for (i = 0; i < 16; i += 2)
45231 if (d->perm[i] + 1 != d->perm[i + 1])
45233 use_pblendvb:
45234 for (i = 0; i < nelt; ++i)
45235 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45237 finish_pblendvb:
45238 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45239 vperm = force_reg (vmode, vperm);
45241 if (GET_MODE_SIZE (vmode) == 16)
45242 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45243 else
45244 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45245 if (target != d->target)
45246 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45247 return true;
45250 for (i = 0; i < 8; ++i)
45251 mask |= (d->perm[i * 2] >= 16) << i;
45252 vmode = V8HImode;
45253 /* FALLTHRU */
45255 do_subreg:
45256 target = gen_reg_rtx (vmode);
45257 op0 = gen_lowpart (vmode, op0);
45258 op1 = gen_lowpart (vmode, op1);
45259 break;
45261 case V32QImode:
45262 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45263 for (i = 0; i < 32; i += 2)
45264 if (d->perm[i] + 1 != d->perm[i + 1])
45265 goto use_pblendvb;
45266 /* See if bytes move in quadruplets. If yes, vpblendd
45267 with immediate can be used. */
45268 for (i = 0; i < 32; i += 4)
45269 if (d->perm[i] + 2 != d->perm[i + 2])
45270 break;
45271 if (i < 32)
45273 /* See if bytes move the same in both lanes. If yes,
45274 vpblendw with immediate can be used. */
45275 for (i = 0; i < 16; i += 2)
45276 if (d->perm[i] + 16 != d->perm[i + 16])
45277 goto use_pblendvb;
45279 /* Use vpblendw. */
45280 for (i = 0; i < 16; ++i)
45281 mask |= (d->perm[i * 2] >= 32) << i;
45282 vmode = V16HImode;
45283 goto do_subreg;
45286 /* Use vpblendd. */
45287 for (i = 0; i < 8; ++i)
45288 mask |= (d->perm[i * 4] >= 32) << i;
45289 vmode = V8SImode;
45290 goto do_subreg;
45292 case V16HImode:
45293 /* See if words move in pairs. If yes, vpblendd can be used. */
45294 for (i = 0; i < 16; i += 2)
45295 if (d->perm[i] + 1 != d->perm[i + 1])
45296 break;
45297 if (i < 16)
45299 /* See if words move the same in both lanes. If not,
45300 vpblendvb must be used. */
45301 for (i = 0; i < 8; i++)
45302 if (d->perm[i] + 8 != d->perm[i + 8])
45304 /* Use vpblendvb. */
45305 for (i = 0; i < 32; ++i)
45306 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45308 vmode = V32QImode;
45309 nelt = 32;
45310 target = gen_reg_rtx (vmode);
45311 op0 = gen_lowpart (vmode, op0);
45312 op1 = gen_lowpart (vmode, op1);
45313 goto finish_pblendvb;
45316 /* Use vpblendw. */
45317 for (i = 0; i < 16; ++i)
45318 mask |= (d->perm[i] >= 16) << i;
45319 break;
45322 /* Use vpblendd. */
45323 for (i = 0; i < 8; ++i)
45324 mask |= (d->perm[i * 2] >= 16) << i;
45325 vmode = V8SImode;
45326 goto do_subreg;
45328 case V4DImode:
45329 /* Use vpblendd. */
45330 for (i = 0; i < 4; ++i)
45331 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45332 vmode = V8SImode;
45333 goto do_subreg;
45335 default:
45336 gcc_unreachable ();
45339 switch (vmode)
45341 case V8DFmode:
45342 case V8DImode:
45343 mmode = QImode;
45344 break;
45345 case V16SFmode:
45346 case V16SImode:
45347 mmode = HImode;
45348 break;
45349 case V32HImode:
45350 mmode = SImode;
45351 break;
45352 case V64QImode:
45353 mmode = DImode;
45354 break;
45355 default:
45356 mmode = VOIDmode;
45359 if (mmode != VOIDmode)
45360 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45361 else
45362 maskop = GEN_INT (mask);
45364 /* This matches five different patterns with the different modes. */
45365 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45366 x = gen_rtx_SET (target, x);
45367 emit_insn (x);
45368 if (target != d->target)
45369 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45371 return true;
45374 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45375 in terms of the variable form of vpermilps.
45377 Note that we will have already failed the immediate input vpermilps,
45378 which requires that the high and low part shuffle be identical; the
45379 variable form doesn't require that. */
45381 static bool
45382 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45384 rtx rperm[8], vperm;
45385 unsigned i;
45387 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45388 return false;
45390 /* We can only permute within the 128-bit lane. */
45391 for (i = 0; i < 8; ++i)
45393 unsigned e = d->perm[i];
45394 if (i < 4 ? e >= 4 : e < 4)
45395 return false;
45398 if (d->testing_p)
45399 return true;
45401 for (i = 0; i < 8; ++i)
45403 unsigned e = d->perm[i];
45405 /* Within each 128-bit lane, the elements of op0 are numbered
45406 from 0 and the elements of op1 are numbered from 4. */
45407 if (e >= 8 + 4)
45408 e -= 8;
45409 else if (e >= 4)
45410 e -= 4;
45412 rperm[i] = GEN_INT (e);
45415 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45416 vperm = force_reg (V8SImode, vperm);
45417 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45419 return true;
45422 /* Return true if permutation D can be performed as VMODE permutation
45423 instead. */
45425 static bool
45426 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45428 unsigned int i, j, chunk;
45430 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45431 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45432 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45433 return false;
45435 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45436 return true;
45438 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45439 for (i = 0; i < d->nelt; i += chunk)
45440 if (d->perm[i] & (chunk - 1))
45441 return false;
45442 else
45443 for (j = 1; j < chunk; ++j)
45444 if (d->perm[i] + j != d->perm[i + j])
45445 return false;
45447 return true;
45450 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45451 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45453 static bool
45454 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45456 unsigned i, nelt, eltsz, mask;
45457 unsigned char perm[64];
45458 machine_mode vmode = V16QImode;
45459 rtx rperm[64], vperm, target, op0, op1;
45461 nelt = d->nelt;
45463 if (!d->one_operand_p)
45465 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45467 if (TARGET_AVX2
45468 && valid_perm_using_mode_p (V2TImode, d))
45470 if (d->testing_p)
45471 return true;
45473 /* Use vperm2i128 insn. The pattern uses
45474 V4DImode instead of V2TImode. */
45475 target = d->target;
45476 if (d->vmode != V4DImode)
45477 target = gen_reg_rtx (V4DImode);
45478 op0 = gen_lowpart (V4DImode, d->op0);
45479 op1 = gen_lowpart (V4DImode, d->op1);
45480 rperm[0]
45481 = GEN_INT ((d->perm[0] / (nelt / 2))
45482 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45483 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45484 if (target != d->target)
45485 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45486 return true;
45488 return false;
45491 else
45493 if (GET_MODE_SIZE (d->vmode) == 16)
45495 if (!TARGET_SSSE3)
45496 return false;
45498 else if (GET_MODE_SIZE (d->vmode) == 32)
45500 if (!TARGET_AVX2)
45501 return false;
45503 /* V4DImode should be already handled through
45504 expand_vselect by vpermq instruction. */
45505 gcc_assert (d->vmode != V4DImode);
45507 vmode = V32QImode;
45508 if (d->vmode == V8SImode
45509 || d->vmode == V16HImode
45510 || d->vmode == V32QImode)
45512 /* First see if vpermq can be used for
45513 V8SImode/V16HImode/V32QImode. */
45514 if (valid_perm_using_mode_p (V4DImode, d))
45516 for (i = 0; i < 4; i++)
45517 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45518 if (d->testing_p)
45519 return true;
45520 target = gen_reg_rtx (V4DImode);
45521 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45522 perm, 4, false))
45524 emit_move_insn (d->target,
45525 gen_lowpart (d->vmode, target));
45526 return true;
45528 return false;
45531 /* Next see if vpermd can be used. */
45532 if (valid_perm_using_mode_p (V8SImode, d))
45533 vmode = V8SImode;
45535 /* Or if vpermps can be used. */
45536 else if (d->vmode == V8SFmode)
45537 vmode = V8SImode;
45539 if (vmode == V32QImode)
45541 /* vpshufb only works intra lanes, it is not
45542 possible to shuffle bytes in between the lanes. */
45543 for (i = 0; i < nelt; ++i)
45544 if ((d->perm[i] ^ i) & (nelt / 2))
45545 return false;
45548 else if (GET_MODE_SIZE (d->vmode) == 64)
45550 if (!TARGET_AVX512BW)
45551 return false;
45553 /* If vpermq didn't work, vpshufb won't work either. */
45554 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45555 return false;
45557 vmode = V64QImode;
45558 if (d->vmode == V16SImode
45559 || d->vmode == V32HImode
45560 || d->vmode == V64QImode)
45562 /* First see if vpermq can be used for
45563 V16SImode/V32HImode/V64QImode. */
45564 if (valid_perm_using_mode_p (V8DImode, d))
45566 for (i = 0; i < 8; i++)
45567 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45568 if (d->testing_p)
45569 return true;
45570 target = gen_reg_rtx (V8DImode);
45571 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45572 perm, 8, false))
45574 emit_move_insn (d->target,
45575 gen_lowpart (d->vmode, target));
45576 return true;
45578 return false;
45581 /* Next see if vpermd can be used. */
45582 if (valid_perm_using_mode_p (V16SImode, d))
45583 vmode = V16SImode;
45585 /* Or if vpermps can be used. */
45586 else if (d->vmode == V16SFmode)
45587 vmode = V16SImode;
45588 if (vmode == V64QImode)
45590 /* vpshufb only works intra lanes, it is not
45591 possible to shuffle bytes in between the lanes. */
45592 for (i = 0; i < nelt; ++i)
45593 if ((d->perm[i] ^ i) & (nelt / 4))
45594 return false;
45597 else
45598 return false;
45601 if (d->testing_p)
45602 return true;
45604 if (vmode == V8SImode)
45605 for (i = 0; i < 8; ++i)
45606 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45607 else if (vmode == V16SImode)
45608 for (i = 0; i < 16; ++i)
45609 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45610 else
45612 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45613 if (!d->one_operand_p)
45614 mask = 2 * nelt - 1;
45615 else if (vmode == V16QImode)
45616 mask = nelt - 1;
45617 else if (vmode == V64QImode)
45618 mask = nelt / 4 - 1;
45619 else
45620 mask = nelt / 2 - 1;
45622 for (i = 0; i < nelt; ++i)
45624 unsigned j, e = d->perm[i] & mask;
45625 for (j = 0; j < eltsz; ++j)
45626 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45630 vperm = gen_rtx_CONST_VECTOR (vmode,
45631 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45632 vperm = force_reg (vmode, vperm);
45634 target = d->target;
45635 if (d->vmode != vmode)
45636 target = gen_reg_rtx (vmode);
45637 op0 = gen_lowpart (vmode, d->op0);
45638 if (d->one_operand_p)
45640 if (vmode == V16QImode)
45641 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45642 else if (vmode == V32QImode)
45643 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45644 else if (vmode == V64QImode)
45645 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45646 else if (vmode == V8SFmode)
45647 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45648 else if (vmode == V8SImode)
45649 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45650 else if (vmode == V16SFmode)
45651 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45652 else if (vmode == V16SImode)
45653 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45654 else
45655 gcc_unreachable ();
45657 else
45659 op1 = gen_lowpart (vmode, d->op1);
45660 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45662 if (target != d->target)
45663 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45665 return true;
45668 /* For V*[QHS]Imode permutations, check if the same permutation
45669 can't be performed in a 2x, 4x or 8x wider inner mode. */
45671 static bool
45672 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45673 struct expand_vec_perm_d *nd)
45675 int i;
45676 enum machine_mode mode = VOIDmode;
45678 switch (d->vmode)
45680 case V16QImode: mode = V8HImode; break;
45681 case V32QImode: mode = V16HImode; break;
45682 case V64QImode: mode = V32HImode; break;
45683 case V8HImode: mode = V4SImode; break;
45684 case V16HImode: mode = V8SImode; break;
45685 case V32HImode: mode = V16SImode; break;
45686 case V4SImode: mode = V2DImode; break;
45687 case V8SImode: mode = V4DImode; break;
45688 case V16SImode: mode = V8DImode; break;
45689 default: return false;
45691 for (i = 0; i < d->nelt; i += 2)
45692 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45693 return false;
45694 nd->vmode = mode;
45695 nd->nelt = d->nelt / 2;
45696 for (i = 0; i < nd->nelt; i++)
45697 nd->perm[i] = d->perm[2 * i] / 2;
45698 if (GET_MODE_INNER (mode) != DImode)
45699 canonicalize_vector_int_perm (nd, nd);
45700 if (nd != d)
45702 nd->one_operand_p = d->one_operand_p;
45703 nd->testing_p = d->testing_p;
45704 if (d->op0 == d->op1)
45705 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45706 else
45708 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45709 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45711 if (d->testing_p)
45712 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45713 else
45714 nd->target = gen_reg_rtx (nd->vmode);
45716 return true;
45719 /* Try to expand one-operand permutation with constant mask. */
45721 static bool
45722 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45724 machine_mode mode = GET_MODE (d->op0);
45725 machine_mode maskmode = mode;
45726 rtx (*gen) (rtx, rtx, rtx) = NULL;
45727 rtx target, op0, mask;
45728 rtx vec[64];
45730 if (!rtx_equal_p (d->op0, d->op1))
45731 return false;
45733 if (!TARGET_AVX512F)
45734 return false;
45736 switch (mode)
45738 case V16SImode:
45739 gen = gen_avx512f_permvarv16si;
45740 break;
45741 case V16SFmode:
45742 gen = gen_avx512f_permvarv16sf;
45743 maskmode = V16SImode;
45744 break;
45745 case V8DImode:
45746 gen = gen_avx512f_permvarv8di;
45747 break;
45748 case V8DFmode:
45749 gen = gen_avx512f_permvarv8df;
45750 maskmode = V8DImode;
45751 break;
45752 default:
45753 return false;
45756 target = d->target;
45757 op0 = d->op0;
45758 for (int i = 0; i < d->nelt; ++i)
45759 vec[i] = GEN_INT (d->perm[i]);
45760 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45761 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45762 return true;
45765 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45766 in a single instruction. */
45768 static bool
45769 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45771 unsigned i, nelt = d->nelt;
45772 struct expand_vec_perm_d nd;
45774 /* Check plain VEC_SELECT first, because AVX has instructions that could
45775 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45776 input where SEL+CONCAT may not. */
45777 if (d->one_operand_p)
45779 int mask = nelt - 1;
45780 bool identity_perm = true;
45781 bool broadcast_perm = true;
45783 for (i = 0; i < nelt; i++)
45785 nd.perm[i] = d->perm[i] & mask;
45786 if (nd.perm[i] != i)
45787 identity_perm = false;
45788 if (nd.perm[i])
45789 broadcast_perm = false;
45792 if (identity_perm)
45794 if (!d->testing_p)
45795 emit_move_insn (d->target, d->op0);
45796 return true;
45798 else if (broadcast_perm && TARGET_AVX2)
45800 /* Use vpbroadcast{b,w,d}. */
45801 rtx (*gen) (rtx, rtx) = NULL;
45802 switch (d->vmode)
45804 case V64QImode:
45805 if (TARGET_AVX512BW)
45806 gen = gen_avx512bw_vec_dupv64qi_1;
45807 break;
45808 case V32QImode:
45809 gen = gen_avx2_pbroadcastv32qi_1;
45810 break;
45811 case V32HImode:
45812 if (TARGET_AVX512BW)
45813 gen = gen_avx512bw_vec_dupv32hi_1;
45814 break;
45815 case V16HImode:
45816 gen = gen_avx2_pbroadcastv16hi_1;
45817 break;
45818 case V16SImode:
45819 if (TARGET_AVX512F)
45820 gen = gen_avx512f_vec_dupv16si_1;
45821 break;
45822 case V8SImode:
45823 gen = gen_avx2_pbroadcastv8si_1;
45824 break;
45825 case V16QImode:
45826 gen = gen_avx2_pbroadcastv16qi;
45827 break;
45828 case V8HImode:
45829 gen = gen_avx2_pbroadcastv8hi;
45830 break;
45831 case V16SFmode:
45832 if (TARGET_AVX512F)
45833 gen = gen_avx512f_vec_dupv16sf_1;
45834 break;
45835 case V8SFmode:
45836 gen = gen_avx2_vec_dupv8sf_1;
45837 break;
45838 case V8DFmode:
45839 if (TARGET_AVX512F)
45840 gen = gen_avx512f_vec_dupv8df_1;
45841 break;
45842 case V8DImode:
45843 if (TARGET_AVX512F)
45844 gen = gen_avx512f_vec_dupv8di_1;
45845 break;
45846 /* For other modes prefer other shuffles this function creates. */
45847 default: break;
45849 if (gen != NULL)
45851 if (!d->testing_p)
45852 emit_insn (gen (d->target, d->op0));
45853 return true;
45857 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45858 return true;
45860 /* There are plenty of patterns in sse.md that are written for
45861 SEL+CONCAT and are not replicated for a single op. Perhaps
45862 that should be changed, to avoid the nastiness here. */
45864 /* Recognize interleave style patterns, which means incrementing
45865 every other permutation operand. */
45866 for (i = 0; i < nelt; i += 2)
45868 nd.perm[i] = d->perm[i] & mask;
45869 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45871 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45872 d->testing_p))
45873 return true;
45875 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45876 if (nelt >= 4)
45878 for (i = 0; i < nelt; i += 4)
45880 nd.perm[i + 0] = d->perm[i + 0] & mask;
45881 nd.perm[i + 1] = d->perm[i + 1] & mask;
45882 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45883 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45886 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45887 d->testing_p))
45888 return true;
45892 /* Finally, try the fully general two operand permute. */
45893 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45894 d->testing_p))
45895 return true;
45897 /* Recognize interleave style patterns with reversed operands. */
45898 if (!d->one_operand_p)
45900 for (i = 0; i < nelt; ++i)
45902 unsigned e = d->perm[i];
45903 if (e >= nelt)
45904 e -= nelt;
45905 else
45906 e += nelt;
45907 nd.perm[i] = e;
45910 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45911 d->testing_p))
45912 return true;
45915 /* Try the SSE4.1 blend variable merge instructions. */
45916 if (expand_vec_perm_blend (d))
45917 return true;
45919 /* Try one of the AVX vpermil variable permutations. */
45920 if (expand_vec_perm_vpermil (d))
45921 return true;
45923 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45924 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45925 if (expand_vec_perm_pshufb (d))
45926 return true;
45928 /* Try the AVX2 vpalignr instruction. */
45929 if (expand_vec_perm_palignr (d, true))
45930 return true;
45932 /* Try the AVX512F vperm{s,d} instructions. */
45933 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45934 return true;
45936 /* Try the AVX512F vpermi2 instructions. */
45937 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45938 return true;
45940 /* See if we can get the same permutation in different vector integer
45941 mode. */
45942 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45944 if (!d->testing_p)
45945 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45946 return true;
45948 return false;
45951 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45952 in terms of a pair of pshuflw + pshufhw instructions. */
45954 static bool
45955 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45957 unsigned char perm2[MAX_VECT_LEN];
45958 unsigned i;
45959 bool ok;
45961 if (d->vmode != V8HImode || !d->one_operand_p)
45962 return false;
45964 /* The two permutations only operate in 64-bit lanes. */
45965 for (i = 0; i < 4; ++i)
45966 if (d->perm[i] >= 4)
45967 return false;
45968 for (i = 4; i < 8; ++i)
45969 if (d->perm[i] < 4)
45970 return false;
45972 if (d->testing_p)
45973 return true;
45975 /* Emit the pshuflw. */
45976 memcpy (perm2, d->perm, 4);
45977 for (i = 4; i < 8; ++i)
45978 perm2[i] = i;
45979 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45980 gcc_assert (ok);
45982 /* Emit the pshufhw. */
45983 memcpy (perm2 + 4, d->perm + 4, 4);
45984 for (i = 0; i < 4; ++i)
45985 perm2[i] = i;
45986 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45987 gcc_assert (ok);
45989 return true;
45992 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45993 the permutation using the SSSE3 palignr instruction. This succeeds
45994 when all of the elements in PERM fit within one vector and we merely
45995 need to shift them down so that a single vector permutation has a
45996 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45997 the vpalignr instruction itself can perform the requested permutation. */
45999 static bool
46000 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46002 unsigned i, nelt = d->nelt;
46003 unsigned min, max, minswap, maxswap;
46004 bool in_order, ok, swap = false;
46005 rtx shift, target;
46006 struct expand_vec_perm_d dcopy;
46008 /* Even with AVX, palignr only operates on 128-bit vectors,
46009 in AVX2 palignr operates on both 128-bit lanes. */
46010 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46011 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46012 return false;
46014 min = 2 * nelt;
46015 max = 0;
46016 minswap = 2 * nelt;
46017 maxswap = 0;
46018 for (i = 0; i < nelt; ++i)
46020 unsigned e = d->perm[i];
46021 unsigned eswap = d->perm[i] ^ nelt;
46022 if (GET_MODE_SIZE (d->vmode) == 32)
46024 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46025 eswap = e ^ (nelt / 2);
46027 if (e < min)
46028 min = e;
46029 if (e > max)
46030 max = e;
46031 if (eswap < minswap)
46032 minswap = eswap;
46033 if (eswap > maxswap)
46034 maxswap = eswap;
46036 if (min == 0
46037 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46039 if (d->one_operand_p
46040 || minswap == 0
46041 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46042 ? nelt / 2 : nelt))
46043 return false;
46044 swap = true;
46045 min = minswap;
46046 max = maxswap;
46049 /* Given that we have SSSE3, we know we'll be able to implement the
46050 single operand permutation after the palignr with pshufb for
46051 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46052 first. */
46053 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46054 return true;
46056 dcopy = *d;
46057 if (swap)
46059 dcopy.op0 = d->op1;
46060 dcopy.op1 = d->op0;
46061 for (i = 0; i < nelt; ++i)
46062 dcopy.perm[i] ^= nelt;
46065 in_order = true;
46066 for (i = 0; i < nelt; ++i)
46068 unsigned e = dcopy.perm[i];
46069 if (GET_MODE_SIZE (d->vmode) == 32
46070 && e >= nelt
46071 && (e & (nelt / 2 - 1)) < min)
46072 e = e - min - (nelt / 2);
46073 else
46074 e = e - min;
46075 if (e != i)
46076 in_order = false;
46077 dcopy.perm[i] = e;
46079 dcopy.one_operand_p = true;
46081 if (single_insn_only_p && !in_order)
46082 return false;
46084 /* For AVX2, test whether we can permute the result in one instruction. */
46085 if (d->testing_p)
46087 if (in_order)
46088 return true;
46089 dcopy.op1 = dcopy.op0;
46090 return expand_vec_perm_1 (&dcopy);
46093 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46094 if (GET_MODE_SIZE (d->vmode) == 16)
46096 target = gen_reg_rtx (TImode);
46097 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46098 gen_lowpart (TImode, dcopy.op0), shift));
46100 else
46102 target = gen_reg_rtx (V2TImode);
46103 emit_insn (gen_avx2_palignrv2ti (target,
46104 gen_lowpart (V2TImode, dcopy.op1),
46105 gen_lowpart (V2TImode, dcopy.op0),
46106 shift));
46109 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46111 /* Test for the degenerate case where the alignment by itself
46112 produces the desired permutation. */
46113 if (in_order)
46115 emit_move_insn (d->target, dcopy.op0);
46116 return true;
46119 ok = expand_vec_perm_1 (&dcopy);
46120 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46122 return ok;
46125 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46126 the permutation using the SSE4_1 pblendv instruction. Potentially
46127 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46129 static bool
46130 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46132 unsigned i, which, nelt = d->nelt;
46133 struct expand_vec_perm_d dcopy, dcopy1;
46134 machine_mode vmode = d->vmode;
46135 bool ok;
46137 /* Use the same checks as in expand_vec_perm_blend. */
46138 if (d->one_operand_p)
46139 return false;
46140 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46142 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46144 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46146 else
46147 return false;
46149 /* Figure out where permutation elements stay not in their
46150 respective lanes. */
46151 for (i = 0, which = 0; i < nelt; ++i)
46153 unsigned e = d->perm[i];
46154 if (e != i)
46155 which |= (e < nelt ? 1 : 2);
46157 /* We can pblend the part where elements stay not in their
46158 respective lanes only when these elements are all in one
46159 half of a permutation.
46160 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46161 lanes, but both 8 and 9 >= 8
46162 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46163 respective lanes and 8 >= 8, but 2 not. */
46164 if (which != 1 && which != 2)
46165 return false;
46166 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46167 return true;
46169 /* First we apply one operand permutation to the part where
46170 elements stay not in their respective lanes. */
46171 dcopy = *d;
46172 if (which == 2)
46173 dcopy.op0 = dcopy.op1 = d->op1;
46174 else
46175 dcopy.op0 = dcopy.op1 = d->op0;
46176 if (!d->testing_p)
46177 dcopy.target = gen_reg_rtx (vmode);
46178 dcopy.one_operand_p = true;
46180 for (i = 0; i < nelt; ++i)
46181 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46183 ok = expand_vec_perm_1 (&dcopy);
46184 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46185 return false;
46186 else
46187 gcc_assert (ok);
46188 if (d->testing_p)
46189 return true;
46191 /* Next we put permuted elements into their positions. */
46192 dcopy1 = *d;
46193 if (which == 2)
46194 dcopy1.op1 = dcopy.target;
46195 else
46196 dcopy1.op0 = dcopy.target;
46198 for (i = 0; i < nelt; ++i)
46199 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46201 ok = expand_vec_perm_blend (&dcopy1);
46202 gcc_assert (ok);
46204 return true;
46207 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46209 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46210 a two vector permutation into a single vector permutation by using
46211 an interleave operation to merge the vectors. */
46213 static bool
46214 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46216 struct expand_vec_perm_d dremap, dfinal;
46217 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46218 unsigned HOST_WIDE_INT contents;
46219 unsigned char remap[2 * MAX_VECT_LEN];
46220 rtx_insn *seq;
46221 bool ok, same_halves = false;
46223 if (GET_MODE_SIZE (d->vmode) == 16)
46225 if (d->one_operand_p)
46226 return false;
46228 else if (GET_MODE_SIZE (d->vmode) == 32)
46230 if (!TARGET_AVX)
46231 return false;
46232 /* For 32-byte modes allow even d->one_operand_p.
46233 The lack of cross-lane shuffling in some instructions
46234 might prevent a single insn shuffle. */
46235 dfinal = *d;
46236 dfinal.testing_p = true;
46237 /* If expand_vec_perm_interleave3 can expand this into
46238 a 3 insn sequence, give up and let it be expanded as
46239 3 insn sequence. While that is one insn longer,
46240 it doesn't need a memory operand and in the common
46241 case that both interleave low and high permutations
46242 with the same operands are adjacent needs 4 insns
46243 for both after CSE. */
46244 if (expand_vec_perm_interleave3 (&dfinal))
46245 return false;
46247 else
46248 return false;
46250 /* Examine from whence the elements come. */
46251 contents = 0;
46252 for (i = 0; i < nelt; ++i)
46253 contents |= HOST_WIDE_INT_1U << d->perm[i];
46255 memset (remap, 0xff, sizeof (remap));
46256 dremap = *d;
46258 if (GET_MODE_SIZE (d->vmode) == 16)
46260 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46262 /* Split the two input vectors into 4 halves. */
46263 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46264 h2 = h1 << nelt2;
46265 h3 = h2 << nelt2;
46266 h4 = h3 << nelt2;
46268 /* If the elements from the low halves use interleave low, and similarly
46269 for interleave high. If the elements are from mis-matched halves, we
46270 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46271 if ((contents & (h1 | h3)) == contents)
46273 /* punpckl* */
46274 for (i = 0; i < nelt2; ++i)
46276 remap[i] = i * 2;
46277 remap[i + nelt] = i * 2 + 1;
46278 dremap.perm[i * 2] = i;
46279 dremap.perm[i * 2 + 1] = i + nelt;
46281 if (!TARGET_SSE2 && d->vmode == V4SImode)
46282 dremap.vmode = V4SFmode;
46284 else if ((contents & (h2 | h4)) == contents)
46286 /* punpckh* */
46287 for (i = 0; i < nelt2; ++i)
46289 remap[i + nelt2] = i * 2;
46290 remap[i + nelt + nelt2] = i * 2 + 1;
46291 dremap.perm[i * 2] = i + nelt2;
46292 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46294 if (!TARGET_SSE2 && d->vmode == V4SImode)
46295 dremap.vmode = V4SFmode;
46297 else if ((contents & (h1 | h4)) == contents)
46299 /* shufps */
46300 for (i = 0; i < nelt2; ++i)
46302 remap[i] = i;
46303 remap[i + nelt + nelt2] = i + nelt2;
46304 dremap.perm[i] = i;
46305 dremap.perm[i + nelt2] = i + nelt + nelt2;
46307 if (nelt != 4)
46309 /* shufpd */
46310 dremap.vmode = V2DImode;
46311 dremap.nelt = 2;
46312 dremap.perm[0] = 0;
46313 dremap.perm[1] = 3;
46316 else if ((contents & (h2 | h3)) == contents)
46318 /* shufps */
46319 for (i = 0; i < nelt2; ++i)
46321 remap[i + nelt2] = i;
46322 remap[i + nelt] = i + nelt2;
46323 dremap.perm[i] = i + nelt2;
46324 dremap.perm[i + nelt2] = i + nelt;
46326 if (nelt != 4)
46328 /* shufpd */
46329 dremap.vmode = V2DImode;
46330 dremap.nelt = 2;
46331 dremap.perm[0] = 1;
46332 dremap.perm[1] = 2;
46335 else
46336 return false;
46338 else
46340 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46341 unsigned HOST_WIDE_INT q[8];
46342 unsigned int nonzero_halves[4];
46344 /* Split the two input vectors into 8 quarters. */
46345 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46346 for (i = 1; i < 8; ++i)
46347 q[i] = q[0] << (nelt4 * i);
46348 for (i = 0; i < 4; ++i)
46349 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46351 nonzero_halves[nzcnt] = i;
46352 ++nzcnt;
46355 if (nzcnt == 1)
46357 gcc_assert (d->one_operand_p);
46358 nonzero_halves[1] = nonzero_halves[0];
46359 same_halves = true;
46361 else if (d->one_operand_p)
46363 gcc_assert (nonzero_halves[0] == 0);
46364 gcc_assert (nonzero_halves[1] == 1);
46367 if (nzcnt <= 2)
46369 if (d->perm[0] / nelt2 == nonzero_halves[1])
46371 /* Attempt to increase the likelihood that dfinal
46372 shuffle will be intra-lane. */
46373 std::swap (nonzero_halves[0], nonzero_halves[1]);
46376 /* vperm2f128 or vperm2i128. */
46377 for (i = 0; i < nelt2; ++i)
46379 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46380 remap[i + nonzero_halves[0] * nelt2] = i;
46381 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46382 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46385 if (d->vmode != V8SFmode
46386 && d->vmode != V4DFmode
46387 && d->vmode != V8SImode)
46389 dremap.vmode = V8SImode;
46390 dremap.nelt = 8;
46391 for (i = 0; i < 4; ++i)
46393 dremap.perm[i] = i + nonzero_halves[0] * 4;
46394 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46398 else if (d->one_operand_p)
46399 return false;
46400 else if (TARGET_AVX2
46401 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46403 /* vpunpckl* */
46404 for (i = 0; i < nelt4; ++i)
46406 remap[i] = i * 2;
46407 remap[i + nelt] = i * 2 + 1;
46408 remap[i + nelt2] = i * 2 + nelt2;
46409 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46410 dremap.perm[i * 2] = i;
46411 dremap.perm[i * 2 + 1] = i + nelt;
46412 dremap.perm[i * 2 + nelt2] = i + nelt2;
46413 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46416 else if (TARGET_AVX2
46417 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46419 /* vpunpckh* */
46420 for (i = 0; i < nelt4; ++i)
46422 remap[i + nelt4] = i * 2;
46423 remap[i + nelt + nelt4] = i * 2 + 1;
46424 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46425 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46426 dremap.perm[i * 2] = i + nelt4;
46427 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46428 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46429 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46432 else
46433 return false;
46436 /* Use the remapping array set up above to move the elements from their
46437 swizzled locations into their final destinations. */
46438 dfinal = *d;
46439 for (i = 0; i < nelt; ++i)
46441 unsigned e = remap[d->perm[i]];
46442 gcc_assert (e < nelt);
46443 /* If same_halves is true, both halves of the remapped vector are the
46444 same. Avoid cross-lane accesses if possible. */
46445 if (same_halves && i >= nelt2)
46447 gcc_assert (e < nelt2);
46448 dfinal.perm[i] = e + nelt2;
46450 else
46451 dfinal.perm[i] = e;
46453 if (!d->testing_p)
46455 dremap.target = gen_reg_rtx (dremap.vmode);
46456 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46458 dfinal.op1 = dfinal.op0;
46459 dfinal.one_operand_p = true;
46461 /* Test if the final remap can be done with a single insn. For V4SFmode or
46462 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46463 start_sequence ();
46464 ok = expand_vec_perm_1 (&dfinal);
46465 seq = get_insns ();
46466 end_sequence ();
46468 if (!ok)
46469 return false;
46471 if (d->testing_p)
46472 return true;
46474 if (dremap.vmode != dfinal.vmode)
46476 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46477 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46480 ok = expand_vec_perm_1 (&dremap);
46481 gcc_assert (ok);
46483 emit_insn (seq);
46484 return true;
46487 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46488 a single vector cross-lane permutation into vpermq followed
46489 by any of the single insn permutations. */
46491 static bool
46492 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46494 struct expand_vec_perm_d dremap, dfinal;
46495 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46496 unsigned contents[2];
46497 bool ok;
46499 if (!(TARGET_AVX2
46500 && (d->vmode == V32QImode || d->vmode == V16HImode)
46501 && d->one_operand_p))
46502 return false;
46504 contents[0] = 0;
46505 contents[1] = 0;
46506 for (i = 0; i < nelt2; ++i)
46508 contents[0] |= 1u << (d->perm[i] / nelt4);
46509 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46512 for (i = 0; i < 2; ++i)
46514 unsigned int cnt = 0;
46515 for (j = 0; j < 4; ++j)
46516 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46517 return false;
46520 if (d->testing_p)
46521 return true;
46523 dremap = *d;
46524 dremap.vmode = V4DImode;
46525 dremap.nelt = 4;
46526 dremap.target = gen_reg_rtx (V4DImode);
46527 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46528 dremap.op1 = dremap.op0;
46529 dremap.one_operand_p = true;
46530 for (i = 0; i < 2; ++i)
46532 unsigned int cnt = 0;
46533 for (j = 0; j < 4; ++j)
46534 if ((contents[i] & (1u << j)) != 0)
46535 dremap.perm[2 * i + cnt++] = j;
46536 for (; cnt < 2; ++cnt)
46537 dremap.perm[2 * i + cnt] = 0;
46540 dfinal = *d;
46541 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46542 dfinal.op1 = dfinal.op0;
46543 dfinal.one_operand_p = true;
46544 for (i = 0, j = 0; i < nelt; ++i)
46546 if (i == nelt2)
46547 j = 2;
46548 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46549 if ((d->perm[i] / nelt4) == dremap.perm[j])
46551 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46552 dfinal.perm[i] |= nelt4;
46553 else
46554 gcc_unreachable ();
46557 ok = expand_vec_perm_1 (&dremap);
46558 gcc_assert (ok);
46560 ok = expand_vec_perm_1 (&dfinal);
46561 gcc_assert (ok);
46563 return true;
46566 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46567 a vector permutation using two instructions, vperm2f128 resp.
46568 vperm2i128 followed by any single in-lane permutation. */
46570 static bool
46571 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46573 struct expand_vec_perm_d dfirst, dsecond;
46574 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46575 bool ok;
46577 if (!TARGET_AVX
46578 || GET_MODE_SIZE (d->vmode) != 32
46579 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46580 return false;
46582 dsecond = *d;
46583 dsecond.one_operand_p = false;
46584 dsecond.testing_p = true;
46586 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46587 immediate. For perm < 16 the second permutation uses
46588 d->op0 as first operand, for perm >= 16 it uses d->op1
46589 as first operand. The second operand is the result of
46590 vperm2[fi]128. */
46591 for (perm = 0; perm < 32; perm++)
46593 /* Ignore permutations which do not move anything cross-lane. */
46594 if (perm < 16)
46596 /* The second shuffle for e.g. V4DFmode has
46597 0123 and ABCD operands.
46598 Ignore AB23, as 23 is already in the second lane
46599 of the first operand. */
46600 if ((perm & 0xc) == (1 << 2)) continue;
46601 /* And 01CD, as 01 is in the first lane of the first
46602 operand. */
46603 if ((perm & 3) == 0) continue;
46604 /* And 4567, as then the vperm2[fi]128 doesn't change
46605 anything on the original 4567 second operand. */
46606 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46608 else
46610 /* The second shuffle for e.g. V4DFmode has
46611 4567 and ABCD operands.
46612 Ignore AB67, as 67 is already in the second lane
46613 of the first operand. */
46614 if ((perm & 0xc) == (3 << 2)) continue;
46615 /* And 45CD, as 45 is in the first lane of the first
46616 operand. */
46617 if ((perm & 3) == 2) continue;
46618 /* And 0123, as then the vperm2[fi]128 doesn't change
46619 anything on the original 0123 first operand. */
46620 if ((perm & 0xf) == (1 << 2)) continue;
46623 for (i = 0; i < nelt; i++)
46625 j = d->perm[i] / nelt2;
46626 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46627 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46628 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46629 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46630 else
46631 break;
46634 if (i == nelt)
46636 start_sequence ();
46637 ok = expand_vec_perm_1 (&dsecond);
46638 end_sequence ();
46640 else
46641 ok = false;
46643 if (ok)
46645 if (d->testing_p)
46646 return true;
46648 /* Found a usable second shuffle. dfirst will be
46649 vperm2f128 on d->op0 and d->op1. */
46650 dsecond.testing_p = false;
46651 dfirst = *d;
46652 dfirst.target = gen_reg_rtx (d->vmode);
46653 for (i = 0; i < nelt; i++)
46654 dfirst.perm[i] = (i & (nelt2 - 1))
46655 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46657 canonicalize_perm (&dfirst);
46658 ok = expand_vec_perm_1 (&dfirst);
46659 gcc_assert (ok);
46661 /* And dsecond is some single insn shuffle, taking
46662 d->op0 and result of vperm2f128 (if perm < 16) or
46663 d->op1 and result of vperm2f128 (otherwise). */
46664 if (perm >= 16)
46665 dsecond.op0 = dsecond.op1;
46666 dsecond.op1 = dfirst.target;
46668 ok = expand_vec_perm_1 (&dsecond);
46669 gcc_assert (ok);
46671 return true;
46674 /* For one operand, the only useful vperm2f128 permutation is 0x01
46675 aka lanes swap. */
46676 if (d->one_operand_p)
46677 return false;
46680 return false;
46683 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46684 a two vector permutation using 2 intra-lane interleave insns
46685 and cross-lane shuffle for 32-byte vectors. */
46687 static bool
46688 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46690 unsigned i, nelt;
46691 rtx (*gen) (rtx, rtx, rtx);
46693 if (d->one_operand_p)
46694 return false;
46695 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46697 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46699 else
46700 return false;
46702 nelt = d->nelt;
46703 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46704 return false;
46705 for (i = 0; i < nelt; i += 2)
46706 if (d->perm[i] != d->perm[0] + i / 2
46707 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46708 return false;
46710 if (d->testing_p)
46711 return true;
46713 switch (d->vmode)
46715 case V32QImode:
46716 if (d->perm[0])
46717 gen = gen_vec_interleave_highv32qi;
46718 else
46719 gen = gen_vec_interleave_lowv32qi;
46720 break;
46721 case V16HImode:
46722 if (d->perm[0])
46723 gen = gen_vec_interleave_highv16hi;
46724 else
46725 gen = gen_vec_interleave_lowv16hi;
46726 break;
46727 case V8SImode:
46728 if (d->perm[0])
46729 gen = gen_vec_interleave_highv8si;
46730 else
46731 gen = gen_vec_interleave_lowv8si;
46732 break;
46733 case V4DImode:
46734 if (d->perm[0])
46735 gen = gen_vec_interleave_highv4di;
46736 else
46737 gen = gen_vec_interleave_lowv4di;
46738 break;
46739 case V8SFmode:
46740 if (d->perm[0])
46741 gen = gen_vec_interleave_highv8sf;
46742 else
46743 gen = gen_vec_interleave_lowv8sf;
46744 break;
46745 case V4DFmode:
46746 if (d->perm[0])
46747 gen = gen_vec_interleave_highv4df;
46748 else
46749 gen = gen_vec_interleave_lowv4df;
46750 break;
46751 default:
46752 gcc_unreachable ();
46755 emit_insn (gen (d->target, d->op0, d->op1));
46756 return true;
46759 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46760 a single vector permutation using a single intra-lane vector
46761 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46762 the non-swapped and swapped vectors together. */
46764 static bool
46765 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46767 struct expand_vec_perm_d dfirst, dsecond;
46768 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46769 rtx_insn *seq;
46770 bool ok;
46771 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46773 if (!TARGET_AVX
46774 || TARGET_AVX2
46775 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46776 || !d->one_operand_p)
46777 return false;
46779 dfirst = *d;
46780 for (i = 0; i < nelt; i++)
46781 dfirst.perm[i] = 0xff;
46782 for (i = 0, msk = 0; i < nelt; i++)
46784 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46785 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46786 return false;
46787 dfirst.perm[j] = d->perm[i];
46788 if (j != i)
46789 msk |= (1 << i);
46791 for (i = 0; i < nelt; i++)
46792 if (dfirst.perm[i] == 0xff)
46793 dfirst.perm[i] = i;
46795 if (!d->testing_p)
46796 dfirst.target = gen_reg_rtx (dfirst.vmode);
46798 start_sequence ();
46799 ok = expand_vec_perm_1 (&dfirst);
46800 seq = get_insns ();
46801 end_sequence ();
46803 if (!ok)
46804 return false;
46806 if (d->testing_p)
46807 return true;
46809 emit_insn (seq);
46811 dsecond = *d;
46812 dsecond.op0 = dfirst.target;
46813 dsecond.op1 = dfirst.target;
46814 dsecond.one_operand_p = true;
46815 dsecond.target = gen_reg_rtx (dsecond.vmode);
46816 for (i = 0; i < nelt; i++)
46817 dsecond.perm[i] = i ^ nelt2;
46819 ok = expand_vec_perm_1 (&dsecond);
46820 gcc_assert (ok);
46822 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46823 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46824 return true;
46827 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46828 permutation using two vperm2f128, followed by a vshufpd insn blending
46829 the two vectors together. */
46831 static bool
46832 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46834 struct expand_vec_perm_d dfirst, dsecond, dthird;
46835 bool ok;
46837 if (!TARGET_AVX || (d->vmode != V4DFmode))
46838 return false;
46840 if (d->testing_p)
46841 return true;
46843 dfirst = *d;
46844 dsecond = *d;
46845 dthird = *d;
46847 dfirst.perm[0] = (d->perm[0] & ~1);
46848 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46849 dfirst.perm[2] = (d->perm[2] & ~1);
46850 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46851 dsecond.perm[0] = (d->perm[1] & ~1);
46852 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46853 dsecond.perm[2] = (d->perm[3] & ~1);
46854 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46855 dthird.perm[0] = (d->perm[0] % 2);
46856 dthird.perm[1] = (d->perm[1] % 2) + 4;
46857 dthird.perm[2] = (d->perm[2] % 2) + 2;
46858 dthird.perm[3] = (d->perm[3] % 2) + 6;
46860 dfirst.target = gen_reg_rtx (dfirst.vmode);
46861 dsecond.target = gen_reg_rtx (dsecond.vmode);
46862 dthird.op0 = dfirst.target;
46863 dthird.op1 = dsecond.target;
46864 dthird.one_operand_p = false;
46866 canonicalize_perm (&dfirst);
46867 canonicalize_perm (&dsecond);
46869 ok = expand_vec_perm_1 (&dfirst)
46870 && expand_vec_perm_1 (&dsecond)
46871 && expand_vec_perm_1 (&dthird);
46873 gcc_assert (ok);
46875 return true;
46878 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46879 permutation with two pshufb insns and an ior. We should have already
46880 failed all two instruction sequences. */
46882 static bool
46883 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46885 rtx rperm[2][16], vperm, l, h, op, m128;
46886 unsigned int i, nelt, eltsz;
46888 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46889 return false;
46890 gcc_assert (!d->one_operand_p);
46892 if (d->testing_p)
46893 return true;
46895 nelt = d->nelt;
46896 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46898 /* Generate two permutation masks. If the required element is within
46899 the given vector it is shuffled into the proper lane. If the required
46900 element is in the other vector, force a zero into the lane by setting
46901 bit 7 in the permutation mask. */
46902 m128 = GEN_INT (-128);
46903 for (i = 0; i < nelt; ++i)
46905 unsigned j, e = d->perm[i];
46906 unsigned which = (e >= nelt);
46907 if (e >= nelt)
46908 e -= nelt;
46910 for (j = 0; j < eltsz; ++j)
46912 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46913 rperm[1-which][i*eltsz + j] = m128;
46917 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46918 vperm = force_reg (V16QImode, vperm);
46920 l = gen_reg_rtx (V16QImode);
46921 op = gen_lowpart (V16QImode, d->op0);
46922 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46924 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46925 vperm = force_reg (V16QImode, vperm);
46927 h = gen_reg_rtx (V16QImode);
46928 op = gen_lowpart (V16QImode, d->op1);
46929 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46931 op = d->target;
46932 if (d->vmode != V16QImode)
46933 op = gen_reg_rtx (V16QImode);
46934 emit_insn (gen_iorv16qi3 (op, l, h));
46935 if (op != d->target)
46936 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46938 return true;
46941 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46942 with two vpshufb insns, vpermq and vpor. We should have already failed
46943 all two or three instruction sequences. */
46945 static bool
46946 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46948 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46949 unsigned int i, nelt, eltsz;
46951 if (!TARGET_AVX2
46952 || !d->one_operand_p
46953 || (d->vmode != V32QImode && d->vmode != V16HImode))
46954 return false;
46956 if (d->testing_p)
46957 return true;
46959 nelt = d->nelt;
46960 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46962 /* Generate two permutation masks. If the required element is within
46963 the same lane, it is shuffled in. If the required element from the
46964 other lane, force a zero by setting bit 7 in the permutation mask.
46965 In the other mask the mask has non-negative elements if element
46966 is requested from the other lane, but also moved to the other lane,
46967 so that the result of vpshufb can have the two V2TImode halves
46968 swapped. */
46969 m128 = GEN_INT (-128);
46970 for (i = 0; i < nelt; ++i)
46972 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46973 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46975 for (j = 0; j < eltsz; ++j)
46977 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46978 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46982 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46983 vperm = force_reg (V32QImode, vperm);
46985 h = gen_reg_rtx (V32QImode);
46986 op = gen_lowpart (V32QImode, d->op0);
46987 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46989 /* Swap the 128-byte lanes of h into hp. */
46990 hp = gen_reg_rtx (V4DImode);
46991 op = gen_lowpart (V4DImode, h);
46992 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46993 const1_rtx));
46995 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46996 vperm = force_reg (V32QImode, vperm);
46998 l = gen_reg_rtx (V32QImode);
46999 op = gen_lowpart (V32QImode, d->op0);
47000 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47002 op = d->target;
47003 if (d->vmode != V32QImode)
47004 op = gen_reg_rtx (V32QImode);
47005 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47006 if (op != d->target)
47007 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47009 return true;
47012 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47013 and extract-odd permutations of two V32QImode and V16QImode operand
47014 with two vpshufb insns, vpor and vpermq. We should have already
47015 failed all two or three instruction sequences. */
47017 static bool
47018 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47020 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47021 unsigned int i, nelt, eltsz;
47023 if (!TARGET_AVX2
47024 || d->one_operand_p
47025 || (d->vmode != V32QImode && d->vmode != V16HImode))
47026 return false;
47028 for (i = 0; i < d->nelt; ++i)
47029 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47030 return false;
47032 if (d->testing_p)
47033 return true;
47035 nelt = d->nelt;
47036 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47038 /* Generate two permutation masks. In the first permutation mask
47039 the first quarter will contain indexes for the first half
47040 of the op0, the second quarter will contain bit 7 set, third quarter
47041 will contain indexes for the second half of the op0 and the
47042 last quarter bit 7 set. In the second permutation mask
47043 the first quarter will contain bit 7 set, the second quarter
47044 indexes for the first half of the op1, the third quarter bit 7 set
47045 and last quarter indexes for the second half of the op1.
47046 I.e. the first mask e.g. for V32QImode extract even will be:
47047 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47048 (all values masked with 0xf except for -128) and second mask
47049 for extract even will be
47050 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47051 m128 = GEN_INT (-128);
47052 for (i = 0; i < nelt; ++i)
47054 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47055 unsigned which = d->perm[i] >= nelt;
47056 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47058 for (j = 0; j < eltsz; ++j)
47060 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47061 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47065 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47066 vperm = force_reg (V32QImode, vperm);
47068 l = gen_reg_rtx (V32QImode);
47069 op = gen_lowpart (V32QImode, d->op0);
47070 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47072 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47073 vperm = force_reg (V32QImode, vperm);
47075 h = gen_reg_rtx (V32QImode);
47076 op = gen_lowpart (V32QImode, d->op1);
47077 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47079 ior = gen_reg_rtx (V32QImode);
47080 emit_insn (gen_iorv32qi3 (ior, l, h));
47082 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47083 op = gen_reg_rtx (V4DImode);
47084 ior = gen_lowpart (V4DImode, ior);
47085 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47086 const1_rtx, GEN_INT (3)));
47087 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47089 return true;
47092 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47093 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47094 with two "and" and "pack" or two "shift" and "pack" insns. We should
47095 have already failed all two instruction sequences. */
47097 static bool
47098 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47100 rtx op, dop0, dop1, t, rperm[16];
47101 unsigned i, odd, c, s, nelt = d->nelt;
47102 bool end_perm = false;
47103 machine_mode half_mode;
47104 rtx (*gen_and) (rtx, rtx, rtx);
47105 rtx (*gen_pack) (rtx, rtx, rtx);
47106 rtx (*gen_shift) (rtx, rtx, rtx);
47108 if (d->one_operand_p)
47109 return false;
47111 switch (d->vmode)
47113 case V8HImode:
47114 /* Required for "pack". */
47115 if (!TARGET_SSE4_1)
47116 return false;
47117 c = 0xffff;
47118 s = 16;
47119 half_mode = V4SImode;
47120 gen_and = gen_andv4si3;
47121 gen_pack = gen_sse4_1_packusdw;
47122 gen_shift = gen_lshrv4si3;
47123 break;
47124 case V16QImode:
47125 /* No check as all instructions are SSE2. */
47126 c = 0xff;
47127 s = 8;
47128 half_mode = V8HImode;
47129 gen_and = gen_andv8hi3;
47130 gen_pack = gen_sse2_packuswb;
47131 gen_shift = gen_lshrv8hi3;
47132 break;
47133 case V16HImode:
47134 if (!TARGET_AVX2)
47135 return false;
47136 c = 0xffff;
47137 s = 16;
47138 half_mode = V8SImode;
47139 gen_and = gen_andv8si3;
47140 gen_pack = gen_avx2_packusdw;
47141 gen_shift = gen_lshrv8si3;
47142 end_perm = true;
47143 break;
47144 case V32QImode:
47145 if (!TARGET_AVX2)
47146 return false;
47147 c = 0xff;
47148 s = 8;
47149 half_mode = V16HImode;
47150 gen_and = gen_andv16hi3;
47151 gen_pack = gen_avx2_packuswb;
47152 gen_shift = gen_lshrv16hi3;
47153 end_perm = true;
47154 break;
47155 default:
47156 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47157 general shuffles. */
47158 return false;
47161 /* Check that permutation is even or odd. */
47162 odd = d->perm[0];
47163 if (odd > 1)
47164 return false;
47166 for (i = 1; i < nelt; ++i)
47167 if (d->perm[i] != 2 * i + odd)
47168 return false;
47170 if (d->testing_p)
47171 return true;
47173 dop0 = gen_reg_rtx (half_mode);
47174 dop1 = gen_reg_rtx (half_mode);
47175 if (odd == 0)
47177 for (i = 0; i < nelt / 2; i++)
47178 rperm[i] = GEN_INT (c);
47179 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
47180 t = force_reg (half_mode, t);
47181 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47182 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47184 else
47186 emit_insn (gen_shift (dop0,
47187 gen_lowpart (half_mode, d->op0),
47188 GEN_INT (s)));
47189 emit_insn (gen_shift (dop1,
47190 gen_lowpart (half_mode, d->op1),
47191 GEN_INT (s)));
47193 /* In AVX2 for 256 bit case we need to permute pack result. */
47194 if (TARGET_AVX2 && end_perm)
47196 op = gen_reg_rtx (d->vmode);
47197 t = gen_reg_rtx (V4DImode);
47198 emit_insn (gen_pack (op, dop0, dop1));
47199 emit_insn (gen_avx2_permv4di_1 (t,
47200 gen_lowpart (V4DImode, op),
47201 const0_rtx,
47202 const2_rtx,
47203 const1_rtx,
47204 GEN_INT (3)));
47205 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47207 else
47208 emit_insn (gen_pack (d->target, dop0, dop1));
47210 return true;
47213 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47214 and extract-odd permutations of two V64QI operands
47215 with two "shifts", two "truncs" and one "concat" insns for "odd"
47216 and two "truncs" and one concat insn for "even."
47217 Have already failed all two instruction sequences. */
47219 static bool
47220 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47222 rtx t1, t2, t3, t4;
47223 unsigned i, odd, nelt = d->nelt;
47225 if (!TARGET_AVX512BW
47226 || d->one_operand_p
47227 || d->vmode != V64QImode)
47228 return false;
47230 /* Check that permutation is even or odd. */
47231 odd = d->perm[0];
47232 if (odd > 1)
47233 return false;
47235 for (i = 1; i < nelt; ++i)
47236 if (d->perm[i] != 2 * i + odd)
47237 return false;
47239 if (d->testing_p)
47240 return true;
47243 if (odd)
47245 t1 = gen_reg_rtx (V32HImode);
47246 t2 = gen_reg_rtx (V32HImode);
47247 emit_insn (gen_lshrv32hi3 (t1,
47248 gen_lowpart (V32HImode, d->op0),
47249 GEN_INT (8)));
47250 emit_insn (gen_lshrv32hi3 (t2,
47251 gen_lowpart (V32HImode, d->op1),
47252 GEN_INT (8)));
47254 else
47256 t1 = gen_lowpart (V32HImode, d->op0);
47257 t2 = gen_lowpart (V32HImode, d->op1);
47260 t3 = gen_reg_rtx (V32QImode);
47261 t4 = gen_reg_rtx (V32QImode);
47262 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47263 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47264 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47266 return true;
47269 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47270 and extract-odd permutations. */
47272 static bool
47273 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47275 rtx t1, t2, t3, t4, t5;
47277 switch (d->vmode)
47279 case V4DFmode:
47280 if (d->testing_p)
47281 break;
47282 t1 = gen_reg_rtx (V4DFmode);
47283 t2 = gen_reg_rtx (V4DFmode);
47285 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47286 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47287 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47289 /* Now an unpck[lh]pd will produce the result required. */
47290 if (odd)
47291 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47292 else
47293 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47294 emit_insn (t3);
47295 break;
47297 case V8SFmode:
47299 int mask = odd ? 0xdd : 0x88;
47301 if (d->testing_p)
47302 break;
47303 t1 = gen_reg_rtx (V8SFmode);
47304 t2 = gen_reg_rtx (V8SFmode);
47305 t3 = gen_reg_rtx (V8SFmode);
47307 /* Shuffle within the 128-bit lanes to produce:
47308 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47309 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47310 GEN_INT (mask)));
47312 /* Shuffle the lanes around to produce:
47313 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47314 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47315 GEN_INT (0x3)));
47317 /* Shuffle within the 128-bit lanes to produce:
47318 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47319 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47321 /* Shuffle within the 128-bit lanes to produce:
47322 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47323 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47325 /* Shuffle the lanes around to produce:
47326 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47327 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47328 GEN_INT (0x20)));
47330 break;
47332 case V2DFmode:
47333 case V4SFmode:
47334 case V2DImode:
47335 case V4SImode:
47336 /* These are always directly implementable by expand_vec_perm_1. */
47337 gcc_unreachable ();
47339 case V8HImode:
47340 if (TARGET_SSE4_1)
47341 return expand_vec_perm_even_odd_pack (d);
47342 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47343 return expand_vec_perm_pshufb2 (d);
47344 else
47346 if (d->testing_p)
47347 break;
47348 /* We need 2*log2(N)-1 operations to achieve odd/even
47349 with interleave. */
47350 t1 = gen_reg_rtx (V8HImode);
47351 t2 = gen_reg_rtx (V8HImode);
47352 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47353 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47354 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47355 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47356 if (odd)
47357 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47358 else
47359 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47360 emit_insn (t3);
47362 break;
47364 case V16QImode:
47365 return expand_vec_perm_even_odd_pack (d);
47367 case V16HImode:
47368 case V32QImode:
47369 return expand_vec_perm_even_odd_pack (d);
47371 case V64QImode:
47372 return expand_vec_perm_even_odd_trunc (d);
47374 case V4DImode:
47375 if (!TARGET_AVX2)
47377 struct expand_vec_perm_d d_copy = *d;
47378 d_copy.vmode = V4DFmode;
47379 if (d->testing_p)
47380 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47381 else
47382 d_copy.target = gen_reg_rtx (V4DFmode);
47383 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47384 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47385 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47387 if (!d->testing_p)
47388 emit_move_insn (d->target,
47389 gen_lowpart (V4DImode, d_copy.target));
47390 return true;
47392 return false;
47395 if (d->testing_p)
47396 break;
47398 t1 = gen_reg_rtx (V4DImode);
47399 t2 = gen_reg_rtx (V4DImode);
47401 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47402 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47403 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47405 /* Now an vpunpck[lh]qdq will produce the result required. */
47406 if (odd)
47407 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47408 else
47409 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47410 emit_insn (t3);
47411 break;
47413 case V8SImode:
47414 if (!TARGET_AVX2)
47416 struct expand_vec_perm_d d_copy = *d;
47417 d_copy.vmode = V8SFmode;
47418 if (d->testing_p)
47419 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47420 else
47421 d_copy.target = gen_reg_rtx (V8SFmode);
47422 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47423 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47424 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47426 if (!d->testing_p)
47427 emit_move_insn (d->target,
47428 gen_lowpart (V8SImode, d_copy.target));
47429 return true;
47431 return false;
47434 if (d->testing_p)
47435 break;
47437 t1 = gen_reg_rtx (V8SImode);
47438 t2 = gen_reg_rtx (V8SImode);
47439 t3 = gen_reg_rtx (V4DImode);
47440 t4 = gen_reg_rtx (V4DImode);
47441 t5 = gen_reg_rtx (V4DImode);
47443 /* Shuffle the lanes around into
47444 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47445 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47446 gen_lowpart (V4DImode, d->op1),
47447 GEN_INT (0x20)));
47448 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47449 gen_lowpart (V4DImode, d->op1),
47450 GEN_INT (0x31)));
47452 /* Swap the 2nd and 3rd position in each lane into
47453 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47454 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47455 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47456 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47457 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47459 /* Now an vpunpck[lh]qdq will produce
47460 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47461 if (odd)
47462 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47463 gen_lowpart (V4DImode, t2));
47464 else
47465 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47466 gen_lowpart (V4DImode, t2));
47467 emit_insn (t3);
47468 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47469 break;
47471 default:
47472 gcc_unreachable ();
47475 return true;
47478 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47479 extract-even and extract-odd permutations. */
47481 static bool
47482 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47484 unsigned i, odd, nelt = d->nelt;
47486 odd = d->perm[0];
47487 if (odd != 0 && odd != 1)
47488 return false;
47490 for (i = 1; i < nelt; ++i)
47491 if (d->perm[i] != 2 * i + odd)
47492 return false;
47494 return expand_vec_perm_even_odd_1 (d, odd);
47497 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47498 permutations. We assume that expand_vec_perm_1 has already failed. */
47500 static bool
47501 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47503 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47504 machine_mode vmode = d->vmode;
47505 unsigned char perm2[4];
47506 rtx op0 = d->op0, dest;
47507 bool ok;
47509 switch (vmode)
47511 case V4DFmode:
47512 case V8SFmode:
47513 /* These are special-cased in sse.md so that we can optionally
47514 use the vbroadcast instruction. They expand to two insns
47515 if the input happens to be in a register. */
47516 gcc_unreachable ();
47518 case V2DFmode:
47519 case V2DImode:
47520 case V4SFmode:
47521 case V4SImode:
47522 /* These are always implementable using standard shuffle patterns. */
47523 gcc_unreachable ();
47525 case V8HImode:
47526 case V16QImode:
47527 /* These can be implemented via interleave. We save one insn by
47528 stopping once we have promoted to V4SImode and then use pshufd. */
47529 if (d->testing_p)
47530 return true;
47533 rtx dest;
47534 rtx (*gen) (rtx, rtx, rtx)
47535 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47536 : gen_vec_interleave_lowv8hi;
47538 if (elt >= nelt2)
47540 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47541 : gen_vec_interleave_highv8hi;
47542 elt -= nelt2;
47544 nelt2 /= 2;
47546 dest = gen_reg_rtx (vmode);
47547 emit_insn (gen (dest, op0, op0));
47548 vmode = get_mode_wider_vector (vmode);
47549 op0 = gen_lowpart (vmode, dest);
47551 while (vmode != V4SImode);
47553 memset (perm2, elt, 4);
47554 dest = gen_reg_rtx (V4SImode);
47555 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47556 gcc_assert (ok);
47557 if (!d->testing_p)
47558 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47559 return true;
47561 case V64QImode:
47562 case V32QImode:
47563 case V16HImode:
47564 case V8SImode:
47565 case V4DImode:
47566 /* For AVX2 broadcasts of the first element vpbroadcast* or
47567 vpermq should be used by expand_vec_perm_1. */
47568 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47569 return false;
47571 default:
47572 gcc_unreachable ();
47576 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47577 broadcast permutations. */
47579 static bool
47580 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47582 unsigned i, elt, nelt = d->nelt;
47584 if (!d->one_operand_p)
47585 return false;
47587 elt = d->perm[0];
47588 for (i = 1; i < nelt; ++i)
47589 if (d->perm[i] != elt)
47590 return false;
47592 return expand_vec_perm_broadcast_1 (d);
47595 /* Implement arbitrary permutations of two V64QImode operands
47596 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
47597 static bool
47598 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
47600 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47601 return false;
47603 if (d->testing_p)
47604 return true;
47606 struct expand_vec_perm_d ds[2];
47607 rtx rperm[128], vperm, target0, target1;
47608 unsigned int i, nelt;
47609 machine_mode vmode;
47611 nelt = d->nelt;
47612 vmode = V64QImode;
47614 for (i = 0; i < 2; i++)
47616 ds[i] = *d;
47617 ds[i].vmode = V32HImode;
47618 ds[i].nelt = 32;
47619 ds[i].target = gen_reg_rtx (V32HImode);
47620 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47621 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47624 /* Prepare permutations such that the first one takes care of
47625 putting the even bytes into the right positions or one higher
47626 positions (ds[0]) and the second one takes care of
47627 putting the odd bytes into the right positions or one below
47628 (ds[1]). */
47630 for (i = 0; i < nelt; i++)
47632 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47633 if (i & 1)
47635 rperm[i] = constm1_rtx;
47636 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47638 else
47640 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47641 rperm[i + 64] = constm1_rtx;
47645 bool ok = expand_vec_perm_1 (&ds[0]);
47646 gcc_assert (ok);
47647 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47649 ok = expand_vec_perm_1 (&ds[1]);
47650 gcc_assert (ok);
47651 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47653 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47654 vperm = force_reg (vmode, vperm);
47655 target0 = gen_reg_rtx (V64QImode);
47656 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47658 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47659 vperm = force_reg (vmode, vperm);
47660 target1 = gen_reg_rtx (V64QImode);
47661 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47663 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47664 return true;
47667 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47668 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47669 all the shorter instruction sequences. */
47671 static bool
47672 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47674 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47675 unsigned int i, nelt, eltsz;
47676 bool used[4];
47678 if (!TARGET_AVX2
47679 || d->one_operand_p
47680 || (d->vmode != V32QImode && d->vmode != V16HImode))
47681 return false;
47683 if (d->testing_p)
47684 return true;
47686 nelt = d->nelt;
47687 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47689 /* Generate 4 permutation masks. If the required element is within
47690 the same lane, it is shuffled in. If the required element from the
47691 other lane, force a zero by setting bit 7 in the permutation mask.
47692 In the other mask the mask has non-negative elements if element
47693 is requested from the other lane, but also moved to the other lane,
47694 so that the result of vpshufb can have the two V2TImode halves
47695 swapped. */
47696 m128 = GEN_INT (-128);
47697 for (i = 0; i < 32; ++i)
47699 rperm[0][i] = m128;
47700 rperm[1][i] = m128;
47701 rperm[2][i] = m128;
47702 rperm[3][i] = m128;
47704 used[0] = false;
47705 used[1] = false;
47706 used[2] = false;
47707 used[3] = false;
47708 for (i = 0; i < nelt; ++i)
47710 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47711 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47712 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47714 for (j = 0; j < eltsz; ++j)
47715 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47716 used[which] = true;
47719 for (i = 0; i < 2; ++i)
47721 if (!used[2 * i + 1])
47723 h[i] = NULL_RTX;
47724 continue;
47726 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47727 gen_rtvec_v (32, rperm[2 * i + 1]));
47728 vperm = force_reg (V32QImode, vperm);
47729 h[i] = gen_reg_rtx (V32QImode);
47730 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47731 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47734 /* Swap the 128-byte lanes of h[X]. */
47735 for (i = 0; i < 2; ++i)
47737 if (h[i] == NULL_RTX)
47738 continue;
47739 op = gen_reg_rtx (V4DImode);
47740 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47741 const2_rtx, GEN_INT (3), const0_rtx,
47742 const1_rtx));
47743 h[i] = gen_lowpart (V32QImode, op);
47746 for (i = 0; i < 2; ++i)
47748 if (!used[2 * i])
47750 l[i] = NULL_RTX;
47751 continue;
47753 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47754 vperm = force_reg (V32QImode, vperm);
47755 l[i] = gen_reg_rtx (V32QImode);
47756 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47757 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47760 for (i = 0; i < 2; ++i)
47762 if (h[i] && l[i])
47764 op = gen_reg_rtx (V32QImode);
47765 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47766 l[i] = op;
47768 else if (h[i])
47769 l[i] = h[i];
47772 gcc_assert (l[0] && l[1]);
47773 op = d->target;
47774 if (d->vmode != V32QImode)
47775 op = gen_reg_rtx (V32QImode);
47776 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47777 if (op != d->target)
47778 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47779 return true;
47782 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47783 With all of the interface bits taken care of, perform the expansion
47784 in D and return true on success. */
47786 static bool
47787 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47789 /* Try a single instruction expansion. */
47790 if (expand_vec_perm_1 (d))
47791 return true;
47793 /* Try sequences of two instructions. */
47795 if (expand_vec_perm_pshuflw_pshufhw (d))
47796 return true;
47798 if (expand_vec_perm_palignr (d, false))
47799 return true;
47801 if (expand_vec_perm_interleave2 (d))
47802 return true;
47804 if (expand_vec_perm_broadcast (d))
47805 return true;
47807 if (expand_vec_perm_vpermq_perm_1 (d))
47808 return true;
47810 if (expand_vec_perm_vperm2f128 (d))
47811 return true;
47813 if (expand_vec_perm_pblendv (d))
47814 return true;
47816 /* Try sequences of three instructions. */
47818 if (expand_vec_perm_even_odd_pack (d))
47819 return true;
47821 if (expand_vec_perm_2vperm2f128_vshuf (d))
47822 return true;
47824 if (expand_vec_perm_pshufb2 (d))
47825 return true;
47827 if (expand_vec_perm_interleave3 (d))
47828 return true;
47830 if (expand_vec_perm_vperm2f128_vblend (d))
47831 return true;
47833 /* Try sequences of four instructions. */
47835 if (expand_vec_perm_even_odd_trunc (d))
47836 return true;
47837 if (expand_vec_perm_vpshufb2_vpermq (d))
47838 return true;
47840 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47841 return true;
47843 if (expand_vec_perm_vpermi2_vpshub2 (d))
47844 return true;
47846 /* ??? Look for narrow permutations whose element orderings would
47847 allow the promotion to a wider mode. */
47849 /* ??? Look for sequences of interleave or a wider permute that place
47850 the data into the correct lanes for a half-vector shuffle like
47851 pshuf[lh]w or vpermilps. */
47853 /* ??? Look for sequences of interleave that produce the desired results.
47854 The combinatorics of punpck[lh] get pretty ugly... */
47856 if (expand_vec_perm_even_odd (d))
47857 return true;
47859 /* Even longer sequences. */
47860 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47861 return true;
47863 /* See if we can get the same permutation in different vector integer
47864 mode. */
47865 struct expand_vec_perm_d nd;
47866 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47868 if (!d->testing_p)
47869 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47870 return true;
47873 return false;
47876 /* If a permutation only uses one operand, make it clear. Returns true
47877 if the permutation references both operands. */
47879 static bool
47880 canonicalize_perm (struct expand_vec_perm_d *d)
47882 int i, which, nelt = d->nelt;
47884 for (i = which = 0; i < nelt; ++i)
47885 which |= (d->perm[i] < nelt ? 1 : 2);
47887 d->one_operand_p = true;
47888 switch (which)
47890 default:
47891 gcc_unreachable();
47893 case 3:
47894 if (!rtx_equal_p (d->op0, d->op1))
47896 d->one_operand_p = false;
47897 break;
47899 /* The elements of PERM do not suggest that only the first operand
47900 is used, but both operands are identical. Allow easier matching
47901 of the permutation by folding the permutation into the single
47902 input vector. */
47903 /* FALLTHRU */
47905 case 2:
47906 for (i = 0; i < nelt; ++i)
47907 d->perm[i] &= nelt - 1;
47908 d->op0 = d->op1;
47909 break;
47911 case 1:
47912 d->op1 = d->op0;
47913 break;
47916 return (which == 3);
47919 bool
47920 ix86_expand_vec_perm_const (rtx operands[4])
47922 struct expand_vec_perm_d d;
47923 unsigned char perm[MAX_VECT_LEN];
47924 int i, nelt;
47925 bool two_args;
47926 rtx sel;
47928 d.target = operands[0];
47929 d.op0 = operands[1];
47930 d.op1 = operands[2];
47931 sel = operands[3];
47933 d.vmode = GET_MODE (d.target);
47934 gcc_assert (VECTOR_MODE_P (d.vmode));
47935 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47936 d.testing_p = false;
47938 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47939 gcc_assert (XVECLEN (sel, 0) == nelt);
47940 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47942 for (i = 0; i < nelt; ++i)
47944 rtx e = XVECEXP (sel, 0, i);
47945 int ei = INTVAL (e) & (2 * nelt - 1);
47946 d.perm[i] = ei;
47947 perm[i] = ei;
47950 two_args = canonicalize_perm (&d);
47952 if (ix86_expand_vec_perm_const_1 (&d))
47953 return true;
47955 /* If the selector says both arguments are needed, but the operands are the
47956 same, the above tried to expand with one_operand_p and flattened selector.
47957 If that didn't work, retry without one_operand_p; we succeeded with that
47958 during testing. */
47959 if (two_args && d.one_operand_p)
47961 d.one_operand_p = false;
47962 memcpy (d.perm, perm, sizeof (perm));
47963 return ix86_expand_vec_perm_const_1 (&d);
47966 return false;
47969 /* Implement targetm.vectorize.vec_perm_const_ok. */
47971 static bool
47972 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
47973 const unsigned char *sel)
47975 struct expand_vec_perm_d d;
47976 unsigned int i, nelt, which;
47977 bool ret;
47979 d.vmode = vmode;
47980 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47981 d.testing_p = true;
47983 /* Given sufficient ISA support we can just return true here
47984 for selected vector modes. */
47985 switch (d.vmode)
47987 case V16SFmode:
47988 case V16SImode:
47989 case V8DImode:
47990 case V8DFmode:
47991 if (TARGET_AVX512F)
47992 /* All implementable with a single vpermi2 insn. */
47993 return true;
47994 break;
47995 case V32HImode:
47996 if (TARGET_AVX512BW)
47997 /* All implementable with a single vpermi2 insn. */
47998 return true;
47999 break;
48000 case V64QImode:
48001 if (TARGET_AVX512BW)
48002 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
48003 return true;
48004 break;
48005 case V8SImode:
48006 case V8SFmode:
48007 case V4DFmode:
48008 case V4DImode:
48009 if (TARGET_AVX512VL)
48010 /* All implementable with a single vpermi2 insn. */
48011 return true;
48012 break;
48013 case V16HImode:
48014 if (TARGET_AVX2)
48015 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48016 return true;
48017 break;
48018 case V32QImode:
48019 if (TARGET_AVX2)
48020 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48021 return true;
48022 break;
48023 case V4SImode:
48024 case V4SFmode:
48025 case V8HImode:
48026 case V16QImode:
48027 /* All implementable with a single vpperm insn. */
48028 if (TARGET_XOP)
48029 return true;
48030 /* All implementable with 2 pshufb + 1 ior. */
48031 if (TARGET_SSSE3)
48032 return true;
48033 break;
48034 case V2DImode:
48035 case V2DFmode:
48036 /* All implementable with shufpd or unpck[lh]pd. */
48037 return true;
48038 default:
48039 return false;
48042 /* Extract the values from the vector CST into the permutation
48043 array in D. */
48044 memcpy (d.perm, sel, nelt);
48045 for (i = which = 0; i < nelt; ++i)
48047 unsigned char e = d.perm[i];
48048 gcc_assert (e < 2 * nelt);
48049 which |= (e < nelt ? 1 : 2);
48052 /* For all elements from second vector, fold the elements to first. */
48053 if (which == 2)
48054 for (i = 0; i < nelt; ++i)
48055 d.perm[i] -= nelt;
48057 /* Check whether the mask can be applied to the vector type. */
48058 d.one_operand_p = (which != 3);
48060 /* Implementable with shufps or pshufd. */
48061 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48062 return true;
48064 /* Otherwise we have to go through the motions and see if we can
48065 figure out how to generate the requested permutation. */
48066 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48067 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48068 if (!d.one_operand_p)
48069 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48071 start_sequence ();
48072 ret = ix86_expand_vec_perm_const_1 (&d);
48073 end_sequence ();
48075 return ret;
48078 void
48079 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48081 struct expand_vec_perm_d d;
48082 unsigned i, nelt;
48084 d.target = targ;
48085 d.op0 = op0;
48086 d.op1 = op1;
48087 d.vmode = GET_MODE (targ);
48088 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48089 d.one_operand_p = false;
48090 d.testing_p = false;
48092 for (i = 0; i < nelt; ++i)
48093 d.perm[i] = i * 2 + odd;
48095 /* We'll either be able to implement the permutation directly... */
48096 if (expand_vec_perm_1 (&d))
48097 return;
48099 /* ... or we use the special-case patterns. */
48100 expand_vec_perm_even_odd_1 (&d, odd);
48103 static void
48104 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48106 struct expand_vec_perm_d d;
48107 unsigned i, nelt, base;
48108 bool ok;
48110 d.target = targ;
48111 d.op0 = op0;
48112 d.op1 = op1;
48113 d.vmode = GET_MODE (targ);
48114 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48115 d.one_operand_p = false;
48116 d.testing_p = false;
48118 base = high_p ? nelt / 2 : 0;
48119 for (i = 0; i < nelt / 2; ++i)
48121 d.perm[i * 2] = i + base;
48122 d.perm[i * 2 + 1] = i + base + nelt;
48125 /* Note that for AVX this isn't one instruction. */
48126 ok = ix86_expand_vec_perm_const_1 (&d);
48127 gcc_assert (ok);
48131 /* Expand a vector operation CODE for a V*QImode in terms of the
48132 same operation on V*HImode. */
48134 void
48135 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48137 machine_mode qimode = GET_MODE (dest);
48138 machine_mode himode;
48139 rtx (*gen_il) (rtx, rtx, rtx);
48140 rtx (*gen_ih) (rtx, rtx, rtx);
48141 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48142 struct expand_vec_perm_d d;
48143 bool ok, full_interleave;
48144 bool uns_p = false;
48145 int i;
48147 switch (qimode)
48149 case V16QImode:
48150 himode = V8HImode;
48151 gen_il = gen_vec_interleave_lowv16qi;
48152 gen_ih = gen_vec_interleave_highv16qi;
48153 break;
48154 case V32QImode:
48155 himode = V16HImode;
48156 gen_il = gen_avx2_interleave_lowv32qi;
48157 gen_ih = gen_avx2_interleave_highv32qi;
48158 break;
48159 case V64QImode:
48160 himode = V32HImode;
48161 gen_il = gen_avx512bw_interleave_lowv64qi;
48162 gen_ih = gen_avx512bw_interleave_highv64qi;
48163 break;
48164 default:
48165 gcc_unreachable ();
48168 op2_l = op2_h = op2;
48169 switch (code)
48171 case MULT:
48172 /* Unpack data such that we've got a source byte in each low byte of
48173 each word. We don't care what goes into the high byte of each word.
48174 Rather than trying to get zero in there, most convenient is to let
48175 it be a copy of the low byte. */
48176 op2_l = gen_reg_rtx (qimode);
48177 op2_h = gen_reg_rtx (qimode);
48178 emit_insn (gen_il (op2_l, op2, op2));
48179 emit_insn (gen_ih (op2_h, op2, op2));
48180 /* FALLTHRU */
48182 op1_l = gen_reg_rtx (qimode);
48183 op1_h = gen_reg_rtx (qimode);
48184 emit_insn (gen_il (op1_l, op1, op1));
48185 emit_insn (gen_ih (op1_h, op1, op1));
48186 full_interleave = qimode == V16QImode;
48187 break;
48189 case ASHIFT:
48190 case LSHIFTRT:
48191 uns_p = true;
48192 /* FALLTHRU */
48193 case ASHIFTRT:
48194 op1_l = gen_reg_rtx (himode);
48195 op1_h = gen_reg_rtx (himode);
48196 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48197 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48198 full_interleave = true;
48199 break;
48200 default:
48201 gcc_unreachable ();
48204 /* Perform the operation. */
48205 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48206 1, OPTAB_DIRECT);
48207 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48208 1, OPTAB_DIRECT);
48209 gcc_assert (res_l && res_h);
48211 /* Merge the data back into the right place. */
48212 d.target = dest;
48213 d.op0 = gen_lowpart (qimode, res_l);
48214 d.op1 = gen_lowpart (qimode, res_h);
48215 d.vmode = qimode;
48216 d.nelt = GET_MODE_NUNITS (qimode);
48217 d.one_operand_p = false;
48218 d.testing_p = false;
48220 if (full_interleave)
48222 /* For SSE2, we used an full interleave, so the desired
48223 results are in the even elements. */
48224 for (i = 0; i < d.nelt; ++i)
48225 d.perm[i] = i * 2;
48227 else
48229 /* For AVX, the interleave used above was not cross-lane. So the
48230 extraction is evens but with the second and third quarter swapped.
48231 Happily, that is even one insn shorter than even extraction.
48232 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48233 always first from the first and then from the second source operand,
48234 the index bits above the low 4 bits remains the same.
48235 Thus, for d.nelt == 32 we want permutation
48236 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48237 and for d.nelt == 64 we want permutation
48238 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48239 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48240 for (i = 0; i < d.nelt; ++i)
48241 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48244 ok = ix86_expand_vec_perm_const_1 (&d);
48245 gcc_assert (ok);
48247 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48248 gen_rtx_fmt_ee (code, qimode, op1, op2));
48251 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48252 if op is CONST_VECTOR with all odd elements equal to their
48253 preceding element. */
48255 static bool
48256 const_vector_equal_evenodd_p (rtx op)
48258 machine_mode mode = GET_MODE (op);
48259 int i, nunits = GET_MODE_NUNITS (mode);
48260 if (GET_CODE (op) != CONST_VECTOR
48261 || nunits != CONST_VECTOR_NUNITS (op))
48262 return false;
48263 for (i = 0; i < nunits; i += 2)
48264 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48265 return false;
48266 return true;
48269 void
48270 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48271 bool uns_p, bool odd_p)
48273 machine_mode mode = GET_MODE (op1);
48274 machine_mode wmode = GET_MODE (dest);
48275 rtx x;
48276 rtx orig_op1 = op1, orig_op2 = op2;
48278 if (!nonimmediate_operand (op1, mode))
48279 op1 = force_reg (mode, op1);
48280 if (!nonimmediate_operand (op2, mode))
48281 op2 = force_reg (mode, op2);
48283 /* We only play even/odd games with vectors of SImode. */
48284 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48286 /* If we're looking for the odd results, shift those members down to
48287 the even slots. For some cpus this is faster than a PSHUFD. */
48288 if (odd_p)
48290 /* For XOP use vpmacsdqh, but only for smult, as it is only
48291 signed. */
48292 if (TARGET_XOP && mode == V4SImode && !uns_p)
48294 x = force_reg (wmode, CONST0_RTX (wmode));
48295 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48296 return;
48299 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48300 if (!const_vector_equal_evenodd_p (orig_op1))
48301 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48302 x, NULL, 1, OPTAB_DIRECT);
48303 if (!const_vector_equal_evenodd_p (orig_op2))
48304 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48305 x, NULL, 1, OPTAB_DIRECT);
48306 op1 = gen_lowpart (mode, op1);
48307 op2 = gen_lowpart (mode, op2);
48310 if (mode == V16SImode)
48312 if (uns_p)
48313 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48314 else
48315 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48317 else if (mode == V8SImode)
48319 if (uns_p)
48320 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48321 else
48322 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48324 else if (uns_p)
48325 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48326 else if (TARGET_SSE4_1)
48327 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48328 else
48330 rtx s1, s2, t0, t1, t2;
48332 /* The easiest way to implement this without PMULDQ is to go through
48333 the motions as if we are performing a full 64-bit multiply. With
48334 the exception that we need to do less shuffling of the elements. */
48336 /* Compute the sign-extension, aka highparts, of the two operands. */
48337 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48338 op1, pc_rtx, pc_rtx);
48339 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48340 op2, pc_rtx, pc_rtx);
48342 /* Multiply LO(A) * HI(B), and vice-versa. */
48343 t1 = gen_reg_rtx (wmode);
48344 t2 = gen_reg_rtx (wmode);
48345 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48346 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48348 /* Multiply LO(A) * LO(B). */
48349 t0 = gen_reg_rtx (wmode);
48350 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48352 /* Combine and shift the highparts into place. */
48353 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48354 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48355 1, OPTAB_DIRECT);
48357 /* Combine high and low parts. */
48358 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48359 return;
48361 emit_insn (x);
48364 void
48365 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48366 bool uns_p, bool high_p)
48368 machine_mode wmode = GET_MODE (dest);
48369 machine_mode mode = GET_MODE (op1);
48370 rtx t1, t2, t3, t4, mask;
48372 switch (mode)
48374 case V4SImode:
48375 t1 = gen_reg_rtx (mode);
48376 t2 = gen_reg_rtx (mode);
48377 if (TARGET_XOP && !uns_p)
48379 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48380 shuffle the elements once so that all elements are in the right
48381 place for immediate use: { A C B D }. */
48382 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48383 const1_rtx, GEN_INT (3)));
48384 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48385 const1_rtx, GEN_INT (3)));
48387 else
48389 /* Put the elements into place for the multiply. */
48390 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48391 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48392 high_p = false;
48394 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48395 break;
48397 case V8SImode:
48398 /* Shuffle the elements between the lanes. After this we
48399 have { A B E F | C D G H } for each operand. */
48400 t1 = gen_reg_rtx (V4DImode);
48401 t2 = gen_reg_rtx (V4DImode);
48402 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48403 const0_rtx, const2_rtx,
48404 const1_rtx, GEN_INT (3)));
48405 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48406 const0_rtx, const2_rtx,
48407 const1_rtx, GEN_INT (3)));
48409 /* Shuffle the elements within the lanes. After this we
48410 have { A A B B | C C D D } or { E E F F | G G H H }. */
48411 t3 = gen_reg_rtx (V8SImode);
48412 t4 = gen_reg_rtx (V8SImode);
48413 mask = GEN_INT (high_p
48414 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48415 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48416 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48417 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48419 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48420 break;
48422 case V8HImode:
48423 case V16HImode:
48424 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48425 uns_p, OPTAB_DIRECT);
48426 t2 = expand_binop (mode,
48427 uns_p ? umul_highpart_optab : smul_highpart_optab,
48428 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48429 gcc_assert (t1 && t2);
48431 t3 = gen_reg_rtx (mode);
48432 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48433 emit_move_insn (dest, gen_lowpart (wmode, t3));
48434 break;
48436 case V16QImode:
48437 case V32QImode:
48438 case V32HImode:
48439 case V16SImode:
48440 case V64QImode:
48441 t1 = gen_reg_rtx (wmode);
48442 t2 = gen_reg_rtx (wmode);
48443 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48444 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48446 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48447 break;
48449 default:
48450 gcc_unreachable ();
48454 void
48455 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48457 rtx res_1, res_2, res_3, res_4;
48459 res_1 = gen_reg_rtx (V4SImode);
48460 res_2 = gen_reg_rtx (V4SImode);
48461 res_3 = gen_reg_rtx (V2DImode);
48462 res_4 = gen_reg_rtx (V2DImode);
48463 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48464 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48466 /* Move the results in element 2 down to element 1; we don't care
48467 what goes in elements 2 and 3. Then we can merge the parts
48468 back together with an interleave.
48470 Note that two other sequences were tried:
48471 (1) Use interleaves at the start instead of psrldq, which allows
48472 us to use a single shufps to merge things back at the end.
48473 (2) Use shufps here to combine the two vectors, then pshufd to
48474 put the elements in the correct order.
48475 In both cases the cost of the reformatting stall was too high
48476 and the overall sequence slower. */
48478 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48479 const0_rtx, const2_rtx,
48480 const0_rtx, const0_rtx));
48481 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48482 const0_rtx, const2_rtx,
48483 const0_rtx, const0_rtx));
48484 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48486 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48489 void
48490 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48492 machine_mode mode = GET_MODE (op0);
48493 rtx t1, t2, t3, t4, t5, t6;
48495 if (TARGET_AVX512DQ && mode == V8DImode)
48496 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48497 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48498 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48499 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48500 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48501 else if (TARGET_XOP && mode == V2DImode)
48503 /* op1: A,B,C,D, op2: E,F,G,H */
48504 op1 = gen_lowpart (V4SImode, op1);
48505 op2 = gen_lowpart (V4SImode, op2);
48507 t1 = gen_reg_rtx (V4SImode);
48508 t2 = gen_reg_rtx (V4SImode);
48509 t3 = gen_reg_rtx (V2DImode);
48510 t4 = gen_reg_rtx (V2DImode);
48512 /* t1: B,A,D,C */
48513 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48514 GEN_INT (1),
48515 GEN_INT (0),
48516 GEN_INT (3),
48517 GEN_INT (2)));
48519 /* t2: (B*E),(A*F),(D*G),(C*H) */
48520 emit_insn (gen_mulv4si3 (t2, t1, op2));
48522 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48523 emit_insn (gen_xop_phadddq (t3, t2));
48525 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48526 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48528 /* Multiply lower parts and add all */
48529 t5 = gen_reg_rtx (V2DImode);
48530 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48531 gen_lowpart (V4SImode, op1),
48532 gen_lowpart (V4SImode, op2)));
48533 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48536 else
48538 machine_mode nmode;
48539 rtx (*umul) (rtx, rtx, rtx);
48541 if (mode == V2DImode)
48543 umul = gen_vec_widen_umult_even_v4si;
48544 nmode = V4SImode;
48546 else if (mode == V4DImode)
48548 umul = gen_vec_widen_umult_even_v8si;
48549 nmode = V8SImode;
48551 else if (mode == V8DImode)
48553 umul = gen_vec_widen_umult_even_v16si;
48554 nmode = V16SImode;
48556 else
48557 gcc_unreachable ();
48560 /* Multiply low parts. */
48561 t1 = gen_reg_rtx (mode);
48562 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48564 /* Shift input vectors right 32 bits so we can multiply high parts. */
48565 t6 = GEN_INT (32);
48566 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48567 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48569 /* Multiply high parts by low parts. */
48570 t4 = gen_reg_rtx (mode);
48571 t5 = gen_reg_rtx (mode);
48572 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48573 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48575 /* Combine and shift the highparts back. */
48576 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48577 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48579 /* Combine high and low parts. */
48580 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48583 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48584 gen_rtx_MULT (mode, op1, op2));
48587 /* Return 1 if control tansfer instruction INSN
48588 should be encoded with bnd prefix.
48589 If insn is NULL then return 1 when control
48590 transfer instructions should be prefixed with
48591 bnd by default for current function. */
48593 bool
48594 ix86_bnd_prefixed_insn_p (rtx insn)
48596 /* For call insns check special flag. */
48597 if (insn && CALL_P (insn))
48599 rtx call = get_call_rtx_from (insn);
48600 if (call)
48601 return CALL_EXPR_WITH_BOUNDS_P (call);
48604 /* All other insns are prefixed only if function is instrumented. */
48605 return chkp_function_instrumented_p (current_function_decl);
48608 /* Calculate integer abs() using only SSE2 instructions. */
48610 void
48611 ix86_expand_sse2_abs (rtx target, rtx input)
48613 machine_mode mode = GET_MODE (target);
48614 rtx tmp0, tmp1, x;
48616 switch (mode)
48618 /* For 32-bit signed integer X, the best way to calculate the absolute
48619 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48620 case V4SImode:
48621 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48622 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48623 NULL, 0, OPTAB_DIRECT);
48624 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48625 NULL, 0, OPTAB_DIRECT);
48626 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48627 target, 0, OPTAB_DIRECT);
48628 break;
48630 /* For 16-bit signed integer X, the best way to calculate the absolute
48631 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48632 case V8HImode:
48633 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48635 x = expand_simple_binop (mode, SMAX, tmp0, input,
48636 target, 0, OPTAB_DIRECT);
48637 break;
48639 /* For 8-bit signed integer X, the best way to calculate the absolute
48640 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48641 as SSE2 provides the PMINUB insn. */
48642 case V16QImode:
48643 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48645 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48646 target, 0, OPTAB_DIRECT);
48647 break;
48649 default:
48650 gcc_unreachable ();
48653 if (x != target)
48654 emit_move_insn (target, x);
48657 /* Expand an extract from a vector register through pextr insn.
48658 Return true if successful. */
48660 bool
48661 ix86_expand_pextr (rtx *operands)
48663 rtx dst = operands[0];
48664 rtx src = operands[1];
48666 unsigned int size = INTVAL (operands[2]);
48667 unsigned int pos = INTVAL (operands[3]);
48669 if (SUBREG_P (dst))
48671 /* Reject non-lowpart subregs. */
48672 if (SUBREG_BYTE (dst) > 0)
48673 return false;
48674 dst = SUBREG_REG (dst);
48677 if (SUBREG_P (src))
48679 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48680 src = SUBREG_REG (src);
48683 switch (GET_MODE (src))
48685 case V16QImode:
48686 case V8HImode:
48687 case V4SImode:
48688 case V2DImode:
48689 case V1TImode:
48690 case TImode:
48692 machine_mode srcmode, dstmode;
48693 rtx d, pat;
48695 dstmode = mode_for_size (size, MODE_INT, 0);
48697 switch (dstmode)
48699 case QImode:
48700 if (!TARGET_SSE4_1)
48701 return false;
48702 srcmode = V16QImode;
48703 break;
48705 case HImode:
48706 if (!TARGET_SSE2)
48707 return false;
48708 srcmode = V8HImode;
48709 break;
48711 case SImode:
48712 if (!TARGET_SSE4_1)
48713 return false;
48714 srcmode = V4SImode;
48715 break;
48717 case DImode:
48718 gcc_assert (TARGET_64BIT);
48719 if (!TARGET_SSE4_1)
48720 return false;
48721 srcmode = V2DImode;
48722 break;
48724 default:
48725 return false;
48728 /* Reject extractions from misaligned positions. */
48729 if (pos & (size-1))
48730 return false;
48732 if (GET_MODE (dst) == dstmode)
48733 d = dst;
48734 else
48735 d = gen_reg_rtx (dstmode);
48737 /* Construct insn pattern. */
48738 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48739 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48741 /* Let the rtl optimizers know about the zero extension performed. */
48742 if (dstmode == QImode || dstmode == HImode)
48744 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48745 d = gen_lowpart (SImode, d);
48748 emit_insn (gen_rtx_SET (d, pat));
48750 if (d != dst)
48751 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48752 return true;
48755 default:
48756 return false;
48760 /* Expand an insert into a vector register through pinsr insn.
48761 Return true if successful. */
48763 bool
48764 ix86_expand_pinsr (rtx *operands)
48766 rtx dst = operands[0];
48767 rtx src = operands[3];
48769 unsigned int size = INTVAL (operands[1]);
48770 unsigned int pos = INTVAL (operands[2]);
48772 if (SUBREG_P (dst))
48774 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48775 dst = SUBREG_REG (dst);
48778 switch (GET_MODE (dst))
48780 case V16QImode:
48781 case V8HImode:
48782 case V4SImode:
48783 case V2DImode:
48784 case V1TImode:
48785 case TImode:
48787 machine_mode srcmode, dstmode;
48788 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48789 rtx d;
48791 srcmode = mode_for_size (size, MODE_INT, 0);
48793 switch (srcmode)
48795 case QImode:
48796 if (!TARGET_SSE4_1)
48797 return false;
48798 dstmode = V16QImode;
48799 pinsr = gen_sse4_1_pinsrb;
48800 break;
48802 case HImode:
48803 if (!TARGET_SSE2)
48804 return false;
48805 dstmode = V8HImode;
48806 pinsr = gen_sse2_pinsrw;
48807 break;
48809 case SImode:
48810 if (!TARGET_SSE4_1)
48811 return false;
48812 dstmode = V4SImode;
48813 pinsr = gen_sse4_1_pinsrd;
48814 break;
48816 case DImode:
48817 gcc_assert (TARGET_64BIT);
48818 if (!TARGET_SSE4_1)
48819 return false;
48820 dstmode = V2DImode;
48821 pinsr = gen_sse4_1_pinsrq;
48822 break;
48824 default:
48825 return false;
48828 /* Reject insertions to misaligned positions. */
48829 if (pos & (size-1))
48830 return false;
48832 if (SUBREG_P (src))
48834 unsigned int srcpos = SUBREG_BYTE (src);
48836 if (srcpos > 0)
48838 rtx extr_ops[4];
48840 extr_ops[0] = gen_reg_rtx (srcmode);
48841 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48842 extr_ops[2] = GEN_INT (size);
48843 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48845 if (!ix86_expand_pextr (extr_ops))
48846 return false;
48848 src = extr_ops[0];
48850 else
48851 src = gen_lowpart (srcmode, SUBREG_REG (src));
48854 if (GET_MODE (dst) == dstmode)
48855 d = dst;
48856 else
48857 d = gen_reg_rtx (dstmode);
48859 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48860 gen_lowpart (srcmode, src),
48861 GEN_INT (1 << (pos / size))));
48862 if (d != dst)
48863 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48864 return true;
48867 default:
48868 return false;
48872 /* This function returns the calling abi specific va_list type node.
48873 It returns the FNDECL specific va_list type. */
48875 static tree
48876 ix86_fn_abi_va_list (tree fndecl)
48878 if (!TARGET_64BIT)
48879 return va_list_type_node;
48880 gcc_assert (fndecl != NULL_TREE);
48882 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48883 return ms_va_list_type_node;
48884 else
48885 return sysv_va_list_type_node;
48888 /* Returns the canonical va_list type specified by TYPE. If there
48889 is no valid TYPE provided, it return NULL_TREE. */
48891 static tree
48892 ix86_canonical_va_list_type (tree type)
48894 if (TARGET_64BIT)
48896 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48897 return ms_va_list_type_node;
48899 if ((TREE_CODE (type) == ARRAY_TYPE
48900 && integer_zerop (array_type_nelts (type)))
48901 || POINTER_TYPE_P (type))
48903 tree elem_type = TREE_TYPE (type);
48904 if (TREE_CODE (elem_type) == RECORD_TYPE
48905 && lookup_attribute ("sysv_abi va_list",
48906 TYPE_ATTRIBUTES (elem_type)))
48907 return sysv_va_list_type_node;
48910 return NULL_TREE;
48913 return std_canonical_va_list_type (type);
48916 /* Iterate through the target-specific builtin types for va_list.
48917 IDX denotes the iterator, *PTREE is set to the result type of
48918 the va_list builtin, and *PNAME to its internal type.
48919 Returns zero if there is no element for this index, otherwise
48920 IDX should be increased upon the next call.
48921 Note, do not iterate a base builtin's name like __builtin_va_list.
48922 Used from c_common_nodes_and_builtins. */
48924 static int
48925 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48927 if (TARGET_64BIT)
48929 switch (idx)
48931 default:
48932 break;
48934 case 0:
48935 *ptree = ms_va_list_type_node;
48936 *pname = "__builtin_ms_va_list";
48937 return 1;
48939 case 1:
48940 *ptree = sysv_va_list_type_node;
48941 *pname = "__builtin_sysv_va_list";
48942 return 1;
48946 return 0;
48949 #undef TARGET_SCHED_DISPATCH
48950 #define TARGET_SCHED_DISPATCH has_dispatch
48951 #undef TARGET_SCHED_DISPATCH_DO
48952 #define TARGET_SCHED_DISPATCH_DO do_dispatch
48953 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48954 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48955 #undef TARGET_SCHED_REORDER
48956 #define TARGET_SCHED_REORDER ix86_sched_reorder
48957 #undef TARGET_SCHED_ADJUST_PRIORITY
48958 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48959 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48960 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48961 ix86_dependencies_evaluation_hook
48963 /* The size of the dispatch window is the total number of bytes of
48964 object code allowed in a window. */
48965 #define DISPATCH_WINDOW_SIZE 16
48967 /* Number of dispatch windows considered for scheduling. */
48968 #define MAX_DISPATCH_WINDOWS 3
48970 /* Maximum number of instructions in a window. */
48971 #define MAX_INSN 4
48973 /* Maximum number of immediate operands in a window. */
48974 #define MAX_IMM 4
48976 /* Maximum number of immediate bits allowed in a window. */
48977 #define MAX_IMM_SIZE 128
48979 /* Maximum number of 32 bit immediates allowed in a window. */
48980 #define MAX_IMM_32 4
48982 /* Maximum number of 64 bit immediates allowed in a window. */
48983 #define MAX_IMM_64 2
48985 /* Maximum total of loads or prefetches allowed in a window. */
48986 #define MAX_LOAD 2
48988 /* Maximum total of stores allowed in a window. */
48989 #define MAX_STORE 1
48991 #undef BIG
48992 #define BIG 100
48995 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
48996 enum dispatch_group {
48997 disp_no_group = 0,
48998 disp_load,
48999 disp_store,
49000 disp_load_store,
49001 disp_prefetch,
49002 disp_imm,
49003 disp_imm_32,
49004 disp_imm_64,
49005 disp_branch,
49006 disp_cmp,
49007 disp_jcc,
49008 disp_last
49011 /* Number of allowable groups in a dispatch window. It is an array
49012 indexed by dispatch_group enum. 100 is used as a big number,
49013 because the number of these kind of operations does not have any
49014 effect in dispatch window, but we need them for other reasons in
49015 the table. */
49016 static unsigned int num_allowable_groups[disp_last] = {
49017 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
49020 char group_name[disp_last + 1][16] = {
49021 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
49022 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
49023 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
49026 /* Instruction path. */
49027 enum insn_path {
49028 no_path = 0,
49029 path_single, /* Single micro op. */
49030 path_double, /* Double micro op. */
49031 path_multi, /* Instructions with more than 2 micro op.. */
49032 last_path
49035 /* sched_insn_info defines a window to the instructions scheduled in
49036 the basic block. It contains a pointer to the insn_info table and
49037 the instruction scheduled.
49039 Windows are allocated for each basic block and are linked
49040 together. */
49041 typedef struct sched_insn_info_s {
49042 rtx insn;
49043 enum dispatch_group group;
49044 enum insn_path path;
49045 int byte_len;
49046 int imm_bytes;
49047 } sched_insn_info;
49049 /* Linked list of dispatch windows. This is a two way list of
49050 dispatch windows of a basic block. It contains information about
49051 the number of uops in the window and the total number of
49052 instructions and of bytes in the object code for this dispatch
49053 window. */
49054 typedef struct dispatch_windows_s {
49055 int num_insn; /* Number of insn in the window. */
49056 int num_uops; /* Number of uops in the window. */
49057 int window_size; /* Number of bytes in the window. */
49058 int window_num; /* Window number between 0 or 1. */
49059 int num_imm; /* Number of immediates in an insn. */
49060 int num_imm_32; /* Number of 32 bit immediates in an insn. */
49061 int num_imm_64; /* Number of 64 bit immediates in an insn. */
49062 int imm_size; /* Total immediates in the window. */
49063 int num_loads; /* Total memory loads in the window. */
49064 int num_stores; /* Total memory stores in the window. */
49065 int violation; /* Violation exists in window. */
49066 sched_insn_info *window; /* Pointer to the window. */
49067 struct dispatch_windows_s *next;
49068 struct dispatch_windows_s *prev;
49069 } dispatch_windows;
49071 /* Immediate valuse used in an insn. */
49072 typedef struct imm_info_s
49074 int imm;
49075 int imm32;
49076 int imm64;
49077 } imm_info;
49079 static dispatch_windows *dispatch_window_list;
49080 static dispatch_windows *dispatch_window_list1;
49082 /* Get dispatch group of insn. */
49084 static enum dispatch_group
49085 get_mem_group (rtx_insn *insn)
49087 enum attr_memory memory;
49089 if (INSN_CODE (insn) < 0)
49090 return disp_no_group;
49091 memory = get_attr_memory (insn);
49092 if (memory == MEMORY_STORE)
49093 return disp_store;
49095 if (memory == MEMORY_LOAD)
49096 return disp_load;
49098 if (memory == MEMORY_BOTH)
49099 return disp_load_store;
49101 return disp_no_group;
49104 /* Return true if insn is a compare instruction. */
49106 static bool
49107 is_cmp (rtx_insn *insn)
49109 enum attr_type type;
49111 type = get_attr_type (insn);
49112 return (type == TYPE_TEST
49113 || type == TYPE_ICMP
49114 || type == TYPE_FCMP
49115 || GET_CODE (PATTERN (insn)) == COMPARE);
49118 /* Return true if a dispatch violation encountered. */
49120 static bool
49121 dispatch_violation (void)
49123 if (dispatch_window_list->next)
49124 return dispatch_window_list->next->violation;
49125 return dispatch_window_list->violation;
49128 /* Return true if insn is a branch instruction. */
49130 static bool
49131 is_branch (rtx_insn *insn)
49133 return (CALL_P (insn) || JUMP_P (insn));
49136 /* Return true if insn is a prefetch instruction. */
49138 static bool
49139 is_prefetch (rtx_insn *insn)
49141 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
49144 /* This function initializes a dispatch window and the list container holding a
49145 pointer to the window. */
49147 static void
49148 init_window (int window_num)
49150 int i;
49151 dispatch_windows *new_list;
49153 if (window_num == 0)
49154 new_list = dispatch_window_list;
49155 else
49156 new_list = dispatch_window_list1;
49158 new_list->num_insn = 0;
49159 new_list->num_uops = 0;
49160 new_list->window_size = 0;
49161 new_list->next = NULL;
49162 new_list->prev = NULL;
49163 new_list->window_num = window_num;
49164 new_list->num_imm = 0;
49165 new_list->num_imm_32 = 0;
49166 new_list->num_imm_64 = 0;
49167 new_list->imm_size = 0;
49168 new_list->num_loads = 0;
49169 new_list->num_stores = 0;
49170 new_list->violation = false;
49172 for (i = 0; i < MAX_INSN; i++)
49174 new_list->window[i].insn = NULL;
49175 new_list->window[i].group = disp_no_group;
49176 new_list->window[i].path = no_path;
49177 new_list->window[i].byte_len = 0;
49178 new_list->window[i].imm_bytes = 0;
49180 return;
49183 /* This function allocates and initializes a dispatch window and the
49184 list container holding a pointer to the window. */
49186 static dispatch_windows *
49187 allocate_window (void)
49189 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
49190 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
49192 return new_list;
49195 /* This routine initializes the dispatch scheduling information. It
49196 initiates building dispatch scheduler tables and constructs the
49197 first dispatch window. */
49199 static void
49200 init_dispatch_sched (void)
49202 /* Allocate a dispatch list and a window. */
49203 dispatch_window_list = allocate_window ();
49204 dispatch_window_list1 = allocate_window ();
49205 init_window (0);
49206 init_window (1);
49209 /* This function returns true if a branch is detected. End of a basic block
49210 does not have to be a branch, but here we assume only branches end a
49211 window. */
49213 static bool
49214 is_end_basic_block (enum dispatch_group group)
49216 return group == disp_branch;
49219 /* This function is called when the end of a window processing is reached. */
49221 static void
49222 process_end_window (void)
49224 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
49225 if (dispatch_window_list->next)
49227 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
49228 gcc_assert (dispatch_window_list->window_size
49229 + dispatch_window_list1->window_size <= 48);
49230 init_window (1);
49232 init_window (0);
49235 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
49236 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
49237 for 48 bytes of instructions. Note that these windows are not dispatch
49238 windows that their sizes are DISPATCH_WINDOW_SIZE. */
49240 static dispatch_windows *
49241 allocate_next_window (int window_num)
49243 if (window_num == 0)
49245 if (dispatch_window_list->next)
49246 init_window (1);
49247 init_window (0);
49248 return dispatch_window_list;
49251 dispatch_window_list->next = dispatch_window_list1;
49252 dispatch_window_list1->prev = dispatch_window_list;
49254 return dispatch_window_list1;
49257 /* Compute number of immediate operands of an instruction. */
49259 static void
49260 find_constant (rtx in_rtx, imm_info *imm_values)
49262 if (INSN_P (in_rtx))
49263 in_rtx = PATTERN (in_rtx);
49264 subrtx_iterator::array_type array;
49265 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49266 if (const_rtx x = *iter)
49267 switch (GET_CODE (x))
49269 case CONST:
49270 case SYMBOL_REF:
49271 case CONST_INT:
49272 (imm_values->imm)++;
49273 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49274 (imm_values->imm32)++;
49275 else
49276 (imm_values->imm64)++;
49277 break;
49279 case CONST_DOUBLE:
49280 case CONST_WIDE_INT:
49281 (imm_values->imm)++;
49282 (imm_values->imm64)++;
49283 break;
49285 case CODE_LABEL:
49286 if (LABEL_KIND (x) == LABEL_NORMAL)
49288 (imm_values->imm)++;
49289 (imm_values->imm32)++;
49291 break;
49293 default:
49294 break;
49298 /* Return total size of immediate operands of an instruction along with number
49299 of corresponding immediate-operands. It initializes its parameters to zero
49300 befor calling FIND_CONSTANT.
49301 INSN is the input instruction. IMM is the total of immediates.
49302 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49303 bit immediates. */
49305 static int
49306 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49308 imm_info imm_values = {0, 0, 0};
49310 find_constant (insn, &imm_values);
49311 *imm = imm_values.imm;
49312 *imm32 = imm_values.imm32;
49313 *imm64 = imm_values.imm64;
49314 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49317 /* This function indicates if an operand of an instruction is an
49318 immediate. */
49320 static bool
49321 has_immediate (rtx_insn *insn)
49323 int num_imm_operand;
49324 int num_imm32_operand;
49325 int num_imm64_operand;
49327 if (insn)
49328 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49329 &num_imm64_operand);
49330 return false;
49333 /* Return single or double path for instructions. */
49335 static enum insn_path
49336 get_insn_path (rtx_insn *insn)
49338 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49340 if ((int)path == 0)
49341 return path_single;
49343 if ((int)path == 1)
49344 return path_double;
49346 return path_multi;
49349 /* Return insn dispatch group. */
49351 static enum dispatch_group
49352 get_insn_group (rtx_insn *insn)
49354 enum dispatch_group group = get_mem_group (insn);
49355 if (group)
49356 return group;
49358 if (is_branch (insn))
49359 return disp_branch;
49361 if (is_cmp (insn))
49362 return disp_cmp;
49364 if (has_immediate (insn))
49365 return disp_imm;
49367 if (is_prefetch (insn))
49368 return disp_prefetch;
49370 return disp_no_group;
49373 /* Count number of GROUP restricted instructions in a dispatch
49374 window WINDOW_LIST. */
49376 static int
49377 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
49379 enum dispatch_group group = get_insn_group (insn);
49380 int imm_size;
49381 int num_imm_operand;
49382 int num_imm32_operand;
49383 int num_imm64_operand;
49385 if (group == disp_no_group)
49386 return 0;
49388 if (group == disp_imm)
49390 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49391 &num_imm64_operand);
49392 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
49393 || num_imm_operand + window_list->num_imm > MAX_IMM
49394 || (num_imm32_operand > 0
49395 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
49396 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
49397 || (num_imm64_operand > 0
49398 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
49399 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
49400 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
49401 && num_imm64_operand > 0
49402 && ((window_list->num_imm_64 > 0
49403 && window_list->num_insn >= 2)
49404 || window_list->num_insn >= 3)))
49405 return BIG;
49407 return 1;
49410 if ((group == disp_load_store
49411 && (window_list->num_loads >= MAX_LOAD
49412 || window_list->num_stores >= MAX_STORE))
49413 || ((group == disp_load
49414 || group == disp_prefetch)
49415 && window_list->num_loads >= MAX_LOAD)
49416 || (group == disp_store
49417 && window_list->num_stores >= MAX_STORE))
49418 return BIG;
49420 return 1;
49423 /* This function returns true if insn satisfies dispatch rules on the
49424 last window scheduled. */
49426 static bool
49427 fits_dispatch_window (rtx_insn *insn)
49429 dispatch_windows *window_list = dispatch_window_list;
49430 dispatch_windows *window_list_next = dispatch_window_list->next;
49431 unsigned int num_restrict;
49432 enum dispatch_group group = get_insn_group (insn);
49433 enum insn_path path = get_insn_path (insn);
49434 int sum;
49436 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
49437 instructions should be given the lowest priority in the
49438 scheduling process in Haifa scheduler to make sure they will be
49439 scheduled in the same dispatch window as the reference to them. */
49440 if (group == disp_jcc || group == disp_cmp)
49441 return false;
49443 /* Check nonrestricted. */
49444 if (group == disp_no_group || group == disp_branch)
49445 return true;
49447 /* Get last dispatch window. */
49448 if (window_list_next)
49449 window_list = window_list_next;
49451 if (window_list->window_num == 1)
49453 sum = window_list->prev->window_size + window_list->window_size;
49455 if (sum == 32
49456 || (min_insn_size (insn) + sum) >= 48)
49457 /* Window 1 is full. Go for next window. */
49458 return true;
49461 num_restrict = count_num_restricted (insn, window_list);
49463 if (num_restrict > num_allowable_groups[group])
49464 return false;
49466 /* See if it fits in the first window. */
49467 if (window_list->window_num == 0)
49469 /* The first widow should have only single and double path
49470 uops. */
49471 if (path == path_double
49472 && (window_list->num_uops + 2) > MAX_INSN)
49473 return false;
49474 else if (path != path_single)
49475 return false;
49477 return true;
49480 /* Add an instruction INSN with NUM_UOPS micro-operations to the
49481 dispatch window WINDOW_LIST. */
49483 static void
49484 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
49486 int byte_len = min_insn_size (insn);
49487 int num_insn = window_list->num_insn;
49488 int imm_size;
49489 sched_insn_info *window = window_list->window;
49490 enum dispatch_group group = get_insn_group (insn);
49491 enum insn_path path = get_insn_path (insn);
49492 int num_imm_operand;
49493 int num_imm32_operand;
49494 int num_imm64_operand;
49496 if (!window_list->violation && group != disp_cmp
49497 && !fits_dispatch_window (insn))
49498 window_list->violation = true;
49500 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49501 &num_imm64_operand);
49503 /* Initialize window with new instruction. */
49504 window[num_insn].insn = insn;
49505 window[num_insn].byte_len = byte_len;
49506 window[num_insn].group = group;
49507 window[num_insn].path = path;
49508 window[num_insn].imm_bytes = imm_size;
49510 window_list->window_size += byte_len;
49511 window_list->num_insn = num_insn + 1;
49512 window_list->num_uops = window_list->num_uops + num_uops;
49513 window_list->imm_size += imm_size;
49514 window_list->num_imm += num_imm_operand;
49515 window_list->num_imm_32 += num_imm32_operand;
49516 window_list->num_imm_64 += num_imm64_operand;
49518 if (group == disp_store)
49519 window_list->num_stores += 1;
49520 else if (group == disp_load
49521 || group == disp_prefetch)
49522 window_list->num_loads += 1;
49523 else if (group == disp_load_store)
49525 window_list->num_stores += 1;
49526 window_list->num_loads += 1;
49530 /* Adds a scheduled instruction, INSN, to the current dispatch window.
49531 If the total bytes of instructions or the number of instructions in
49532 the window exceed allowable, it allocates a new window. */
49534 static void
49535 add_to_dispatch_window (rtx_insn *insn)
49537 int byte_len;
49538 dispatch_windows *window_list;
49539 dispatch_windows *next_list;
49540 dispatch_windows *window0_list;
49541 enum insn_path path;
49542 enum dispatch_group insn_group;
49543 bool insn_fits;
49544 int num_insn;
49545 int num_uops;
49546 int window_num;
49547 int insn_num_uops;
49548 int sum;
49550 if (INSN_CODE (insn) < 0)
49551 return;
49553 byte_len = min_insn_size (insn);
49554 window_list = dispatch_window_list;
49555 next_list = window_list->next;
49556 path = get_insn_path (insn);
49557 insn_group = get_insn_group (insn);
49559 /* Get the last dispatch window. */
49560 if (next_list)
49561 window_list = dispatch_window_list->next;
49563 if (path == path_single)
49564 insn_num_uops = 1;
49565 else if (path == path_double)
49566 insn_num_uops = 2;
49567 else
49568 insn_num_uops = (int) path;
49570 /* If current window is full, get a new window.
49571 Window number zero is full, if MAX_INSN uops are scheduled in it.
49572 Window number one is full, if window zero's bytes plus window
49573 one's bytes is 32, or if the bytes of the new instruction added
49574 to the total makes it greater than 48, or it has already MAX_INSN
49575 instructions in it. */
49576 num_insn = window_list->num_insn;
49577 num_uops = window_list->num_uops;
49578 window_num = window_list->window_num;
49579 insn_fits = fits_dispatch_window (insn);
49581 if (num_insn >= MAX_INSN
49582 || num_uops + insn_num_uops > MAX_INSN
49583 || !(insn_fits))
49585 window_num = ~window_num & 1;
49586 window_list = allocate_next_window (window_num);
49589 if (window_num == 0)
49591 add_insn_window (insn, window_list, insn_num_uops);
49592 if (window_list->num_insn >= MAX_INSN
49593 && insn_group == disp_branch)
49595 process_end_window ();
49596 return;
49599 else if (window_num == 1)
49601 window0_list = window_list->prev;
49602 sum = window0_list->window_size + window_list->window_size;
49603 if (sum == 32
49604 || (byte_len + sum) >= 48)
49606 process_end_window ();
49607 window_list = dispatch_window_list;
49610 add_insn_window (insn, window_list, insn_num_uops);
49612 else
49613 gcc_unreachable ();
49615 if (is_end_basic_block (insn_group))
49617 /* End of basic block is reached do end-basic-block process. */
49618 process_end_window ();
49619 return;
49623 /* Print the dispatch window, WINDOW_NUM, to FILE. */
49625 DEBUG_FUNCTION static void
49626 debug_dispatch_window_file (FILE *file, int window_num)
49628 dispatch_windows *list;
49629 int i;
49631 if (window_num == 0)
49632 list = dispatch_window_list;
49633 else
49634 list = dispatch_window_list1;
49636 fprintf (file, "Window #%d:\n", list->window_num);
49637 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
49638 list->num_insn, list->num_uops, list->window_size);
49639 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49640 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
49642 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
49643 list->num_stores);
49644 fprintf (file, " insn info:\n");
49646 for (i = 0; i < MAX_INSN; i++)
49648 if (!list->window[i].insn)
49649 break;
49650 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
49651 i, group_name[list->window[i].group],
49652 i, (void *)list->window[i].insn,
49653 i, list->window[i].path,
49654 i, list->window[i].byte_len,
49655 i, list->window[i].imm_bytes);
49659 /* Print to stdout a dispatch window. */
49661 DEBUG_FUNCTION void
49662 debug_dispatch_window (int window_num)
49664 debug_dispatch_window_file (stdout, window_num);
49667 /* Print INSN dispatch information to FILE. */
49669 DEBUG_FUNCTION static void
49670 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
49672 int byte_len;
49673 enum insn_path path;
49674 enum dispatch_group group;
49675 int imm_size;
49676 int num_imm_operand;
49677 int num_imm32_operand;
49678 int num_imm64_operand;
49680 if (INSN_CODE (insn) < 0)
49681 return;
49683 byte_len = min_insn_size (insn);
49684 path = get_insn_path (insn);
49685 group = get_insn_group (insn);
49686 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49687 &num_imm64_operand);
49689 fprintf (file, " insn info:\n");
49690 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
49691 group_name[group], path, byte_len);
49692 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49693 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
49696 /* Print to STDERR the status of the ready list with respect to
49697 dispatch windows. */
49699 DEBUG_FUNCTION void
49700 debug_ready_dispatch (void)
49702 int i;
49703 int no_ready = number_in_ready ();
49705 fprintf (stdout, "Number of ready: %d\n", no_ready);
49707 for (i = 0; i < no_ready; i++)
49708 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
49711 /* This routine is the driver of the dispatch scheduler. */
49713 static void
49714 do_dispatch (rtx_insn *insn, int mode)
49716 if (mode == DISPATCH_INIT)
49717 init_dispatch_sched ();
49718 else if (mode == ADD_TO_DISPATCH_WINDOW)
49719 add_to_dispatch_window (insn);
49722 /* Return TRUE if Dispatch Scheduling is supported. */
49724 static bool
49725 has_dispatch (rtx_insn *insn, int action)
49727 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
49728 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
49729 switch (action)
49731 default:
49732 return false;
49734 case IS_DISPATCH_ON:
49735 return true;
49737 case IS_CMP:
49738 return is_cmp (insn);
49740 case DISPATCH_VIOLATION:
49741 return dispatch_violation ();
49743 case FITS_DISPATCH_WINDOW:
49744 return fits_dispatch_window (insn);
49747 return false;
49750 /* Implementation of reassociation_width target hook used by
49751 reassoc phase to identify parallelism level in reassociated
49752 tree. Statements tree_code is passed in OPC. Arguments type
49753 is passed in MODE.
49755 Currently parallel reassociation is enabled for Atom
49756 processors only and we set reassociation width to be 2
49757 because Atom may issue up to 2 instructions per cycle.
49759 Return value should be fixed if parallel reassociation is
49760 enabled for other processors. */
49762 static int
49763 ix86_reassociation_width (unsigned int, machine_mode mode)
49765 /* Vector part. */
49766 if (VECTOR_MODE_P (mode))
49768 if (TARGET_VECTOR_PARALLEL_EXECUTION)
49769 return 2;
49770 else
49771 return 1;
49774 /* Scalar part. */
49775 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
49776 return 2;
49777 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
49778 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
49779 else
49780 return 1;
49783 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49784 place emms and femms instructions. */
49786 static machine_mode
49787 ix86_preferred_simd_mode (machine_mode mode)
49789 if (!TARGET_SSE)
49790 return word_mode;
49792 switch (mode)
49794 case QImode:
49795 return TARGET_AVX512BW ? V64QImode :
49796 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
49797 case HImode:
49798 return TARGET_AVX512BW ? V32HImode :
49799 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
49800 case SImode:
49801 return TARGET_AVX512F ? V16SImode :
49802 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
49803 case DImode:
49804 return TARGET_AVX512F ? V8DImode :
49805 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
49807 case SFmode:
49808 if (TARGET_AVX512F)
49809 return V16SFmode;
49810 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49811 return V8SFmode;
49812 else
49813 return V4SFmode;
49815 case DFmode:
49816 if (TARGET_AVX512F)
49817 return V8DFmode;
49818 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49819 return V4DFmode;
49820 else if (TARGET_SSE2)
49821 return V2DFmode;
49822 /* FALLTHRU */
49824 default:
49825 return word_mode;
49829 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49830 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49831 256bit and 128bit vectors. */
49833 static unsigned int
49834 ix86_autovectorize_vector_sizes (void)
49836 return TARGET_AVX512F ? 64 | 32 | 16 :
49837 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
49840 /* Implemenation of targetm.vectorize.get_mask_mode. */
49842 static machine_mode
49843 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
49845 unsigned elem_size = vector_size / nunits;
49847 /* Scalar mask case. */
49848 if ((TARGET_AVX512F && vector_size == 64)
49849 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49851 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49852 return smallest_mode_for_size (nunits, MODE_INT);
49855 machine_mode elem_mode
49856 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
49858 gcc_assert (elem_size * nunits == vector_size);
49860 return mode_for_vector (elem_mode, nunits);
49865 /* Return class of registers which could be used for pseudo of MODE
49866 and of class RCLASS for spilling instead of memory. Return NO_REGS
49867 if it is not possible or non-profitable. */
49869 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49871 static reg_class_t
49872 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49874 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49875 && TARGET_SSE2
49876 && TARGET_INTER_UNIT_MOVES_TO_VEC
49877 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49878 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49879 && INTEGER_CLASS_P (rclass))
49880 return ALL_SSE_REGS;
49881 return NO_REGS;
49884 /* Implement targetm.vectorize.init_cost. */
49886 static void *
49887 ix86_init_cost (struct loop *)
49889 unsigned *cost = XNEWVEC (unsigned, 3);
49890 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49891 return cost;
49894 /* Implement targetm.vectorize.add_stmt_cost. */
49896 static unsigned
49897 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49898 struct _stmt_vec_info *stmt_info, int misalign,
49899 enum vect_cost_model_location where)
49901 unsigned *cost = (unsigned *) data;
49902 unsigned retval = 0;
49904 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49905 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49907 /* Penalize DFmode vector operations for Bonnell. */
49908 if (TARGET_BONNELL && kind == vector_stmt
49909 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49910 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49912 /* Statements in an inner loop relative to the loop being
49913 vectorized are weighted more heavily. The value here is
49914 arbitrary and could potentially be improved with analysis. */
49915 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49916 count *= 50; /* FIXME. */
49918 retval = (unsigned) (count * stmt_cost);
49920 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49921 for Silvermont as it has out of order integer pipeline and can execute
49922 2 scalar instruction per tick, but has in order SIMD pipeline. */
49923 if ((TARGET_SILVERMONT || TARGET_INTEL)
49924 && stmt_info && stmt_info->stmt)
49926 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49927 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49928 retval = (retval * 17) / 10;
49931 cost[where] += retval;
49933 return retval;
49936 /* Implement targetm.vectorize.finish_cost. */
49938 static void
49939 ix86_finish_cost (void *data, unsigned *prologue_cost,
49940 unsigned *body_cost, unsigned *epilogue_cost)
49942 unsigned *cost = (unsigned *) data;
49943 *prologue_cost = cost[vect_prologue];
49944 *body_cost = cost[vect_body];
49945 *epilogue_cost = cost[vect_epilogue];
49948 /* Implement targetm.vectorize.destroy_cost_data. */
49950 static void
49951 ix86_destroy_cost_data (void *data)
49953 free (data);
49956 /* Validate target specific memory model bits in VAL. */
49958 static unsigned HOST_WIDE_INT
49959 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49961 enum memmodel model = memmodel_from_int (val);
49962 bool strong;
49964 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49965 |MEMMODEL_MASK)
49966 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49968 warning (OPT_Winvalid_memory_model,
49969 "Unknown architecture specific memory model");
49970 return MEMMODEL_SEQ_CST;
49972 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49973 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49975 warning (OPT_Winvalid_memory_model,
49976 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49977 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49979 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49981 warning (OPT_Winvalid_memory_model,
49982 "HLE_RELEASE not used with RELEASE or stronger memory model");
49983 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49985 return val;
49988 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49989 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49990 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49991 or number of vecsize_mangle variants that should be emitted. */
49993 static int
49994 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49995 struct cgraph_simd_clone *clonei,
49996 tree base_type, int num)
49998 int ret = 1;
50000 if (clonei->simdlen
50001 && (clonei->simdlen < 2
50002 || clonei->simdlen > 1024
50003 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50005 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50006 "unsupported simdlen %d", clonei->simdlen);
50007 return 0;
50010 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50011 if (TREE_CODE (ret_type) != VOID_TYPE)
50012 switch (TYPE_MODE (ret_type))
50014 case QImode:
50015 case HImode:
50016 case SImode:
50017 case DImode:
50018 case SFmode:
50019 case DFmode:
50020 /* case SCmode: */
50021 /* case DCmode: */
50022 break;
50023 default:
50024 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50025 "unsupported return type %qT for simd\n", ret_type);
50026 return 0;
50029 tree t;
50030 int i;
50032 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50033 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50034 switch (TYPE_MODE (TREE_TYPE (t)))
50036 case QImode:
50037 case HImode:
50038 case SImode:
50039 case DImode:
50040 case SFmode:
50041 case DFmode:
50042 /* case SCmode: */
50043 /* case DCmode: */
50044 break;
50045 default:
50046 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50047 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
50048 return 0;
50051 if (clonei->cilk_elemental)
50053 /* Parse here processor clause. If not present, default to 'b'. */
50054 clonei->vecsize_mangle = 'b';
50056 else if (!TREE_PUBLIC (node->decl))
50058 /* If the function isn't exported, we can pick up just one ISA
50059 for the clones. */
50060 if (TARGET_AVX512F)
50061 clonei->vecsize_mangle = 'e';
50062 else if (TARGET_AVX2)
50063 clonei->vecsize_mangle = 'd';
50064 else if (TARGET_AVX)
50065 clonei->vecsize_mangle = 'c';
50066 else
50067 clonei->vecsize_mangle = 'b';
50068 ret = 1;
50070 else
50072 clonei->vecsize_mangle = "bcde"[num];
50073 ret = 4;
50075 clonei->mask_mode = VOIDmode;
50076 switch (clonei->vecsize_mangle)
50078 case 'b':
50079 clonei->vecsize_int = 128;
50080 clonei->vecsize_float = 128;
50081 break;
50082 case 'c':
50083 clonei->vecsize_int = 128;
50084 clonei->vecsize_float = 256;
50085 break;
50086 case 'd':
50087 clonei->vecsize_int = 256;
50088 clonei->vecsize_float = 256;
50089 break;
50090 case 'e':
50091 clonei->vecsize_int = 512;
50092 clonei->vecsize_float = 512;
50093 if (TYPE_MODE (base_type) == QImode)
50094 clonei->mask_mode = DImode;
50095 else
50096 clonei->mask_mode = SImode;
50097 break;
50099 if (clonei->simdlen == 0)
50101 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50102 clonei->simdlen = clonei->vecsize_int;
50103 else
50104 clonei->simdlen = clonei->vecsize_float;
50105 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50107 else if (clonei->simdlen > 16)
50109 /* For compatibility with ICC, use the same upper bounds
50110 for simdlen. In particular, for CTYPE below, use the return type,
50111 unless the function returns void, in that case use the characteristic
50112 type. If it is possible for given SIMDLEN to pass CTYPE value
50113 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50114 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50115 emit corresponding clone. */
50116 tree ctype = ret_type;
50117 if (TREE_CODE (ret_type) == VOID_TYPE)
50118 ctype = base_type;
50119 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50120 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50121 cnt /= clonei->vecsize_int;
50122 else
50123 cnt /= clonei->vecsize_float;
50124 if (cnt > (TARGET_64BIT ? 16 : 8))
50126 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50127 "unsupported simdlen %d", clonei->simdlen);
50128 return 0;
50131 return ret;
50134 /* Add target attribute to SIMD clone NODE if needed. */
50136 static void
50137 ix86_simd_clone_adjust (struct cgraph_node *node)
50139 const char *str = NULL;
50140 gcc_assert (node->decl == cfun->decl);
50141 switch (node->simdclone->vecsize_mangle)
50143 case 'b':
50144 if (!TARGET_SSE2)
50145 str = "sse2";
50146 break;
50147 case 'c':
50148 if (!TARGET_AVX)
50149 str = "avx";
50150 break;
50151 case 'd':
50152 if (!TARGET_AVX2)
50153 str = "avx2";
50154 break;
50155 case 'e':
50156 if (!TARGET_AVX512F)
50157 str = "avx512f";
50158 break;
50159 default:
50160 gcc_unreachable ();
50162 if (str == NULL)
50163 return;
50164 push_cfun (NULL);
50165 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50166 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50167 gcc_assert (ok);
50168 pop_cfun ();
50169 ix86_reset_previous_fndecl ();
50170 ix86_set_current_function (node->decl);
50173 /* If SIMD clone NODE can't be used in a vectorized loop
50174 in current function, return -1, otherwise return a badness of using it
50175 (0 if it is most desirable from vecsize_mangle point of view, 1
50176 slightly less desirable, etc.). */
50178 static int
50179 ix86_simd_clone_usable (struct cgraph_node *node)
50181 switch (node->simdclone->vecsize_mangle)
50183 case 'b':
50184 if (!TARGET_SSE2)
50185 return -1;
50186 if (!TARGET_AVX)
50187 return 0;
50188 return TARGET_AVX2 ? 2 : 1;
50189 case 'c':
50190 if (!TARGET_AVX)
50191 return -1;
50192 return TARGET_AVX2 ? 1 : 0;
50193 case 'd':
50194 if (!TARGET_AVX2)
50195 return -1;
50196 return 0;
50197 case 'e':
50198 if (!TARGET_AVX512F)
50199 return -1;
50200 return 0;
50201 default:
50202 gcc_unreachable ();
50206 /* This function adjusts the unroll factor based on
50207 the hardware capabilities. For ex, bdver3 has
50208 a loop buffer which makes unrolling of smaller
50209 loops less important. This function decides the
50210 unroll factor using number of memory references
50211 (value 32 is used) as a heuristic. */
50213 static unsigned
50214 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50216 basic_block *bbs;
50217 rtx_insn *insn;
50218 unsigned i;
50219 unsigned mem_count = 0;
50221 if (!TARGET_ADJUST_UNROLL)
50222 return nunroll;
50224 /* Count the number of memory references within the loop body.
50225 This value determines the unrolling factor for bdver3 and bdver4
50226 architectures. */
50227 subrtx_iterator::array_type array;
50228 bbs = get_loop_body (loop);
50229 for (i = 0; i < loop->num_nodes; i++)
50230 FOR_BB_INSNS (bbs[i], insn)
50231 if (NONDEBUG_INSN_P (insn))
50232 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50233 if (const_rtx x = *iter)
50234 if (MEM_P (x))
50236 machine_mode mode = GET_MODE (x);
50237 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50238 if (n_words > 4)
50239 mem_count += 2;
50240 else
50241 mem_count += 1;
50243 free (bbs);
50245 if (mem_count && mem_count <=32)
50246 return 32/mem_count;
50248 return nunroll;
50252 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50254 static bool
50255 ix86_float_exceptions_rounding_supported_p (void)
50257 /* For x87 floating point with standard excess precision handling,
50258 there is no adddf3 pattern (since x87 floating point only has
50259 XFmode operations) so the default hook implementation gets this
50260 wrong. */
50261 return TARGET_80387 || TARGET_SSE_MATH;
50264 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50266 static void
50267 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50269 if (!TARGET_80387 && !TARGET_SSE_MATH)
50270 return;
50271 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50272 if (TARGET_80387)
50274 tree fenv_index_type = build_index_type (size_int (6));
50275 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50276 tree fenv_var = create_tmp_var_raw (fenv_type);
50277 TREE_ADDRESSABLE (fenv_var) = 1;
50278 tree fenv_ptr = build_pointer_type (fenv_type);
50279 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50280 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50281 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50282 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50283 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50284 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50285 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50286 tree hold_fnclex = build_call_expr (fnclex, 0);
50287 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50288 NULL_TREE, NULL_TREE);
50289 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50290 hold_fnclex);
50291 *clear = build_call_expr (fnclex, 0);
50292 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50293 tree fnstsw_call = build_call_expr (fnstsw, 0);
50294 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50295 sw_var, fnstsw_call);
50296 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50297 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50298 exceptions_var, exceptions_x87);
50299 *update = build2 (COMPOUND_EXPR, integer_type_node,
50300 sw_mod, update_mod);
50301 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50302 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50304 if (TARGET_SSE_MATH)
50306 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50307 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50308 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50309 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50310 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50311 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50312 mxcsr_orig_var, stmxcsr_hold_call);
50313 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50314 mxcsr_orig_var,
50315 build_int_cst (unsigned_type_node, 0x1f80));
50316 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50317 build_int_cst (unsigned_type_node, 0xffffffc0));
50318 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50319 mxcsr_mod_var, hold_mod_val);
50320 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50321 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50322 hold_assign_orig, hold_assign_mod);
50323 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50324 ldmxcsr_hold_call);
50325 if (*hold)
50326 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50327 else
50328 *hold = hold_all;
50329 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50330 if (*clear)
50331 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50332 ldmxcsr_clear_call);
50333 else
50334 *clear = ldmxcsr_clear_call;
50335 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50336 tree exceptions_sse = fold_convert (integer_type_node,
50337 stxmcsr_update_call);
50338 if (*update)
50340 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50341 exceptions_var, exceptions_sse);
50342 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50343 exceptions_var, exceptions_mod);
50344 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50345 exceptions_assign);
50347 else
50348 *update = build2 (MODIFY_EXPR, integer_type_node,
50349 exceptions_var, exceptions_sse);
50350 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50351 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50352 ldmxcsr_update_call);
50354 tree atomic_feraiseexcept
50355 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50356 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50357 1, exceptions_var);
50358 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50359 atomic_feraiseexcept_call);
50362 /* Return mode to be used for bounds or VOIDmode
50363 if bounds are not supported. */
50365 static enum machine_mode
50366 ix86_mpx_bound_mode ()
50368 /* Do not support pointer checker if MPX
50369 is not enabled. */
50370 if (!TARGET_MPX)
50372 if (flag_check_pointer_bounds)
50373 warning (0, "Pointer Checker requires MPX support on this target."
50374 " Use -mmpx options to enable MPX.");
50375 return VOIDmode;
50378 return BNDmode;
50381 /* Return constant used to statically initialize constant bounds.
50383 This function is used to create special bound values. For now
50384 only INIT bounds and NONE bounds are expected. More special
50385 values may be added later. */
50387 static tree
50388 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50390 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50391 : build_zero_cst (pointer_sized_int_node);
50392 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50393 : build_minus_one_cst (pointer_sized_int_node);
50395 /* This function is supposed to be used to create INIT and
50396 NONE bounds only. */
50397 gcc_assert ((lb == 0 && ub == -1)
50398 || (lb == -1 && ub == 0));
50400 return build_complex (NULL, low, high);
50403 /* Generate a list of statements STMTS to initialize pointer bounds
50404 variable VAR with bounds LB and UB. Return the number of generated
50405 statements. */
50407 static int
50408 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50410 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50411 tree lhs, modify, var_p;
50413 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50414 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50416 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50417 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50418 append_to_statement_list (modify, stmts);
50420 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50421 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50422 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50423 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50424 append_to_statement_list (modify, stmts);
50426 return 2;
50429 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50430 /* For i386, common symbol is local only for non-PIE binaries. For
50431 x86-64, common symbol is local only for non-PIE binaries or linker
50432 supports copy reloc in PIE binaries. */
50434 static bool
50435 ix86_binds_local_p (const_tree exp)
50437 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50438 (!flag_pic
50439 || (TARGET_64BIT
50440 && HAVE_LD_PIE_COPYRELOC != 0)));
50442 #endif
50444 /* If MEM is in the form of [base+offset], extract the two parts
50445 of address and set to BASE and OFFSET, otherwise return false. */
50447 static bool
50448 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50450 rtx addr;
50452 gcc_assert (MEM_P (mem));
50454 addr = XEXP (mem, 0);
50456 if (GET_CODE (addr) == CONST)
50457 addr = XEXP (addr, 0);
50459 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50461 *base = addr;
50462 *offset = const0_rtx;
50463 return true;
50466 if (GET_CODE (addr) == PLUS
50467 && (REG_P (XEXP (addr, 0))
50468 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50469 && CONST_INT_P (XEXP (addr, 1)))
50471 *base = XEXP (addr, 0);
50472 *offset = XEXP (addr, 1);
50473 return true;
50476 return false;
50479 /* Given OPERANDS of consecutive load/store, check if we can merge
50480 them into move multiple. LOAD is true if they are load instructions.
50481 MODE is the mode of memory operands. */
50483 bool
50484 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50485 enum machine_mode mode)
50487 HOST_WIDE_INT offval_1, offval_2, msize;
50488 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50490 if (load)
50492 mem_1 = operands[1];
50493 mem_2 = operands[3];
50494 reg_1 = operands[0];
50495 reg_2 = operands[2];
50497 else
50499 mem_1 = operands[0];
50500 mem_2 = operands[2];
50501 reg_1 = operands[1];
50502 reg_2 = operands[3];
50505 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50507 if (REGNO (reg_1) != REGNO (reg_2))
50508 return false;
50510 /* Check if the addresses are in the form of [base+offset]. */
50511 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50512 return false;
50513 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50514 return false;
50516 /* Check if the bases are the same. */
50517 if (!rtx_equal_p (base_1, base_2))
50518 return false;
50520 offval_1 = INTVAL (offset_1);
50521 offval_2 = INTVAL (offset_2);
50522 msize = GET_MODE_SIZE (mode);
50523 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50524 if (offval_1 + msize != offval_2)
50525 return false;
50527 return true;
50530 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50532 static bool
50533 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50534 optimization_type opt_type)
50536 switch (op)
50538 case asin_optab:
50539 case acos_optab:
50540 case log1p_optab:
50541 case exp_optab:
50542 case exp10_optab:
50543 case exp2_optab:
50544 case expm1_optab:
50545 case ldexp_optab:
50546 case scalb_optab:
50547 case round_optab:
50548 return opt_type == OPTIMIZE_FOR_SPEED;
50550 case rint_optab:
50551 if (SSE_FLOAT_MODE_P (mode1)
50552 && TARGET_SSE_MATH
50553 && !flag_trapping_math
50554 && !TARGET_ROUND)
50555 return opt_type == OPTIMIZE_FOR_SPEED;
50556 return true;
50558 case floor_optab:
50559 case ceil_optab:
50560 case btrunc_optab:
50561 if (SSE_FLOAT_MODE_P (mode1)
50562 && TARGET_SSE_MATH
50563 && !flag_trapping_math
50564 && TARGET_ROUND)
50565 return true;
50566 return opt_type == OPTIMIZE_FOR_SPEED;
50568 case rsqrt_optab:
50569 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50571 default:
50572 return true;
50576 /* Address space support.
50578 This is not "far pointers" in the 16-bit sense, but an easy way
50579 to use %fs and %gs segment prefixes. Therefore:
50581 (a) All address spaces have the same modes,
50582 (b) All address spaces have the same addresss forms,
50583 (c) While %fs and %gs are technically subsets of the generic
50584 address space, they are probably not subsets of each other.
50585 (d) Since we have no access to the segment base register values
50586 without resorting to a system call, we cannot convert a
50587 non-default address space to a default address space.
50588 Therefore we do not claim %fs or %gs are subsets of generic.
50590 Therefore we can (mostly) use the default hooks. */
50592 /* All use of segmentation is assumed to make address 0 valid. */
50594 static bool
50595 ix86_addr_space_zero_address_valid (addr_space_t as)
50597 return as != ADDR_SPACE_GENERIC;
50600 static void
50601 ix86_init_libfuncs (void)
50603 if (TARGET_64BIT)
50605 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
50606 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
50608 else
50610 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
50611 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
50614 #if TARGET_MACHO
50615 darwin_rename_builtins ();
50616 #endif
50619 /* Generate call to __divmoddi4. */
50621 static void
50622 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
50623 rtx op0, rtx op1,
50624 rtx *quot_p, rtx *rem_p)
50626 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
50628 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
50629 mode, 3,
50630 op0, GET_MODE (op0),
50631 op1, GET_MODE (op1),
50632 XEXP (rem, 0), Pmode);
50633 *quot_p = quot;
50634 *rem_p = rem;
50637 /* Target-specific selftests. */
50639 #if CHECKING_P
50641 namespace selftest {
50643 /* Verify that hard regs are dumped as expected (in compact mode). */
50645 static void
50646 ix86_test_dumping_hard_regs ()
50648 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
50649 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
50652 /* Run all target-specific selftests. */
50654 static void
50655 ix86_run_selftests (void)
50657 ix86_test_dumping_hard_regs ();
50660 } // namespace selftest
50662 #endif /* CHECKING_P */
50664 /* Initialize the GCC target structure. */
50665 #undef TARGET_RETURN_IN_MEMORY
50666 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50668 #undef TARGET_LEGITIMIZE_ADDRESS
50669 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50671 #undef TARGET_ATTRIBUTE_TABLE
50672 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50673 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50674 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50675 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50676 # undef TARGET_MERGE_DECL_ATTRIBUTES
50677 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50678 #endif
50680 #undef TARGET_COMP_TYPE_ATTRIBUTES
50681 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50683 #undef TARGET_INIT_BUILTINS
50684 #define TARGET_INIT_BUILTINS ix86_init_builtins
50685 #undef TARGET_BUILTIN_DECL
50686 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50687 #undef TARGET_EXPAND_BUILTIN
50688 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50690 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50691 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50692 ix86_builtin_vectorized_function
50694 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50695 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50697 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50698 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50700 #undef TARGET_BUILTIN_RECIPROCAL
50701 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50703 #undef TARGET_ASM_FUNCTION_EPILOGUE
50704 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50706 #undef TARGET_ENCODE_SECTION_INFO
50707 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50708 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50709 #else
50710 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50711 #endif
50713 #undef TARGET_ASM_OPEN_PAREN
50714 #define TARGET_ASM_OPEN_PAREN ""
50715 #undef TARGET_ASM_CLOSE_PAREN
50716 #define TARGET_ASM_CLOSE_PAREN ""
50718 #undef TARGET_ASM_BYTE_OP
50719 #define TARGET_ASM_BYTE_OP ASM_BYTE
50721 #undef TARGET_ASM_ALIGNED_HI_OP
50722 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50723 #undef TARGET_ASM_ALIGNED_SI_OP
50724 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50725 #ifdef ASM_QUAD
50726 #undef TARGET_ASM_ALIGNED_DI_OP
50727 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50728 #endif
50730 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50731 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50733 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50734 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50736 #undef TARGET_ASM_UNALIGNED_HI_OP
50737 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50738 #undef TARGET_ASM_UNALIGNED_SI_OP
50739 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50740 #undef TARGET_ASM_UNALIGNED_DI_OP
50741 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50743 #undef TARGET_PRINT_OPERAND
50744 #define TARGET_PRINT_OPERAND ix86_print_operand
50745 #undef TARGET_PRINT_OPERAND_ADDRESS
50746 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50747 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50748 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50749 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50750 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50752 #undef TARGET_SCHED_INIT_GLOBAL
50753 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50754 #undef TARGET_SCHED_ADJUST_COST
50755 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50756 #undef TARGET_SCHED_ISSUE_RATE
50757 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50758 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50759 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50760 ia32_multipass_dfa_lookahead
50761 #undef TARGET_SCHED_MACRO_FUSION_P
50762 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50763 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50764 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50766 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50767 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50769 #undef TARGET_MEMMODEL_CHECK
50770 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50772 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50773 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50775 #ifdef HAVE_AS_TLS
50776 #undef TARGET_HAVE_TLS
50777 #define TARGET_HAVE_TLS true
50778 #endif
50779 #undef TARGET_CANNOT_FORCE_CONST_MEM
50780 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50781 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50782 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50784 #undef TARGET_DELEGITIMIZE_ADDRESS
50785 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50787 #undef TARGET_MS_BITFIELD_LAYOUT_P
50788 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50790 #if TARGET_MACHO
50791 #undef TARGET_BINDS_LOCAL_P
50792 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50793 #else
50794 #undef TARGET_BINDS_LOCAL_P
50795 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50796 #endif
50797 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50798 #undef TARGET_BINDS_LOCAL_P
50799 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50800 #endif
50802 #undef TARGET_ASM_OUTPUT_MI_THUNK
50803 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50804 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50805 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50807 #undef TARGET_ASM_FILE_START
50808 #define TARGET_ASM_FILE_START x86_file_start
50810 #undef TARGET_OPTION_OVERRIDE
50811 #define TARGET_OPTION_OVERRIDE ix86_option_override
50813 #undef TARGET_REGISTER_MOVE_COST
50814 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50815 #undef TARGET_MEMORY_MOVE_COST
50816 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50817 #undef TARGET_RTX_COSTS
50818 #define TARGET_RTX_COSTS ix86_rtx_costs
50819 #undef TARGET_ADDRESS_COST
50820 #define TARGET_ADDRESS_COST ix86_address_cost
50822 #undef TARGET_FIXED_CONDITION_CODE_REGS
50823 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50824 #undef TARGET_CC_MODES_COMPATIBLE
50825 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50827 #undef TARGET_MACHINE_DEPENDENT_REORG
50828 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50830 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50831 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50833 #undef TARGET_BUILD_BUILTIN_VA_LIST
50834 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50836 #undef TARGET_FOLD_BUILTIN
50837 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50839 #undef TARGET_GIMPLE_FOLD_BUILTIN
50840 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50842 #undef TARGET_COMPARE_VERSION_PRIORITY
50843 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50845 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50846 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50847 ix86_generate_version_dispatcher_body
50849 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50850 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50851 ix86_get_function_versions_dispatcher
50853 #undef TARGET_ENUM_VA_LIST_P
50854 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50856 #undef TARGET_FN_ABI_VA_LIST
50857 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50859 #undef TARGET_CANONICAL_VA_LIST_TYPE
50860 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50862 #undef TARGET_EXPAND_BUILTIN_VA_START
50863 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50865 #undef TARGET_MD_ASM_ADJUST
50866 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50868 #undef TARGET_PROMOTE_PROTOTYPES
50869 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50870 #undef TARGET_SETUP_INCOMING_VARARGS
50871 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50872 #undef TARGET_MUST_PASS_IN_STACK
50873 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50874 #undef TARGET_FUNCTION_ARG_ADVANCE
50875 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50876 #undef TARGET_FUNCTION_ARG
50877 #define TARGET_FUNCTION_ARG ix86_function_arg
50878 #undef TARGET_INIT_PIC_REG
50879 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50880 #undef TARGET_USE_PSEUDO_PIC_REG
50881 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50882 #undef TARGET_FUNCTION_ARG_BOUNDARY
50883 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50884 #undef TARGET_PASS_BY_REFERENCE
50885 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50886 #undef TARGET_INTERNAL_ARG_POINTER
50887 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50888 #undef TARGET_UPDATE_STACK_BOUNDARY
50889 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50890 #undef TARGET_GET_DRAP_RTX
50891 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50892 #undef TARGET_STRICT_ARGUMENT_NAMING
50893 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50894 #undef TARGET_STATIC_CHAIN
50895 #define TARGET_STATIC_CHAIN ix86_static_chain
50896 #undef TARGET_TRAMPOLINE_INIT
50897 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50898 #undef TARGET_RETURN_POPS_ARGS
50899 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50901 #undef TARGET_LEGITIMATE_COMBINED_INSN
50902 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50904 #undef TARGET_ASAN_SHADOW_OFFSET
50905 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50907 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50908 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50910 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50911 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50913 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50914 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50916 #undef TARGET_C_MODE_FOR_SUFFIX
50917 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50919 #ifdef HAVE_AS_TLS
50920 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50921 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50922 #endif
50924 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50925 #undef TARGET_INSERT_ATTRIBUTES
50926 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50927 #endif
50929 #undef TARGET_MANGLE_TYPE
50930 #define TARGET_MANGLE_TYPE ix86_mangle_type
50932 #ifdef TARGET_THREAD_SSP_OFFSET
50933 #undef TARGET_STACK_PROTECT_GUARD
50934 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50935 #endif
50937 #if !TARGET_MACHO
50938 #undef TARGET_STACK_PROTECT_FAIL
50939 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50940 #endif
50942 #undef TARGET_FUNCTION_VALUE
50943 #define TARGET_FUNCTION_VALUE ix86_function_value
50945 #undef TARGET_FUNCTION_VALUE_REGNO_P
50946 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50948 #undef TARGET_PROMOTE_FUNCTION_MODE
50949 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50951 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50952 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50954 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50955 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50957 #undef TARGET_INSTANTIATE_DECLS
50958 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50960 #undef TARGET_SECONDARY_RELOAD
50961 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50963 #undef TARGET_CLASS_MAX_NREGS
50964 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50966 #undef TARGET_PREFERRED_RELOAD_CLASS
50967 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50968 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50969 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50970 #undef TARGET_CLASS_LIKELY_SPILLED_P
50971 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50973 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50974 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50975 ix86_builtin_vectorization_cost
50976 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50977 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50978 ix86_vectorize_vec_perm_const_ok
50979 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50980 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50981 ix86_preferred_simd_mode
50982 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50983 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50984 ix86_autovectorize_vector_sizes
50985 #undef TARGET_VECTORIZE_GET_MASK_MODE
50986 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50987 #undef TARGET_VECTORIZE_INIT_COST
50988 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50989 #undef TARGET_VECTORIZE_ADD_STMT_COST
50990 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50991 #undef TARGET_VECTORIZE_FINISH_COST
50992 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50993 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50994 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50996 #undef TARGET_SET_CURRENT_FUNCTION
50997 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50999 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51000 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51002 #undef TARGET_OPTION_SAVE
51003 #define TARGET_OPTION_SAVE ix86_function_specific_save
51005 #undef TARGET_OPTION_RESTORE
51006 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51008 #undef TARGET_OPTION_POST_STREAM_IN
51009 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51011 #undef TARGET_OPTION_PRINT
51012 #define TARGET_OPTION_PRINT ix86_function_specific_print
51014 #undef TARGET_OPTION_FUNCTION_VERSIONS
51015 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
51017 #undef TARGET_CAN_INLINE_P
51018 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51020 #undef TARGET_LEGITIMATE_ADDRESS_P
51021 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51023 #undef TARGET_REGISTER_PRIORITY
51024 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51026 #undef TARGET_REGISTER_USAGE_LEVELING_P
51027 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51029 #undef TARGET_LEGITIMATE_CONSTANT_P
51030 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51032 #undef TARGET_FRAME_POINTER_REQUIRED
51033 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51035 #undef TARGET_CAN_ELIMINATE
51036 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51038 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51039 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51041 #undef TARGET_ASM_CODE_END
51042 #define TARGET_ASM_CODE_END ix86_code_end
51044 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51045 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51047 #undef TARGET_LOOP_UNROLL_ADJUST
51048 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51050 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51051 #undef TARGET_SPILL_CLASS
51052 #define TARGET_SPILL_CLASS ix86_spill_class
51054 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51055 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51056 ix86_simd_clone_compute_vecsize_and_simdlen
51058 #undef TARGET_SIMD_CLONE_ADJUST
51059 #define TARGET_SIMD_CLONE_ADJUST \
51060 ix86_simd_clone_adjust
51062 #undef TARGET_SIMD_CLONE_USABLE
51063 #define TARGET_SIMD_CLONE_USABLE \
51064 ix86_simd_clone_usable
51066 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51067 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51068 ix86_float_exceptions_rounding_supported_p
51070 #undef TARGET_MODE_EMIT
51071 #define TARGET_MODE_EMIT ix86_emit_mode_set
51073 #undef TARGET_MODE_NEEDED
51074 #define TARGET_MODE_NEEDED ix86_mode_needed
51076 #undef TARGET_MODE_AFTER
51077 #define TARGET_MODE_AFTER ix86_mode_after
51079 #undef TARGET_MODE_ENTRY
51080 #define TARGET_MODE_ENTRY ix86_mode_entry
51082 #undef TARGET_MODE_EXIT
51083 #define TARGET_MODE_EXIT ix86_mode_exit
51085 #undef TARGET_MODE_PRIORITY
51086 #define TARGET_MODE_PRIORITY ix86_mode_priority
51088 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51089 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51091 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51092 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
51094 #undef TARGET_STORE_BOUNDS_FOR_ARG
51095 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
51097 #undef TARGET_LOAD_RETURNED_BOUNDS
51098 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
51100 #undef TARGET_STORE_RETURNED_BOUNDS
51101 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
51103 #undef TARGET_CHKP_BOUND_MODE
51104 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
51106 #undef TARGET_BUILTIN_CHKP_FUNCTION
51107 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
51109 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
51110 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
51112 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
51113 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
51115 #undef TARGET_CHKP_INITIALIZE_BOUNDS
51116 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
51118 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51119 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51121 #undef TARGET_OFFLOAD_OPTIONS
51122 #define TARGET_OFFLOAD_OPTIONS \
51123 ix86_offload_options
51125 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
51126 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
51128 #undef TARGET_OPTAB_SUPPORTED_P
51129 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
51131 #undef TARGET_HARD_REGNO_SCRATCH_OK
51132 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
51134 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
51135 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
51137 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
51138 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
51140 #undef TARGET_INIT_LIBFUNCS
51141 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
51143 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
51144 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
51146 #if CHECKING_P
51147 #undef TARGET_RUN_TARGET_SELFTESTS
51148 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
51149 #endif /* #if CHECKING_P */
51151 struct gcc_target targetm = TARGET_INITIALIZER;
51153 #include "gt-i386.h"