Delete x86 deprecated pcommit instruction support
[official-gcc.git] / gcc / config / i386 / i386.c
blob5d0917a153a7cd903518806b55702bf2e2dd4f60
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
81 /* This file should be included last. */
82 #include "target-def.h"
84 static rtx legitimize_dllimport_symbol (rtx, bool);
85 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
86 static rtx legitimize_pe_coff_symbol (rtx, bool);
87 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
89 #ifndef CHECK_STACK_LIMIT
90 #define CHECK_STACK_LIMIT (-1)
91 #endif
93 /* Return index of given mode in mult and division cost tables. */
94 #define MODE_INDEX(mode) \
95 ((mode) == QImode ? 0 \
96 : (mode) == HImode ? 1 \
97 : (mode) == SImode ? 2 \
98 : (mode) == DImode ? 3 \
99 : 4)
101 /* Processor costs (relative to an add) */
102 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
103 #define COSTS_N_BYTES(N) ((N) * 2)
105 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
107 static stringop_algs ix86_size_memcpy[2] = {
108 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
109 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
110 static stringop_algs ix86_size_memset[2] = {
111 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
112 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
114 const
115 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
116 COSTS_N_BYTES (2), /* cost of an add instruction */
117 COSTS_N_BYTES (3), /* cost of a lea instruction */
118 COSTS_N_BYTES (2), /* variable shift costs */
119 COSTS_N_BYTES (3), /* constant shift costs */
120 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
121 COSTS_N_BYTES (3), /* HI */
122 COSTS_N_BYTES (3), /* SI */
123 COSTS_N_BYTES (3), /* DI */
124 COSTS_N_BYTES (5)}, /* other */
125 0, /* cost of multiply per each bit set */
126 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
127 COSTS_N_BYTES (3), /* HI */
128 COSTS_N_BYTES (3), /* SI */
129 COSTS_N_BYTES (3), /* DI */
130 COSTS_N_BYTES (5)}, /* other */
131 COSTS_N_BYTES (3), /* cost of movsx */
132 COSTS_N_BYTES (3), /* cost of movzx */
133 0, /* "large" insn */
134 2, /* MOVE_RATIO */
135 2, /* cost for loading QImode using movzbl */
136 {2, 2, 2}, /* cost of loading integer registers
137 in QImode, HImode and SImode.
138 Relative to reg-reg move (2). */
139 {2, 2, 2}, /* cost of storing integer registers */
140 2, /* cost of reg,reg fld/fst */
141 {2, 2, 2}, /* cost of loading fp registers
142 in SFmode, DFmode and XFmode */
143 {2, 2, 2}, /* cost of storing fp registers
144 in SFmode, DFmode and XFmode */
145 3, /* cost of moving MMX register */
146 {3, 3}, /* cost of loading MMX registers
147 in SImode and DImode */
148 {3, 3}, /* cost of storing MMX registers
149 in SImode and DImode */
150 3, /* cost of moving SSE register */
151 {3, 3, 3}, /* cost of loading SSE registers
152 in SImode, DImode and TImode */
153 {3, 3, 3}, /* cost of storing SSE registers
154 in SImode, DImode and TImode */
155 3, /* MMX or SSE register to integer */
156 0, /* size of l1 cache */
157 0, /* size of l2 cache */
158 0, /* size of prefetch block */
159 0, /* number of parallel prefetches */
160 2, /* Branch cost */
161 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
162 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
163 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
164 COSTS_N_BYTES (2), /* cost of FABS instruction. */
165 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
166 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
167 ix86_size_memcpy,
168 ix86_size_memset,
169 1, /* scalar_stmt_cost. */
170 1, /* scalar load_cost. */
171 1, /* scalar_store_cost. */
172 1, /* vec_stmt_cost. */
173 1, /* vec_to_scalar_cost. */
174 1, /* scalar_to_vec_cost. */
175 1, /* vec_align_load_cost. */
176 1, /* vec_unalign_load_cost. */
177 1, /* vec_store_cost. */
178 1, /* cond_taken_branch_cost. */
179 1, /* cond_not_taken_branch_cost. */
182 /* Processor costs (relative to an add) */
183 static stringop_algs i386_memcpy[2] = {
184 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
185 DUMMY_STRINGOP_ALGS};
186 static stringop_algs i386_memset[2] = {
187 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
188 DUMMY_STRINGOP_ALGS};
190 static const
191 struct processor_costs i386_cost = { /* 386 specific costs */
192 COSTS_N_INSNS (1), /* cost of an add instruction */
193 COSTS_N_INSNS (1), /* cost of a lea instruction */
194 COSTS_N_INSNS (3), /* variable shift costs */
195 COSTS_N_INSNS (2), /* constant shift costs */
196 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
197 COSTS_N_INSNS (6), /* HI */
198 COSTS_N_INSNS (6), /* SI */
199 COSTS_N_INSNS (6), /* DI */
200 COSTS_N_INSNS (6)}, /* other */
201 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
202 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
203 COSTS_N_INSNS (23), /* HI */
204 COSTS_N_INSNS (23), /* SI */
205 COSTS_N_INSNS (23), /* DI */
206 COSTS_N_INSNS (23)}, /* other */
207 COSTS_N_INSNS (3), /* cost of movsx */
208 COSTS_N_INSNS (2), /* cost of movzx */
209 15, /* "large" insn */
210 3, /* MOVE_RATIO */
211 4, /* cost for loading QImode using movzbl */
212 {2, 4, 2}, /* cost of loading integer registers
213 in QImode, HImode and SImode.
214 Relative to reg-reg move (2). */
215 {2, 4, 2}, /* cost of storing integer registers */
216 2, /* cost of reg,reg fld/fst */
217 {8, 8, 8}, /* cost of loading fp registers
218 in SFmode, DFmode and XFmode */
219 {8, 8, 8}, /* cost of storing fp registers
220 in SFmode, DFmode and XFmode */
221 2, /* cost of moving MMX register */
222 {4, 8}, /* cost of loading MMX registers
223 in SImode and DImode */
224 {4, 8}, /* cost of storing MMX registers
225 in SImode and DImode */
226 2, /* cost of moving SSE register */
227 {4, 8, 16}, /* cost of loading SSE registers
228 in SImode, DImode and TImode */
229 {4, 8, 16}, /* cost of storing SSE registers
230 in SImode, DImode and TImode */
231 3, /* MMX or SSE register to integer */
232 0, /* size of l1 cache */
233 0, /* size of l2 cache */
234 0, /* size of prefetch block */
235 0, /* number of parallel prefetches */
236 1, /* Branch cost */
237 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
238 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
239 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
240 COSTS_N_INSNS (22), /* cost of FABS instruction. */
241 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
242 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
243 i386_memcpy,
244 i386_memset,
245 1, /* scalar_stmt_cost. */
246 1, /* scalar load_cost. */
247 1, /* scalar_store_cost. */
248 1, /* vec_stmt_cost. */
249 1, /* vec_to_scalar_cost. */
250 1, /* scalar_to_vec_cost. */
251 1, /* vec_align_load_cost. */
252 2, /* vec_unalign_load_cost. */
253 1, /* vec_store_cost. */
254 3, /* cond_taken_branch_cost. */
255 1, /* cond_not_taken_branch_cost. */
258 static stringop_algs i486_memcpy[2] = {
259 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
260 DUMMY_STRINGOP_ALGS};
261 static stringop_algs i486_memset[2] = {
262 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
263 DUMMY_STRINGOP_ALGS};
265 static const
266 struct processor_costs i486_cost = { /* 486 specific costs */
267 COSTS_N_INSNS (1), /* cost of an add instruction */
268 COSTS_N_INSNS (1), /* cost of a lea instruction */
269 COSTS_N_INSNS (3), /* variable shift costs */
270 COSTS_N_INSNS (2), /* constant shift costs */
271 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
272 COSTS_N_INSNS (12), /* HI */
273 COSTS_N_INSNS (12), /* SI */
274 COSTS_N_INSNS (12), /* DI */
275 COSTS_N_INSNS (12)}, /* other */
276 1, /* cost of multiply per each bit set */
277 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
278 COSTS_N_INSNS (40), /* HI */
279 COSTS_N_INSNS (40), /* SI */
280 COSTS_N_INSNS (40), /* DI */
281 COSTS_N_INSNS (40)}, /* other */
282 COSTS_N_INSNS (3), /* cost of movsx */
283 COSTS_N_INSNS (2), /* cost of movzx */
284 15, /* "large" insn */
285 3, /* MOVE_RATIO */
286 4, /* cost for loading QImode using movzbl */
287 {2, 4, 2}, /* cost of loading integer registers
288 in QImode, HImode and SImode.
289 Relative to reg-reg move (2). */
290 {2, 4, 2}, /* cost of storing integer registers */
291 2, /* cost of reg,reg fld/fst */
292 {8, 8, 8}, /* cost of loading fp registers
293 in SFmode, DFmode and XFmode */
294 {8, 8, 8}, /* cost of storing fp registers
295 in SFmode, DFmode and XFmode */
296 2, /* cost of moving MMX register */
297 {4, 8}, /* cost of loading MMX registers
298 in SImode and DImode */
299 {4, 8}, /* cost of storing MMX registers
300 in SImode and DImode */
301 2, /* cost of moving SSE register */
302 {4, 8, 16}, /* cost of loading SSE registers
303 in SImode, DImode and TImode */
304 {4, 8, 16}, /* cost of storing SSE registers
305 in SImode, DImode and TImode */
306 3, /* MMX or SSE register to integer */
307 4, /* size of l1 cache. 486 has 8kB cache
308 shared for code and data, so 4kB is
309 not really precise. */
310 4, /* size of l2 cache */
311 0, /* size of prefetch block */
312 0, /* number of parallel prefetches */
313 1, /* Branch cost */
314 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
315 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
316 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
317 COSTS_N_INSNS (3), /* cost of FABS instruction. */
318 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
319 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
320 i486_memcpy,
321 i486_memset,
322 1, /* scalar_stmt_cost. */
323 1, /* scalar load_cost. */
324 1, /* scalar_store_cost. */
325 1, /* vec_stmt_cost. */
326 1, /* vec_to_scalar_cost. */
327 1, /* scalar_to_vec_cost. */
328 1, /* vec_align_load_cost. */
329 2, /* vec_unalign_load_cost. */
330 1, /* vec_store_cost. */
331 3, /* cond_taken_branch_cost. */
332 1, /* cond_not_taken_branch_cost. */
335 static stringop_algs pentium_memcpy[2] = {
336 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
337 DUMMY_STRINGOP_ALGS};
338 static stringop_algs pentium_memset[2] = {
339 {libcall, {{-1, rep_prefix_4_byte, false}}},
340 DUMMY_STRINGOP_ALGS};
342 static const
343 struct processor_costs pentium_cost = {
344 COSTS_N_INSNS (1), /* cost of an add instruction */
345 COSTS_N_INSNS (1), /* cost of a lea instruction */
346 COSTS_N_INSNS (4), /* variable shift costs */
347 COSTS_N_INSNS (1), /* constant shift costs */
348 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
349 COSTS_N_INSNS (11), /* HI */
350 COSTS_N_INSNS (11), /* SI */
351 COSTS_N_INSNS (11), /* DI */
352 COSTS_N_INSNS (11)}, /* other */
353 0, /* cost of multiply per each bit set */
354 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
355 COSTS_N_INSNS (25), /* HI */
356 COSTS_N_INSNS (25), /* SI */
357 COSTS_N_INSNS (25), /* DI */
358 COSTS_N_INSNS (25)}, /* other */
359 COSTS_N_INSNS (3), /* cost of movsx */
360 COSTS_N_INSNS (2), /* cost of movzx */
361 8, /* "large" insn */
362 6, /* MOVE_RATIO */
363 6, /* cost for loading QImode using movzbl */
364 {2, 4, 2}, /* cost of loading integer registers
365 in QImode, HImode and SImode.
366 Relative to reg-reg move (2). */
367 {2, 4, 2}, /* cost of storing integer registers */
368 2, /* cost of reg,reg fld/fst */
369 {2, 2, 6}, /* cost of loading fp registers
370 in SFmode, DFmode and XFmode */
371 {4, 4, 6}, /* cost of storing fp registers
372 in SFmode, DFmode and XFmode */
373 8, /* cost of moving MMX register */
374 {8, 8}, /* cost of loading MMX registers
375 in SImode and DImode */
376 {8, 8}, /* cost of storing MMX registers
377 in SImode and DImode */
378 2, /* cost of moving SSE register */
379 {4, 8, 16}, /* cost of loading SSE registers
380 in SImode, DImode and TImode */
381 {4, 8, 16}, /* cost of storing SSE registers
382 in SImode, DImode and TImode */
383 3, /* MMX or SSE register to integer */
384 8, /* size of l1 cache. */
385 8, /* size of l2 cache */
386 0, /* size of prefetch block */
387 0, /* number of parallel prefetches */
388 2, /* Branch cost */
389 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
390 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
391 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
392 COSTS_N_INSNS (1), /* cost of FABS instruction. */
393 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
394 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
395 pentium_memcpy,
396 pentium_memset,
397 1, /* scalar_stmt_cost. */
398 1, /* scalar load_cost. */
399 1, /* scalar_store_cost. */
400 1, /* vec_stmt_cost. */
401 1, /* vec_to_scalar_cost. */
402 1, /* scalar_to_vec_cost. */
403 1, /* vec_align_load_cost. */
404 2, /* vec_unalign_load_cost. */
405 1, /* vec_store_cost. */
406 3, /* cond_taken_branch_cost. */
407 1, /* cond_not_taken_branch_cost. */
410 static const
411 struct processor_costs lakemont_cost = {
412 COSTS_N_INSNS (1), /* cost of an add instruction */
413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
414 COSTS_N_INSNS (1), /* variable shift costs */
415 COSTS_N_INSNS (1), /* constant shift costs */
416 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
417 COSTS_N_INSNS (11), /* HI */
418 COSTS_N_INSNS (11), /* SI */
419 COSTS_N_INSNS (11), /* DI */
420 COSTS_N_INSNS (11)}, /* other */
421 0, /* cost of multiply per each bit set */
422 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
423 COSTS_N_INSNS (25), /* HI */
424 COSTS_N_INSNS (25), /* SI */
425 COSTS_N_INSNS (25), /* DI */
426 COSTS_N_INSNS (25)}, /* other */
427 COSTS_N_INSNS (3), /* cost of movsx */
428 COSTS_N_INSNS (2), /* cost of movzx */
429 8, /* "large" insn */
430 17, /* MOVE_RATIO */
431 6, /* cost for loading QImode using movzbl */
432 {2, 4, 2}, /* cost of loading integer registers
433 in QImode, HImode and SImode.
434 Relative to reg-reg move (2). */
435 {2, 4, 2}, /* cost of storing integer registers */
436 2, /* cost of reg,reg fld/fst */
437 {2, 2, 6}, /* cost of loading fp registers
438 in SFmode, DFmode and XFmode */
439 {4, 4, 6}, /* cost of storing fp registers
440 in SFmode, DFmode and XFmode */
441 8, /* cost of moving MMX register */
442 {8, 8}, /* cost of loading MMX registers
443 in SImode and DImode */
444 {8, 8}, /* cost of storing MMX registers
445 in SImode and DImode */
446 2, /* cost of moving SSE register */
447 {4, 8, 16}, /* cost of loading SSE registers
448 in SImode, DImode and TImode */
449 {4, 8, 16}, /* cost of storing SSE registers
450 in SImode, DImode and TImode */
451 3, /* MMX or SSE register to integer */
452 8, /* size of l1 cache. */
453 8, /* size of l2 cache */
454 0, /* size of prefetch block */
455 0, /* number of parallel prefetches */
456 2, /* Branch cost */
457 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
458 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
459 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
460 COSTS_N_INSNS (1), /* cost of FABS instruction. */
461 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
462 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
463 pentium_memcpy,
464 pentium_memset,
465 1, /* scalar_stmt_cost. */
466 1, /* scalar load_cost. */
467 1, /* scalar_store_cost. */
468 1, /* vec_stmt_cost. */
469 1, /* vec_to_scalar_cost. */
470 1, /* scalar_to_vec_cost. */
471 1, /* vec_align_load_cost. */
472 2, /* vec_unalign_load_cost. */
473 1, /* vec_store_cost. */
474 3, /* cond_taken_branch_cost. */
475 1, /* cond_not_taken_branch_cost. */
478 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
479 (we ensure the alignment). For small blocks inline loop is still a
480 noticeable win, for bigger blocks either rep movsl or rep movsb is
481 way to go. Rep movsb has apparently more expensive startup time in CPU,
482 but after 4K the difference is down in the noise. */
483 static stringop_algs pentiumpro_memcpy[2] = {
484 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
485 {8192, rep_prefix_4_byte, false},
486 {-1, rep_prefix_1_byte, false}}},
487 DUMMY_STRINGOP_ALGS};
488 static stringop_algs pentiumpro_memset[2] = {
489 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
490 {8192, rep_prefix_4_byte, false},
491 {-1, libcall, false}}},
492 DUMMY_STRINGOP_ALGS};
493 static const
494 struct processor_costs pentiumpro_cost = {
495 COSTS_N_INSNS (1), /* cost of an add instruction */
496 COSTS_N_INSNS (1), /* cost of a lea instruction */
497 COSTS_N_INSNS (1), /* variable shift costs */
498 COSTS_N_INSNS (1), /* constant shift costs */
499 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
500 COSTS_N_INSNS (4), /* HI */
501 COSTS_N_INSNS (4), /* SI */
502 COSTS_N_INSNS (4), /* DI */
503 COSTS_N_INSNS (4)}, /* other */
504 0, /* cost of multiply per each bit set */
505 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
506 COSTS_N_INSNS (17), /* HI */
507 COSTS_N_INSNS (17), /* SI */
508 COSTS_N_INSNS (17), /* DI */
509 COSTS_N_INSNS (17)}, /* other */
510 COSTS_N_INSNS (1), /* cost of movsx */
511 COSTS_N_INSNS (1), /* cost of movzx */
512 8, /* "large" insn */
513 6, /* MOVE_RATIO */
514 2, /* cost for loading QImode using movzbl */
515 {4, 4, 4}, /* cost of loading integer registers
516 in QImode, HImode and SImode.
517 Relative to reg-reg move (2). */
518 {2, 2, 2}, /* cost of storing integer registers */
519 2, /* cost of reg,reg fld/fst */
520 {2, 2, 6}, /* cost of loading fp registers
521 in SFmode, DFmode and XFmode */
522 {4, 4, 6}, /* cost of storing fp registers
523 in SFmode, DFmode and XFmode */
524 2, /* cost of moving MMX register */
525 {2, 2}, /* cost of loading MMX registers
526 in SImode and DImode */
527 {2, 2}, /* cost of storing MMX registers
528 in SImode and DImode */
529 2, /* cost of moving SSE register */
530 {2, 2, 8}, /* cost of loading SSE registers
531 in SImode, DImode and TImode */
532 {2, 2, 8}, /* cost of storing SSE registers
533 in SImode, DImode and TImode */
534 3, /* MMX or SSE register to integer */
535 8, /* size of l1 cache. */
536 256, /* size of l2 cache */
537 32, /* size of prefetch block */
538 6, /* number of parallel prefetches */
539 2, /* Branch cost */
540 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
541 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
542 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
543 COSTS_N_INSNS (2), /* cost of FABS instruction. */
544 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
545 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
546 pentiumpro_memcpy,
547 pentiumpro_memset,
548 1, /* scalar_stmt_cost. */
549 1, /* scalar load_cost. */
550 1, /* scalar_store_cost. */
551 1, /* vec_stmt_cost. */
552 1, /* vec_to_scalar_cost. */
553 1, /* scalar_to_vec_cost. */
554 1, /* vec_align_load_cost. */
555 2, /* vec_unalign_load_cost. */
556 1, /* vec_store_cost. */
557 3, /* cond_taken_branch_cost. */
558 1, /* cond_not_taken_branch_cost. */
561 static stringop_algs geode_memcpy[2] = {
562 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
563 DUMMY_STRINGOP_ALGS};
564 static stringop_algs geode_memset[2] = {
565 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
566 DUMMY_STRINGOP_ALGS};
567 static const
568 struct processor_costs geode_cost = {
569 COSTS_N_INSNS (1), /* cost of an add instruction */
570 COSTS_N_INSNS (1), /* cost of a lea instruction */
571 COSTS_N_INSNS (2), /* variable shift costs */
572 COSTS_N_INSNS (1), /* constant shift costs */
573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
574 COSTS_N_INSNS (4), /* HI */
575 COSTS_N_INSNS (7), /* SI */
576 COSTS_N_INSNS (7), /* DI */
577 COSTS_N_INSNS (7)}, /* other */
578 0, /* cost of multiply per each bit set */
579 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
580 COSTS_N_INSNS (23), /* HI */
581 COSTS_N_INSNS (39), /* SI */
582 COSTS_N_INSNS (39), /* DI */
583 COSTS_N_INSNS (39)}, /* other */
584 COSTS_N_INSNS (1), /* cost of movsx */
585 COSTS_N_INSNS (1), /* cost of movzx */
586 8, /* "large" insn */
587 4, /* MOVE_RATIO */
588 1, /* cost for loading QImode using movzbl */
589 {1, 1, 1}, /* cost of loading integer registers
590 in QImode, HImode and SImode.
591 Relative to reg-reg move (2). */
592 {1, 1, 1}, /* cost of storing integer registers */
593 1, /* cost of reg,reg fld/fst */
594 {1, 1, 1}, /* cost of loading fp registers
595 in SFmode, DFmode and XFmode */
596 {4, 6, 6}, /* cost of storing fp registers
597 in SFmode, DFmode and XFmode */
599 2, /* cost of moving MMX register */
600 {2, 2}, /* cost of loading MMX registers
601 in SImode and DImode */
602 {2, 2}, /* cost of storing MMX registers
603 in SImode and DImode */
604 2, /* cost of moving SSE register */
605 {2, 2, 8}, /* cost of loading SSE registers
606 in SImode, DImode and TImode */
607 {2, 2, 8}, /* cost of storing SSE registers
608 in SImode, DImode and TImode */
609 3, /* MMX or SSE register to integer */
610 64, /* size of l1 cache. */
611 128, /* size of l2 cache. */
612 32, /* size of prefetch block */
613 1, /* number of parallel prefetches */
614 1, /* Branch cost */
615 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
616 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
617 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
618 COSTS_N_INSNS (1), /* cost of FABS instruction. */
619 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
620 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
621 geode_memcpy,
622 geode_memset,
623 1, /* scalar_stmt_cost. */
624 1, /* scalar load_cost. */
625 1, /* scalar_store_cost. */
626 1, /* vec_stmt_cost. */
627 1, /* vec_to_scalar_cost. */
628 1, /* scalar_to_vec_cost. */
629 1, /* vec_align_load_cost. */
630 2, /* vec_unalign_load_cost. */
631 1, /* vec_store_cost. */
632 3, /* cond_taken_branch_cost. */
633 1, /* cond_not_taken_branch_cost. */
636 static stringop_algs k6_memcpy[2] = {
637 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
638 DUMMY_STRINGOP_ALGS};
639 static stringop_algs k6_memset[2] = {
640 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
641 DUMMY_STRINGOP_ALGS};
642 static const
643 struct processor_costs k6_cost = {
644 COSTS_N_INSNS (1), /* cost of an add instruction */
645 COSTS_N_INSNS (2), /* cost of a lea instruction */
646 COSTS_N_INSNS (1), /* variable shift costs */
647 COSTS_N_INSNS (1), /* constant shift costs */
648 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
649 COSTS_N_INSNS (3), /* HI */
650 COSTS_N_INSNS (3), /* SI */
651 COSTS_N_INSNS (3), /* DI */
652 COSTS_N_INSNS (3)}, /* other */
653 0, /* cost of multiply per each bit set */
654 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
655 COSTS_N_INSNS (18), /* HI */
656 COSTS_N_INSNS (18), /* SI */
657 COSTS_N_INSNS (18), /* DI */
658 COSTS_N_INSNS (18)}, /* other */
659 COSTS_N_INSNS (2), /* cost of movsx */
660 COSTS_N_INSNS (2), /* cost of movzx */
661 8, /* "large" insn */
662 4, /* MOVE_RATIO */
663 3, /* cost for loading QImode using movzbl */
664 {4, 5, 4}, /* cost of loading integer registers
665 in QImode, HImode and SImode.
666 Relative to reg-reg move (2). */
667 {2, 3, 2}, /* cost of storing integer registers */
668 4, /* cost of reg,reg fld/fst */
669 {6, 6, 6}, /* cost of loading fp registers
670 in SFmode, DFmode and XFmode */
671 {4, 4, 4}, /* cost of storing fp registers
672 in SFmode, DFmode and XFmode */
673 2, /* cost of moving MMX register */
674 {2, 2}, /* cost of loading MMX registers
675 in SImode and DImode */
676 {2, 2}, /* cost of storing MMX registers
677 in SImode and DImode */
678 2, /* cost of moving SSE register */
679 {2, 2, 8}, /* cost of loading SSE registers
680 in SImode, DImode and TImode */
681 {2, 2, 8}, /* cost of storing SSE registers
682 in SImode, DImode and TImode */
683 6, /* MMX or SSE register to integer */
684 32, /* size of l1 cache. */
685 32, /* size of l2 cache. Some models
686 have integrated l2 cache, but
687 optimizing for k6 is not important
688 enough to worry about that. */
689 32, /* size of prefetch block */
690 1, /* number of parallel prefetches */
691 1, /* Branch cost */
692 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
693 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
694 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
695 COSTS_N_INSNS (2), /* cost of FABS instruction. */
696 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
697 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
698 k6_memcpy,
699 k6_memset,
700 1, /* scalar_stmt_cost. */
701 1, /* scalar load_cost. */
702 1, /* scalar_store_cost. */
703 1, /* vec_stmt_cost. */
704 1, /* vec_to_scalar_cost. */
705 1, /* scalar_to_vec_cost. */
706 1, /* vec_align_load_cost. */
707 2, /* vec_unalign_load_cost. */
708 1, /* vec_store_cost. */
709 3, /* cond_taken_branch_cost. */
710 1, /* cond_not_taken_branch_cost. */
713 /* For some reason, Athlon deals better with REP prefix (relative to loops)
714 compared to K8. Alignment becomes important after 8 bytes for memcpy and
715 128 bytes for memset. */
716 static stringop_algs athlon_memcpy[2] = {
717 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
718 DUMMY_STRINGOP_ALGS};
719 static stringop_algs athlon_memset[2] = {
720 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
721 DUMMY_STRINGOP_ALGS};
722 static const
723 struct processor_costs athlon_cost = {
724 COSTS_N_INSNS (1), /* cost of an add instruction */
725 COSTS_N_INSNS (2), /* cost of a lea instruction */
726 COSTS_N_INSNS (1), /* variable shift costs */
727 COSTS_N_INSNS (1), /* constant shift costs */
728 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
729 COSTS_N_INSNS (5), /* HI */
730 COSTS_N_INSNS (5), /* SI */
731 COSTS_N_INSNS (5), /* DI */
732 COSTS_N_INSNS (5)}, /* other */
733 0, /* cost of multiply per each bit set */
734 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
735 COSTS_N_INSNS (26), /* HI */
736 COSTS_N_INSNS (42), /* SI */
737 COSTS_N_INSNS (74), /* DI */
738 COSTS_N_INSNS (74)}, /* other */
739 COSTS_N_INSNS (1), /* cost of movsx */
740 COSTS_N_INSNS (1), /* cost of movzx */
741 8, /* "large" insn */
742 9, /* MOVE_RATIO */
743 4, /* cost for loading QImode using movzbl */
744 {3, 4, 3}, /* cost of loading integer registers
745 in QImode, HImode and SImode.
746 Relative to reg-reg move (2). */
747 {3, 4, 3}, /* cost of storing integer registers */
748 4, /* cost of reg,reg fld/fst */
749 {4, 4, 12}, /* cost of loading fp registers
750 in SFmode, DFmode and XFmode */
751 {6, 6, 8}, /* cost of storing fp registers
752 in SFmode, DFmode and XFmode */
753 2, /* cost of moving MMX register */
754 {4, 4}, /* cost of loading MMX registers
755 in SImode and DImode */
756 {4, 4}, /* cost of storing MMX registers
757 in SImode and DImode */
758 2, /* cost of moving SSE register */
759 {4, 4, 6}, /* cost of loading SSE registers
760 in SImode, DImode and TImode */
761 {4, 4, 5}, /* cost of storing SSE registers
762 in SImode, DImode and TImode */
763 5, /* MMX or SSE register to integer */
764 64, /* size of l1 cache. */
765 256, /* size of l2 cache. */
766 64, /* size of prefetch block */
767 6, /* number of parallel prefetches */
768 5, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
775 athlon_memcpy,
776 athlon_memset,
777 1, /* scalar_stmt_cost. */
778 1, /* scalar load_cost. */
779 1, /* scalar_store_cost. */
780 1, /* vec_stmt_cost. */
781 1, /* vec_to_scalar_cost. */
782 1, /* scalar_to_vec_cost. */
783 1, /* vec_align_load_cost. */
784 2, /* vec_unalign_load_cost. */
785 1, /* vec_store_cost. */
786 3, /* cond_taken_branch_cost. */
787 1, /* cond_not_taken_branch_cost. */
790 /* K8 has optimized REP instruction for medium sized blocks, but for very
791 small blocks it is better to use loop. For large blocks, libcall can
792 do nontemporary accesses and beat inline considerably. */
793 static stringop_algs k8_memcpy[2] = {
794 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
795 {-1, rep_prefix_4_byte, false}}},
796 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
797 {-1, libcall, false}}}};
798 static stringop_algs k8_memset[2] = {
799 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
800 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
801 {libcall, {{48, unrolled_loop, false},
802 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
803 static const
804 struct processor_costs k8_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (26), /* HI */
817 COSTS_N_INSNS (42), /* SI */
818 COSTS_N_INSNS (74), /* DI */
819 COSTS_N_INSNS (74)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 3, 6}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 5, /* MMX or SSE register to integer */
845 64, /* size of l1 cache. */
846 512, /* size of l2 cache. */
847 64, /* size of prefetch block */
848 /* New AMD processors never drop prefetches; if they cannot be performed
849 immediately, they are queued. We set number of simultaneous prefetches
850 to a large constant to reflect this (it probably is not a good idea not
851 to limit number of prefetches at all, as their execution also takes some
852 time). */
853 100, /* number of parallel prefetches */
854 3, /* Branch cost */
855 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
856 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
857 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
858 COSTS_N_INSNS (2), /* cost of FABS instruction. */
859 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
860 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
862 k8_memcpy,
863 k8_memset,
864 4, /* scalar_stmt_cost. */
865 2, /* scalar load_cost. */
866 2, /* scalar_store_cost. */
867 5, /* vec_stmt_cost. */
868 0, /* vec_to_scalar_cost. */
869 2, /* scalar_to_vec_cost. */
870 2, /* vec_align_load_cost. */
871 3, /* vec_unalign_load_cost. */
872 3, /* vec_store_cost. */
873 3, /* cond_taken_branch_cost. */
874 2, /* cond_not_taken_branch_cost. */
877 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
878 very small blocks it is better to use loop. For large blocks, libcall can
879 do nontemporary accesses and beat inline considerably. */
880 static stringop_algs amdfam10_memcpy[2] = {
881 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
882 {-1, rep_prefix_4_byte, false}}},
883 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
884 {-1, libcall, false}}}};
885 static stringop_algs amdfam10_memset[2] = {
886 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
887 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
888 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
889 {-1, libcall, false}}}};
890 struct processor_costs amdfam10_cost = {
891 COSTS_N_INSNS (1), /* cost of an add instruction */
892 COSTS_N_INSNS (2), /* cost of a lea instruction */
893 COSTS_N_INSNS (1), /* variable shift costs */
894 COSTS_N_INSNS (1), /* constant shift costs */
895 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
896 COSTS_N_INSNS (4), /* HI */
897 COSTS_N_INSNS (3), /* SI */
898 COSTS_N_INSNS (4), /* DI */
899 COSTS_N_INSNS (5)}, /* other */
900 0, /* cost of multiply per each bit set */
901 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
902 COSTS_N_INSNS (35), /* HI */
903 COSTS_N_INSNS (51), /* SI */
904 COSTS_N_INSNS (83), /* DI */
905 COSTS_N_INSNS (83)}, /* other */
906 COSTS_N_INSNS (1), /* cost of movsx */
907 COSTS_N_INSNS (1), /* cost of movzx */
908 8, /* "large" insn */
909 9, /* MOVE_RATIO */
910 4, /* cost for loading QImode using movzbl */
911 {3, 4, 3}, /* cost of loading integer registers
912 in QImode, HImode and SImode.
913 Relative to reg-reg move (2). */
914 {3, 4, 3}, /* cost of storing integer registers */
915 4, /* cost of reg,reg fld/fst */
916 {4, 4, 12}, /* cost of loading fp registers
917 in SFmode, DFmode and XFmode */
918 {6, 6, 8}, /* cost of storing fp registers
919 in SFmode, DFmode and XFmode */
920 2, /* cost of moving MMX register */
921 {3, 3}, /* cost of loading MMX registers
922 in SImode and DImode */
923 {4, 4}, /* cost of storing MMX registers
924 in SImode and DImode */
925 2, /* cost of moving SSE register */
926 {4, 4, 3}, /* cost of loading SSE registers
927 in SImode, DImode and TImode */
928 {4, 4, 5}, /* cost of storing SSE registers
929 in SImode, DImode and TImode */
930 3, /* MMX or SSE register to integer */
931 /* On K8:
932 MOVD reg64, xmmreg Double FSTORE 4
933 MOVD reg32, xmmreg Double FSTORE 4
934 On AMDFAM10:
935 MOVD reg64, xmmreg Double FADD 3
936 1/1 1/1
937 MOVD reg32, xmmreg Double FADD 3
938 1/1 1/1 */
939 64, /* size of l1 cache. */
940 512, /* size of l2 cache. */
941 64, /* size of prefetch block */
942 /* New AMD processors never drop prefetches; if they cannot be performed
943 immediately, they are queued. We set number of simultaneous prefetches
944 to a large constant to reflect this (it probably is not a good idea not
945 to limit number of prefetches at all, as their execution also takes some
946 time). */
947 100, /* number of parallel prefetches */
948 2, /* Branch cost */
949 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
950 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
951 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
952 COSTS_N_INSNS (2), /* cost of FABS instruction. */
953 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
954 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
956 amdfam10_memcpy,
957 amdfam10_memset,
958 4, /* scalar_stmt_cost. */
959 2, /* scalar load_cost. */
960 2, /* scalar_store_cost. */
961 6, /* vec_stmt_cost. */
962 0, /* vec_to_scalar_cost. */
963 2, /* scalar_to_vec_cost. */
964 2, /* vec_align_load_cost. */
965 2, /* vec_unalign_load_cost. */
966 2, /* vec_store_cost. */
967 2, /* cond_taken_branch_cost. */
968 1, /* cond_not_taken_branch_cost. */
971 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
972 very small blocks it is better to use loop. For large blocks, libcall
973 can do nontemporary accesses and beat inline considerably. */
974 static stringop_algs bdver1_memcpy[2] = {
975 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
976 {-1, rep_prefix_4_byte, false}}},
977 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
978 {-1, libcall, false}}}};
979 static stringop_algs bdver1_memset[2] = {
980 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
981 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
982 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
983 {-1, libcall, false}}}};
985 const struct processor_costs bdver1_cost = {
986 COSTS_N_INSNS (1), /* cost of an add instruction */
987 COSTS_N_INSNS (1), /* cost of a lea instruction */
988 COSTS_N_INSNS (1), /* variable shift costs */
989 COSTS_N_INSNS (1), /* constant shift costs */
990 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
991 COSTS_N_INSNS (4), /* HI */
992 COSTS_N_INSNS (4), /* SI */
993 COSTS_N_INSNS (6), /* DI */
994 COSTS_N_INSNS (6)}, /* other */
995 0, /* cost of multiply per each bit set */
996 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
997 COSTS_N_INSNS (35), /* HI */
998 COSTS_N_INSNS (51), /* SI */
999 COSTS_N_INSNS (83), /* DI */
1000 COSTS_N_INSNS (83)}, /* other */
1001 COSTS_N_INSNS (1), /* cost of movsx */
1002 COSTS_N_INSNS (1), /* cost of movzx */
1003 8, /* "large" insn */
1004 9, /* MOVE_RATIO */
1005 4, /* cost for loading QImode using movzbl */
1006 {5, 5, 4}, /* cost of loading integer registers
1007 in QImode, HImode and SImode.
1008 Relative to reg-reg move (2). */
1009 {4, 4, 4}, /* cost of storing integer registers */
1010 2, /* cost of reg,reg fld/fst */
1011 {5, 5, 12}, /* cost of loading fp registers
1012 in SFmode, DFmode and XFmode */
1013 {4, 4, 8}, /* cost of storing fp registers
1014 in SFmode, DFmode and XFmode */
1015 2, /* cost of moving MMX register */
1016 {4, 4}, /* cost of loading MMX registers
1017 in SImode and DImode */
1018 {4, 4}, /* cost of storing MMX registers
1019 in SImode and DImode */
1020 2, /* cost of moving SSE register */
1021 {4, 4, 4}, /* cost of loading SSE registers
1022 in SImode, DImode and TImode */
1023 {4, 4, 4}, /* cost of storing SSE registers
1024 in SImode, DImode and TImode */
1025 2, /* MMX or SSE register to integer */
1026 /* On K8:
1027 MOVD reg64, xmmreg Double FSTORE 4
1028 MOVD reg32, xmmreg Double FSTORE 4
1029 On AMDFAM10:
1030 MOVD reg64, xmmreg Double FADD 3
1031 1/1 1/1
1032 MOVD reg32, xmmreg Double FADD 3
1033 1/1 1/1 */
1034 16, /* size of l1 cache. */
1035 2048, /* size of l2 cache. */
1036 64, /* size of prefetch block */
1037 /* New AMD processors never drop prefetches; if they cannot be performed
1038 immediately, they are queued. We set number of simultaneous prefetches
1039 to a large constant to reflect this (it probably is not a good idea not
1040 to limit number of prefetches at all, as their execution also takes some
1041 time). */
1042 100, /* number of parallel prefetches */
1043 2, /* Branch cost */
1044 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1045 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1046 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1047 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1048 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1049 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1051 bdver1_memcpy,
1052 bdver1_memset,
1053 6, /* scalar_stmt_cost. */
1054 4, /* scalar load_cost. */
1055 4, /* scalar_store_cost. */
1056 6, /* vec_stmt_cost. */
1057 0, /* vec_to_scalar_cost. */
1058 2, /* scalar_to_vec_cost. */
1059 4, /* vec_align_load_cost. */
1060 4, /* vec_unalign_load_cost. */
1061 4, /* vec_store_cost. */
1062 4, /* cond_taken_branch_cost. */
1063 2, /* cond_not_taken_branch_cost. */
1066 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1067 very small blocks it is better to use loop. For large blocks, libcall
1068 can do nontemporary accesses and beat inline considerably. */
1070 static stringop_algs bdver2_memcpy[2] = {
1071 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1072 {-1, rep_prefix_4_byte, false}}},
1073 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1074 {-1, libcall, false}}}};
1075 static stringop_algs bdver2_memset[2] = {
1076 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1077 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1078 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1079 {-1, libcall, false}}}};
1081 const struct processor_costs bdver2_cost = {
1082 COSTS_N_INSNS (1), /* cost of an add instruction */
1083 COSTS_N_INSNS (1), /* cost of a lea instruction */
1084 COSTS_N_INSNS (1), /* variable shift costs */
1085 COSTS_N_INSNS (1), /* constant shift costs */
1086 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1087 COSTS_N_INSNS (4), /* HI */
1088 COSTS_N_INSNS (4), /* SI */
1089 COSTS_N_INSNS (6), /* DI */
1090 COSTS_N_INSNS (6)}, /* other */
1091 0, /* cost of multiply per each bit set */
1092 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1093 COSTS_N_INSNS (35), /* HI */
1094 COSTS_N_INSNS (51), /* SI */
1095 COSTS_N_INSNS (83), /* DI */
1096 COSTS_N_INSNS (83)}, /* other */
1097 COSTS_N_INSNS (1), /* cost of movsx */
1098 COSTS_N_INSNS (1), /* cost of movzx */
1099 8, /* "large" insn */
1100 9, /* MOVE_RATIO */
1101 4, /* cost for loading QImode using movzbl */
1102 {5, 5, 4}, /* cost of loading integer registers
1103 in QImode, HImode and SImode.
1104 Relative to reg-reg move (2). */
1105 {4, 4, 4}, /* cost of storing integer registers */
1106 2, /* cost of reg,reg fld/fst */
1107 {5, 5, 12}, /* cost of loading fp registers
1108 in SFmode, DFmode and XFmode */
1109 {4, 4, 8}, /* cost of storing fp registers
1110 in SFmode, DFmode and XFmode */
1111 2, /* cost of moving MMX register */
1112 {4, 4}, /* cost of loading MMX registers
1113 in SImode and DImode */
1114 {4, 4}, /* cost of storing MMX registers
1115 in SImode and DImode */
1116 2, /* cost of moving SSE register */
1117 {4, 4, 4}, /* cost of loading SSE registers
1118 in SImode, DImode and TImode */
1119 {4, 4, 4}, /* cost of storing SSE registers
1120 in SImode, DImode and TImode */
1121 2, /* MMX or SSE register to integer */
1122 /* On K8:
1123 MOVD reg64, xmmreg Double FSTORE 4
1124 MOVD reg32, xmmreg Double FSTORE 4
1125 On AMDFAM10:
1126 MOVD reg64, xmmreg Double FADD 3
1127 1/1 1/1
1128 MOVD reg32, xmmreg Double FADD 3
1129 1/1 1/1 */
1130 16, /* size of l1 cache. */
1131 2048, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 2, /* Branch cost */
1140 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1147 bdver2_memcpy,
1148 bdver2_memset,
1149 6, /* scalar_stmt_cost. */
1150 4, /* scalar load_cost. */
1151 4, /* scalar_store_cost. */
1152 6, /* vec_stmt_cost. */
1153 0, /* vec_to_scalar_cost. */
1154 2, /* scalar_to_vec_cost. */
1155 4, /* vec_align_load_cost. */
1156 4, /* vec_unalign_load_cost. */
1157 4, /* vec_store_cost. */
1158 4, /* cond_taken_branch_cost. */
1159 2, /* cond_not_taken_branch_cost. */
1163 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall
1165 can do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs bdver3_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs bdver3_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 struct processor_costs bdver3_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (1), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (4), /* SI */
1184 COSTS_N_INSNS (6), /* DI */
1185 COSTS_N_INSNS (6)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {5, 5, 4}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {4, 4, 4}, /* cost of storing integer registers */
1201 2, /* cost of reg,reg fld/fst */
1202 {5, 5, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {4, 4, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {4, 4}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 4}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 4}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 2, /* MMX or SSE register to integer */
1217 16, /* size of l1 cache. */
1218 2048, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1234 bdver3_memcpy,
1235 bdver3_memset,
1236 6, /* scalar_stmt_cost. */
1237 4, /* scalar load_cost. */
1238 4, /* scalar_store_cost. */
1239 6, /* vec_stmt_cost. */
1240 0, /* vec_to_scalar_cost. */
1241 2, /* scalar_to_vec_cost. */
1242 4, /* vec_align_load_cost. */
1243 4, /* vec_unalign_load_cost. */
1244 4, /* vec_store_cost. */
1245 4, /* cond_taken_branch_cost. */
1246 2, /* cond_not_taken_branch_cost. */
1249 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1250 very small blocks it is better to use loop. For large blocks, libcall
1251 can do nontemporary accesses and beat inline considerably. */
1252 static stringop_algs bdver4_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs bdver4_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 struct processor_costs bdver4_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (1), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (4), /* SI */
1270 COSTS_N_INSNS (6), /* DI */
1271 COSTS_N_INSNS (6)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {5, 5, 4}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {4, 4, 4}, /* cost of storing integer registers */
1287 2, /* cost of reg,reg fld/fst */
1288 {5, 5, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {4, 4, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {4, 4}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 4}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 4}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 2, /* MMX or SSE register to integer */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320 bdver4_memcpy,
1321 bdver4_memset,
1322 6, /* scalar_stmt_cost. */
1323 4, /* scalar load_cost. */
1324 4, /* scalar_store_cost. */
1325 6, /* vec_stmt_cost. */
1326 0, /* vec_to_scalar_cost. */
1327 2, /* scalar_to_vec_cost. */
1328 4, /* vec_align_load_cost. */
1329 4, /* vec_unalign_load_cost. */
1330 4, /* vec_store_cost. */
1331 4, /* cond_taken_branch_cost. */
1332 2, /* cond_not_taken_branch_cost. */
1336 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1337 very small blocks it is better to use loop. For large blocks, libcall
1338 can do nontemporary accesses and beat inline considerably. */
1339 static stringop_algs znver1_memcpy[2] = {
1340 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1341 {-1, rep_prefix_4_byte, false}}},
1342 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1343 {-1, libcall, false}}}};
1344 static stringop_algs znver1_memset[2] = {
1345 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1346 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1347 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1348 {-1, libcall, false}}}};
1349 struct processor_costs znver1_cost = {
1350 COSTS_N_INSNS (1), /* cost of an add instruction. */
1351 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1352 COSTS_N_INSNS (1), /* variable shift costs. */
1353 COSTS_N_INSNS (1), /* constant shift costs. */
1354 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1355 COSTS_N_INSNS (3), /* HI. */
1356 COSTS_N_INSNS (3), /* SI. */
1357 COSTS_N_INSNS (4), /* DI. */
1358 COSTS_N_INSNS (4)}, /* other. */
1359 0, /* cost of multiply per each bit
1360 set. */
1361 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1362 COSTS_N_INSNS (35), /* HI. */
1363 COSTS_N_INSNS (51), /* SI. */
1364 COSTS_N_INSNS (83), /* DI. */
1365 COSTS_N_INSNS (83)}, /* other. */
1366 COSTS_N_INSNS (1), /* cost of movsx. */
1367 COSTS_N_INSNS (1), /* cost of movzx. */
1368 8, /* "large" insn. */
1369 9, /* MOVE_RATIO. */
1370 4, /* cost for loading QImode using
1371 movzbl. */
1372 {5, 5, 4}, /* cost of loading integer registers
1373 in QImode, HImode and SImode.
1374 Relative to reg-reg move (2). */
1375 {4, 4, 4}, /* cost of storing integer
1376 registers. */
1377 2, /* cost of reg,reg fld/fst. */
1378 {5, 5, 12}, /* cost of loading fp registers
1379 in SFmode, DFmode and XFmode. */
1380 {4, 4, 8}, /* cost of storing fp registers
1381 in SFmode, DFmode and XFmode. */
1382 2, /* cost of moving MMX register. */
1383 {4, 4}, /* cost of loading MMX registers
1384 in SImode and DImode. */
1385 {4, 4}, /* cost of storing MMX registers
1386 in SImode and DImode. */
1387 2, /* cost of moving SSE register. */
1388 {4, 4, 4}, /* cost of loading SSE registers
1389 in SImode, DImode and TImode. */
1390 {4, 4, 4}, /* cost of storing SSE registers
1391 in SImode, DImode and TImode. */
1392 2, /* MMX or SSE register to integer. */
1393 32, /* size of l1 cache. */
1394 512, /* size of l2 cache. */
1395 64, /* size of prefetch block. */
1396 /* New AMD processors never drop prefetches; if they cannot be performed
1397 immediately, they are queued. We set number of simultaneous prefetches
1398 to a large constant to reflect this (it probably is not a good idea not
1399 to limit number of prefetches at all, as their execution also takes some
1400 time). */
1401 100, /* number of parallel prefetches. */
1402 2, /* Branch cost. */
1403 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1404 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1405 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1406 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1407 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1408 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1410 znver1_memcpy,
1411 znver1_memset,
1412 6, /* scalar_stmt_cost. */
1413 4, /* scalar load_cost. */
1414 4, /* scalar_store_cost. */
1415 6, /* vec_stmt_cost. */
1416 0, /* vec_to_scalar_cost. */
1417 2, /* scalar_to_vec_cost. */
1418 4, /* vec_align_load_cost. */
1419 4, /* vec_unalign_load_cost. */
1420 4, /* vec_store_cost. */
1421 4, /* cond_taken_branch_cost. */
1422 2, /* cond_not_taken_branch_cost. */
1425 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1426 very small blocks it is better to use loop. For large blocks, libcall can
1427 do nontemporary accesses and beat inline considerably. */
1428 static stringop_algs btver1_memcpy[2] = {
1429 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1430 {-1, rep_prefix_4_byte, false}}},
1431 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1432 {-1, libcall, false}}}};
1433 static stringop_algs btver1_memset[2] = {
1434 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1435 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1436 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1437 {-1, libcall, false}}}};
1438 const struct processor_costs btver1_cost = {
1439 COSTS_N_INSNS (1), /* cost of an add instruction */
1440 COSTS_N_INSNS (2), /* cost of a lea instruction */
1441 COSTS_N_INSNS (1), /* variable shift costs */
1442 COSTS_N_INSNS (1), /* constant shift costs */
1443 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1444 COSTS_N_INSNS (4), /* HI */
1445 COSTS_N_INSNS (3), /* SI */
1446 COSTS_N_INSNS (4), /* DI */
1447 COSTS_N_INSNS (5)}, /* other */
1448 0, /* cost of multiply per each bit set */
1449 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1450 COSTS_N_INSNS (35), /* HI */
1451 COSTS_N_INSNS (51), /* SI */
1452 COSTS_N_INSNS (83), /* DI */
1453 COSTS_N_INSNS (83)}, /* other */
1454 COSTS_N_INSNS (1), /* cost of movsx */
1455 COSTS_N_INSNS (1), /* cost of movzx */
1456 8, /* "large" insn */
1457 9, /* MOVE_RATIO */
1458 4, /* cost for loading QImode using movzbl */
1459 {3, 4, 3}, /* cost of loading integer registers
1460 in QImode, HImode and SImode.
1461 Relative to reg-reg move (2). */
1462 {3, 4, 3}, /* cost of storing integer registers */
1463 4, /* cost of reg,reg fld/fst */
1464 {4, 4, 12}, /* cost of loading fp registers
1465 in SFmode, DFmode and XFmode */
1466 {6, 6, 8}, /* cost of storing fp registers
1467 in SFmode, DFmode and XFmode */
1468 2, /* cost of moving MMX register */
1469 {3, 3}, /* cost of loading MMX registers
1470 in SImode and DImode */
1471 {4, 4}, /* cost of storing MMX registers
1472 in SImode and DImode */
1473 2, /* cost of moving SSE register */
1474 {4, 4, 3}, /* cost of loading SSE registers
1475 in SImode, DImode and TImode */
1476 {4, 4, 5}, /* cost of storing SSE registers
1477 in SImode, DImode and TImode */
1478 3, /* MMX or SSE register to integer */
1479 /* On K8:
1480 MOVD reg64, xmmreg Double FSTORE 4
1481 MOVD reg32, xmmreg Double FSTORE 4
1482 On AMDFAM10:
1483 MOVD reg64, xmmreg Double FADD 3
1484 1/1 1/1
1485 MOVD reg32, xmmreg Double FADD 3
1486 1/1 1/1 */
1487 32, /* size of l1 cache. */
1488 512, /* size of l2 cache. */
1489 64, /* size of prefetch block */
1490 100, /* number of parallel prefetches */
1491 2, /* Branch cost */
1492 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1493 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1494 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1495 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1496 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1497 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1499 btver1_memcpy,
1500 btver1_memset,
1501 4, /* scalar_stmt_cost. */
1502 2, /* scalar load_cost. */
1503 2, /* scalar_store_cost. */
1504 6, /* vec_stmt_cost. */
1505 0, /* vec_to_scalar_cost. */
1506 2, /* scalar_to_vec_cost. */
1507 2, /* vec_align_load_cost. */
1508 2, /* vec_unalign_load_cost. */
1509 2, /* vec_store_cost. */
1510 2, /* cond_taken_branch_cost. */
1511 1, /* cond_not_taken_branch_cost. */
1514 static stringop_algs btver2_memcpy[2] = {
1515 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1516 {-1, rep_prefix_4_byte, false}}},
1517 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1518 {-1, libcall, false}}}};
1519 static stringop_algs btver2_memset[2] = {
1520 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1521 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1522 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1523 {-1, libcall, false}}}};
1524 const struct processor_costs btver2_cost = {
1525 COSTS_N_INSNS (1), /* cost of an add instruction */
1526 COSTS_N_INSNS (2), /* cost of a lea instruction */
1527 COSTS_N_INSNS (1), /* variable shift costs */
1528 COSTS_N_INSNS (1), /* constant shift costs */
1529 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1530 COSTS_N_INSNS (4), /* HI */
1531 COSTS_N_INSNS (3), /* SI */
1532 COSTS_N_INSNS (4), /* DI */
1533 COSTS_N_INSNS (5)}, /* other */
1534 0, /* cost of multiply per each bit set */
1535 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1536 COSTS_N_INSNS (35), /* HI */
1537 COSTS_N_INSNS (51), /* SI */
1538 COSTS_N_INSNS (83), /* DI */
1539 COSTS_N_INSNS (83)}, /* other */
1540 COSTS_N_INSNS (1), /* cost of movsx */
1541 COSTS_N_INSNS (1), /* cost of movzx */
1542 8, /* "large" insn */
1543 9, /* MOVE_RATIO */
1544 4, /* cost for loading QImode using movzbl */
1545 {3, 4, 3}, /* cost of loading integer registers
1546 in QImode, HImode and SImode.
1547 Relative to reg-reg move (2). */
1548 {3, 4, 3}, /* cost of storing integer registers */
1549 4, /* cost of reg,reg fld/fst */
1550 {4, 4, 12}, /* cost of loading fp registers
1551 in SFmode, DFmode and XFmode */
1552 {6, 6, 8}, /* cost of storing fp registers
1553 in SFmode, DFmode and XFmode */
1554 2, /* cost of moving MMX register */
1555 {3, 3}, /* cost of loading MMX registers
1556 in SImode and DImode */
1557 {4, 4}, /* cost of storing MMX registers
1558 in SImode and DImode */
1559 2, /* cost of moving SSE register */
1560 {4, 4, 3}, /* cost of loading SSE registers
1561 in SImode, DImode and TImode */
1562 {4, 4, 5}, /* cost of storing SSE registers
1563 in SImode, DImode and TImode */
1564 3, /* MMX or SSE register to integer */
1565 /* On K8:
1566 MOVD reg64, xmmreg Double FSTORE 4
1567 MOVD reg32, xmmreg Double FSTORE 4
1568 On AMDFAM10:
1569 MOVD reg64, xmmreg Double FADD 3
1570 1/1 1/1
1571 MOVD reg32, xmmreg Double FADD 3
1572 1/1 1/1 */
1573 32, /* size of l1 cache. */
1574 2048, /* size of l2 cache. */
1575 64, /* size of prefetch block */
1576 100, /* number of parallel prefetches */
1577 2, /* Branch cost */
1578 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1579 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1580 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1581 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1582 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1583 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1584 btver2_memcpy,
1585 btver2_memset,
1586 4, /* scalar_stmt_cost. */
1587 2, /* scalar load_cost. */
1588 2, /* scalar_store_cost. */
1589 6, /* vec_stmt_cost. */
1590 0, /* vec_to_scalar_cost. */
1591 2, /* scalar_to_vec_cost. */
1592 2, /* vec_align_load_cost. */
1593 2, /* vec_unalign_load_cost. */
1594 2, /* vec_store_cost. */
1595 2, /* cond_taken_branch_cost. */
1596 1, /* cond_not_taken_branch_cost. */
1599 static stringop_algs pentium4_memcpy[2] = {
1600 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1601 DUMMY_STRINGOP_ALGS};
1602 static stringop_algs pentium4_memset[2] = {
1603 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1604 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1605 DUMMY_STRINGOP_ALGS};
1607 static const
1608 struct processor_costs pentium4_cost = {
1609 COSTS_N_INSNS (1), /* cost of an add instruction */
1610 COSTS_N_INSNS (3), /* cost of a lea instruction */
1611 COSTS_N_INSNS (4), /* variable shift costs */
1612 COSTS_N_INSNS (4), /* constant shift costs */
1613 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1614 COSTS_N_INSNS (15), /* HI */
1615 COSTS_N_INSNS (15), /* SI */
1616 COSTS_N_INSNS (15), /* DI */
1617 COSTS_N_INSNS (15)}, /* other */
1618 0, /* cost of multiply per each bit set */
1619 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1620 COSTS_N_INSNS (56), /* HI */
1621 COSTS_N_INSNS (56), /* SI */
1622 COSTS_N_INSNS (56), /* DI */
1623 COSTS_N_INSNS (56)}, /* other */
1624 COSTS_N_INSNS (1), /* cost of movsx */
1625 COSTS_N_INSNS (1), /* cost of movzx */
1626 16, /* "large" insn */
1627 6, /* MOVE_RATIO */
1628 2, /* cost for loading QImode using movzbl */
1629 {4, 5, 4}, /* cost of loading integer registers
1630 in QImode, HImode and SImode.
1631 Relative to reg-reg move (2). */
1632 {2, 3, 2}, /* cost of storing integer registers */
1633 2, /* cost of reg,reg fld/fst */
1634 {2, 2, 6}, /* cost of loading fp registers
1635 in SFmode, DFmode and XFmode */
1636 {4, 4, 6}, /* cost of storing fp registers
1637 in SFmode, DFmode and XFmode */
1638 2, /* cost of moving MMX register */
1639 {2, 2}, /* cost of loading MMX registers
1640 in SImode and DImode */
1641 {2, 2}, /* cost of storing MMX registers
1642 in SImode and DImode */
1643 12, /* cost of moving SSE register */
1644 {12, 12, 12}, /* cost of loading SSE registers
1645 in SImode, DImode and TImode */
1646 {2, 2, 8}, /* cost of storing SSE registers
1647 in SImode, DImode and TImode */
1648 10, /* MMX or SSE register to integer */
1649 8, /* size of l1 cache. */
1650 256, /* size of l2 cache. */
1651 64, /* size of prefetch block */
1652 6, /* number of parallel prefetches */
1653 2, /* Branch cost */
1654 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1655 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1656 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1657 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1658 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1659 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1660 pentium4_memcpy,
1661 pentium4_memset,
1662 1, /* scalar_stmt_cost. */
1663 1, /* scalar load_cost. */
1664 1, /* scalar_store_cost. */
1665 1, /* vec_stmt_cost. */
1666 1, /* vec_to_scalar_cost. */
1667 1, /* scalar_to_vec_cost. */
1668 1, /* vec_align_load_cost. */
1669 2, /* vec_unalign_load_cost. */
1670 1, /* vec_store_cost. */
1671 3, /* cond_taken_branch_cost. */
1672 1, /* cond_not_taken_branch_cost. */
1675 static stringop_algs nocona_memcpy[2] = {
1676 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1677 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1678 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1680 static stringop_algs nocona_memset[2] = {
1681 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1682 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1683 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1684 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1686 static const
1687 struct processor_costs nocona_cost = {
1688 COSTS_N_INSNS (1), /* cost of an add instruction */
1689 COSTS_N_INSNS (1), /* cost of a lea instruction */
1690 COSTS_N_INSNS (1), /* variable shift costs */
1691 COSTS_N_INSNS (1), /* constant shift costs */
1692 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1693 COSTS_N_INSNS (10), /* HI */
1694 COSTS_N_INSNS (10), /* SI */
1695 COSTS_N_INSNS (10), /* DI */
1696 COSTS_N_INSNS (10)}, /* other */
1697 0, /* cost of multiply per each bit set */
1698 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1699 COSTS_N_INSNS (66), /* HI */
1700 COSTS_N_INSNS (66), /* SI */
1701 COSTS_N_INSNS (66), /* DI */
1702 COSTS_N_INSNS (66)}, /* other */
1703 COSTS_N_INSNS (1), /* cost of movsx */
1704 COSTS_N_INSNS (1), /* cost of movzx */
1705 16, /* "large" insn */
1706 17, /* MOVE_RATIO */
1707 4, /* cost for loading QImode using movzbl */
1708 {4, 4, 4}, /* cost of loading integer registers
1709 in QImode, HImode and SImode.
1710 Relative to reg-reg move (2). */
1711 {4, 4, 4}, /* cost of storing integer registers */
1712 3, /* cost of reg,reg fld/fst */
1713 {12, 12, 12}, /* cost of loading fp registers
1714 in SFmode, DFmode and XFmode */
1715 {4, 4, 4}, /* cost of storing fp registers
1716 in SFmode, DFmode and XFmode */
1717 6, /* cost of moving MMX register */
1718 {12, 12}, /* cost of loading MMX registers
1719 in SImode and DImode */
1720 {12, 12}, /* cost of storing MMX registers
1721 in SImode and DImode */
1722 6, /* cost of moving SSE register */
1723 {12, 12, 12}, /* cost of loading SSE registers
1724 in SImode, DImode and TImode */
1725 {12, 12, 12}, /* cost of storing SSE registers
1726 in SImode, DImode and TImode */
1727 8, /* MMX or SSE register to integer */
1728 8, /* size of l1 cache. */
1729 1024, /* size of l2 cache. */
1730 64, /* size of prefetch block */
1731 8, /* number of parallel prefetches */
1732 1, /* Branch cost */
1733 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1734 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1735 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1736 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1737 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1738 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1739 nocona_memcpy,
1740 nocona_memset,
1741 1, /* scalar_stmt_cost. */
1742 1, /* scalar load_cost. */
1743 1, /* scalar_store_cost. */
1744 1, /* vec_stmt_cost. */
1745 1, /* vec_to_scalar_cost. */
1746 1, /* scalar_to_vec_cost. */
1747 1, /* vec_align_load_cost. */
1748 2, /* vec_unalign_load_cost. */
1749 1, /* vec_store_cost. */
1750 3, /* cond_taken_branch_cost. */
1751 1, /* cond_not_taken_branch_cost. */
1754 static stringop_algs atom_memcpy[2] = {
1755 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1756 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1757 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1758 static stringop_algs atom_memset[2] = {
1759 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1760 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1761 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1762 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1763 static const
1764 struct processor_costs atom_cost = {
1765 COSTS_N_INSNS (1), /* cost of an add instruction */
1766 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1767 COSTS_N_INSNS (1), /* variable shift costs */
1768 COSTS_N_INSNS (1), /* constant shift costs */
1769 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1770 COSTS_N_INSNS (4), /* HI */
1771 COSTS_N_INSNS (3), /* SI */
1772 COSTS_N_INSNS (4), /* DI */
1773 COSTS_N_INSNS (2)}, /* other */
1774 0, /* cost of multiply per each bit set */
1775 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1776 COSTS_N_INSNS (26), /* HI */
1777 COSTS_N_INSNS (42), /* SI */
1778 COSTS_N_INSNS (74), /* DI */
1779 COSTS_N_INSNS (74)}, /* other */
1780 COSTS_N_INSNS (1), /* cost of movsx */
1781 COSTS_N_INSNS (1), /* cost of movzx */
1782 8, /* "large" insn */
1783 17, /* MOVE_RATIO */
1784 4, /* cost for loading QImode using movzbl */
1785 {4, 4, 4}, /* cost of loading integer registers
1786 in QImode, HImode and SImode.
1787 Relative to reg-reg move (2). */
1788 {4, 4, 4}, /* cost of storing integer registers */
1789 4, /* cost of reg,reg fld/fst */
1790 {12, 12, 12}, /* cost of loading fp registers
1791 in SFmode, DFmode and XFmode */
1792 {6, 6, 8}, /* cost of storing fp registers
1793 in SFmode, DFmode and XFmode */
1794 2, /* cost of moving MMX register */
1795 {8, 8}, /* cost of loading MMX registers
1796 in SImode and DImode */
1797 {8, 8}, /* cost of storing MMX registers
1798 in SImode and DImode */
1799 2, /* cost of moving SSE register */
1800 {8, 8, 8}, /* cost of loading SSE registers
1801 in SImode, DImode and TImode */
1802 {8, 8, 8}, /* cost of storing SSE registers
1803 in SImode, DImode and TImode */
1804 5, /* MMX or SSE register to integer */
1805 32, /* size of l1 cache. */
1806 256, /* size of l2 cache. */
1807 64, /* size of prefetch block */
1808 6, /* number of parallel prefetches */
1809 3, /* Branch cost */
1810 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1811 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1812 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1813 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1814 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1815 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1816 atom_memcpy,
1817 atom_memset,
1818 1, /* scalar_stmt_cost. */
1819 1, /* scalar load_cost. */
1820 1, /* scalar_store_cost. */
1821 1, /* vec_stmt_cost. */
1822 1, /* vec_to_scalar_cost. */
1823 1, /* scalar_to_vec_cost. */
1824 1, /* vec_align_load_cost. */
1825 2, /* vec_unalign_load_cost. */
1826 1, /* vec_store_cost. */
1827 3, /* cond_taken_branch_cost. */
1828 1, /* cond_not_taken_branch_cost. */
1831 static stringop_algs slm_memcpy[2] = {
1832 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1833 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1834 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835 static stringop_algs slm_memset[2] = {
1836 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1837 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1839 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static const
1841 struct processor_costs slm_cost = {
1842 COSTS_N_INSNS (1), /* cost of an add instruction */
1843 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1844 COSTS_N_INSNS (1), /* variable shift costs */
1845 COSTS_N_INSNS (1), /* constant shift costs */
1846 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1847 COSTS_N_INSNS (3), /* HI */
1848 COSTS_N_INSNS (3), /* SI */
1849 COSTS_N_INSNS (4), /* DI */
1850 COSTS_N_INSNS (2)}, /* other */
1851 0, /* cost of multiply per each bit set */
1852 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1853 COSTS_N_INSNS (26), /* HI */
1854 COSTS_N_INSNS (42), /* SI */
1855 COSTS_N_INSNS (74), /* DI */
1856 COSTS_N_INSNS (74)}, /* other */
1857 COSTS_N_INSNS (1), /* cost of movsx */
1858 COSTS_N_INSNS (1), /* cost of movzx */
1859 8, /* "large" insn */
1860 17, /* MOVE_RATIO */
1861 4, /* cost for loading QImode using movzbl */
1862 {4, 4, 4}, /* cost of loading integer registers
1863 in QImode, HImode and SImode.
1864 Relative to reg-reg move (2). */
1865 {4, 4, 4}, /* cost of storing integer registers */
1866 4, /* cost of reg,reg fld/fst */
1867 {12, 12, 12}, /* cost of loading fp registers
1868 in SFmode, DFmode and XFmode */
1869 {6, 6, 8}, /* cost of storing fp registers
1870 in SFmode, DFmode and XFmode */
1871 2, /* cost of moving MMX register */
1872 {8, 8}, /* cost of loading MMX registers
1873 in SImode and DImode */
1874 {8, 8}, /* cost of storing MMX registers
1875 in SImode and DImode */
1876 2, /* cost of moving SSE register */
1877 {8, 8, 8}, /* cost of loading SSE registers
1878 in SImode, DImode and TImode */
1879 {8, 8, 8}, /* cost of storing SSE registers
1880 in SImode, DImode and TImode */
1881 5, /* MMX or SSE register to integer */
1882 32, /* size of l1 cache. */
1883 256, /* size of l2 cache. */
1884 64, /* size of prefetch block */
1885 6, /* number of parallel prefetches */
1886 3, /* Branch cost */
1887 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1888 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1889 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1890 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1891 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1892 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1893 slm_memcpy,
1894 slm_memset,
1895 1, /* scalar_stmt_cost. */
1896 1, /* scalar load_cost. */
1897 1, /* scalar_store_cost. */
1898 1, /* vec_stmt_cost. */
1899 4, /* vec_to_scalar_cost. */
1900 1, /* scalar_to_vec_cost. */
1901 1, /* vec_align_load_cost. */
1902 2, /* vec_unalign_load_cost. */
1903 1, /* vec_store_cost. */
1904 3, /* cond_taken_branch_cost. */
1905 1, /* cond_not_taken_branch_cost. */
1908 static stringop_algs intel_memcpy[2] = {
1909 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1910 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1911 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1912 static stringop_algs intel_memset[2] = {
1913 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1914 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1915 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1916 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1917 static const
1918 struct processor_costs intel_cost = {
1919 COSTS_N_INSNS (1), /* cost of an add instruction */
1920 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1921 COSTS_N_INSNS (1), /* variable shift costs */
1922 COSTS_N_INSNS (1), /* constant shift costs */
1923 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1924 COSTS_N_INSNS (3), /* HI */
1925 COSTS_N_INSNS (3), /* SI */
1926 COSTS_N_INSNS (4), /* DI */
1927 COSTS_N_INSNS (2)}, /* other */
1928 0, /* cost of multiply per each bit set */
1929 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1930 COSTS_N_INSNS (26), /* HI */
1931 COSTS_N_INSNS (42), /* SI */
1932 COSTS_N_INSNS (74), /* DI */
1933 COSTS_N_INSNS (74)}, /* other */
1934 COSTS_N_INSNS (1), /* cost of movsx */
1935 COSTS_N_INSNS (1), /* cost of movzx */
1936 8, /* "large" insn */
1937 17, /* MOVE_RATIO */
1938 4, /* cost for loading QImode using movzbl */
1939 {4, 4, 4}, /* cost of loading integer registers
1940 in QImode, HImode and SImode.
1941 Relative to reg-reg move (2). */
1942 {4, 4, 4}, /* cost of storing integer registers */
1943 4, /* cost of reg,reg fld/fst */
1944 {12, 12, 12}, /* cost of loading fp registers
1945 in SFmode, DFmode and XFmode */
1946 {6, 6, 8}, /* cost of storing fp registers
1947 in SFmode, DFmode and XFmode */
1948 2, /* cost of moving MMX register */
1949 {8, 8}, /* cost of loading MMX registers
1950 in SImode and DImode */
1951 {8, 8}, /* cost of storing MMX registers
1952 in SImode and DImode */
1953 2, /* cost of moving SSE register */
1954 {8, 8, 8}, /* cost of loading SSE registers
1955 in SImode, DImode and TImode */
1956 {8, 8, 8}, /* cost of storing SSE registers
1957 in SImode, DImode and TImode */
1958 5, /* MMX or SSE register to integer */
1959 32, /* size of l1 cache. */
1960 256, /* size of l2 cache. */
1961 64, /* size of prefetch block */
1962 6, /* number of parallel prefetches */
1963 3, /* Branch cost */
1964 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1965 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1966 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1967 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1968 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1969 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1970 intel_memcpy,
1971 intel_memset,
1972 1, /* scalar_stmt_cost. */
1973 1, /* scalar load_cost. */
1974 1, /* scalar_store_cost. */
1975 1, /* vec_stmt_cost. */
1976 4, /* vec_to_scalar_cost. */
1977 1, /* scalar_to_vec_cost. */
1978 1, /* vec_align_load_cost. */
1979 2, /* vec_unalign_load_cost. */
1980 1, /* vec_store_cost. */
1981 3, /* cond_taken_branch_cost. */
1982 1, /* cond_not_taken_branch_cost. */
1985 /* Generic should produce code tuned for Core-i7 (and newer chips)
1986 and btver1 (and newer chips). */
1988 static stringop_algs generic_memcpy[2] = {
1989 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1990 {-1, libcall, false}}},
1991 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1992 {-1, libcall, false}}}};
1993 static stringop_algs generic_memset[2] = {
1994 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1995 {-1, libcall, false}}},
1996 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1997 {-1, libcall, false}}}};
1998 static const
1999 struct processor_costs generic_cost = {
2000 COSTS_N_INSNS (1), /* cost of an add instruction */
2001 /* On all chips taken into consideration lea is 2 cycles and more. With
2002 this cost however our current implementation of synth_mult results in
2003 use of unnecessary temporary registers causing regression on several
2004 SPECfp benchmarks. */
2005 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2006 COSTS_N_INSNS (1), /* variable shift costs */
2007 COSTS_N_INSNS (1), /* constant shift costs */
2008 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2009 COSTS_N_INSNS (4), /* HI */
2010 COSTS_N_INSNS (3), /* SI */
2011 COSTS_N_INSNS (4), /* DI */
2012 COSTS_N_INSNS (2)}, /* other */
2013 0, /* cost of multiply per each bit set */
2014 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2015 COSTS_N_INSNS (26), /* HI */
2016 COSTS_N_INSNS (42), /* SI */
2017 COSTS_N_INSNS (74), /* DI */
2018 COSTS_N_INSNS (74)}, /* other */
2019 COSTS_N_INSNS (1), /* cost of movsx */
2020 COSTS_N_INSNS (1), /* cost of movzx */
2021 8, /* "large" insn */
2022 17, /* MOVE_RATIO */
2023 4, /* cost for loading QImode using movzbl */
2024 {4, 4, 4}, /* cost of loading integer registers
2025 in QImode, HImode and SImode.
2026 Relative to reg-reg move (2). */
2027 {4, 4, 4}, /* cost of storing integer registers */
2028 4, /* cost of reg,reg fld/fst */
2029 {12, 12, 12}, /* cost of loading fp registers
2030 in SFmode, DFmode and XFmode */
2031 {6, 6, 8}, /* cost of storing fp registers
2032 in SFmode, DFmode and XFmode */
2033 2, /* cost of moving MMX register */
2034 {8, 8}, /* cost of loading MMX registers
2035 in SImode and DImode */
2036 {8, 8}, /* cost of storing MMX registers
2037 in SImode and DImode */
2038 2, /* cost of moving SSE register */
2039 {8, 8, 8}, /* cost of loading SSE registers
2040 in SImode, DImode and TImode */
2041 {8, 8, 8}, /* cost of storing SSE registers
2042 in SImode, DImode and TImode */
2043 5, /* MMX or SSE register to integer */
2044 32, /* size of l1 cache. */
2045 512, /* size of l2 cache. */
2046 64, /* size of prefetch block */
2047 6, /* number of parallel prefetches */
2048 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2049 value is increased to perhaps more appropriate value of 5. */
2050 3, /* Branch cost */
2051 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2052 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2053 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2054 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2055 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2056 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2057 generic_memcpy,
2058 generic_memset,
2059 1, /* scalar_stmt_cost. */
2060 1, /* scalar load_cost. */
2061 1, /* scalar_store_cost. */
2062 1, /* vec_stmt_cost. */
2063 1, /* vec_to_scalar_cost. */
2064 1, /* scalar_to_vec_cost. */
2065 1, /* vec_align_load_cost. */
2066 2, /* vec_unalign_load_cost. */
2067 1, /* vec_store_cost. */
2068 3, /* cond_taken_branch_cost. */
2069 1, /* cond_not_taken_branch_cost. */
2072 /* core_cost should produce code tuned for Core familly of CPUs. */
2073 static stringop_algs core_memcpy[2] = {
2074 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2075 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2076 {-1, libcall, false}}}};
2077 static stringop_algs core_memset[2] = {
2078 {libcall, {{6, loop_1_byte, true},
2079 {24, loop, true},
2080 {8192, rep_prefix_4_byte, true},
2081 {-1, libcall, false}}},
2082 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2083 {-1, libcall, false}}}};
2085 static const
2086 struct processor_costs core_cost = {
2087 COSTS_N_INSNS (1), /* cost of an add instruction */
2088 /* On all chips taken into consideration lea is 2 cycles and more. With
2089 this cost however our current implementation of synth_mult results in
2090 use of unnecessary temporary registers causing regression on several
2091 SPECfp benchmarks. */
2092 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2093 COSTS_N_INSNS (1), /* variable shift costs */
2094 COSTS_N_INSNS (1), /* constant shift costs */
2095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2096 COSTS_N_INSNS (4), /* HI */
2097 COSTS_N_INSNS (3), /* SI */
2098 COSTS_N_INSNS (4), /* DI */
2099 COSTS_N_INSNS (2)}, /* other */
2100 0, /* cost of multiply per each bit set */
2101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2102 COSTS_N_INSNS (26), /* HI */
2103 COSTS_N_INSNS (42), /* SI */
2104 COSTS_N_INSNS (74), /* DI */
2105 COSTS_N_INSNS (74)}, /* other */
2106 COSTS_N_INSNS (1), /* cost of movsx */
2107 COSTS_N_INSNS (1), /* cost of movzx */
2108 8, /* "large" insn */
2109 17, /* MOVE_RATIO */
2110 4, /* cost for loading QImode using movzbl */
2111 {4, 4, 4}, /* cost of loading integer registers
2112 in QImode, HImode and SImode.
2113 Relative to reg-reg move (2). */
2114 {4, 4, 4}, /* cost of storing integer registers */
2115 4, /* cost of reg,reg fld/fst */
2116 {12, 12, 12}, /* cost of loading fp registers
2117 in SFmode, DFmode and XFmode */
2118 {6, 6, 8}, /* cost of storing fp registers
2119 in SFmode, DFmode and XFmode */
2120 2, /* cost of moving MMX register */
2121 {8, 8}, /* cost of loading MMX registers
2122 in SImode and DImode */
2123 {8, 8}, /* cost of storing MMX registers
2124 in SImode and DImode */
2125 2, /* cost of moving SSE register */
2126 {8, 8, 8}, /* cost of loading SSE registers
2127 in SImode, DImode and TImode */
2128 {8, 8, 8}, /* cost of storing SSE registers
2129 in SImode, DImode and TImode */
2130 5, /* MMX or SSE register to integer */
2131 64, /* size of l1 cache. */
2132 512, /* size of l2 cache. */
2133 64, /* size of prefetch block */
2134 6, /* number of parallel prefetches */
2135 /* FIXME perhaps more appropriate value is 5. */
2136 3, /* Branch cost */
2137 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2138 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2139 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2140 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2141 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2142 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2143 core_memcpy,
2144 core_memset,
2145 1, /* scalar_stmt_cost. */
2146 1, /* scalar load_cost. */
2147 1, /* scalar_store_cost. */
2148 1, /* vec_stmt_cost. */
2149 1, /* vec_to_scalar_cost. */
2150 1, /* scalar_to_vec_cost. */
2151 1, /* vec_align_load_cost. */
2152 2, /* vec_unalign_load_cost. */
2153 1, /* vec_store_cost. */
2154 3, /* cond_taken_branch_cost. */
2155 1, /* cond_not_taken_branch_cost. */
2159 /* Set by -mtune. */
2160 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2162 /* Set by -mtune or -Os. */
2163 const struct processor_costs *ix86_cost = &pentium_cost;
2165 /* Processor feature/optimization bitmasks. */
2166 #define m_386 (1U<<PROCESSOR_I386)
2167 #define m_486 (1U<<PROCESSOR_I486)
2168 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2169 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2170 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2171 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2172 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2173 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2174 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2175 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2176 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2177 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2178 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2179 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2180 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2181 #define m_KNL (1U<<PROCESSOR_KNL)
2182 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2183 #define m_INTEL (1U<<PROCESSOR_INTEL)
2185 #define m_GEODE (1U<<PROCESSOR_GEODE)
2186 #define m_K6 (1U<<PROCESSOR_K6)
2187 #define m_K6_GEODE (m_K6 | m_GEODE)
2188 #define m_K8 (1U<<PROCESSOR_K8)
2189 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2190 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2191 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2192 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2193 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2194 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2195 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2196 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2197 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2198 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2199 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2200 #define m_BTVER (m_BTVER1 | m_BTVER2)
2201 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2202 | m_ZNVER1)
2204 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2206 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2207 #undef DEF_TUNE
2208 #define DEF_TUNE(tune, name, selector) name,
2209 #include "x86-tune.def"
2210 #undef DEF_TUNE
2213 /* Feature tests against the various tunings. */
2214 unsigned char ix86_tune_features[X86_TUNE_LAST];
2216 /* Feature tests against the various tunings used to create ix86_tune_features
2217 based on the processor mask. */
2218 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2219 #undef DEF_TUNE
2220 #define DEF_TUNE(tune, name, selector) selector,
2221 #include "x86-tune.def"
2222 #undef DEF_TUNE
2225 /* Feature tests against the various architecture variations. */
2226 unsigned char ix86_arch_features[X86_ARCH_LAST];
2228 /* Feature tests against the various architecture variations, used to create
2229 ix86_arch_features based on the processor mask. */
2230 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2231 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2232 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2234 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2235 ~m_386,
2237 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2238 ~(m_386 | m_486),
2240 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2241 ~m_386,
2243 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2244 ~m_386,
2247 /* In case the average insn count for single function invocation is
2248 lower than this constant, emit fast (but longer) prologue and
2249 epilogue code. */
2250 #define FAST_PROLOGUE_INSN_COUNT 20
2252 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2253 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2254 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2255 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2257 /* Array of the smallest class containing reg number REGNO, indexed by
2258 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2260 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2262 /* ax, dx, cx, bx */
2263 AREG, DREG, CREG, BREG,
2264 /* si, di, bp, sp */
2265 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2266 /* FP registers */
2267 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2268 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2269 /* arg pointer */
2270 NON_Q_REGS,
2271 /* flags, fpsr, fpcr, frame */
2272 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2273 /* SSE registers */
2274 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2275 SSE_REGS, SSE_REGS,
2276 /* MMX registers */
2277 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2278 MMX_REGS, MMX_REGS,
2279 /* REX registers */
2280 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2281 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2282 /* SSE REX registers */
2283 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2284 SSE_REGS, SSE_REGS,
2285 /* AVX-512 SSE registers */
2286 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2287 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2288 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2289 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2290 /* Mask registers. */
2291 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2292 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2293 /* MPX bound registers */
2294 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2297 /* The "default" register map used in 32bit mode. */
2299 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2301 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2302 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2303 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2304 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2305 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2306 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2307 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2308 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2309 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2310 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2311 101, 102, 103, 104, /* bound registers */
2314 /* The "default" register map used in 64bit mode. */
2316 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2318 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2319 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2320 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2321 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2322 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2323 8,9,10,11,12,13,14,15, /* extended integer registers */
2324 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2325 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2326 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2327 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2328 126, 127, 128, 129, /* bound registers */
2331 /* Define the register numbers to be used in Dwarf debugging information.
2332 The SVR4 reference port C compiler uses the following register numbers
2333 in its Dwarf output code:
2334 0 for %eax (gcc regno = 0)
2335 1 for %ecx (gcc regno = 2)
2336 2 for %edx (gcc regno = 1)
2337 3 for %ebx (gcc regno = 3)
2338 4 for %esp (gcc regno = 7)
2339 5 for %ebp (gcc regno = 6)
2340 6 for %esi (gcc regno = 4)
2341 7 for %edi (gcc regno = 5)
2342 The following three DWARF register numbers are never generated by
2343 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2344 believes these numbers have these meanings.
2345 8 for %eip (no gcc equivalent)
2346 9 for %eflags (gcc regno = 17)
2347 10 for %trapno (no gcc equivalent)
2348 It is not at all clear how we should number the FP stack registers
2349 for the x86 architecture. If the version of SDB on x86/svr4 were
2350 a bit less brain dead with respect to floating-point then we would
2351 have a precedent to follow with respect to DWARF register numbers
2352 for x86 FP registers, but the SDB on x86/svr4 is so completely
2353 broken with respect to FP registers that it is hardly worth thinking
2354 of it as something to strive for compatibility with.
2355 The version of x86/svr4 SDB I have at the moment does (partially)
2356 seem to believe that DWARF register number 11 is associated with
2357 the x86 register %st(0), but that's about all. Higher DWARF
2358 register numbers don't seem to be associated with anything in
2359 particular, and even for DWARF regno 11, SDB only seems to under-
2360 stand that it should say that a variable lives in %st(0) (when
2361 asked via an `=' command) if we said it was in DWARF regno 11,
2362 but SDB still prints garbage when asked for the value of the
2363 variable in question (via a `/' command).
2364 (Also note that the labels SDB prints for various FP stack regs
2365 when doing an `x' command are all wrong.)
2366 Note that these problems generally don't affect the native SVR4
2367 C compiler because it doesn't allow the use of -O with -g and
2368 because when it is *not* optimizing, it allocates a memory
2369 location for each floating-point variable, and the memory
2370 location is what gets described in the DWARF AT_location
2371 attribute for the variable in question.
2372 Regardless of the severe mental illness of the x86/svr4 SDB, we
2373 do something sensible here and we use the following DWARF
2374 register numbers. Note that these are all stack-top-relative
2375 numbers.
2376 11 for %st(0) (gcc regno = 8)
2377 12 for %st(1) (gcc regno = 9)
2378 13 for %st(2) (gcc regno = 10)
2379 14 for %st(3) (gcc regno = 11)
2380 15 for %st(4) (gcc regno = 12)
2381 16 for %st(5) (gcc regno = 13)
2382 17 for %st(6) (gcc regno = 14)
2383 18 for %st(7) (gcc regno = 15)
2385 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2387 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2388 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2389 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2390 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2391 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2392 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2393 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2394 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2395 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2396 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2397 101, 102, 103, 104, /* bound registers */
2400 /* Define parameter passing and return registers. */
2402 static int const x86_64_int_parameter_registers[6] =
2404 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2407 static int const x86_64_ms_abi_int_parameter_registers[4] =
2409 CX_REG, DX_REG, R8_REG, R9_REG
2412 static int const x86_64_int_return_registers[4] =
2414 AX_REG, DX_REG, DI_REG, SI_REG
2417 /* Additional registers that are clobbered by SYSV calls. */
2419 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2421 SI_REG, DI_REG,
2422 XMM6_REG, XMM7_REG,
2423 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2424 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2427 /* Define the structure for the machine field in struct function. */
2429 struct GTY(()) stack_local_entry {
2430 unsigned short mode;
2431 unsigned short n;
2432 rtx rtl;
2433 struct stack_local_entry *next;
2436 /* Structure describing stack frame layout.
2437 Stack grows downward:
2439 [arguments]
2440 <- ARG_POINTER
2441 saved pc
2443 saved static chain if ix86_static_chain_on_stack
2445 saved frame pointer if frame_pointer_needed
2446 <- HARD_FRAME_POINTER
2447 [saved regs]
2448 <- regs_save_offset
2449 [padding0]
2451 [saved SSE regs]
2452 <- sse_regs_save_offset
2453 [padding1] |
2454 | <- FRAME_POINTER
2455 [va_arg registers] |
2457 [frame] |
2459 [padding2] | = to_allocate
2460 <- STACK_POINTER
2462 struct ix86_frame
2464 int nsseregs;
2465 int nregs;
2466 int va_arg_size;
2467 int red_zone_size;
2468 int outgoing_arguments_size;
2470 /* The offsets relative to ARG_POINTER. */
2471 HOST_WIDE_INT frame_pointer_offset;
2472 HOST_WIDE_INT hard_frame_pointer_offset;
2473 HOST_WIDE_INT stack_pointer_offset;
2474 HOST_WIDE_INT hfp_save_offset;
2475 HOST_WIDE_INT reg_save_offset;
2476 HOST_WIDE_INT sse_reg_save_offset;
2478 /* When save_regs_using_mov is set, emit prologue using
2479 move instead of push instructions. */
2480 bool save_regs_using_mov;
2483 /* Which cpu are we scheduling for. */
2484 enum attr_cpu ix86_schedule;
2486 /* Which cpu are we optimizing for. */
2487 enum processor_type ix86_tune;
2489 /* Which instruction set architecture to use. */
2490 enum processor_type ix86_arch;
2492 /* True if processor has SSE prefetch instruction. */
2493 unsigned char x86_prefetch_sse;
2495 /* -mstackrealign option */
2496 static const char ix86_force_align_arg_pointer_string[]
2497 = "force_align_arg_pointer";
2499 static rtx (*ix86_gen_leave) (void);
2500 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2501 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2502 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2503 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2504 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2505 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2506 static rtx (*ix86_gen_clzero) (rtx);
2507 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2508 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2509 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2510 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2511 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2512 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2514 /* Preferred alignment for stack boundary in bits. */
2515 unsigned int ix86_preferred_stack_boundary;
2517 /* Alignment for incoming stack boundary in bits specified at
2518 command line. */
2519 static unsigned int ix86_user_incoming_stack_boundary;
2521 /* Default alignment for incoming stack boundary in bits. */
2522 static unsigned int ix86_default_incoming_stack_boundary;
2524 /* Alignment for incoming stack boundary in bits. */
2525 unsigned int ix86_incoming_stack_boundary;
2527 /* Calling abi specific va_list type nodes. */
2528 static GTY(()) tree sysv_va_list_type_node;
2529 static GTY(()) tree ms_va_list_type_node;
2531 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2532 char internal_label_prefix[16];
2533 int internal_label_prefix_len;
2535 /* Fence to use after loop using movnt. */
2536 tree x86_mfence;
2538 /* Register class used for passing given 64bit part of the argument.
2539 These represent classes as documented by the PS ABI, with the exception
2540 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2541 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2543 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2544 whenever possible (upper half does contain padding). */
2545 enum x86_64_reg_class
2547 X86_64_NO_CLASS,
2548 X86_64_INTEGER_CLASS,
2549 X86_64_INTEGERSI_CLASS,
2550 X86_64_SSE_CLASS,
2551 X86_64_SSESF_CLASS,
2552 X86_64_SSEDF_CLASS,
2553 X86_64_SSEUP_CLASS,
2554 X86_64_X87_CLASS,
2555 X86_64_X87UP_CLASS,
2556 X86_64_COMPLEX_X87_CLASS,
2557 X86_64_MEMORY_CLASS
2560 #define MAX_CLASSES 8
2562 /* Table of constants used by fldpi, fldln2, etc.... */
2563 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2564 static bool ext_80387_constants_init = 0;
2567 static struct machine_function * ix86_init_machine_status (void);
2568 static rtx ix86_function_value (const_tree, const_tree, bool);
2569 static bool ix86_function_value_regno_p (const unsigned int);
2570 static unsigned int ix86_function_arg_boundary (machine_mode,
2571 const_tree);
2572 static rtx ix86_static_chain (const_tree, bool);
2573 static int ix86_function_regparm (const_tree, const_tree);
2574 static void ix86_compute_frame_layout (struct ix86_frame *);
2575 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2576 rtx, rtx, int);
2577 static void ix86_add_new_builtins (HOST_WIDE_INT);
2578 static tree ix86_canonical_va_list_type (tree);
2579 static void predict_jump (int);
2580 static unsigned int split_stack_prologue_scratch_regno (void);
2581 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2583 enum ix86_function_specific_strings
2585 IX86_FUNCTION_SPECIFIC_ARCH,
2586 IX86_FUNCTION_SPECIFIC_TUNE,
2587 IX86_FUNCTION_SPECIFIC_MAX
2590 static char *ix86_target_string (HOST_WIDE_INT, int, int, const char *,
2591 const char *, enum fpmath_unit, bool);
2592 static void ix86_function_specific_save (struct cl_target_option *,
2593 struct gcc_options *opts);
2594 static void ix86_function_specific_restore (struct gcc_options *opts,
2595 struct cl_target_option *);
2596 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2597 static void ix86_function_specific_print (FILE *, int,
2598 struct cl_target_option *);
2599 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2600 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2601 struct gcc_options *,
2602 struct gcc_options *,
2603 struct gcc_options *);
2604 static bool ix86_can_inline_p (tree, tree);
2605 static void ix86_set_current_function (tree);
2606 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2608 static enum calling_abi ix86_function_abi (const_tree);
2611 #ifndef SUBTARGET32_DEFAULT_CPU
2612 #define SUBTARGET32_DEFAULT_CPU "i386"
2613 #endif
2615 /* Whether -mtune= or -march= were specified */
2616 static int ix86_tune_defaulted;
2617 static int ix86_arch_specified;
2619 /* Vectorization library interface and handlers. */
2620 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2622 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2623 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2625 /* Processor target table, indexed by processor number */
2626 struct ptt
2628 const char *const name; /* processor name */
2629 const struct processor_costs *cost; /* Processor costs */
2630 const int align_loop; /* Default alignments. */
2631 const int align_loop_max_skip;
2632 const int align_jump;
2633 const int align_jump_max_skip;
2634 const int align_func;
2637 /* This table must be in sync with enum processor_type in i386.h. */
2638 static const struct ptt processor_target_table[PROCESSOR_max] =
2640 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2641 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2642 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2643 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2644 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2645 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2646 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2647 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2648 {"core2", &core_cost, 16, 10, 16, 10, 16},
2649 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2650 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2651 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2652 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2653 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2654 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2655 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2656 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2657 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2658 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2659 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2660 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2661 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2662 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2663 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2664 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2665 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2666 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2667 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2668 {"znver1", &znver1_cost, 16, 10, 16, 7, 11}
2671 static unsigned int
2672 rest_of_handle_insert_vzeroupper (void)
2674 int i;
2676 /* vzeroupper instructions are inserted immediately after reload to
2677 account for possible spills from 256bit registers. The pass
2678 reuses mode switching infrastructure by re-running mode insertion
2679 pass, so disable entities that have already been processed. */
2680 for (i = 0; i < MAX_386_ENTITIES; i++)
2681 ix86_optimize_mode_switching[i] = 0;
2683 ix86_optimize_mode_switching[AVX_U128] = 1;
2685 /* Call optimize_mode_switching. */
2686 g->get_passes ()->execute_pass_mode_switching ();
2687 return 0;
2690 /* Return 1 if INSN uses or defines a hard register.
2691 Hard register uses in a memory address are ignored.
2692 Clobbers and flags definitions are ignored. */
2694 static bool
2695 has_non_address_hard_reg (rtx_insn *insn)
2697 df_ref ref;
2698 FOR_EACH_INSN_DEF (ref, insn)
2699 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2700 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2701 && DF_REF_REGNO (ref) != FLAGS_REG)
2702 return true;
2704 FOR_EACH_INSN_USE (ref, insn)
2705 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2706 return true;
2708 return false;
2711 /* Check if comparison INSN may be transformed
2712 into vector comparison. Currently we transform
2713 zero checks only which look like:
2715 (set (reg:CCZ 17 flags)
2716 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2717 (subreg:SI (reg:DI x) 0))
2718 (const_int 0 [0]))) */
2720 static bool
2721 convertible_comparison_p (rtx_insn *insn)
2723 if (!TARGET_SSE4_1)
2724 return false;
2726 rtx def_set = single_set (insn);
2728 gcc_assert (def_set);
2730 rtx src = SET_SRC (def_set);
2731 rtx dst = SET_DEST (def_set);
2733 gcc_assert (GET_CODE (src) == COMPARE);
2735 if (GET_CODE (dst) != REG
2736 || REGNO (dst) != FLAGS_REG
2737 || GET_MODE (dst) != CCZmode)
2738 return false;
2740 rtx op1 = XEXP (src, 0);
2741 rtx op2 = XEXP (src, 1);
2743 if (op2 != CONST0_RTX (GET_MODE (op2)))
2744 return false;
2746 if (GET_CODE (op1) != IOR)
2747 return false;
2749 op2 = XEXP (op1, 1);
2750 op1 = XEXP (op1, 0);
2752 if (!SUBREG_P (op1)
2753 || !SUBREG_P (op2)
2754 || GET_MODE (op1) != SImode
2755 || GET_MODE (op2) != SImode
2756 || ((SUBREG_BYTE (op1) != 0
2757 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
2758 && (SUBREG_BYTE (op2) != 0
2759 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
2760 return false;
2762 op1 = SUBREG_REG (op1);
2763 op2 = SUBREG_REG (op2);
2765 if (op1 != op2
2766 || !REG_P (op1)
2767 || GET_MODE (op1) != DImode)
2768 return false;
2770 return true;
2773 /* The DImode version of scalar_to_vector_candidate_p. */
2775 static bool
2776 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
2778 rtx def_set = single_set (insn);
2780 if (!def_set)
2781 return false;
2783 if (has_non_address_hard_reg (insn))
2784 return false;
2786 rtx src = SET_SRC (def_set);
2787 rtx dst = SET_DEST (def_set);
2789 if (GET_CODE (src) == COMPARE)
2790 return convertible_comparison_p (insn);
2792 /* We are interested in DImode promotion only. */
2793 if ((GET_MODE (src) != DImode
2794 && !CONST_INT_P (src))
2795 || GET_MODE (dst) != DImode)
2796 return false;
2798 if (!REG_P (dst) && !MEM_P (dst))
2799 return false;
2801 switch (GET_CODE (src))
2803 case PLUS:
2804 case MINUS:
2805 case IOR:
2806 case XOR:
2807 case AND:
2808 break;
2810 case REG:
2811 return true;
2813 case MEM:
2814 case CONST_INT:
2815 return REG_P (dst);
2817 default:
2818 return false;
2821 if (!REG_P (XEXP (src, 0))
2822 && !MEM_P (XEXP (src, 0))
2823 && !CONST_INT_P (XEXP (src, 0))
2824 /* Check for andnot case. */
2825 && (GET_CODE (src) != AND
2826 || GET_CODE (XEXP (src, 0)) != NOT
2827 || !REG_P (XEXP (XEXP (src, 0), 0))))
2828 return false;
2830 if (!REG_P (XEXP (src, 1))
2831 && !MEM_P (XEXP (src, 1))
2832 && !CONST_INT_P (XEXP (src, 1)))
2833 return false;
2835 if ((GET_MODE (XEXP (src, 0)) != DImode
2836 && !CONST_INT_P (XEXP (src, 0)))
2837 || (GET_MODE (XEXP (src, 1)) != DImode
2838 && !CONST_INT_P (XEXP (src, 1))))
2839 return false;
2841 return true;
2844 /* The TImode version of scalar_to_vector_candidate_p. */
2846 static bool
2847 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2849 rtx def_set = single_set (insn);
2851 if (!def_set)
2852 return false;
2854 if (has_non_address_hard_reg (insn))
2855 return false;
2857 rtx src = SET_SRC (def_set);
2858 rtx dst = SET_DEST (def_set);
2860 /* Only TImode load and store are allowed. */
2861 if (GET_MODE (dst) != TImode)
2862 return false;
2864 if (MEM_P (dst))
2866 /* Check for store. Memory must be aligned or unaligned store
2867 is optimal. Only support store from register, standard SSE
2868 constant or CONST_WIDE_INT generated from piecewise store.
2870 ??? Verify performance impact before enabling CONST_INT for
2871 __int128 store. */
2872 if (misaligned_operand (dst, TImode)
2873 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2874 return false;
2876 switch (GET_CODE (src))
2878 default:
2879 return false;
2881 case REG:
2882 case CONST_WIDE_INT:
2883 return true;
2885 case CONST_INT:
2886 return standard_sse_constant_p (src, TImode);
2889 else if (MEM_P (src))
2891 /* Check for load. Memory must be aligned or unaligned load is
2892 optimal. */
2893 return (REG_P (dst)
2894 && (!misaligned_operand (src, TImode)
2895 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2898 return false;
2901 /* Return 1 if INSN may be converted into vector
2902 instruction. */
2904 static bool
2905 scalar_to_vector_candidate_p (rtx_insn *insn)
2907 if (TARGET_64BIT)
2908 return timode_scalar_to_vector_candidate_p (insn);
2909 else
2910 return dimode_scalar_to_vector_candidate_p (insn);
2913 /* The DImode version of remove_non_convertible_regs. */
2915 static void
2916 dimode_remove_non_convertible_regs (bitmap candidates)
2918 bitmap_iterator bi;
2919 unsigned id;
2920 bitmap regs = BITMAP_ALLOC (NULL);
2922 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2924 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
2925 rtx reg = SET_DEST (def_set);
2927 if (!REG_P (reg)
2928 || bitmap_bit_p (regs, REGNO (reg))
2929 || HARD_REGISTER_P (reg))
2930 continue;
2932 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
2933 def;
2934 def = DF_REF_NEXT_REG (def))
2936 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2938 if (dump_file)
2939 fprintf (dump_file,
2940 "r%d has non convertible definition in insn %d\n",
2941 REGNO (reg), DF_REF_INSN_UID (def));
2943 bitmap_set_bit (regs, REGNO (reg));
2944 break;
2949 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2951 for (df_ref def = DF_REG_DEF_CHAIN (id);
2952 def;
2953 def = DF_REF_NEXT_REG (def))
2954 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2956 if (dump_file)
2957 fprintf (dump_file, "Removing insn %d from candidates list\n",
2958 DF_REF_INSN_UID (def));
2960 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2964 BITMAP_FREE (regs);
2967 /* For a register REGNO, scan instructions for its defs and uses.
2968 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2970 static void
2971 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2972 unsigned int regno)
2974 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2975 def;
2976 def = DF_REF_NEXT_REG (def))
2978 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2980 if (dump_file)
2981 fprintf (dump_file,
2982 "r%d has non convertible def in insn %d\n",
2983 regno, DF_REF_INSN_UID (def));
2985 bitmap_set_bit (regs, regno);
2986 break;
2990 for (df_ref ref = DF_REG_USE_CHAIN (regno);
2991 ref;
2992 ref = DF_REF_NEXT_REG (ref))
2994 /* Debug instructions are skipped. */
2995 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2996 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2998 if (dump_file)
2999 fprintf (dump_file,
3000 "r%d has non convertible use in insn %d\n",
3001 regno, DF_REF_INSN_UID (ref));
3003 bitmap_set_bit (regs, regno);
3004 break;
3009 /* The TImode version of remove_non_convertible_regs. */
3011 static void
3012 timode_remove_non_convertible_regs (bitmap candidates)
3014 bitmap_iterator bi;
3015 unsigned id;
3016 bitmap regs = BITMAP_ALLOC (NULL);
3018 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3020 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3021 rtx dest = SET_DEST (def_set);
3022 rtx src = SET_SRC (def_set);
3024 if ((!REG_P (dest)
3025 || bitmap_bit_p (regs, REGNO (dest))
3026 || HARD_REGISTER_P (dest))
3027 && (!REG_P (src)
3028 || bitmap_bit_p (regs, REGNO (src))
3029 || HARD_REGISTER_P (src)))
3030 continue;
3032 if (REG_P (dest))
3033 timode_check_non_convertible_regs (candidates, regs,
3034 REGNO (dest));
3036 if (REG_P (src))
3037 timode_check_non_convertible_regs (candidates, regs,
3038 REGNO (src));
3041 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3043 for (df_ref def = DF_REG_DEF_CHAIN (id);
3044 def;
3045 def = DF_REF_NEXT_REG (def))
3046 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3048 if (dump_file)
3049 fprintf (dump_file, "Removing insn %d from candidates list\n",
3050 DF_REF_INSN_UID (def));
3052 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3055 for (df_ref ref = DF_REG_USE_CHAIN (id);
3056 ref;
3057 ref = DF_REF_NEXT_REG (ref))
3058 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3060 if (dump_file)
3061 fprintf (dump_file, "Removing insn %d from candidates list\n",
3062 DF_REF_INSN_UID (ref));
3064 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3068 BITMAP_FREE (regs);
3071 /* For a given bitmap of insn UIDs scans all instruction and
3072 remove insn from CANDIDATES in case it has both convertible
3073 and not convertible definitions.
3075 All insns in a bitmap are conversion candidates according to
3076 scalar_to_vector_candidate_p. Currently it implies all insns
3077 are single_set. */
3079 static void
3080 remove_non_convertible_regs (bitmap candidates)
3082 if (TARGET_64BIT)
3083 timode_remove_non_convertible_regs (candidates);
3084 else
3085 dimode_remove_non_convertible_regs (candidates);
3088 class scalar_chain
3090 public:
3091 scalar_chain ();
3092 virtual ~scalar_chain ();
3094 static unsigned max_id;
3096 /* ID of a chain. */
3097 unsigned int chain_id;
3098 /* A queue of instructions to be included into a chain. */
3099 bitmap queue;
3100 /* Instructions included into a chain. */
3101 bitmap insns;
3102 /* All registers defined by a chain. */
3103 bitmap defs;
3104 /* Registers used in both vector and sclar modes. */
3105 bitmap defs_conv;
3107 void build (bitmap candidates, unsigned insn_uid);
3108 virtual int compute_convert_gain () = 0;
3109 int convert ();
3111 protected:
3112 void add_to_queue (unsigned insn_uid);
3113 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3115 private:
3116 void add_insn (bitmap candidates, unsigned insn_uid);
3117 void analyze_register_chain (bitmap candidates, df_ref ref);
3118 virtual void mark_dual_mode_def (df_ref def) = 0;
3119 virtual void convert_insn (rtx_insn *insn) = 0;
3120 virtual void convert_registers () = 0;
3123 class dimode_scalar_chain : public scalar_chain
3125 public:
3126 int compute_convert_gain ();
3127 private:
3128 void mark_dual_mode_def (df_ref def);
3129 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3130 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3131 void convert_insn (rtx_insn *insn);
3132 void convert_op (rtx *op, rtx_insn *insn);
3133 void convert_reg (unsigned regno);
3134 void make_vector_copies (unsigned regno);
3135 void convert_registers ();
3136 int vector_const_cost (rtx exp);
3139 class timode_scalar_chain : public scalar_chain
3141 public:
3142 /* Convert from TImode to V1TImode is always faster. */
3143 int compute_convert_gain () { return 1; }
3145 private:
3146 void mark_dual_mode_def (df_ref def);
3147 void fix_debug_reg_uses (rtx reg);
3148 void convert_insn (rtx_insn *insn);
3149 /* We don't convert registers to difference size. */
3150 void convert_registers () {}
3153 unsigned scalar_chain::max_id = 0;
3155 /* Initialize new chain. */
3157 scalar_chain::scalar_chain ()
3159 chain_id = ++max_id;
3161 if (dump_file)
3162 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3164 bitmap_obstack_initialize (NULL);
3165 insns = BITMAP_ALLOC (NULL);
3166 defs = BITMAP_ALLOC (NULL);
3167 defs_conv = BITMAP_ALLOC (NULL);
3168 queue = NULL;
3171 /* Free chain's data. */
3173 scalar_chain::~scalar_chain ()
3175 BITMAP_FREE (insns);
3176 BITMAP_FREE (defs);
3177 BITMAP_FREE (defs_conv);
3178 bitmap_obstack_release (NULL);
3181 /* Add instruction into chains' queue. */
3183 void
3184 scalar_chain::add_to_queue (unsigned insn_uid)
3186 if (bitmap_bit_p (insns, insn_uid)
3187 || bitmap_bit_p (queue, insn_uid))
3188 return;
3190 if (dump_file)
3191 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3192 insn_uid, chain_id);
3193 bitmap_set_bit (queue, insn_uid);
3196 /* For DImode conversion, mark register defined by DEF as requiring
3197 conversion. */
3199 void
3200 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3202 gcc_assert (DF_REF_REG_DEF_P (def));
3204 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3205 return;
3207 if (dump_file)
3208 fprintf (dump_file,
3209 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3210 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3212 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3215 /* For TImode conversion, it is unused. */
3217 void
3218 timode_scalar_chain::mark_dual_mode_def (df_ref)
3220 gcc_unreachable ();
3223 /* Check REF's chain to add new insns into a queue
3224 and find registers requiring conversion. */
3226 void
3227 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3229 df_link *chain;
3231 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3232 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3233 add_to_queue (DF_REF_INSN_UID (ref));
3235 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3237 unsigned uid = DF_REF_INSN_UID (chain->ref);
3239 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3240 continue;
3242 if (!DF_REF_REG_MEM_P (chain->ref))
3244 if (bitmap_bit_p (insns, uid))
3245 continue;
3247 if (bitmap_bit_p (candidates, uid))
3249 add_to_queue (uid);
3250 continue;
3254 if (DF_REF_REG_DEF_P (chain->ref))
3256 if (dump_file)
3257 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3258 DF_REF_REGNO (chain->ref), uid);
3259 mark_dual_mode_def (chain->ref);
3261 else
3263 if (dump_file)
3264 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3265 DF_REF_REGNO (chain->ref), uid);
3266 mark_dual_mode_def (ref);
3271 /* Add instruction into a chain. */
3273 void
3274 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3276 if (bitmap_bit_p (insns, insn_uid))
3277 return;
3279 if (dump_file)
3280 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3282 bitmap_set_bit (insns, insn_uid);
3284 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3285 rtx def_set = single_set (insn);
3286 if (def_set && REG_P (SET_DEST (def_set))
3287 && !HARD_REGISTER_P (SET_DEST (def_set)))
3288 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3290 df_ref ref;
3291 df_ref def;
3292 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3293 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3294 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3295 def;
3296 def = DF_REF_NEXT_REG (def))
3297 analyze_register_chain (candidates, def);
3298 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3299 if (!DF_REF_REG_MEM_P (ref))
3300 analyze_register_chain (candidates, ref);
3303 /* Build new chain starting from insn INSN_UID recursively
3304 adding all dependent uses and definitions. */
3306 void
3307 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3309 queue = BITMAP_ALLOC (NULL);
3310 bitmap_set_bit (queue, insn_uid);
3312 if (dump_file)
3313 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3315 while (!bitmap_empty_p (queue))
3317 insn_uid = bitmap_first_set_bit (queue);
3318 bitmap_clear_bit (queue, insn_uid);
3319 bitmap_clear_bit (candidates, insn_uid);
3320 add_insn (candidates, insn_uid);
3323 if (dump_file)
3325 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3326 fprintf (dump_file, " insns: ");
3327 dump_bitmap (dump_file, insns);
3328 if (!bitmap_empty_p (defs_conv))
3330 bitmap_iterator bi;
3331 unsigned id;
3332 const char *comma = "";
3333 fprintf (dump_file, " defs to convert: ");
3334 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3336 fprintf (dump_file, "%sr%d", comma, id);
3337 comma = ", ";
3339 fprintf (dump_file, "\n");
3343 BITMAP_FREE (queue);
3346 /* Return a cost of building a vector costant
3347 instead of using a scalar one. */
3350 dimode_scalar_chain::vector_const_cost (rtx exp)
3352 gcc_assert (CONST_INT_P (exp));
3354 if (standard_sse_constant_p (exp, V2DImode))
3355 return COSTS_N_INSNS (1);
3356 return ix86_cost->sse_load[1];
3359 /* Compute a gain for chain conversion. */
3362 dimode_scalar_chain::compute_convert_gain ()
3364 bitmap_iterator bi;
3365 unsigned insn_uid;
3366 int gain = 0;
3367 int cost = 0;
3369 if (dump_file)
3370 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3372 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3374 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3375 rtx def_set = single_set (insn);
3376 rtx src = SET_SRC (def_set);
3377 rtx dst = SET_DEST (def_set);
3379 if (REG_P (src) && REG_P (dst))
3380 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3381 else if (REG_P (src) && MEM_P (dst))
3382 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3383 else if (MEM_P (src) && REG_P (dst))
3384 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3385 else if (GET_CODE (src) == PLUS
3386 || GET_CODE (src) == MINUS
3387 || GET_CODE (src) == IOR
3388 || GET_CODE (src) == XOR
3389 || GET_CODE (src) == AND)
3391 gain += ix86_cost->add;
3392 if (CONST_INT_P (XEXP (src, 0)))
3393 gain -= vector_const_cost (XEXP (src, 0));
3394 if (CONST_INT_P (XEXP (src, 1)))
3395 gain -= vector_const_cost (XEXP (src, 1));
3397 else if (GET_CODE (src) == COMPARE)
3399 /* Assume comparison cost is the same. */
3401 else if (GET_CODE (src) == CONST_INT)
3403 if (REG_P (dst))
3404 gain += COSTS_N_INSNS (2);
3405 else if (MEM_P (dst))
3406 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3407 gain -= vector_const_cost (src);
3409 else
3410 gcc_unreachable ();
3413 if (dump_file)
3414 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3416 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3417 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3419 if (dump_file)
3420 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3422 gain -= cost;
3424 if (dump_file)
3425 fprintf (dump_file, " Total gain: %d\n", gain);
3427 return gain;
3430 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3433 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3435 if (x == reg)
3436 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3438 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3439 int i, j;
3440 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3442 if (fmt[i] == 'e')
3443 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3444 else if (fmt[i] == 'E')
3445 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3446 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3447 reg, new_reg);
3450 return x;
3453 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3455 void
3456 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3457 rtx reg, rtx new_reg)
3459 replace_with_subreg (single_set (insn), reg, new_reg);
3462 /* Insert generated conversion instruction sequence INSNS
3463 after instruction AFTER. New BB may be required in case
3464 instruction has EH region attached. */
3466 void
3467 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3469 if (!control_flow_insn_p (after))
3471 emit_insn_after (insns, after);
3472 return;
3475 basic_block bb = BLOCK_FOR_INSN (after);
3476 edge e = find_fallthru_edge (bb->succs);
3477 gcc_assert (e);
3479 basic_block new_bb = split_edge (e);
3480 emit_insn_after (insns, BB_HEAD (new_bb));
3483 /* Make vector copies for all register REGNO definitions
3484 and replace its uses in a chain. */
3486 void
3487 dimode_scalar_chain::make_vector_copies (unsigned regno)
3489 rtx reg = regno_reg_rtx[regno];
3490 rtx vreg = gen_reg_rtx (DImode);
3491 df_ref ref;
3493 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3494 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3496 rtx_insn *insn = DF_REF_INSN (ref);
3498 start_sequence ();
3499 if (TARGET_SSE4_1)
3501 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3502 CONST0_RTX (V4SImode),
3503 gen_rtx_SUBREG (SImode, reg, 0)));
3504 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3505 gen_rtx_SUBREG (V4SImode, vreg, 0),
3506 gen_rtx_SUBREG (SImode, reg, 4),
3507 GEN_INT (2)));
3509 else if (TARGET_INTER_UNIT_MOVES_TO_VEC)
3511 rtx tmp = gen_reg_rtx (DImode);
3512 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3513 CONST0_RTX (V4SImode),
3514 gen_rtx_SUBREG (SImode, reg, 0)));
3515 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3516 CONST0_RTX (V4SImode),
3517 gen_rtx_SUBREG (SImode, reg, 4)));
3518 emit_insn (gen_vec_interleave_lowv4si
3519 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3520 gen_rtx_SUBREG (V4SImode, vreg, 0),
3521 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3523 else
3525 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3526 emit_move_insn (adjust_address (tmp, SImode, 0),
3527 gen_rtx_SUBREG (SImode, reg, 0));
3528 emit_move_insn (adjust_address (tmp, SImode, 4),
3529 gen_rtx_SUBREG (SImode, reg, 4));
3530 emit_move_insn (vreg, tmp);
3532 rtx_insn *seq = get_insns ();
3533 end_sequence ();
3534 emit_conversion_insns (seq, insn);
3536 if (dump_file)
3537 fprintf (dump_file,
3538 " Copied r%d to a vector register r%d for insn %d\n",
3539 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3542 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3543 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3545 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, vreg);
3547 if (dump_file)
3548 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3549 regno, REGNO (vreg), DF_REF_INSN_UID (ref));
3553 /* Convert all definitions of register REGNO
3554 and fix its uses. Scalar copies may be created
3555 in case register is used in not convertible insn. */
3557 void
3558 dimode_scalar_chain::convert_reg (unsigned regno)
3560 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3561 rtx reg = regno_reg_rtx[regno];
3562 rtx scopy = NULL_RTX;
3563 df_ref ref;
3564 bitmap conv;
3566 conv = BITMAP_ALLOC (NULL);
3567 bitmap_copy (conv, insns);
3569 if (scalar_copy)
3570 scopy = gen_reg_rtx (DImode);
3572 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3574 rtx_insn *insn = DF_REF_INSN (ref);
3575 rtx def_set = single_set (insn);
3576 rtx src = SET_SRC (def_set);
3577 rtx reg = DF_REF_REG (ref);
3579 if (!MEM_P (src))
3581 replace_with_subreg_in_insn (insn, reg, reg);
3582 bitmap_clear_bit (conv, INSN_UID (insn));
3585 if (scalar_copy)
3587 rtx vcopy = gen_reg_rtx (V2DImode);
3589 start_sequence ();
3590 if (TARGET_INTER_UNIT_MOVES_FROM_VEC)
3592 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3593 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3594 gen_rtx_SUBREG (SImode, vcopy, 0));
3595 emit_move_insn (vcopy,
3596 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3597 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3598 gen_rtx_SUBREG (SImode, vcopy, 0));
3600 else
3602 rtx tmp = assign_386_stack_local (DImode, SLOT_TEMP);
3603 emit_move_insn (tmp, reg);
3604 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3605 adjust_address (tmp, SImode, 0));
3606 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3607 adjust_address (tmp, SImode, 4));
3609 rtx_insn *seq = get_insns ();
3610 end_sequence ();
3611 emit_conversion_insns (seq, insn);
3613 if (dump_file)
3614 fprintf (dump_file,
3615 " Copied r%d to a scalar register r%d for insn %d\n",
3616 regno, REGNO (scopy), INSN_UID (insn));
3620 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3621 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3623 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3625 rtx def_set = single_set (DF_REF_INSN (ref));
3626 if (!MEM_P (SET_DEST (def_set))
3627 || !REG_P (SET_SRC (def_set)))
3628 replace_with_subreg_in_insn (DF_REF_INSN (ref), reg, reg);
3629 bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
3632 /* Skip debug insns and uninitialized uses. */
3633 else if (DF_REF_CHAIN (ref)
3634 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
3636 gcc_assert (scopy);
3637 replace_rtx (DF_REF_INSN (ref), reg, scopy);
3638 df_insn_rescan (DF_REF_INSN (ref));
3641 BITMAP_FREE (conv);
3644 /* Convert operand OP in INSN. We should handle
3645 memory operands and uninitialized registers.
3646 All other register uses are converted during
3647 registers conversion. */
3649 void
3650 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
3652 *op = copy_rtx_if_shared (*op);
3654 if (GET_CODE (*op) == NOT)
3656 convert_op (&XEXP (*op, 0), insn);
3657 PUT_MODE (*op, V2DImode);
3659 else if (MEM_P (*op))
3661 rtx tmp = gen_reg_rtx (DImode);
3663 emit_insn_before (gen_move_insn (tmp, *op), insn);
3664 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
3666 if (dump_file)
3667 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
3668 INSN_UID (insn), REGNO (tmp));
3670 else if (REG_P (*op))
3672 /* We may have not converted register usage in case
3673 this register has no definition. Otherwise it
3674 should be converted in convert_reg. */
3675 df_ref ref;
3676 FOR_EACH_INSN_USE (ref, insn)
3677 if (DF_REF_REGNO (ref) == REGNO (*op))
3679 gcc_assert (!DF_REF_CHAIN (ref));
3680 break;
3682 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
3684 else if (CONST_INT_P (*op))
3686 rtx vec_cst;
3687 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
3689 /* Prefer all ones vector in case of -1. */
3690 if (constm1_operand (*op, GET_MODE (*op)))
3691 vec_cst = CONSTM1_RTX (V2DImode);
3692 else
3693 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
3694 gen_rtvec (2, *op, const0_rtx));
3696 if (!standard_sse_constant_p (vec_cst, V2DImode))
3698 start_sequence ();
3699 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
3700 rtx_insn *seq = get_insns ();
3701 end_sequence ();
3702 emit_insn_before (seq, insn);
3705 emit_insn_before (gen_move_insn (tmp, vec_cst), insn);
3706 *op = tmp;
3708 else
3710 gcc_assert (SUBREG_P (*op));
3711 gcc_assert (GET_MODE (*op) == V2DImode);
3715 /* Convert INSN to vector mode. */
3717 void
3718 dimode_scalar_chain::convert_insn (rtx_insn *insn)
3720 rtx def_set = single_set (insn);
3721 rtx src = SET_SRC (def_set);
3722 rtx dst = SET_DEST (def_set);
3723 rtx subreg;
3725 if (MEM_P (dst) && !REG_P (src))
3727 /* There are no scalar integer instructions and therefore
3728 temporary register usage is required. */
3729 rtx tmp = gen_reg_rtx (DImode);
3730 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
3731 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
3734 switch (GET_CODE (src))
3736 case PLUS:
3737 case MINUS:
3738 case IOR:
3739 case XOR:
3740 case AND:
3741 convert_op (&XEXP (src, 0), insn);
3742 convert_op (&XEXP (src, 1), insn);
3743 PUT_MODE (src, V2DImode);
3744 break;
3746 case MEM:
3747 if (!REG_P (dst))
3748 convert_op (&src, insn);
3749 break;
3751 case REG:
3752 if (!MEM_P (dst))
3753 convert_op (&src, insn);
3754 break;
3756 case SUBREG:
3757 gcc_assert (GET_MODE (src) == V2DImode);
3758 break;
3760 case COMPARE:
3761 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
3763 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
3764 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
3766 if (REG_P (src))
3767 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
3768 else
3769 subreg = copy_rtx_if_shared (src);
3770 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
3771 copy_rtx_if_shared (subreg),
3772 copy_rtx_if_shared (subreg)),
3773 insn);
3774 dst = gen_rtx_REG (CCmode, FLAGS_REG);
3775 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
3776 copy_rtx_if_shared (src)),
3777 UNSPEC_PTEST);
3778 break;
3780 case CONST_INT:
3781 convert_op (&src, insn);
3782 break;
3784 default:
3785 gcc_unreachable ();
3788 SET_SRC (def_set) = src;
3789 SET_DEST (def_set) = dst;
3791 /* Drop possible dead definitions. */
3792 PATTERN (insn) = def_set;
3794 INSN_CODE (insn) = -1;
3795 recog_memoized (insn);
3796 df_insn_rescan (insn);
3799 /* Fix uses of converted REG in debug insns. */
3801 void
3802 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
3804 if (!flag_var_tracking)
3805 return;
3807 df_ref ref;
3808 for (ref = DF_REG_USE_CHAIN (REGNO (reg));
3809 ref;
3810 ref = DF_REF_NEXT_REG (ref))
3812 rtx_insn *insn = DF_REF_INSN (ref);
3813 if (DEBUG_INSN_P (insn))
3815 /* It may be a debug insn with a TImode variable in
3816 register. */
3817 rtx val = PATTERN (insn);
3818 if (GET_MODE (val) != TImode)
3819 continue;
3820 gcc_assert (GET_CODE (val) == VAR_LOCATION);
3821 rtx loc = PAT_VAR_LOCATION_LOC (val);
3822 /* It may have been converted to TImode already. */
3823 if (GET_MODE (loc) == TImode)
3824 continue;
3825 gcc_assert (REG_P (loc)
3826 && GET_MODE (loc) == V1TImode);
3827 /* Convert V1TImode register, which has been updated by a SET
3828 insn before, to SUBREG TImode. */
3829 PAT_VAR_LOCATION_LOC (val) = gen_rtx_SUBREG (TImode, loc, 0);
3830 df_insn_rescan (insn);
3835 /* Convert INSN from TImode to V1T1mode. */
3837 void
3838 timode_scalar_chain::convert_insn (rtx_insn *insn)
3840 rtx def_set = single_set (insn);
3841 rtx src = SET_SRC (def_set);
3842 rtx dst = SET_DEST (def_set);
3844 switch (GET_CODE (dst))
3846 case REG:
3848 rtx tmp = find_reg_equal_equiv_note (insn);
3849 if (tmp)
3850 PUT_MODE (XEXP (tmp, 0), V1TImode);
3851 PUT_MODE (dst, V1TImode);
3852 fix_debug_reg_uses (dst);
3854 break;
3855 case MEM:
3856 PUT_MODE (dst, V1TImode);
3857 break;
3859 default:
3860 gcc_unreachable ();
3863 switch (GET_CODE (src))
3865 case REG:
3866 PUT_MODE (src, V1TImode);
3867 /* Call fix_debug_reg_uses only if SRC is never defined. */
3868 if (!DF_REG_DEF_CHAIN (REGNO (src)))
3869 fix_debug_reg_uses (src);
3870 break;
3872 case MEM:
3873 PUT_MODE (src, V1TImode);
3874 break;
3876 case CONST_WIDE_INT:
3877 if (NONDEBUG_INSN_P (insn))
3879 /* Since there are no instructions to store 128-bit constant,
3880 temporary register usage is required. */
3881 rtx tmp = gen_reg_rtx (V1TImode);
3882 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
3883 src = validize_mem (force_const_mem (V1TImode, src));
3884 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3885 dst = tmp;
3887 break;
3889 case CONST_INT:
3890 switch (standard_sse_constant_p (src, TImode))
3892 case 1:
3893 src = CONST0_RTX (GET_MODE (dst));
3894 break;
3895 case 2:
3896 src = CONSTM1_RTX (GET_MODE (dst));
3897 break;
3898 default:
3899 gcc_unreachable ();
3901 if (NONDEBUG_INSN_P (insn))
3903 rtx tmp = gen_reg_rtx (V1TImode);
3904 /* Since there are no instructions to store standard SSE
3905 constant, temporary register usage is required. */
3906 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
3907 dst = tmp;
3909 break;
3911 default:
3912 gcc_unreachable ();
3915 SET_SRC (def_set) = src;
3916 SET_DEST (def_set) = dst;
3918 /* Drop possible dead definitions. */
3919 PATTERN (insn) = def_set;
3921 INSN_CODE (insn) = -1;
3922 recog_memoized (insn);
3923 df_insn_rescan (insn);
3926 void
3927 dimode_scalar_chain::convert_registers ()
3929 bitmap_iterator bi;
3930 unsigned id;
3932 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
3933 convert_reg (id);
3935 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
3936 make_vector_copies (id);
3939 /* Convert whole chain creating required register
3940 conversions and copies. */
3943 scalar_chain::convert ()
3945 bitmap_iterator bi;
3946 unsigned id;
3947 int converted_insns = 0;
3949 if (!dbg_cnt (stv_conversion))
3950 return 0;
3952 if (dump_file)
3953 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
3955 convert_registers ();
3957 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
3959 convert_insn (DF_INSN_UID_GET (id)->insn);
3960 converted_insns++;
3963 return converted_insns;
3966 /* Main STV pass function. Find and convert scalar
3967 instructions into vector mode when profitable. */
3969 static unsigned int
3970 convert_scalars_to_vector ()
3972 basic_block bb;
3973 bitmap candidates;
3974 int converted_insns = 0;
3976 bitmap_obstack_initialize (NULL);
3977 candidates = BITMAP_ALLOC (NULL);
3979 calculate_dominance_info (CDI_DOMINATORS);
3980 df_set_flags (DF_DEFER_INSN_RESCAN);
3981 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
3982 df_md_add_problem ();
3983 df_analyze ();
3985 /* Find all instructions we want to convert into vector mode. */
3986 if (dump_file)
3987 fprintf (dump_file, "Searching for mode conversion candidates...\n");
3989 FOR_EACH_BB_FN (bb, cfun)
3991 rtx_insn *insn;
3992 FOR_BB_INSNS (bb, insn)
3993 if (scalar_to_vector_candidate_p (insn))
3995 if (dump_file)
3996 fprintf (dump_file, " insn %d is marked as a candidate\n",
3997 INSN_UID (insn));
3999 bitmap_set_bit (candidates, INSN_UID (insn));
4003 remove_non_convertible_regs (candidates);
4005 if (bitmap_empty_p (candidates))
4006 if (dump_file)
4007 fprintf (dump_file, "There are no candidates for optimization.\n");
4009 while (!bitmap_empty_p (candidates))
4011 unsigned uid = bitmap_first_set_bit (candidates);
4012 scalar_chain *chain;
4014 if (TARGET_64BIT)
4015 chain = new timode_scalar_chain;
4016 else
4017 chain = new dimode_scalar_chain;
4019 /* Find instructions chain we want to convert to vector mode.
4020 Check all uses and definitions to estimate all required
4021 conversions. */
4022 chain->build (candidates, uid);
4024 if (chain->compute_convert_gain () > 0)
4025 converted_insns += chain->convert ();
4026 else
4027 if (dump_file)
4028 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4029 chain->chain_id);
4031 delete chain;
4034 if (dump_file)
4035 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4037 BITMAP_FREE (candidates);
4038 bitmap_obstack_release (NULL);
4039 df_process_deferred_rescans ();
4041 /* Conversion means we may have 128bit register spills/fills
4042 which require aligned stack. */
4043 if (converted_insns)
4045 if (crtl->stack_alignment_needed < 128)
4046 crtl->stack_alignment_needed = 128;
4047 if (crtl->stack_alignment_estimated < 128)
4048 crtl->stack_alignment_estimated = 128;
4051 return 0;
4054 namespace {
4056 const pass_data pass_data_insert_vzeroupper =
4058 RTL_PASS, /* type */
4059 "vzeroupper", /* name */
4060 OPTGROUP_NONE, /* optinfo_flags */
4061 TV_MACH_DEP, /* tv_id */
4062 0, /* properties_required */
4063 0, /* properties_provided */
4064 0, /* properties_destroyed */
4065 0, /* todo_flags_start */
4066 TODO_df_finish, /* todo_flags_finish */
4069 class pass_insert_vzeroupper : public rtl_opt_pass
4071 public:
4072 pass_insert_vzeroupper(gcc::context *ctxt)
4073 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4076 /* opt_pass methods: */
4077 virtual bool gate (function *)
4079 return TARGET_AVX && !TARGET_AVX512F
4080 && TARGET_VZEROUPPER && flag_expensive_optimizations
4081 && !optimize_size;
4084 virtual unsigned int execute (function *)
4086 return rest_of_handle_insert_vzeroupper ();
4089 }; // class pass_insert_vzeroupper
4091 const pass_data pass_data_stv =
4093 RTL_PASS, /* type */
4094 "stv", /* name */
4095 OPTGROUP_NONE, /* optinfo_flags */
4096 TV_MACH_DEP, /* tv_id */
4097 0, /* properties_required */
4098 0, /* properties_provided */
4099 0, /* properties_destroyed */
4100 0, /* todo_flags_start */
4101 TODO_df_finish, /* todo_flags_finish */
4104 class pass_stv : public rtl_opt_pass
4106 public:
4107 pass_stv (gcc::context *ctxt)
4108 : rtl_opt_pass (pass_data_stv, ctxt)
4111 /* opt_pass methods: */
4112 virtual bool gate (function *)
4114 return TARGET_STV && TARGET_SSE2 && optimize > 1;
4117 virtual unsigned int execute (function *)
4119 return convert_scalars_to_vector ();
4122 }; // class pass_stv
4124 } // anon namespace
4126 rtl_opt_pass *
4127 make_pass_insert_vzeroupper (gcc::context *ctxt)
4129 return new pass_insert_vzeroupper (ctxt);
4132 rtl_opt_pass *
4133 make_pass_stv (gcc::context *ctxt)
4135 return new pass_stv (ctxt);
4138 /* Return true if a red-zone is in use. */
4140 bool
4141 ix86_using_red_zone (void)
4143 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4146 /* Return a string that documents the current -m options. The caller is
4147 responsible for freeing the string. */
4149 static char *
4150 ix86_target_string (HOST_WIDE_INT isa, int flags, int ix86_flags,
4151 const char *arch, const char *tune,
4152 enum fpmath_unit fpmath, bool add_nl_p)
4154 struct ix86_target_opts
4156 const char *option; /* option string */
4157 HOST_WIDE_INT mask; /* isa mask options */
4160 /* This table is ordered so that options like -msse4.2 that imply
4161 preceding options while match those first. */
4162 static struct ix86_target_opts isa_opts[] =
4164 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4165 { "-mfma", OPTION_MASK_ISA_FMA },
4166 { "-mxop", OPTION_MASK_ISA_XOP },
4167 { "-mlwp", OPTION_MASK_ISA_LWP },
4168 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4169 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4170 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4171 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4172 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4173 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4174 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4175 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4176 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4177 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4178 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4179 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4180 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4181 { "-msse3", OPTION_MASK_ISA_SSE3 },
4182 { "-msse2", OPTION_MASK_ISA_SSE2 },
4183 { "-msse", OPTION_MASK_ISA_SSE },
4184 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4185 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4186 { "-mmmx", OPTION_MASK_ISA_MMX },
4187 { "-mabm", OPTION_MASK_ISA_ABM },
4188 { "-mbmi", OPTION_MASK_ISA_BMI },
4189 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4190 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4191 { "-mhle", OPTION_MASK_ISA_HLE },
4192 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4193 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4194 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4195 { "-madx", OPTION_MASK_ISA_ADX },
4196 { "-mtbm", OPTION_MASK_ISA_TBM },
4197 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4198 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4199 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4200 { "-maes", OPTION_MASK_ISA_AES },
4201 { "-msha", OPTION_MASK_ISA_SHA },
4202 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4203 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4204 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4205 { "-mf16c", OPTION_MASK_ISA_F16C },
4206 { "-mrtm", OPTION_MASK_ISA_RTM },
4207 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4208 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4209 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4210 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4211 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4212 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4213 { "-mmpx", OPTION_MASK_ISA_MPX },
4214 { "-mclwb", OPTION_MASK_ISA_CLWB },
4215 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4216 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4217 { "-mpku", OPTION_MASK_ISA_PKU },
4220 /* Flag options. */
4221 static struct ix86_target_opts flag_opts[] =
4223 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4224 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4225 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4226 { "-m80387", MASK_80387 },
4227 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4228 { "-malign-double", MASK_ALIGN_DOUBLE },
4229 { "-mcld", MASK_CLD },
4230 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4231 { "-mieee-fp", MASK_IEEE_FP },
4232 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4233 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4234 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4235 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4236 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4237 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4238 { "-mno-red-zone", MASK_NO_RED_ZONE },
4239 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4240 { "-mrecip", MASK_RECIP },
4241 { "-mrtd", MASK_RTD },
4242 { "-msseregparm", MASK_SSEREGPARM },
4243 { "-mstack-arg-probe", MASK_STACK_PROBE },
4244 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4245 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4246 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4247 { "-mvzeroupper", MASK_VZEROUPPER },
4248 { "-mstv", MASK_STV},
4249 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
4250 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
4251 { "-mprefer-avx128", MASK_PREFER_AVX128},
4254 /* Additional flag options. */
4255 static struct ix86_target_opts ix86_flag_opts[] =
4257 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4260 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts)
4261 + ARRAY_SIZE (ix86_flag_opts) + 6][2];
4263 char isa_other[40];
4264 char target_other[40];
4265 char ix86_target_other[40];
4266 unsigned num = 0;
4267 unsigned i, j;
4268 char *ret;
4269 char *ptr;
4270 size_t len;
4271 size_t line_len;
4272 size_t sep_len;
4273 const char *abi;
4275 memset (opts, '\0', sizeof (opts));
4277 /* Add -march= option. */
4278 if (arch)
4280 opts[num][0] = "-march=";
4281 opts[num++][1] = arch;
4284 /* Add -mtune= option. */
4285 if (tune)
4287 opts[num][0] = "-mtune=";
4288 opts[num++][1] = tune;
4291 /* Add -m32/-m64/-mx32. */
4292 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4294 if ((isa & OPTION_MASK_ABI_64) != 0)
4295 abi = "-m64";
4296 else
4297 abi = "-mx32";
4298 isa &= ~ (OPTION_MASK_ISA_64BIT
4299 | OPTION_MASK_ABI_64
4300 | OPTION_MASK_ABI_X32);
4302 else
4303 abi = "-m32";
4304 opts[num++][0] = abi;
4306 /* Pick out the options in isa options. */
4307 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4309 if ((isa & isa_opts[i].mask) != 0)
4311 opts[num++][0] = isa_opts[i].option;
4312 isa &= ~ isa_opts[i].mask;
4316 if (isa && add_nl_p)
4318 opts[num++][0] = isa_other;
4319 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
4320 isa);
4323 /* Add flag options. */
4324 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4326 if ((flags & flag_opts[i].mask) != 0)
4328 opts[num++][0] = flag_opts[i].option;
4329 flags &= ~ flag_opts[i].mask;
4333 if (flags && add_nl_p)
4335 opts[num++][0] = target_other;
4336 sprintf (target_other, "(other flags: %#x)", flags);
4339 /* Add additional flag options. */
4340 for (i = 0; i < ARRAY_SIZE (ix86_flag_opts); i++)
4342 if ((ix86_flags & ix86_flag_opts[i].mask) != 0)
4344 opts[num++][0] = ix86_flag_opts[i].option;
4345 ix86_flags &= ~ ix86_flag_opts[i].mask;
4349 if (ix86_flags && add_nl_p)
4351 opts[num++][0] = ix86_target_other;
4352 sprintf (ix86_target_other, "(other flags: %#x)", ix86_flags);
4355 /* Add -fpmath= option. */
4356 if (fpmath)
4358 opts[num][0] = "-mfpmath=";
4359 switch ((int) fpmath)
4361 case FPMATH_387:
4362 opts[num++][1] = "387";
4363 break;
4365 case FPMATH_SSE:
4366 opts[num++][1] = "sse";
4367 break;
4369 case FPMATH_387 | FPMATH_SSE:
4370 opts[num++][1] = "sse+387";
4371 break;
4373 default:
4374 gcc_unreachable ();
4378 /* Any options? */
4379 if (num == 0)
4380 return NULL;
4382 gcc_assert (num < ARRAY_SIZE (opts));
4384 /* Size the string. */
4385 len = 0;
4386 sep_len = (add_nl_p) ? 3 : 1;
4387 for (i = 0; i < num; i++)
4389 len += sep_len;
4390 for (j = 0; j < 2; j++)
4391 if (opts[i][j])
4392 len += strlen (opts[i][j]);
4395 /* Build the string. */
4396 ret = ptr = (char *) xmalloc (len);
4397 line_len = 0;
4399 for (i = 0; i < num; i++)
4401 size_t len2[2];
4403 for (j = 0; j < 2; j++)
4404 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4406 if (i != 0)
4408 *ptr++ = ' ';
4409 line_len++;
4411 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4413 *ptr++ = '\\';
4414 *ptr++ = '\n';
4415 line_len = 0;
4419 for (j = 0; j < 2; j++)
4420 if (opts[i][j])
4422 memcpy (ptr, opts[i][j], len2[j]);
4423 ptr += len2[j];
4424 line_len += len2[j];
4428 *ptr = '\0';
4429 gcc_assert (ret + len >= ptr);
4431 return ret;
4434 /* Return true, if profiling code should be emitted before
4435 prologue. Otherwise it returns false.
4436 Note: For x86 with "hotfix" it is sorried. */
4437 static bool
4438 ix86_profile_before_prologue (void)
4440 return flag_fentry != 0;
4443 /* Function that is callable from the debugger to print the current
4444 options. */
4445 void ATTRIBUTE_UNUSED
4446 ix86_debug_options (void)
4448 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
4449 ix86_target_flags,
4450 ix86_arch_string, ix86_tune_string,
4451 ix86_fpmath, true);
4453 if (opts)
4455 fprintf (stderr, "%s\n\n", opts);
4456 free (opts);
4458 else
4459 fputs ("<no options>\n\n", stderr);
4461 return;
4464 /* Return true if T is one of the bytes we should avoid with
4465 -fmitigate-rop. */
4467 static bool
4468 ix86_rop_should_change_byte_p (int t)
4470 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4473 static const char *stringop_alg_names[] = {
4474 #define DEF_ENUM
4475 #define DEF_ALG(alg, name) #name,
4476 #include "stringop.def"
4477 #undef DEF_ENUM
4478 #undef DEF_ALG
4481 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4482 The string is of the following form (or comma separated list of it):
4484 strategy_alg:max_size:[align|noalign]
4486 where the full size range for the strategy is either [0, max_size] or
4487 [min_size, max_size], in which min_size is the max_size + 1 of the
4488 preceding range. The last size range must have max_size == -1.
4490 Examples:
4493 -mmemcpy-strategy=libcall:-1:noalign
4495 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
4499 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
4501 This is to tell the compiler to use the following strategy for memset
4502 1) when the expected size is between [1, 16], use rep_8byte strategy;
4503 2) when the size is between [17, 2048], use vector_loop;
4504 3) when the size is > 2048, use libcall. */
4506 struct stringop_size_range
4508 int max;
4509 stringop_alg alg;
4510 bool noalign;
4513 static void
4514 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
4516 const struct stringop_algs *default_algs;
4517 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
4518 char *curr_range_str, *next_range_str;
4519 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
4520 int i = 0, n = 0;
4522 if (is_memset)
4523 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
4524 else
4525 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
4527 curr_range_str = strategy_str;
4531 int maxs;
4532 char alg_name[128];
4533 char align[16];
4534 next_range_str = strchr (curr_range_str, ',');
4535 if (next_range_str)
4536 *next_range_str++ = '\0';
4538 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
4539 alg_name, &maxs, align))
4541 error ("wrong argument %qs to option %qs", curr_range_str, opt);
4542 return;
4545 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
4547 error ("size ranges of option %qs should be increasing", opt);
4548 return;
4551 for (i = 0; i < last_alg; i++)
4552 if (!strcmp (alg_name, stringop_alg_names[i]))
4553 break;
4555 if (i == last_alg)
4557 error ("wrong strategy name %qs specified for option %qs",
4558 alg_name, opt);
4560 auto_vec <const char *> candidates;
4561 for (i = 0; i < last_alg; i++)
4562 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
4563 candidates.safe_push (stringop_alg_names[i]);
4565 char *s;
4566 const char *hint
4567 = candidates_list_and_hint (alg_name, s, candidates);
4568 if (hint)
4569 inform (input_location,
4570 "valid arguments to %qs are: %s; did you mean %qs?",
4571 opt, s, hint);
4572 else
4573 inform (input_location, "valid arguments to %qs are: %s",
4574 opt, s);
4575 XDELETEVEC (s);
4576 return;
4579 if ((stringop_alg) i == rep_prefix_8_byte
4580 && !TARGET_64BIT)
4582 /* rep; movq isn't available in 32-bit code. */
4583 error ("strategy name %qs specified for option %qs "
4584 "not supported for 32-bit code", alg_name, opt);
4585 return;
4588 input_ranges[n].max = maxs;
4589 input_ranges[n].alg = (stringop_alg) i;
4590 if (!strcmp (align, "align"))
4591 input_ranges[n].noalign = false;
4592 else if (!strcmp (align, "noalign"))
4593 input_ranges[n].noalign = true;
4594 else
4596 error ("unknown alignment %qs specified for option %qs", align, opt);
4597 return;
4599 n++;
4600 curr_range_str = next_range_str;
4602 while (curr_range_str);
4604 if (input_ranges[n - 1].max != -1)
4606 error ("the max value for the last size range should be -1"
4607 " for option %qs", opt);
4608 return;
4611 if (n > MAX_STRINGOP_ALGS)
4613 error ("too many size ranges specified in option %qs", opt);
4614 return;
4617 /* Now override the default algs array. */
4618 for (i = 0; i < n; i++)
4620 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
4621 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
4622 = input_ranges[i].alg;
4623 *const_cast<int *>(&default_algs->size[i].noalign)
4624 = input_ranges[i].noalign;
4629 /* parse -mtune-ctrl= option. When DUMP is true,
4630 print the features that are explicitly set. */
4632 static void
4633 parse_mtune_ctrl_str (bool dump)
4635 if (!ix86_tune_ctrl_string)
4636 return;
4638 char *next_feature_string = NULL;
4639 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
4640 char *orig = curr_feature_string;
4641 int i;
4644 bool clear = false;
4646 next_feature_string = strchr (curr_feature_string, ',');
4647 if (next_feature_string)
4648 *next_feature_string++ = '\0';
4649 if (*curr_feature_string == '^')
4651 curr_feature_string++;
4652 clear = true;
4654 for (i = 0; i < X86_TUNE_LAST; i++)
4656 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
4658 ix86_tune_features[i] = !clear;
4659 if (dump)
4660 fprintf (stderr, "Explicitly %s feature %s\n",
4661 clear ? "clear" : "set", ix86_tune_feature_names[i]);
4662 break;
4665 if (i == X86_TUNE_LAST)
4666 error ("Unknown parameter to option -mtune-ctrl: %s",
4667 clear ? curr_feature_string - 1 : curr_feature_string);
4668 curr_feature_string = next_feature_string;
4670 while (curr_feature_string);
4671 free (orig);
4674 /* Helper function to set ix86_tune_features. IX86_TUNE is the
4675 processor type. */
4677 static void
4678 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
4680 unsigned int ix86_tune_mask = 1u << ix86_tune;
4681 int i;
4683 for (i = 0; i < X86_TUNE_LAST; ++i)
4685 if (ix86_tune_no_default)
4686 ix86_tune_features[i] = 0;
4687 else
4688 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4691 if (dump)
4693 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
4694 for (i = 0; i < X86_TUNE_LAST; i++)
4695 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
4696 ix86_tune_features[i] ? "on" : "off");
4699 parse_mtune_ctrl_str (dump);
4703 /* Default align_* from the processor table. */
4705 static void
4706 ix86_default_align (struct gcc_options *opts)
4708 if (opts->x_align_loops == 0)
4710 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
4711 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
4713 if (opts->x_align_jumps == 0)
4715 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
4716 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
4718 if (opts->x_align_functions == 0)
4720 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
4724 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
4726 static void
4727 ix86_override_options_after_change (void)
4729 ix86_default_align (&global_options);
4732 /* Override various settings based on options. If MAIN_ARGS_P, the
4733 options are from the command line, otherwise they are from
4734 attributes. Return true if there's an error related to march
4735 option. */
4737 static bool
4738 ix86_option_override_internal (bool main_args_p,
4739 struct gcc_options *opts,
4740 struct gcc_options *opts_set)
4742 int i;
4743 unsigned int ix86_arch_mask;
4744 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
4746 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
4747 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
4748 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
4749 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
4750 #define PTA_AES (HOST_WIDE_INT_1 << 4)
4751 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
4752 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
4753 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
4754 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
4755 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
4756 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
4757 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
4758 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
4759 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
4760 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
4761 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
4762 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
4763 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
4764 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
4765 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
4766 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
4767 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
4768 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
4769 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
4770 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
4771 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
4772 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
4773 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
4774 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
4775 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
4776 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
4777 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
4778 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
4779 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
4780 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
4781 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
4782 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
4783 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
4784 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
4785 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
4786 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
4787 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
4788 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
4789 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
4790 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
4791 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
4792 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
4793 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
4794 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
4795 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
4796 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
4797 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
4798 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
4799 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
4800 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
4801 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
4802 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
4803 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
4804 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
4805 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
4807 #define PTA_CORE2 \
4808 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
4809 | PTA_CX16 | PTA_FXSR)
4810 #define PTA_NEHALEM \
4811 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
4812 #define PTA_WESTMERE \
4813 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
4814 #define PTA_SANDYBRIDGE \
4815 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
4816 #define PTA_IVYBRIDGE \
4817 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
4818 #define PTA_HASWELL \
4819 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
4820 | PTA_FMA | PTA_MOVBE | PTA_HLE)
4821 #define PTA_BROADWELL \
4822 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
4823 #define PTA_SKYLAKE \
4824 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
4825 #define PTA_SKYLAKE_AVX512 \
4826 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
4827 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
4828 #define PTA_KNL \
4829 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
4830 #define PTA_BONNELL \
4831 (PTA_CORE2 | PTA_MOVBE)
4832 #define PTA_SILVERMONT \
4833 (PTA_WESTMERE | PTA_MOVBE)
4835 /* if this reaches 64, need to widen struct pta flags below */
4837 static struct pta
4839 const char *const name; /* processor name or nickname. */
4840 const enum processor_type processor;
4841 const enum attr_cpu schedule;
4842 const unsigned HOST_WIDE_INT flags;
4844 const processor_alias_table[] =
4846 {"i386", PROCESSOR_I386, CPU_NONE, 0},
4847 {"i486", PROCESSOR_I486, CPU_NONE, 0},
4848 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4849 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
4850 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
4851 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
4852 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
4853 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4854 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4855 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
4856 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4857 PTA_MMX | PTA_SSE | PTA_FXSR},
4858 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4859 PTA_MMX | PTA_SSE | PTA_FXSR},
4860 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4861 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4862 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4863 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4864 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4865 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
4866 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
4867 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4868 PTA_MMX | PTA_SSE | PTA_FXSR},
4869 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4870 PTA_MMX | PTA_SSE | PTA_FXSR},
4871 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
4872 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4873 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
4874 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
4875 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
4876 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
4877 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
4878 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4879 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
4880 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4881 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
4882 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
4883 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4884 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
4885 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
4886 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4887 PTA_SANDYBRIDGE},
4888 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4889 PTA_SANDYBRIDGE},
4890 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4891 PTA_IVYBRIDGE},
4892 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
4893 PTA_IVYBRIDGE},
4894 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4895 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
4896 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
4897 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
4898 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
4899 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4900 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
4901 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4902 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
4903 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
4904 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
4905 {"geode", PROCESSOR_GEODE, CPU_GEODE,
4906 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4907 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
4908 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4909 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
4910 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
4911 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4912 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
4913 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
4914 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
4915 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4916 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
4917 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4918 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
4919 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
4920 {"x86-64", PROCESSOR_K8, CPU_K8,
4921 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4922 {"eden-x2", PROCESSOR_K8, CPU_K8,
4923 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
4924 {"nano", PROCESSOR_K8, CPU_K8,
4925 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4926 | PTA_SSSE3 | PTA_FXSR},
4927 {"nano-1000", PROCESSOR_K8, CPU_K8,
4928 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4929 | PTA_SSSE3 | PTA_FXSR},
4930 {"nano-2000", PROCESSOR_K8, CPU_K8,
4931 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4932 | PTA_SSSE3 | PTA_FXSR},
4933 {"nano-3000", PROCESSOR_K8, CPU_K8,
4934 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4935 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4936 {"nano-x2", PROCESSOR_K8, CPU_K8,
4937 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4938 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4939 {"eden-x4", PROCESSOR_K8, CPU_K8,
4940 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4941 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4942 {"nano-x4", PROCESSOR_K8, CPU_K8,
4943 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4944 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
4945 {"k8", PROCESSOR_K8, CPU_K8,
4946 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4947 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4948 {"k8-sse3", PROCESSOR_K8, CPU_K8,
4949 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4950 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4951 {"opteron", PROCESSOR_K8, CPU_K8,
4952 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4953 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4954 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
4955 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4956 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4957 {"athlon64", PROCESSOR_K8, CPU_K8,
4958 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4959 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4960 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
4961 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4962 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
4963 {"athlon-fx", PROCESSOR_K8, CPU_K8,
4964 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
4965 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
4966 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4967 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4968 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4969 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
4970 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
4971 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
4972 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
4973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4974 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4975 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4976 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4977 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
4978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4979 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4980 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4981 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
4982 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
4983 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
4984 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4985 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4986 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
4987 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
4988 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
4989 | PTA_XSAVEOPT | PTA_FSGSBASE},
4990 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
4991 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
4992 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
4993 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
4994 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
4995 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
4996 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
4997 | PTA_MOVBE | PTA_MWAITX},
4998 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
4999 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5000 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5001 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5002 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5003 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5004 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5005 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5006 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5007 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5008 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5009 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5010 | PTA_FXSR | PTA_XSAVE},
5011 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5012 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5013 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5014 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5015 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5016 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5018 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5019 PTA_64BIT
5020 | PTA_HLE /* flags are only used for -march switch. */ },
5023 /* -mrecip options. */
5024 static struct
5026 const char *string; /* option name */
5027 unsigned int mask; /* mask bits to set */
5029 const recip_options[] =
5031 { "all", RECIP_MASK_ALL },
5032 { "none", RECIP_MASK_NONE },
5033 { "div", RECIP_MASK_DIV },
5034 { "sqrt", RECIP_MASK_SQRT },
5035 { "vec-div", RECIP_MASK_VEC_DIV },
5036 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5039 int const pta_size = ARRAY_SIZE (processor_alias_table);
5041 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5042 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5043 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5044 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5045 #ifdef TARGET_BI_ARCH
5046 else
5048 #if TARGET_BI_ARCH == 1
5049 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5050 is on and OPTION_MASK_ABI_X32 is off. We turn off
5051 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5052 -mx32. */
5053 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5054 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5055 #else
5056 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5057 on and OPTION_MASK_ABI_64 is off. We turn off
5058 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5059 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5060 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5061 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5062 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5063 #endif
5064 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5065 && TARGET_IAMCU_P (opts->x_target_flags))
5066 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5067 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5069 #endif
5071 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5073 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5074 OPTION_MASK_ABI_64 for TARGET_X32. */
5075 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5076 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5078 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5079 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5080 | OPTION_MASK_ABI_X32
5081 | OPTION_MASK_ABI_64);
5082 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5084 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5085 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5086 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5087 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5090 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5091 SUBTARGET_OVERRIDE_OPTIONS;
5092 #endif
5094 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5095 SUBSUBTARGET_OVERRIDE_OPTIONS;
5096 #endif
5098 /* -fPIC is the default for x86_64. */
5099 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5100 opts->x_flag_pic = 2;
5102 /* Need to check -mtune=generic first. */
5103 if (opts->x_ix86_tune_string)
5105 /* As special support for cross compilers we read -mtune=native
5106 as -mtune=generic. With native compilers we won't see the
5107 -mtune=native, as it was changed by the driver. */
5108 if (!strcmp (opts->x_ix86_tune_string, "native"))
5110 opts->x_ix86_tune_string = "generic";
5112 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5113 warning (OPT_Wdeprecated,
5114 main_args_p
5115 ? "%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5116 "or %<-mtune=generic%> instead as appropriate"
5117 : "%<target(\"tune=x86-64\")%> is deprecated; use "
5118 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%> "
5119 "instead as appropriate");
5121 else
5123 if (opts->x_ix86_arch_string)
5124 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5125 if (!opts->x_ix86_tune_string)
5127 opts->x_ix86_tune_string
5128 = processor_target_table[TARGET_CPU_DEFAULT].name;
5129 ix86_tune_defaulted = 1;
5132 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5133 or defaulted. We need to use a sensible tune option. */
5134 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5136 opts->x_ix86_tune_string = "generic";
5140 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5141 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5143 /* rep; movq isn't available in 32-bit code. */
5144 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5145 opts->x_ix86_stringop_alg = no_stringop;
5148 if (!opts->x_ix86_arch_string)
5149 opts->x_ix86_arch_string
5150 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5151 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5152 else
5153 ix86_arch_specified = 1;
5155 if (opts_set->x_ix86_pmode)
5157 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5158 && opts->x_ix86_pmode == PMODE_SI)
5159 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5160 && opts->x_ix86_pmode == PMODE_DI))
5161 error ("address mode %qs not supported in the %s bit mode",
5162 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5163 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5165 else
5166 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5167 ? PMODE_DI : PMODE_SI;
5169 if (!opts_set->x_ix86_abi)
5170 opts->x_ix86_abi = DEFAULT_ABI;
5172 /* For targets using ms ABI enable ms-extensions, if not
5173 explicit turned off. For non-ms ABI we turn off this
5174 option. */
5175 if (!opts_set->x_flag_ms_extensions)
5176 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5178 if (opts_set->x_ix86_cmodel)
5180 switch (opts->x_ix86_cmodel)
5182 case CM_SMALL:
5183 case CM_SMALL_PIC:
5184 if (opts->x_flag_pic)
5185 opts->x_ix86_cmodel = CM_SMALL_PIC;
5186 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5187 error ("code model %qs not supported in the %s bit mode",
5188 "small", "32");
5189 break;
5191 case CM_MEDIUM:
5192 case CM_MEDIUM_PIC:
5193 if (opts->x_flag_pic)
5194 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5195 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5196 error ("code model %qs not supported in the %s bit mode",
5197 "medium", "32");
5198 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5199 error ("code model %qs not supported in x32 mode",
5200 "medium");
5201 break;
5203 case CM_LARGE:
5204 case CM_LARGE_PIC:
5205 if (opts->x_flag_pic)
5206 opts->x_ix86_cmodel = CM_LARGE_PIC;
5207 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5208 error ("code model %qs not supported in the %s bit mode",
5209 "large", "32");
5210 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5211 error ("code model %qs not supported in x32 mode",
5212 "large");
5213 break;
5215 case CM_32:
5216 if (opts->x_flag_pic)
5217 error ("code model %s does not support PIC mode", "32");
5218 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5219 error ("code model %qs not supported in the %s bit mode",
5220 "32", "64");
5221 break;
5223 case CM_KERNEL:
5224 if (opts->x_flag_pic)
5226 error ("code model %s does not support PIC mode", "kernel");
5227 opts->x_ix86_cmodel = CM_32;
5229 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5230 error ("code model %qs not supported in the %s bit mode",
5231 "kernel", "32");
5232 break;
5234 default:
5235 gcc_unreachable ();
5238 else
5240 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5241 use of rip-relative addressing. This eliminates fixups that
5242 would otherwise be needed if this object is to be placed in a
5243 DLL, and is essentially just as efficient as direct addressing. */
5244 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5245 && (TARGET_RDOS || TARGET_PECOFF))
5246 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5247 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5248 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5249 else
5250 opts->x_ix86_cmodel = CM_32;
5252 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5254 error ("-masm=intel not supported in this configuration");
5255 opts->x_ix86_asm_dialect = ASM_ATT;
5257 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5258 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5259 sorry ("%i-bit mode not compiled in",
5260 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5262 for (i = 0; i < pta_size; i++)
5263 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5265 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5267 error (main_args_p
5268 ? "%<generic%> CPU can be used only for %<-mtune=%> switch"
5269 : "%<generic%> CPU can be used only for "
5270 "%<target(\"tune=\")%> attribute");
5271 return false;
5273 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5275 error (main_args_p
5276 ? "%<intel%> CPU can be used only for %<-mtune=%> switch"
5277 : "%<intel%> CPU can be used only for "
5278 "%<target(\"tune=\")%> attribute");
5279 return false;
5282 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5283 && !(processor_alias_table[i].flags & PTA_64BIT))
5285 error ("CPU you selected does not support x86-64 "
5286 "instruction set");
5287 return false;
5290 ix86_schedule = processor_alias_table[i].schedule;
5291 ix86_arch = processor_alias_table[i].processor;
5292 /* Default cpu tuning to the architecture. */
5293 ix86_tune = ix86_arch;
5295 if (processor_alias_table[i].flags & PTA_MMX
5296 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5297 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5298 if (processor_alias_table[i].flags & PTA_3DNOW
5299 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5300 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5301 if (processor_alias_table[i].flags & PTA_3DNOW_A
5302 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5303 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5304 if (processor_alias_table[i].flags & PTA_SSE
5305 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5306 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5307 if (processor_alias_table[i].flags & PTA_SSE2
5308 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5309 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5310 if (processor_alias_table[i].flags & PTA_SSE3
5311 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5312 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5313 if (processor_alias_table[i].flags & PTA_SSSE3
5314 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5315 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5316 if (processor_alias_table[i].flags & PTA_SSE4_1
5317 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5318 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5319 if (processor_alias_table[i].flags & PTA_SSE4_2
5320 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5321 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5322 if (processor_alias_table[i].flags & PTA_AVX
5323 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5324 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5325 if (processor_alias_table[i].flags & PTA_AVX2
5326 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5327 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5328 if (processor_alias_table[i].flags & PTA_FMA
5329 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5330 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5331 if (processor_alias_table[i].flags & PTA_SSE4A
5332 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5333 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5334 if (processor_alias_table[i].flags & PTA_FMA4
5335 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5336 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5337 if (processor_alias_table[i].flags & PTA_XOP
5338 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5339 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5340 if (processor_alias_table[i].flags & PTA_LWP
5341 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5342 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5343 if (processor_alias_table[i].flags & PTA_ABM
5344 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5345 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5346 if (processor_alias_table[i].flags & PTA_BMI
5347 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5348 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5349 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5350 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5351 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5352 if (processor_alias_table[i].flags & PTA_TBM
5353 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5354 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5355 if (processor_alias_table[i].flags & PTA_BMI2
5356 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5357 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5358 if (processor_alias_table[i].flags & PTA_CX16
5359 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5360 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5361 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5362 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5363 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5364 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5365 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5366 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5367 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5368 if (processor_alias_table[i].flags & PTA_MOVBE
5369 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5370 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5371 if (processor_alias_table[i].flags & PTA_AES
5372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5373 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5374 if (processor_alias_table[i].flags & PTA_SHA
5375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5376 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5377 if (processor_alias_table[i].flags & PTA_PCLMUL
5378 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5379 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5380 if (processor_alias_table[i].flags & PTA_FSGSBASE
5381 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5382 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5383 if (processor_alias_table[i].flags & PTA_RDRND
5384 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5385 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5386 if (processor_alias_table[i].flags & PTA_F16C
5387 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5388 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5389 if (processor_alias_table[i].flags & PTA_RTM
5390 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5391 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5392 if (processor_alias_table[i].flags & PTA_HLE
5393 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5394 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5395 if (processor_alias_table[i].flags & PTA_PRFCHW
5396 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5397 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5398 if (processor_alias_table[i].flags & PTA_RDSEED
5399 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5400 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5401 if (processor_alias_table[i].flags & PTA_ADX
5402 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5403 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5404 if (processor_alias_table[i].flags & PTA_FXSR
5405 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5406 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5407 if (processor_alias_table[i].flags & PTA_XSAVE
5408 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5409 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5410 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5411 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5412 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5413 if (processor_alias_table[i].flags & PTA_AVX512F
5414 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5415 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5416 if (processor_alias_table[i].flags & PTA_AVX512ER
5417 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5418 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5419 if (processor_alias_table[i].flags & PTA_AVX512PF
5420 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5421 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5422 if (processor_alias_table[i].flags & PTA_AVX512CD
5423 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5424 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5425 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5426 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5427 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5428 if (processor_alias_table[i].flags & PTA_CLWB
5429 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5430 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5431 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5432 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5433 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5434 if (processor_alias_table[i].flags & PTA_CLZERO
5435 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5436 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5437 if (processor_alias_table[i].flags & PTA_XSAVEC
5438 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5439 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5440 if (processor_alias_table[i].flags & PTA_XSAVES
5441 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5442 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5443 if (processor_alias_table[i].flags & PTA_AVX512DQ
5444 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5445 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5446 if (processor_alias_table[i].flags & PTA_AVX512BW
5447 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5448 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5449 if (processor_alias_table[i].flags & PTA_AVX512VL
5450 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5451 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5452 if (processor_alias_table[i].flags & PTA_MPX
5453 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5454 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5455 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5456 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5457 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5458 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5459 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5460 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5461 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5462 x86_prefetch_sse = true;
5463 if (processor_alias_table[i].flags & PTA_MWAITX
5464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5466 if (processor_alias_table[i].flags & PTA_PKU
5467 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5468 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
5470 /* Don't enable x87 instructions if only
5471 general registers are allowed. */
5472 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
5473 && !(opts_set->x_target_flags & MASK_80387))
5475 if (processor_alias_table[i].flags & PTA_NO_80387)
5476 opts->x_target_flags &= ~MASK_80387;
5477 else
5478 opts->x_target_flags |= MASK_80387;
5480 break;
5483 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
5484 error ("Intel MPX does not support x32");
5486 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
5487 error ("Intel MPX does not support x32");
5489 if (i == pta_size)
5491 error (main_args_p
5492 ? "bad value (%qs) for %<-march=%> switch"
5493 : "bad value (%qs) for %<target(\"arch=\")%> attribute",
5494 opts->x_ix86_arch_string);
5496 auto_vec <const char *> candidates;
5497 for (i = 0; i < pta_size; i++)
5498 if (strcmp (processor_alias_table[i].name, "generic")
5499 && strcmp (processor_alias_table[i].name, "intel")
5500 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5501 || (processor_alias_table[i].flags & PTA_64BIT)))
5502 candidates.safe_push (processor_alias_table[i].name);
5504 char *s;
5505 const char *hint
5506 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
5507 if (hint)
5508 inform (input_location,
5509 main_args_p
5510 ? "valid arguments to %<-march=%> switch are: "
5511 "%s; did you mean %qs?"
5512 : "valid arguments to %<target(\"arch=\")%> attribute are: "
5513 "%s; did you mean %qs?", s, hint);
5514 else
5515 inform (input_location,
5516 main_args_p
5517 ? "valid arguments to %<-march=%> switch are: %s"
5518 : "valid arguments to %<target(\"arch=\")%> attribute are: %s",
5520 XDELETEVEC (s);
5523 ix86_arch_mask = 1u << ix86_arch;
5524 for (i = 0; i < X86_ARCH_LAST; ++i)
5525 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5527 for (i = 0; i < pta_size; i++)
5528 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
5530 ix86_schedule = processor_alias_table[i].schedule;
5531 ix86_tune = processor_alias_table[i].processor;
5532 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5534 if (!(processor_alias_table[i].flags & PTA_64BIT))
5536 if (ix86_tune_defaulted)
5538 opts->x_ix86_tune_string = "x86-64";
5539 for (i = 0; i < pta_size; i++)
5540 if (! strcmp (opts->x_ix86_tune_string,
5541 processor_alias_table[i].name))
5542 break;
5543 ix86_schedule = processor_alias_table[i].schedule;
5544 ix86_tune = processor_alias_table[i].processor;
5546 else
5547 error ("CPU you selected does not support x86-64 "
5548 "instruction set");
5551 /* Intel CPUs have always interpreted SSE prefetch instructions as
5552 NOPs; so, we can enable SSE prefetch instructions even when
5553 -mtune (rather than -march) points us to a processor that has them.
5554 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
5555 higher processors. */
5556 if (TARGET_CMOV
5557 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
5558 x86_prefetch_sse = true;
5559 break;
5562 if (ix86_tune_specified && i == pta_size)
5564 error (main_args_p
5565 ? "bad value (%qs) for %<-mtune=%> switch"
5566 : "bad value (%qs) for %<target(\"tune=\")%> attribute",
5567 opts->x_ix86_tune_string);
5569 auto_vec <const char *> candidates;
5570 for (i = 0; i < pta_size; i++)
5571 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5572 || (processor_alias_table[i].flags & PTA_64BIT))
5573 candidates.safe_push (processor_alias_table[i].name);
5575 char *s;
5576 const char *hint
5577 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
5578 if (hint)
5579 inform (input_location,
5580 main_args_p
5581 ? "valid arguments to %<-mtune=%> switch are: "
5582 "%s; did you mean %qs?"
5583 : "valid arguments to %<target(\"tune=\")%> attribute are: "
5584 "%s; did you mean %qs?", s, hint);
5585 else
5586 inform (input_location,
5587 main_args_p
5588 ? "valid arguments to %<-mtune=%> switch are: %s"
5589 : "valid arguments to %<target(\"tune=\")%> attribute are: %s",
5591 XDELETEVEC (s);
5594 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
5596 #ifndef USE_IX86_FRAME_POINTER
5597 #define USE_IX86_FRAME_POINTER 0
5598 #endif
5600 #ifndef USE_X86_64_FRAME_POINTER
5601 #define USE_X86_64_FRAME_POINTER 0
5602 #endif
5604 /* Set the default values for switches whose default depends on TARGET_64BIT
5605 in case they weren't overwritten by command line options. */
5606 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5608 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5609 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
5610 if (opts->x_flag_asynchronous_unwind_tables
5611 && !opts_set->x_flag_unwind_tables
5612 && TARGET_64BIT_MS_ABI)
5613 opts->x_flag_unwind_tables = 1;
5614 if (opts->x_flag_asynchronous_unwind_tables == 2)
5615 opts->x_flag_unwind_tables
5616 = opts->x_flag_asynchronous_unwind_tables = 1;
5617 if (opts->x_flag_pcc_struct_return == 2)
5618 opts->x_flag_pcc_struct_return = 0;
5620 else
5622 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
5623 opts->x_flag_omit_frame_pointer
5624 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
5625 if (opts->x_flag_asynchronous_unwind_tables == 2)
5626 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
5627 if (opts->x_flag_pcc_struct_return == 2)
5629 /* Intel MCU psABI specifies that -freg-struct-return should
5630 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
5631 we check -miamcu so that -freg-struct-return is always
5632 turned on if -miamcu is used. */
5633 if (TARGET_IAMCU_P (opts->x_target_flags))
5634 opts->x_flag_pcc_struct_return = 0;
5635 else
5636 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
5640 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5641 /* TODO: ix86_cost should be chosen at instruction or function granuality
5642 so for cold code we use size_cost even in !optimize_size compilation. */
5643 if (opts->x_optimize_size)
5644 ix86_cost = &ix86_size_cost;
5645 else
5646 ix86_cost = ix86_tune_cost;
5648 /* Arrange to set up i386_stack_locals for all functions. */
5649 init_machine_status = ix86_init_machine_status;
5651 /* Validate -mregparm= value. */
5652 if (opts_set->x_ix86_regparm)
5654 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5655 warning (0, "-mregparm is ignored in 64-bit mode");
5656 else if (TARGET_IAMCU_P (opts->x_target_flags))
5657 warning (0, "-mregparm is ignored for Intel MCU psABI");
5658 if (opts->x_ix86_regparm > REGPARM_MAX)
5660 error ("-mregparm=%d is not between 0 and %d",
5661 opts->x_ix86_regparm, REGPARM_MAX);
5662 opts->x_ix86_regparm = 0;
5665 if (TARGET_IAMCU_P (opts->x_target_flags)
5666 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
5667 opts->x_ix86_regparm = REGPARM_MAX;
5669 /* Default align_* from the processor table. */
5670 ix86_default_align (opts);
5672 /* Provide default for -mbranch-cost= value. */
5673 if (!opts_set->x_ix86_branch_cost)
5674 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
5676 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5678 opts->x_target_flags
5679 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
5681 /* Enable by default the SSE and MMX builtins. Do allow the user to
5682 explicitly disable any of these. In particular, disabling SSE and
5683 MMX for kernel code is extremely useful. */
5684 if (!ix86_arch_specified)
5685 opts->x_ix86_isa_flags
5686 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
5687 | TARGET_SUBTARGET64_ISA_DEFAULT)
5688 & ~opts->x_ix86_isa_flags_explicit);
5690 if (TARGET_RTD_P (opts->x_target_flags))
5691 warning (0,
5692 main_args_p ? "%<-mrtd%> is ignored in 64bit mode"
5693 : "%<target(\"rtd\")%> is ignored in 64bit mode");
5695 else
5697 opts->x_target_flags
5698 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
5700 if (!ix86_arch_specified)
5701 opts->x_ix86_isa_flags
5702 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
5704 /* i386 ABI does not specify red zone. It still makes sense to use it
5705 when programmer takes care to stack from being destroyed. */
5706 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
5707 opts->x_target_flags |= MASK_NO_RED_ZONE;
5710 /* Keep nonleaf frame pointers. */
5711 if (opts->x_flag_omit_frame_pointer)
5712 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
5713 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
5714 opts->x_flag_omit_frame_pointer = 1;
5716 /* If we're doing fast math, we don't care about comparison order
5717 wrt NaNs. This lets us use a shorter comparison sequence. */
5718 if (opts->x_flag_finite_math_only)
5719 opts->x_target_flags &= ~MASK_IEEE_FP;
5721 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
5722 since the insns won't need emulation. */
5723 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
5724 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
5726 /* Likewise, if the target doesn't have a 387, or we've specified
5727 software floating point, don't use 387 inline intrinsics. */
5728 if (!TARGET_80387_P (opts->x_target_flags))
5729 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
5731 /* Turn on MMX builtins for -msse. */
5732 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
5733 opts->x_ix86_isa_flags
5734 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
5736 /* Enable SSE prefetch. */
5737 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
5738 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
5739 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
5740 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
5741 x86_prefetch_sse = true;
5743 /* Enable popcnt instruction for -msse4.2 or -mabm. */
5744 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
5745 || TARGET_ABM_P (opts->x_ix86_isa_flags))
5746 opts->x_ix86_isa_flags
5747 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
5749 /* Enable lzcnt instruction for -mabm. */
5750 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
5751 opts->x_ix86_isa_flags
5752 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
5754 /* Validate -mpreferred-stack-boundary= value or default it to
5755 PREFERRED_STACK_BOUNDARY_DEFAULT. */
5756 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
5757 if (opts_set->x_ix86_preferred_stack_boundary_arg)
5759 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5760 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
5761 int max = (TARGET_SEH ? 4 : 12);
5763 if (opts->x_ix86_preferred_stack_boundary_arg < min
5764 || opts->x_ix86_preferred_stack_boundary_arg > max)
5766 if (min == max)
5767 error ("-mpreferred-stack-boundary is not supported "
5768 "for this target");
5769 else
5770 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
5771 opts->x_ix86_preferred_stack_boundary_arg, min, max);
5773 else
5774 ix86_preferred_stack_boundary
5775 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
5778 /* Set the default value for -mstackrealign. */
5779 if (opts->x_ix86_force_align_arg_pointer == -1)
5780 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
5782 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
5784 /* Validate -mincoming-stack-boundary= value or default it to
5785 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
5786 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
5787 if (opts_set->x_ix86_incoming_stack_boundary_arg)
5789 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
5791 if (opts->x_ix86_incoming_stack_boundary_arg < min
5792 || opts->x_ix86_incoming_stack_boundary_arg > 12)
5793 error ("-mincoming-stack-boundary=%d is not between %d and 12",
5794 opts->x_ix86_incoming_stack_boundary_arg, min);
5795 else
5797 ix86_user_incoming_stack_boundary
5798 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
5799 ix86_incoming_stack_boundary
5800 = ix86_user_incoming_stack_boundary;
5804 #ifndef NO_PROFILE_COUNTERS
5805 if (flag_nop_mcount)
5806 error ("-mnop-mcount is not compatible with this target");
5807 #endif
5808 if (flag_nop_mcount && flag_pic)
5809 error ("-mnop-mcount is not implemented for -fPIC");
5811 /* Accept -msseregparm only if at least SSE support is enabled. */
5812 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
5813 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
5814 error (main_args_p
5815 ? "%<-msseregparm%> used without SSE enabled"
5816 : "%<target(\"sseregparm\")%> used without SSE enabled");
5818 if (opts_set->x_ix86_fpmath)
5820 if (opts->x_ix86_fpmath & FPMATH_SSE)
5822 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
5824 if (TARGET_80387_P (opts->x_target_flags))
5826 warning (0, "SSE instruction set disabled, using 387 arithmetics");
5827 opts->x_ix86_fpmath = FPMATH_387;
5830 else if ((opts->x_ix86_fpmath & FPMATH_387)
5831 && !TARGET_80387_P (opts->x_target_flags))
5833 warning (0, "387 instruction set disabled, using SSE arithmetics");
5834 opts->x_ix86_fpmath = FPMATH_SSE;
5838 /* For all chips supporting SSE2, -mfpmath=sse performs better than
5839 fpmath=387. The second is however default at many targets since the
5840 extra 80bit precision of temporaries is considered to be part of ABI.
5841 Overwrite the default at least for -ffast-math.
5842 TODO: -mfpmath=both seems to produce same performing code with bit
5843 smaller binaries. It is however not clear if register allocation is
5844 ready for this setting.
5845 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
5846 codegen. We may switch to 387 with -ffast-math for size optimized
5847 functions. */
5848 else if (fast_math_flags_set_p (&global_options)
5849 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
5850 opts->x_ix86_fpmath = FPMATH_SSE;
5851 else
5852 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
5854 /* Use external vectorized library in vectorizing intrinsics. */
5855 if (opts_set->x_ix86_veclibabi_type)
5856 switch (opts->x_ix86_veclibabi_type)
5858 case ix86_veclibabi_type_svml:
5859 ix86_veclib_handler = ix86_veclibabi_svml;
5860 break;
5862 case ix86_veclibabi_type_acml:
5863 ix86_veclib_handler = ix86_veclibabi_acml;
5864 break;
5866 default:
5867 gcc_unreachable ();
5870 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
5871 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5872 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5874 /* If stack probes are required, the space used for large function
5875 arguments on the stack must also be probed, so enable
5876 -maccumulate-outgoing-args so this happens in the prologue. */
5877 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
5878 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5880 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5881 warning (0,
5882 main_args_p
5883 ? "stack probing requires %<-maccumulate-outgoing-args%> "
5884 "for correctness"
5885 : "stack probing requires "
5886 "%<target(\"accumulate-outgoing-args\")%> for correctness");
5887 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5890 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
5891 so enable -maccumulate-outgoing-args when %ebp is fixed. */
5892 if (fixed_regs[BP_REG]
5893 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
5895 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
5896 warning (0,
5897 main_args_p
5898 ? "fixed ebp register requires %<-maccumulate-outgoing-args%>"
5899 : "fixed ebp register requires "
5900 "%<target(\"accumulate-outgoing-args\")%>");
5901 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
5904 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
5906 char *p;
5907 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
5908 p = strchr (internal_label_prefix, 'X');
5909 internal_label_prefix_len = p - internal_label_prefix;
5910 *p = '\0';
5913 /* When scheduling description is not available, disable scheduler pass
5914 so it won't slow down the compilation and make x87 code slower. */
5915 if (!TARGET_SCHEDULE)
5916 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
5918 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
5919 ix86_tune_cost->simultaneous_prefetches,
5920 opts->x_param_values,
5921 opts_set->x_param_values);
5922 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
5923 ix86_tune_cost->prefetch_block,
5924 opts->x_param_values,
5925 opts_set->x_param_values);
5926 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
5927 ix86_tune_cost->l1_cache_size,
5928 opts->x_param_values,
5929 opts_set->x_param_values);
5930 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
5931 ix86_tune_cost->l2_cache_size,
5932 opts->x_param_values,
5933 opts_set->x_param_values);
5935 /* Restrict number of if-converted SET insns to 1. */
5936 if (TARGET_ONE_IF_CONV_INSN)
5937 maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS,
5939 opts->x_param_values,
5940 opts_set->x_param_values);
5942 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
5943 if (opts->x_flag_prefetch_loop_arrays < 0
5944 && HAVE_prefetch
5945 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
5946 && !opts->x_optimize_size
5947 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
5948 opts->x_flag_prefetch_loop_arrays = 1;
5950 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
5951 can be opts->x_optimized to ap = __builtin_next_arg (0). */
5952 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
5953 targetm.expand_builtin_va_start = NULL;
5955 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5957 ix86_gen_leave = gen_leave_rex64;
5958 if (Pmode == DImode)
5960 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
5961 ix86_gen_tls_local_dynamic_base_64
5962 = gen_tls_local_dynamic_base_64_di;
5964 else
5966 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
5967 ix86_gen_tls_local_dynamic_base_64
5968 = gen_tls_local_dynamic_base_64_si;
5971 else
5972 ix86_gen_leave = gen_leave;
5974 if (Pmode == DImode)
5976 ix86_gen_add3 = gen_adddi3;
5977 ix86_gen_sub3 = gen_subdi3;
5978 ix86_gen_sub3_carry = gen_subdi3_carry;
5979 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
5980 ix86_gen_andsp = gen_anddi3;
5981 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
5982 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
5983 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
5984 ix86_gen_monitor = gen_sse3_monitor_di;
5985 ix86_gen_monitorx = gen_monitorx_di;
5986 ix86_gen_clzero = gen_clzero_di;
5988 else
5990 ix86_gen_add3 = gen_addsi3;
5991 ix86_gen_sub3 = gen_subsi3;
5992 ix86_gen_sub3_carry = gen_subsi3_carry;
5993 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
5994 ix86_gen_andsp = gen_andsi3;
5995 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
5996 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
5997 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
5998 ix86_gen_monitor = gen_sse3_monitor_si;
5999 ix86_gen_monitorx = gen_monitorx_si;
6000 ix86_gen_clzero = gen_clzero_si;
6003 #ifdef USE_IX86_CLD
6004 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6005 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6006 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6007 #endif
6009 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6011 if (opts->x_flag_fentry > 0)
6012 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6013 "with -fpic");
6014 opts->x_flag_fentry = 0;
6016 else if (TARGET_SEH)
6018 if (opts->x_flag_fentry == 0)
6019 sorry ("-mno-fentry isn%'t compatible with SEH");
6020 opts->x_flag_fentry = 1;
6022 else if (opts->x_flag_fentry < 0)
6024 #if defined(PROFILE_BEFORE_PROLOGUE)
6025 opts->x_flag_fentry = 1;
6026 #else
6027 opts->x_flag_fentry = 0;
6028 #endif
6031 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6032 opts->x_target_flags |= MASK_VZEROUPPER;
6033 if (!(opts_set->x_target_flags & MASK_STV))
6034 opts->x_target_flags |= MASK_STV;
6035 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6036 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6037 stack realignment will be extra cost the pass doesn't take into
6038 account and the pass can't realign the stack. */
6039 if (ix86_preferred_stack_boundary < 128
6040 || ix86_incoming_stack_boundary < 128
6041 || opts->x_ix86_force_align_arg_pointer)
6042 opts->x_target_flags &= ~MASK_STV;
6043 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6044 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6045 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6046 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6047 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6048 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6049 /* Enable 128-bit AVX instruction generation
6050 for the auto-vectorizer. */
6051 if (TARGET_AVX128_OPTIMAL
6052 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6053 opts->x_target_flags |= MASK_PREFER_AVX128;
6055 if (opts->x_ix86_recip_name)
6057 char *p = ASTRDUP (opts->x_ix86_recip_name);
6058 char *q;
6059 unsigned int mask, i;
6060 bool invert;
6062 while ((q = strtok (p, ",")) != NULL)
6064 p = NULL;
6065 if (*q == '!')
6067 invert = true;
6068 q++;
6070 else
6071 invert = false;
6073 if (!strcmp (q, "default"))
6074 mask = RECIP_MASK_ALL;
6075 else
6077 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6078 if (!strcmp (q, recip_options[i].string))
6080 mask = recip_options[i].mask;
6081 break;
6084 if (i == ARRAY_SIZE (recip_options))
6086 error ("unknown option for -mrecip=%s", q);
6087 invert = false;
6088 mask = RECIP_MASK_NONE;
6092 opts->x_recip_mask_explicit |= mask;
6093 if (invert)
6094 opts->x_recip_mask &= ~mask;
6095 else
6096 opts->x_recip_mask |= mask;
6100 if (TARGET_RECIP_P (opts->x_target_flags))
6101 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6102 else if (opts_set->x_target_flags & MASK_RECIP)
6103 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6105 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6106 for 64-bit Bionic. Also default long double to 64-bit for Intel
6107 MCU psABI. */
6108 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6109 && !(opts_set->x_target_flags
6110 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6111 opts->x_target_flags |= (TARGET_64BIT
6112 ? MASK_LONG_DOUBLE_128
6113 : MASK_LONG_DOUBLE_64);
6115 /* Only one of them can be active. */
6116 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6117 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6119 /* Save the initial options in case the user does function specific
6120 options. */
6121 if (main_args_p)
6122 target_option_default_node = target_option_current_node
6123 = build_target_option_node (opts);
6125 /* Handle stack protector */
6126 if (!opts_set->x_ix86_stack_protector_guard)
6127 opts->x_ix86_stack_protector_guard
6128 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6130 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6131 if (opts->x_ix86_tune_memcpy_strategy)
6133 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6134 ix86_parse_stringop_strategy_string (str, false);
6135 free (str);
6138 if (opts->x_ix86_tune_memset_strategy)
6140 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6141 ix86_parse_stringop_strategy_string (str, true);
6142 free (str);
6145 return true;
6148 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6150 static void
6151 ix86_option_override (void)
6153 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
6154 struct register_pass_info insert_vzeroupper_info
6155 = { pass_insert_vzeroupper, "reload",
6156 1, PASS_POS_INSERT_AFTER
6158 opt_pass *pass_stv = make_pass_stv (g);
6159 struct register_pass_info stv_info_dimode
6160 = { pass_stv, "combine",
6161 1, PASS_POS_INSERT_AFTER
6163 /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
6164 CONSTM1_RTX generated by the STV pass can be CSEed. */
6165 struct register_pass_info stv_info_timode
6166 = { pass_stv, "cse2",
6167 1, PASS_POS_INSERT_BEFORE
6170 ix86_option_override_internal (true, &global_options, &global_options_set);
6173 /* This needs to be done at start up. It's convenient to do it here. */
6174 register_pass (&insert_vzeroupper_info);
6175 register_pass (TARGET_64BIT ? &stv_info_timode : &stv_info_dimode);
6178 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6179 static char *
6180 ix86_offload_options (void)
6182 if (TARGET_LP64)
6183 return xstrdup ("-foffload-abi=lp64");
6184 return xstrdup ("-foffload-abi=ilp32");
6187 /* Update register usage after having seen the compiler flags. */
6189 static void
6190 ix86_conditional_register_usage (void)
6192 int i, c_mask;
6194 /* If there are no caller-saved registers, preserve all registers.
6195 except fixed_regs and registers used for function return value
6196 since aggregate_value_p checks call_used_regs[regno] on return
6197 value. */
6198 if (cfun && cfun->machine->no_caller_saved_registers)
6199 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6200 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6201 call_used_regs[i] = 0;
6203 /* For 32-bit targets, squash the REX registers. */
6204 if (! TARGET_64BIT)
6206 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6207 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6208 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6209 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6210 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6211 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6214 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6215 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6217 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6219 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6221 /* Set/reset conditionally defined registers from
6222 CALL_USED_REGISTERS initializer. */
6223 if (call_used_regs[i] > 1)
6224 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6226 /* Calculate registers of CLOBBERED_REGS register set
6227 as call used registers from GENERAL_REGS register set. */
6228 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6229 && call_used_regs[i])
6230 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6233 /* If MMX is disabled, squash the registers. */
6234 if (! TARGET_MMX)
6235 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6236 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6237 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6239 /* If SSE is disabled, squash the registers. */
6240 if (! TARGET_SSE)
6241 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6242 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6243 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6245 /* If the FPU is disabled, squash the registers. */
6246 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6247 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6248 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6249 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6251 /* If AVX512F is disabled, squash the registers. */
6252 if (! TARGET_AVX512F)
6254 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6255 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6257 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6258 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6261 /* If MPX is disabled, squash the registers. */
6262 if (! TARGET_MPX)
6263 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6264 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6268 /* Save the current options */
6270 static void
6271 ix86_function_specific_save (struct cl_target_option *ptr,
6272 struct gcc_options *opts)
6274 ptr->arch = ix86_arch;
6275 ptr->schedule = ix86_schedule;
6276 ptr->prefetch_sse = x86_prefetch_sse;
6277 ptr->tune = ix86_tune;
6278 ptr->branch_cost = ix86_branch_cost;
6279 ptr->tune_defaulted = ix86_tune_defaulted;
6280 ptr->arch_specified = ix86_arch_specified;
6281 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6282 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6283 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6284 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6285 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6286 ptr->x_ix86_abi = opts->x_ix86_abi;
6287 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6288 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6289 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6290 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6291 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6292 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6293 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6294 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6295 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6296 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6297 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6298 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6299 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6300 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6301 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6302 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6303 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6304 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6305 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6306 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6308 /* The fields are char but the variables are not; make sure the
6309 values fit in the fields. */
6310 gcc_assert (ptr->arch == ix86_arch);
6311 gcc_assert (ptr->schedule == ix86_schedule);
6312 gcc_assert (ptr->tune == ix86_tune);
6313 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6316 /* Restore the current options */
6318 static void
6319 ix86_function_specific_restore (struct gcc_options *opts,
6320 struct cl_target_option *ptr)
6322 enum processor_type old_tune = ix86_tune;
6323 enum processor_type old_arch = ix86_arch;
6324 unsigned int ix86_arch_mask;
6325 int i;
6327 /* We don't change -fPIC. */
6328 opts->x_flag_pic = flag_pic;
6330 ix86_arch = (enum processor_type) ptr->arch;
6331 ix86_schedule = (enum attr_cpu) ptr->schedule;
6332 ix86_tune = (enum processor_type) ptr->tune;
6333 x86_prefetch_sse = ptr->prefetch_sse;
6334 opts->x_ix86_branch_cost = ptr->branch_cost;
6335 ix86_tune_defaulted = ptr->tune_defaulted;
6336 ix86_arch_specified = ptr->arch_specified;
6337 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6338 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6339 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6340 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6341 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6342 opts->x_ix86_abi = ptr->x_ix86_abi;
6343 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6344 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6345 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6346 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6347 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6348 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6349 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6350 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6351 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6352 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6353 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6354 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6355 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6356 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6357 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6358 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6359 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6360 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6361 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6362 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6363 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6364 /* TODO: ix86_cost should be chosen at instruction or function granuality
6365 so for cold code we use size_cost even in !optimize_size compilation. */
6366 if (opts->x_optimize_size)
6367 ix86_cost = &ix86_size_cost;
6368 else
6369 ix86_cost = ix86_tune_cost;
6371 /* Recreate the arch feature tests if the arch changed */
6372 if (old_arch != ix86_arch)
6374 ix86_arch_mask = 1u << ix86_arch;
6375 for (i = 0; i < X86_ARCH_LAST; ++i)
6376 ix86_arch_features[i]
6377 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6380 /* Recreate the tune optimization tests */
6381 if (old_tune != ix86_tune)
6382 set_ix86_tune_features (ix86_tune, false);
6385 /* Adjust target options after streaming them in. This is mainly about
6386 reconciling them with global options. */
6388 static void
6389 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6391 /* flag_pic is a global option, but ix86_cmodel is target saved option
6392 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6393 for PIC, or error out. */
6394 if (flag_pic)
6395 switch (ptr->x_ix86_cmodel)
6397 case CM_SMALL:
6398 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6399 break;
6401 case CM_MEDIUM:
6402 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6403 break;
6405 case CM_LARGE:
6406 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6407 break;
6409 case CM_KERNEL:
6410 error ("code model %s does not support PIC mode", "kernel");
6411 break;
6413 default:
6414 break;
6416 else
6417 switch (ptr->x_ix86_cmodel)
6419 case CM_SMALL_PIC:
6420 ptr->x_ix86_cmodel = CM_SMALL;
6421 break;
6423 case CM_MEDIUM_PIC:
6424 ptr->x_ix86_cmodel = CM_MEDIUM;
6425 break;
6427 case CM_LARGE_PIC:
6428 ptr->x_ix86_cmodel = CM_LARGE;
6429 break;
6431 default:
6432 break;
6436 /* Print the current options */
6438 static void
6439 ix86_function_specific_print (FILE *file, int indent,
6440 struct cl_target_option *ptr)
6442 char *target_string
6443 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
6444 ptr->x_ix86_target_flags, NULL, NULL,
6445 ptr->x_ix86_fpmath, false);
6447 gcc_assert (ptr->arch < PROCESSOR_max);
6448 fprintf (file, "%*sarch = %d (%s)\n",
6449 indent, "",
6450 ptr->arch, processor_target_table[ptr->arch].name);
6452 gcc_assert (ptr->tune < PROCESSOR_max);
6453 fprintf (file, "%*stune = %d (%s)\n",
6454 indent, "",
6455 ptr->tune, processor_target_table[ptr->tune].name);
6457 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6459 if (target_string)
6461 fprintf (file, "%*s%s\n", indent, "", target_string);
6462 free (target_string);
6467 /* Inner function to process the attribute((target(...))), take an argument and
6468 set the current options from the argument. If we have a list, recursively go
6469 over the list. */
6471 static bool
6472 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6473 struct gcc_options *opts,
6474 struct gcc_options *opts_set,
6475 struct gcc_options *enum_opts_set)
6477 char *next_optstr;
6478 bool ret = true;
6480 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6481 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6482 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6483 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6484 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
6486 enum ix86_opt_type
6488 ix86_opt_unknown,
6489 ix86_opt_yes,
6490 ix86_opt_no,
6491 ix86_opt_str,
6492 ix86_opt_enum,
6493 ix86_opt_isa
6496 static const struct
6498 const char *string;
6499 size_t len;
6500 enum ix86_opt_type type;
6501 int opt;
6502 int mask;
6503 } attrs[] = {
6504 /* isa options */
6505 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
6506 IX86_ATTR_ISA ("abm", OPT_mabm),
6507 IX86_ATTR_ISA ("bmi", OPT_mbmi),
6508 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
6509 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
6510 IX86_ATTR_ISA ("tbm", OPT_mtbm),
6511 IX86_ATTR_ISA ("aes", OPT_maes),
6512 IX86_ATTR_ISA ("sha", OPT_msha),
6513 IX86_ATTR_ISA ("avx", OPT_mavx),
6514 IX86_ATTR_ISA ("avx2", OPT_mavx2),
6515 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
6516 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
6517 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
6518 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
6519 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
6520 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
6521 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
6522 IX86_ATTR_ISA ("mmx", OPT_mmmx),
6523 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
6524 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
6525 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
6526 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
6527 IX86_ATTR_ISA ("sse", OPT_msse),
6528 IX86_ATTR_ISA ("sse2", OPT_msse2),
6529 IX86_ATTR_ISA ("sse3", OPT_msse3),
6530 IX86_ATTR_ISA ("sse4", OPT_msse4),
6531 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
6532 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
6533 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
6534 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
6535 IX86_ATTR_ISA ("fma4", OPT_mfma4),
6536 IX86_ATTR_ISA ("fma", OPT_mfma),
6537 IX86_ATTR_ISA ("xop", OPT_mxop),
6538 IX86_ATTR_ISA ("lwp", OPT_mlwp),
6539 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
6540 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
6541 IX86_ATTR_ISA ("f16c", OPT_mf16c),
6542 IX86_ATTR_ISA ("rtm", OPT_mrtm),
6543 IX86_ATTR_ISA ("hle", OPT_mhle),
6544 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
6545 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
6546 IX86_ATTR_ISA ("adx", OPT_madx),
6547 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
6548 IX86_ATTR_ISA ("xsave", OPT_mxsave),
6549 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
6550 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
6551 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
6552 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
6553 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
6554 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
6555 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
6556 IX86_ATTR_ISA ("clwb", OPT_mclwb),
6557 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
6558 IX86_ATTR_ISA ("clzero", OPT_mclzero),
6559 IX86_ATTR_ISA ("pku", OPT_mpku),
6561 /* enum options */
6562 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
6564 /* string options */
6565 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
6566 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
6568 /* flag options */
6569 IX86_ATTR_YES ("cld",
6570 OPT_mcld,
6571 MASK_CLD),
6573 IX86_ATTR_NO ("fancy-math-387",
6574 OPT_mfancy_math_387,
6575 MASK_NO_FANCY_MATH_387),
6577 IX86_ATTR_YES ("ieee-fp",
6578 OPT_mieee_fp,
6579 MASK_IEEE_FP),
6581 IX86_ATTR_YES ("inline-all-stringops",
6582 OPT_minline_all_stringops,
6583 MASK_INLINE_ALL_STRINGOPS),
6585 IX86_ATTR_YES ("inline-stringops-dynamically",
6586 OPT_minline_stringops_dynamically,
6587 MASK_INLINE_STRINGOPS_DYNAMICALLY),
6589 IX86_ATTR_NO ("align-stringops",
6590 OPT_mno_align_stringops,
6591 MASK_NO_ALIGN_STRINGOPS),
6593 IX86_ATTR_YES ("recip",
6594 OPT_mrecip,
6595 MASK_RECIP),
6599 /* If this is a list, recurse to get the options. */
6600 if (TREE_CODE (args) == TREE_LIST)
6602 bool ret = true;
6604 for (; args; args = TREE_CHAIN (args))
6605 if (TREE_VALUE (args)
6606 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
6607 p_strings, opts, opts_set,
6608 enum_opts_set))
6609 ret = false;
6611 return ret;
6614 else if (TREE_CODE (args) != STRING_CST)
6616 error ("attribute %<target%> argument not a string");
6617 return false;
6620 /* Handle multiple arguments separated by commas. */
6621 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
6623 while (next_optstr && *next_optstr != '\0')
6625 char *p = next_optstr;
6626 char *orig_p = p;
6627 char *comma = strchr (next_optstr, ',');
6628 const char *opt_string;
6629 size_t len, opt_len;
6630 int opt;
6631 bool opt_set_p;
6632 char ch;
6633 unsigned i;
6634 enum ix86_opt_type type = ix86_opt_unknown;
6635 int mask = 0;
6637 if (comma)
6639 *comma = '\0';
6640 len = comma - next_optstr;
6641 next_optstr = comma + 1;
6643 else
6645 len = strlen (p);
6646 next_optstr = NULL;
6649 /* Recognize no-xxx. */
6650 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
6652 opt_set_p = false;
6653 p += 3;
6654 len -= 3;
6656 else
6657 opt_set_p = true;
6659 /* Find the option. */
6660 ch = *p;
6661 opt = N_OPTS;
6662 for (i = 0; i < ARRAY_SIZE (attrs); i++)
6664 type = attrs[i].type;
6665 opt_len = attrs[i].len;
6666 if (ch == attrs[i].string[0]
6667 && ((type != ix86_opt_str && type != ix86_opt_enum)
6668 ? len == opt_len
6669 : len > opt_len)
6670 && memcmp (p, attrs[i].string, opt_len) == 0)
6672 opt = attrs[i].opt;
6673 mask = attrs[i].mask;
6674 opt_string = attrs[i].string;
6675 break;
6679 /* Process the option. */
6680 if (opt == N_OPTS)
6682 error ("attribute(target(\"%s\")) is unknown", orig_p);
6683 ret = false;
6686 else if (type == ix86_opt_isa)
6688 struct cl_decoded_option decoded;
6690 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
6691 ix86_handle_option (opts, opts_set,
6692 &decoded, input_location);
6695 else if (type == ix86_opt_yes || type == ix86_opt_no)
6697 if (type == ix86_opt_no)
6698 opt_set_p = !opt_set_p;
6700 if (opt_set_p)
6701 opts->x_target_flags |= mask;
6702 else
6703 opts->x_target_flags &= ~mask;
6706 else if (type == ix86_opt_str)
6708 if (p_strings[opt])
6710 error ("option(\"%s\") was already specified", opt_string);
6711 ret = false;
6713 else
6714 p_strings[opt] = xstrdup (p + opt_len);
6717 else if (type == ix86_opt_enum)
6719 bool arg_ok;
6720 int value;
6722 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
6723 if (arg_ok)
6724 set_option (opts, enum_opts_set, opt, value,
6725 p + opt_len, DK_UNSPECIFIED, input_location,
6726 global_dc);
6727 else
6729 error ("attribute(target(\"%s\")) is unknown", orig_p);
6730 ret = false;
6734 else
6735 gcc_unreachable ();
6738 return ret;
6741 /* Release allocated strings. */
6742 static void
6743 release_options_strings (char **option_strings)
6745 /* Free up memory allocated to hold the strings */
6746 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
6747 free (option_strings[i]);
6750 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
6752 tree
6753 ix86_valid_target_attribute_tree (tree args,
6754 struct gcc_options *opts,
6755 struct gcc_options *opts_set)
6757 const char *orig_arch_string = opts->x_ix86_arch_string;
6758 const char *orig_tune_string = opts->x_ix86_tune_string;
6759 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
6760 int orig_tune_defaulted = ix86_tune_defaulted;
6761 int orig_arch_specified = ix86_arch_specified;
6762 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
6763 tree t = NULL_TREE;
6764 struct cl_target_option *def
6765 = TREE_TARGET_OPTION (target_option_default_node);
6766 struct gcc_options enum_opts_set;
6768 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
6770 /* Process each of the options on the chain. */
6771 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
6772 opts_set, &enum_opts_set))
6773 return error_mark_node;
6775 /* If the changed options are different from the default, rerun
6776 ix86_option_override_internal, and then save the options away.
6777 The string options are attribute options, and will be undone
6778 when we copy the save structure. */
6779 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
6780 || opts->x_target_flags != def->x_target_flags
6781 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
6782 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
6783 || enum_opts_set.x_ix86_fpmath)
6785 /* If we are using the default tune= or arch=, undo the string assigned,
6786 and use the default. */
6787 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
6789 opts->x_ix86_arch_string
6790 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
6792 /* If arch= is set, clear all bits in x_ix86_isa_flags,
6793 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
6794 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
6795 | OPTION_MASK_ABI_64
6796 | OPTION_MASK_ABI_X32
6797 | OPTION_MASK_CODE16);
6800 else if (!orig_arch_specified)
6801 opts->x_ix86_arch_string = NULL;
6803 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
6804 opts->x_ix86_tune_string
6805 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
6806 else if (orig_tune_defaulted)
6807 opts->x_ix86_tune_string = NULL;
6809 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
6810 if (enum_opts_set.x_ix86_fpmath)
6811 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6812 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6813 && TARGET_SSE_P (opts->x_ix86_isa_flags))
6815 if (TARGET_80387_P (opts->x_target_flags))
6816 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
6817 | FPMATH_387);
6818 else
6819 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
6820 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
6823 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
6824 bool r = ix86_option_override_internal (false, opts, opts_set);
6825 if (!r)
6827 release_options_strings (option_strings);
6828 return error_mark_node;
6831 /* Add any builtin functions with the new isa if any. */
6832 ix86_add_new_builtins (opts->x_ix86_isa_flags);
6834 /* Save the current options unless we are validating options for
6835 #pragma. */
6836 t = build_target_option_node (opts);
6838 opts->x_ix86_arch_string = orig_arch_string;
6839 opts->x_ix86_tune_string = orig_tune_string;
6840 opts_set->x_ix86_fpmath = orig_fpmath_set;
6842 release_options_strings (option_strings);
6845 return t;
6848 /* Hook to validate attribute((target("string"))). */
6850 static bool
6851 ix86_valid_target_attribute_p (tree fndecl,
6852 tree ARG_UNUSED (name),
6853 tree args,
6854 int ARG_UNUSED (flags))
6856 struct gcc_options func_options;
6857 tree new_target, new_optimize;
6858 bool ret = true;
6860 /* attribute((target("default"))) does nothing, beyond
6861 affecting multi-versioning. */
6862 if (TREE_VALUE (args)
6863 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
6864 && TREE_CHAIN (args) == NULL_TREE
6865 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
6866 return true;
6868 tree old_optimize = build_optimization_node (&global_options);
6870 /* Get the optimization options of the current function. */
6871 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6873 if (!func_optimize)
6874 func_optimize = old_optimize;
6876 /* Init func_options. */
6877 memset (&func_options, 0, sizeof (func_options));
6878 init_options_struct (&func_options, NULL);
6879 lang_hooks.init_options_struct (&func_options);
6881 cl_optimization_restore (&func_options,
6882 TREE_OPTIMIZATION (func_optimize));
6884 /* Initialize func_options to the default before its target options can
6885 be set. */
6886 cl_target_option_restore (&func_options,
6887 TREE_TARGET_OPTION (target_option_default_node));
6889 new_target = ix86_valid_target_attribute_tree (args, &func_options,
6890 &global_options_set);
6892 new_optimize = build_optimization_node (&func_options);
6894 if (new_target == error_mark_node)
6895 ret = false;
6897 else if (fndecl && new_target)
6899 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
6901 if (old_optimize != new_optimize)
6902 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
6905 finalize_options_struct (&func_options);
6907 return ret;
6911 /* Hook to determine if one function can safely inline another. */
6913 static bool
6914 ix86_can_inline_p (tree caller, tree callee)
6916 bool ret = false;
6917 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
6918 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
6920 /* If callee has no option attributes, then it is ok to inline. */
6921 if (!callee_tree)
6922 ret = true;
6924 /* If caller has no option attributes, but callee does then it is not ok to
6925 inline. */
6926 else if (!caller_tree)
6927 ret = false;
6929 else
6931 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
6932 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
6934 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
6935 can inline a SSE2 function but a SSE2 function can't inline a SSE4
6936 function. */
6937 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
6938 != callee_opts->x_ix86_isa_flags)
6939 ret = false;
6941 /* See if we have the same non-isa options. */
6942 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
6943 ret = false;
6945 /* See if arch, tune, etc. are the same. */
6946 else if (caller_opts->arch != callee_opts->arch)
6947 ret = false;
6949 else if (caller_opts->tune != callee_opts->tune)
6950 ret = false;
6952 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
6953 ret = false;
6955 else if (caller_opts->branch_cost != callee_opts->branch_cost)
6956 ret = false;
6958 else
6959 ret = true;
6962 return ret;
6966 /* Remember the last target of ix86_set_current_function. */
6967 static GTY(()) tree ix86_previous_fndecl;
6969 /* Set targets globals to the default (or current #pragma GCC target
6970 if active). Invalidate ix86_previous_fndecl cache. */
6972 void
6973 ix86_reset_previous_fndecl (void)
6975 tree new_tree = target_option_current_node;
6976 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6977 if (TREE_TARGET_GLOBALS (new_tree))
6978 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6979 else if (new_tree == target_option_default_node)
6980 restore_target_globals (&default_target_globals);
6981 else
6982 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6983 ix86_previous_fndecl = NULL_TREE;
6986 /* Set the func_type field from the function FNDECL. */
6988 static void
6989 ix86_set_func_type (tree fndecl)
6991 if (cfun->machine->func_type == TYPE_UNKNOWN)
6993 if (lookup_attribute ("interrupt",
6994 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
6996 int nargs = 0;
6997 for (tree arg = DECL_ARGUMENTS (fndecl);
6998 arg;
6999 arg = TREE_CHAIN (arg))
7000 nargs++;
7001 cfun->machine->no_caller_saved_registers = true;
7002 cfun->machine->func_type
7003 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7005 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7007 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7008 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7009 sorry ("Only DWARF debug format is supported for interrupt "
7010 "service routine.");
7012 else
7014 cfun->machine->func_type = TYPE_NORMAL;
7015 if (lookup_attribute ("no_caller_saved_registers",
7016 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7017 cfun->machine->no_caller_saved_registers = true;
7022 /* Establish appropriate back-end context for processing the function
7023 FNDECL. The argument might be NULL to indicate processing at top
7024 level, outside of any function scope. */
7025 static void
7026 ix86_set_current_function (tree fndecl)
7028 /* Only change the context if the function changes. This hook is called
7029 several times in the course of compiling a function, and we don't want to
7030 slow things down too much or call target_reinit when it isn't safe. */
7031 if (fndecl == ix86_previous_fndecl)
7033 /* There may be 2 function bodies for the same function FNDECL,
7034 one is extern inline and one isn't. Call ix86_set_func_type
7035 to set the func_type field. */
7036 if (fndecl != NULL_TREE)
7037 ix86_set_func_type (fndecl);
7038 return;
7041 tree old_tree;
7042 if (ix86_previous_fndecl == NULL_TREE)
7043 old_tree = target_option_current_node;
7044 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7045 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7046 else
7047 old_tree = target_option_default_node;
7049 if (fndecl == NULL_TREE)
7051 if (old_tree != target_option_current_node)
7052 ix86_reset_previous_fndecl ();
7053 return;
7056 ix86_set_func_type (fndecl);
7058 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7059 if (new_tree == NULL_TREE)
7060 new_tree = target_option_default_node;
7062 if (old_tree != new_tree)
7064 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7065 if (TREE_TARGET_GLOBALS (new_tree))
7066 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7067 else if (new_tree == target_option_default_node)
7068 restore_target_globals (&default_target_globals);
7069 else
7070 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7072 ix86_previous_fndecl = fndecl;
7074 static bool prev_no_caller_saved_registers;
7076 /* 64-bit MS and SYSV ABI have different set of call used registers.
7077 Avoid expensive re-initialization of init_regs each time we switch
7078 function context. */
7079 if (TARGET_64BIT
7080 && (call_used_regs[SI_REG]
7081 == (cfun->machine->call_abi == MS_ABI)))
7082 reinit_regs ();
7083 /* Need to re-initialize init_regs if caller-saved registers are
7084 changed. */
7085 else if (prev_no_caller_saved_registers
7086 != cfun->machine->no_caller_saved_registers)
7087 reinit_regs ();
7089 if (cfun->machine->func_type != TYPE_NORMAL
7090 || cfun->machine->no_caller_saved_registers)
7092 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7093 may change processor state. */
7094 const char *isa;
7095 if (TARGET_MPX)
7096 isa = "MPX";
7097 else if (TARGET_SSE)
7098 isa = "SSE";
7099 else if (TARGET_MMX)
7100 isa = "MMX/3Dnow";
7101 else if (TARGET_80387)
7102 isa = "80387";
7103 else
7104 isa = NULL;
7105 if (isa != NULL)
7107 if (cfun->machine->func_type != TYPE_NORMAL)
7108 sorry ("%s instructions aren't allowed in %s service routine",
7109 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7110 ? "exception" : "interrupt"));
7111 else
7112 sorry ("%s instructions aren't allowed in function with "
7113 "no_caller_saved_registers attribute", isa);
7114 /* Don't issue the same error twice. */
7115 cfun->machine->func_type = TYPE_NORMAL;
7116 cfun->machine->no_caller_saved_registers = false;
7120 prev_no_caller_saved_registers
7121 = cfun->machine->no_caller_saved_registers;
7125 /* Return true if this goes in large data/bss. */
7127 static bool
7128 ix86_in_large_data_p (tree exp)
7130 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7131 return false;
7133 if (exp == NULL_TREE)
7134 return false;
7136 /* Functions are never large data. */
7137 if (TREE_CODE (exp) == FUNCTION_DECL)
7138 return false;
7140 /* Automatic variables are never large data. */
7141 if (TREE_CODE (exp) == VAR_DECL && !is_global_var (exp))
7142 return false;
7144 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
7146 const char *section = DECL_SECTION_NAME (exp);
7147 if (strcmp (section, ".ldata") == 0
7148 || strcmp (section, ".lbss") == 0)
7149 return true;
7150 return false;
7152 else
7154 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7156 /* If this is an incomplete type with size 0, then we can't put it
7157 in data because it might be too big when completed. Also,
7158 int_size_in_bytes returns -1 if size can vary or is larger than
7159 an integer in which case also it is safer to assume that it goes in
7160 large data. */
7161 if (size <= 0 || size > ix86_section_threshold)
7162 return true;
7165 return false;
7168 /* i386-specific section flag to mark large sections. */
7169 #define SECTION_LARGE SECTION_MACH_DEP
7171 /* Switch to the appropriate section for output of DECL.
7172 DECL is either a `VAR_DECL' node or a constant of some sort.
7173 RELOC indicates whether forming the initial value of DECL requires
7174 link-time relocations. */
7176 ATTRIBUTE_UNUSED static section *
7177 x86_64_elf_select_section (tree decl, int reloc,
7178 unsigned HOST_WIDE_INT align)
7180 if (ix86_in_large_data_p (decl))
7182 const char *sname = NULL;
7183 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7184 switch (categorize_decl_for_section (decl, reloc))
7186 case SECCAT_DATA:
7187 sname = ".ldata";
7188 break;
7189 case SECCAT_DATA_REL:
7190 sname = ".ldata.rel";
7191 break;
7192 case SECCAT_DATA_REL_LOCAL:
7193 sname = ".ldata.rel.local";
7194 break;
7195 case SECCAT_DATA_REL_RO:
7196 sname = ".ldata.rel.ro";
7197 break;
7198 case SECCAT_DATA_REL_RO_LOCAL:
7199 sname = ".ldata.rel.ro.local";
7200 break;
7201 case SECCAT_BSS:
7202 sname = ".lbss";
7203 flags |= SECTION_BSS;
7204 break;
7205 case SECCAT_RODATA:
7206 case SECCAT_RODATA_MERGE_STR:
7207 case SECCAT_RODATA_MERGE_STR_INIT:
7208 case SECCAT_RODATA_MERGE_CONST:
7209 sname = ".lrodata";
7210 flags &= ~SECTION_WRITE;
7211 break;
7212 case SECCAT_SRODATA:
7213 case SECCAT_SDATA:
7214 case SECCAT_SBSS:
7215 gcc_unreachable ();
7216 case SECCAT_TEXT:
7217 case SECCAT_TDATA:
7218 case SECCAT_TBSS:
7219 /* We don't split these for medium model. Place them into
7220 default sections and hope for best. */
7221 break;
7223 if (sname)
7225 /* We might get called with string constants, but get_named_section
7226 doesn't like them as they are not DECLs. Also, we need to set
7227 flags in that case. */
7228 if (!DECL_P (decl))
7229 return get_section (sname, flags, NULL);
7230 return get_named_section (decl, sname, reloc);
7233 return default_elf_select_section (decl, reloc, align);
7236 /* Select a set of attributes for section NAME based on the properties
7237 of DECL and whether or not RELOC indicates that DECL's initializer
7238 might contain runtime relocations. */
7240 static unsigned int ATTRIBUTE_UNUSED
7241 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7243 unsigned int flags = default_section_type_flags (decl, name, reloc);
7245 if (ix86_in_large_data_p (decl))
7246 flags |= SECTION_LARGE;
7248 if (decl == NULL_TREE
7249 && (strcmp (name, ".ldata.rel.ro") == 0
7250 || strcmp (name, ".ldata.rel.ro.local") == 0))
7251 flags |= SECTION_RELRO;
7253 if (strcmp (name, ".lbss") == 0
7254 || strncmp (name, ".lbss.", 5) == 0
7255 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7256 flags |= SECTION_BSS;
7258 return flags;
7261 /* Build up a unique section name, expressed as a
7262 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7263 RELOC indicates whether the initial value of EXP requires
7264 link-time relocations. */
7266 static void ATTRIBUTE_UNUSED
7267 x86_64_elf_unique_section (tree decl, int reloc)
7269 if (ix86_in_large_data_p (decl))
7271 const char *prefix = NULL;
7272 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7273 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7275 switch (categorize_decl_for_section (decl, reloc))
7277 case SECCAT_DATA:
7278 case SECCAT_DATA_REL:
7279 case SECCAT_DATA_REL_LOCAL:
7280 case SECCAT_DATA_REL_RO:
7281 case SECCAT_DATA_REL_RO_LOCAL:
7282 prefix = one_only ? ".ld" : ".ldata";
7283 break;
7284 case SECCAT_BSS:
7285 prefix = one_only ? ".lb" : ".lbss";
7286 break;
7287 case SECCAT_RODATA:
7288 case SECCAT_RODATA_MERGE_STR:
7289 case SECCAT_RODATA_MERGE_STR_INIT:
7290 case SECCAT_RODATA_MERGE_CONST:
7291 prefix = one_only ? ".lr" : ".lrodata";
7292 break;
7293 case SECCAT_SRODATA:
7294 case SECCAT_SDATA:
7295 case SECCAT_SBSS:
7296 gcc_unreachable ();
7297 case SECCAT_TEXT:
7298 case SECCAT_TDATA:
7299 case SECCAT_TBSS:
7300 /* We don't split these for medium model. Place them into
7301 default sections and hope for best. */
7302 break;
7304 if (prefix)
7306 const char *name, *linkonce;
7307 char *string;
7309 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7310 name = targetm.strip_name_encoding (name);
7312 /* If we're using one_only, then there needs to be a .gnu.linkonce
7313 prefix to the section name. */
7314 linkonce = one_only ? ".gnu.linkonce" : "";
7316 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7318 set_decl_section_name (decl, string);
7319 return;
7322 default_unique_section (decl, reloc);
7325 #ifdef COMMON_ASM_OP
7327 #ifndef LARGECOMM_SECTION_ASM_OP
7328 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7329 #endif
7331 /* This says how to output assembler code to declare an
7332 uninitialized external linkage data object.
7334 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7335 large objects. */
7336 void
7337 x86_elf_aligned_decl_common (FILE *file, tree decl,
7338 const char *name, unsigned HOST_WIDE_INT size,
7339 int align)
7341 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7342 && size > (unsigned int)ix86_section_threshold)
7344 switch_to_section (get_named_section (decl, ".lbss", 0));
7345 fputs (LARGECOMM_SECTION_ASM_OP, file);
7347 else
7348 fputs (COMMON_ASM_OP, file);
7349 assemble_name (file, name);
7350 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7351 size, align / BITS_PER_UNIT);
7353 #endif
7355 /* Utility function for targets to use in implementing
7356 ASM_OUTPUT_ALIGNED_BSS. */
7358 void
7359 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7360 unsigned HOST_WIDE_INT size, int align)
7362 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7363 && size > (unsigned int)ix86_section_threshold)
7364 switch_to_section (get_named_section (decl, ".lbss", 0));
7365 else
7366 switch_to_section (bss_section);
7367 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7368 #ifdef ASM_DECLARE_OBJECT_NAME
7369 last_assemble_variable_decl = decl;
7370 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7371 #else
7372 /* Standard thing is just output label for the object. */
7373 ASM_OUTPUT_LABEL (file, name);
7374 #endif /* ASM_DECLARE_OBJECT_NAME */
7375 ASM_OUTPUT_SKIP (file, size ? size : 1);
7378 /* Decide whether we must probe the stack before any space allocation
7379 on this target. It's essentially TARGET_STACK_PROBE except when
7380 -fstack-check causes the stack to be already probed differently. */
7382 bool
7383 ix86_target_stack_probe (void)
7385 /* Do not probe the stack twice if static stack checking is enabled. */
7386 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7387 return false;
7389 return TARGET_STACK_PROBE;
7392 /* Decide whether we can make a sibling call to a function. DECL is the
7393 declaration of the function being targeted by the call and EXP is the
7394 CALL_EXPR representing the call. */
7396 static bool
7397 ix86_function_ok_for_sibcall (tree decl, tree exp)
7399 tree type, decl_or_type;
7400 rtx a, b;
7401 bool bind_global = decl && !targetm.binds_local_p (decl);
7403 /* Sibling call isn't OK if there are no caller-saved registers
7404 since all registers must be preserved before return. */
7405 if (cfun->machine->no_caller_saved_registers)
7406 return false;
7408 /* If we are generating position-independent code, we cannot sibcall
7409 optimize direct calls to global functions, as the PLT requires
7410 %ebx be live. (Darwin does not have a PLT.) */
7411 if (!TARGET_MACHO
7412 && !TARGET_64BIT
7413 && flag_pic
7414 && flag_plt
7415 && bind_global)
7416 return false;
7418 /* If we need to align the outgoing stack, then sibcalling would
7419 unalign the stack, which may break the called function. */
7420 if (ix86_minimum_incoming_stack_boundary (true)
7421 < PREFERRED_STACK_BOUNDARY)
7422 return false;
7424 if (decl)
7426 decl_or_type = decl;
7427 type = TREE_TYPE (decl);
7429 else
7431 /* We're looking at the CALL_EXPR, we need the type of the function. */
7432 type = CALL_EXPR_FN (exp); /* pointer expression */
7433 type = TREE_TYPE (type); /* pointer type */
7434 type = TREE_TYPE (type); /* function type */
7435 decl_or_type = type;
7438 /* Check that the return value locations are the same. Like
7439 if we are returning floats on the 80387 register stack, we cannot
7440 make a sibcall from a function that doesn't return a float to a
7441 function that does or, conversely, from a function that does return
7442 a float to a function that doesn't; the necessary stack adjustment
7443 would not be executed. This is also the place we notice
7444 differences in the return value ABI. Note that it is ok for one
7445 of the functions to have void return type as long as the return
7446 value of the other is passed in a register. */
7447 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7448 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7449 cfun->decl, false);
7450 if (STACK_REG_P (a) || STACK_REG_P (b))
7452 if (!rtx_equal_p (a, b))
7453 return false;
7455 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7457 else if (!rtx_equal_p (a, b))
7458 return false;
7460 if (TARGET_64BIT)
7462 /* The SYSV ABI has more call-clobbered registers;
7463 disallow sibcalls from MS to SYSV. */
7464 if (cfun->machine->call_abi == MS_ABI
7465 && ix86_function_type_abi (type) == SYSV_ABI)
7466 return false;
7468 else
7470 /* If this call is indirect, we'll need to be able to use a
7471 call-clobbered register for the address of the target function.
7472 Make sure that all such registers are not used for passing
7473 parameters. Note that DLLIMPORT functions and call to global
7474 function via GOT slot are indirect. */
7475 if (!decl
7476 || (bind_global && flag_pic && !flag_plt)
7477 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
7479 /* Check if regparm >= 3 since arg_reg_available is set to
7480 false if regparm == 0. If regparm is 1 or 2, there is
7481 always a call-clobbered register available.
7483 ??? The symbol indirect call doesn't need a call-clobbered
7484 register. But we don't know if this is a symbol indirect
7485 call or not here. */
7486 if (ix86_function_regparm (type, NULL) >= 3
7487 && !cfun->machine->arg_reg_available)
7488 return false;
7492 /* Otherwise okay. That also includes certain types of indirect calls. */
7493 return true;
7496 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
7497 and "sseregparm" calling convention attributes;
7498 arguments as in struct attribute_spec.handler. */
7500 static tree
7501 ix86_handle_cconv_attribute (tree *node, tree name,
7502 tree args,
7503 int,
7504 bool *no_add_attrs)
7506 if (TREE_CODE (*node) != FUNCTION_TYPE
7507 && TREE_CODE (*node) != METHOD_TYPE
7508 && TREE_CODE (*node) != FIELD_DECL
7509 && TREE_CODE (*node) != TYPE_DECL)
7511 warning (OPT_Wattributes, "%qE attribute only applies to functions",
7512 name);
7513 *no_add_attrs = true;
7514 return NULL_TREE;
7517 /* Can combine regparm with all attributes but fastcall, and thiscall. */
7518 if (is_attribute_p ("regparm", name))
7520 tree cst;
7522 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7524 error ("fastcall and regparm attributes are not compatible");
7527 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7529 error ("regparam and thiscall attributes are not compatible");
7532 cst = TREE_VALUE (args);
7533 if (TREE_CODE (cst) != INTEGER_CST)
7535 warning (OPT_Wattributes,
7536 "%qE attribute requires an integer constant argument",
7537 name);
7538 *no_add_attrs = true;
7540 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
7542 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
7543 name, REGPARM_MAX);
7544 *no_add_attrs = true;
7547 return NULL_TREE;
7550 if (TARGET_64BIT)
7552 /* Do not warn when emulating the MS ABI. */
7553 if ((TREE_CODE (*node) != FUNCTION_TYPE
7554 && TREE_CODE (*node) != METHOD_TYPE)
7555 || ix86_function_type_abi (*node) != MS_ABI)
7556 warning (OPT_Wattributes, "%qE attribute ignored",
7557 name);
7558 *no_add_attrs = true;
7559 return NULL_TREE;
7562 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
7563 if (is_attribute_p ("fastcall", name))
7565 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7567 error ("fastcall and cdecl attributes are not compatible");
7569 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7571 error ("fastcall and stdcall attributes are not compatible");
7573 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
7575 error ("fastcall and regparm attributes are not compatible");
7577 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7579 error ("fastcall and thiscall attributes are not compatible");
7583 /* Can combine stdcall with fastcall (redundant), regparm and
7584 sseregparm. */
7585 else if (is_attribute_p ("stdcall", name))
7587 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7589 error ("stdcall and cdecl attributes are not compatible");
7591 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7593 error ("stdcall and fastcall attributes are not compatible");
7595 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7597 error ("stdcall and thiscall attributes are not compatible");
7601 /* Can combine cdecl with regparm and sseregparm. */
7602 else if (is_attribute_p ("cdecl", name))
7604 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7606 error ("stdcall and cdecl attributes are not compatible");
7608 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7610 error ("fastcall and cdecl attributes are not compatible");
7612 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
7614 error ("cdecl and thiscall attributes are not compatible");
7617 else if (is_attribute_p ("thiscall", name))
7619 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
7620 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
7621 name);
7622 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
7624 error ("stdcall and thiscall attributes are not compatible");
7626 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
7628 error ("fastcall and thiscall attributes are not compatible");
7630 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
7632 error ("cdecl and thiscall attributes are not compatible");
7636 /* Can combine sseregparm with all attributes. */
7638 return NULL_TREE;
7641 /* The transactional memory builtins are implicitly regparm or fastcall
7642 depending on the ABI. Override the generic do-nothing attribute that
7643 these builtins were declared with, and replace it with one of the two
7644 attributes that we expect elsewhere. */
7646 static tree
7647 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
7648 int flags, bool *no_add_attrs)
7650 tree alt;
7652 /* In no case do we want to add the placeholder attribute. */
7653 *no_add_attrs = true;
7655 /* The 64-bit ABI is unchanged for transactional memory. */
7656 if (TARGET_64BIT)
7657 return NULL_TREE;
7659 /* ??? Is there a better way to validate 32-bit windows? We have
7660 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
7661 if (CHECK_STACK_LIMIT > 0)
7662 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
7663 else
7665 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
7666 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
7668 decl_attributes (node, alt, flags);
7670 return NULL_TREE;
7673 /* This function determines from TYPE the calling-convention. */
7675 unsigned int
7676 ix86_get_callcvt (const_tree type)
7678 unsigned int ret = 0;
7679 bool is_stdarg;
7680 tree attrs;
7682 if (TARGET_64BIT)
7683 return IX86_CALLCVT_CDECL;
7685 attrs = TYPE_ATTRIBUTES (type);
7686 if (attrs != NULL_TREE)
7688 if (lookup_attribute ("cdecl", attrs))
7689 ret |= IX86_CALLCVT_CDECL;
7690 else if (lookup_attribute ("stdcall", attrs))
7691 ret |= IX86_CALLCVT_STDCALL;
7692 else if (lookup_attribute ("fastcall", attrs))
7693 ret |= IX86_CALLCVT_FASTCALL;
7694 else if (lookup_attribute ("thiscall", attrs))
7695 ret |= IX86_CALLCVT_THISCALL;
7697 /* Regparam isn't allowed for thiscall and fastcall. */
7698 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
7700 if (lookup_attribute ("regparm", attrs))
7701 ret |= IX86_CALLCVT_REGPARM;
7702 if (lookup_attribute ("sseregparm", attrs))
7703 ret |= IX86_CALLCVT_SSEREGPARM;
7706 if (IX86_BASE_CALLCVT(ret) != 0)
7707 return ret;
7710 is_stdarg = stdarg_p (type);
7711 if (TARGET_RTD && !is_stdarg)
7712 return IX86_CALLCVT_STDCALL | ret;
7714 if (ret != 0
7715 || is_stdarg
7716 || TREE_CODE (type) != METHOD_TYPE
7717 || ix86_function_type_abi (type) != MS_ABI)
7718 return IX86_CALLCVT_CDECL | ret;
7720 return IX86_CALLCVT_THISCALL;
7723 /* Return 0 if the attributes for two types are incompatible, 1 if they
7724 are compatible, and 2 if they are nearly compatible (which causes a
7725 warning to be generated). */
7727 static int
7728 ix86_comp_type_attributes (const_tree type1, const_tree type2)
7730 unsigned int ccvt1, ccvt2;
7732 if (TREE_CODE (type1) != FUNCTION_TYPE
7733 && TREE_CODE (type1) != METHOD_TYPE)
7734 return 1;
7736 ccvt1 = ix86_get_callcvt (type1);
7737 ccvt2 = ix86_get_callcvt (type2);
7738 if (ccvt1 != ccvt2)
7739 return 0;
7740 if (ix86_function_regparm (type1, NULL)
7741 != ix86_function_regparm (type2, NULL))
7742 return 0;
7744 return 1;
7747 /* Return the regparm value for a function with the indicated TYPE and DECL.
7748 DECL may be NULL when calling function indirectly
7749 or considering a libcall. */
7751 static int
7752 ix86_function_regparm (const_tree type, const_tree decl)
7754 tree attr;
7755 int regparm;
7756 unsigned int ccvt;
7758 if (TARGET_64BIT)
7759 return (ix86_function_type_abi (type) == SYSV_ABI
7760 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
7761 ccvt = ix86_get_callcvt (type);
7762 regparm = ix86_regparm;
7764 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
7766 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
7767 if (attr)
7769 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
7770 return regparm;
7773 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7774 return 2;
7775 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7776 return 1;
7778 /* Use register calling convention for local functions when possible. */
7779 if (decl
7780 && TREE_CODE (decl) == FUNCTION_DECL)
7782 cgraph_node *target = cgraph_node::get (decl);
7783 if (target)
7784 target = target->function_symbol ();
7786 /* Caller and callee must agree on the calling convention, so
7787 checking here just optimize means that with
7788 __attribute__((optimize (...))) caller could use regparm convention
7789 and callee not, or vice versa. Instead look at whether the callee
7790 is optimized or not. */
7791 if (target && opt_for_fn (target->decl, optimize)
7792 && !(profile_flag && !flag_fentry))
7794 cgraph_local_info *i = &target->local;
7795 if (i && i->local && i->can_change_signature)
7797 int local_regparm, globals = 0, regno;
7799 /* Make sure no regparm register is taken by a
7800 fixed register variable. */
7801 for (local_regparm = 0; local_regparm < REGPARM_MAX;
7802 local_regparm++)
7803 if (fixed_regs[local_regparm])
7804 break;
7806 /* We don't want to use regparm(3) for nested functions as
7807 these use a static chain pointer in the third argument. */
7808 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
7809 local_regparm = 2;
7811 /* Save a register for the split stack. */
7812 if (local_regparm == 3 && flag_split_stack)
7813 local_regparm = 2;
7815 /* Each fixed register usage increases register pressure,
7816 so less registers should be used for argument passing.
7817 This functionality can be overriden by an explicit
7818 regparm value. */
7819 for (regno = AX_REG; regno <= DI_REG; regno++)
7820 if (fixed_regs[regno])
7821 globals++;
7823 local_regparm
7824 = globals < local_regparm ? local_regparm - globals : 0;
7826 if (local_regparm > regparm)
7827 regparm = local_regparm;
7832 return regparm;
7835 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
7836 DFmode (2) arguments in SSE registers for a function with the
7837 indicated TYPE and DECL. DECL may be NULL when calling function
7838 indirectly or considering a libcall. Return -1 if any FP parameter
7839 should be rejected by error. This is used in siutation we imply SSE
7840 calling convetion but the function is called from another function with
7841 SSE disabled. Otherwise return 0. */
7843 static int
7844 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
7846 gcc_assert (!TARGET_64BIT);
7848 /* Use SSE registers to pass SFmode and DFmode arguments if requested
7849 by the sseregparm attribute. */
7850 if (TARGET_SSEREGPARM
7851 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
7853 if (!TARGET_SSE)
7855 if (warn)
7857 if (decl)
7858 error ("calling %qD with attribute sseregparm without "
7859 "SSE/SSE2 enabled", decl);
7860 else
7861 error ("calling %qT with attribute sseregparm without "
7862 "SSE/SSE2 enabled", type);
7864 return 0;
7867 return 2;
7870 if (!decl)
7871 return 0;
7873 cgraph_node *target = cgraph_node::get (decl);
7874 if (target)
7875 target = target->function_symbol ();
7877 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
7878 (and DFmode for SSE2) arguments in SSE registers. */
7879 if (target
7880 /* TARGET_SSE_MATH */
7881 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
7882 && opt_for_fn (target->decl, optimize)
7883 && !(profile_flag && !flag_fentry))
7885 cgraph_local_info *i = &target->local;
7886 if (i && i->local && i->can_change_signature)
7888 /* Refuse to produce wrong code when local function with SSE enabled
7889 is called from SSE disabled function.
7890 FIXME: We need a way to detect these cases cross-ltrans partition
7891 and avoid using SSE calling conventions on local functions called
7892 from function with SSE disabled. For now at least delay the
7893 warning until we know we are going to produce wrong code.
7894 See PR66047 */
7895 if (!TARGET_SSE && warn)
7896 return -1;
7897 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
7898 ->x_ix86_isa_flags) ? 2 : 1;
7902 return 0;
7905 /* Return true if EAX is live at the start of the function. Used by
7906 ix86_expand_prologue to determine if we need special help before
7907 calling allocate_stack_worker. */
7909 static bool
7910 ix86_eax_live_at_start_p (void)
7912 /* Cheat. Don't bother working forward from ix86_function_regparm
7913 to the function type to whether an actual argument is located in
7914 eax. Instead just look at cfg info, which is still close enough
7915 to correct at this point. This gives false positives for broken
7916 functions that might use uninitialized data that happens to be
7917 allocated in eax, but who cares? */
7918 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
7921 static bool
7922 ix86_keep_aggregate_return_pointer (tree fntype)
7924 tree attr;
7926 if (!TARGET_64BIT)
7928 attr = lookup_attribute ("callee_pop_aggregate_return",
7929 TYPE_ATTRIBUTES (fntype));
7930 if (attr)
7931 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
7933 /* For 32-bit MS-ABI the default is to keep aggregate
7934 return pointer. */
7935 if (ix86_function_type_abi (fntype) == MS_ABI)
7936 return true;
7938 return KEEP_AGGREGATE_RETURN_POINTER != 0;
7941 /* Value is the number of bytes of arguments automatically
7942 popped when returning from a subroutine call.
7943 FUNDECL is the declaration node of the function (as a tree),
7944 FUNTYPE is the data type of the function (as a tree),
7945 or for a library call it is an identifier node for the subroutine name.
7946 SIZE is the number of bytes of arguments passed on the stack.
7948 On the 80386, the RTD insn may be used to pop them if the number
7949 of args is fixed, but if the number is variable then the caller
7950 must pop them all. RTD can't be used for library calls now
7951 because the library is compiled with the Unix compiler.
7952 Use of RTD is a selectable option, since it is incompatible with
7953 standard Unix calling sequences. If the option is not selected,
7954 the caller must always pop the args.
7956 The attribute stdcall is equivalent to RTD on a per module basis. */
7958 static int
7959 ix86_return_pops_args (tree fundecl, tree funtype, int size)
7961 unsigned int ccvt;
7963 /* None of the 64-bit ABIs pop arguments. */
7964 if (TARGET_64BIT)
7965 return 0;
7967 ccvt = ix86_get_callcvt (funtype);
7969 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
7970 | IX86_CALLCVT_THISCALL)) != 0
7971 && ! stdarg_p (funtype))
7972 return size;
7974 /* Lose any fake structure return argument if it is passed on the stack. */
7975 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
7976 && !ix86_keep_aggregate_return_pointer (funtype))
7978 int nregs = ix86_function_regparm (funtype, fundecl);
7979 if (nregs == 0)
7980 return GET_MODE_SIZE (Pmode);
7983 return 0;
7986 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
7988 static bool
7989 ix86_legitimate_combined_insn (rtx_insn *insn)
7991 /* Check operand constraints in case hard registers were propagated
7992 into insn pattern. This check prevents combine pass from
7993 generating insn patterns with invalid hard register operands.
7994 These invalid insns can eventually confuse reload to error out
7995 with a spill failure. See also PRs 46829 and 46843. */
7996 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
7998 int i;
8000 extract_insn (insn);
8001 preprocess_constraints (insn);
8003 int n_operands = recog_data.n_operands;
8004 int n_alternatives = recog_data.n_alternatives;
8005 for (i = 0; i < n_operands; i++)
8007 rtx op = recog_data.operand[i];
8008 machine_mode mode = GET_MODE (op);
8009 const operand_alternative *op_alt;
8010 int offset = 0;
8011 bool win;
8012 int j;
8014 /* A unary operator may be accepted by the predicate, but it
8015 is irrelevant for matching constraints. */
8016 if (UNARY_P (op))
8017 op = XEXP (op, 0);
8019 if (SUBREG_P (op))
8021 if (REG_P (SUBREG_REG (op))
8022 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8023 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8024 GET_MODE (SUBREG_REG (op)),
8025 SUBREG_BYTE (op),
8026 GET_MODE (op));
8027 op = SUBREG_REG (op);
8030 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8031 continue;
8033 op_alt = recog_op_alt;
8035 /* Operand has no constraints, anything is OK. */
8036 win = !n_alternatives;
8038 alternative_mask preferred = get_preferred_alternatives (insn);
8039 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8041 if (!TEST_BIT (preferred, j))
8042 continue;
8043 if (op_alt[i].anything_ok
8044 || (op_alt[i].matches != -1
8045 && operands_match_p
8046 (recog_data.operand[i],
8047 recog_data.operand[op_alt[i].matches]))
8048 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8050 win = true;
8051 break;
8055 if (!win)
8056 return false;
8060 return true;
8063 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8065 static unsigned HOST_WIDE_INT
8066 ix86_asan_shadow_offset (void)
8068 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8069 : HOST_WIDE_INT_C (0x7fff8000))
8070 : (HOST_WIDE_INT_1 << 29);
8073 /* Argument support functions. */
8075 /* Return true when register may be used to pass function parameters. */
8076 bool
8077 ix86_function_arg_regno_p (int regno)
8079 int i;
8080 enum calling_abi call_abi;
8081 const int *parm_regs;
8083 if (TARGET_MPX && BND_REGNO_P (regno))
8084 return true;
8086 if (!TARGET_64BIT)
8088 if (TARGET_MACHO)
8089 return (regno < REGPARM_MAX
8090 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8091 else
8092 return (regno < REGPARM_MAX
8093 || (TARGET_MMX && MMX_REGNO_P (regno)
8094 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8095 || (TARGET_SSE && SSE_REGNO_P (regno)
8096 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8099 if (TARGET_SSE && SSE_REGNO_P (regno)
8100 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8101 return true;
8103 /* TODO: The function should depend on current function ABI but
8104 builtins.c would need updating then. Therefore we use the
8105 default ABI. */
8106 call_abi = ix86_cfun_abi ();
8108 /* RAX is used as hidden argument to va_arg functions. */
8109 if (call_abi == SYSV_ABI && regno == AX_REG)
8110 return true;
8112 if (call_abi == MS_ABI)
8113 parm_regs = x86_64_ms_abi_int_parameter_registers;
8114 else
8115 parm_regs = x86_64_int_parameter_registers;
8117 for (i = 0; i < (call_abi == MS_ABI
8118 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8119 if (regno == parm_regs[i])
8120 return true;
8121 return false;
8124 /* Return if we do not know how to pass TYPE solely in registers. */
8126 static bool
8127 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8129 if (must_pass_in_stack_var_size_or_pad (mode, type))
8130 return true;
8132 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8133 The layout_type routine is crafty and tries to trick us into passing
8134 currently unsupported vector types on the stack by using TImode. */
8135 return (!TARGET_64BIT && mode == TImode
8136 && type && TREE_CODE (type) != VECTOR_TYPE);
8139 /* It returns the size, in bytes, of the area reserved for arguments passed
8140 in registers for the function represented by fndecl dependent to the used
8141 abi format. */
8143 ix86_reg_parm_stack_space (const_tree fndecl)
8145 enum calling_abi call_abi = SYSV_ABI;
8146 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8147 call_abi = ix86_function_abi (fndecl);
8148 else
8149 call_abi = ix86_function_type_abi (fndecl);
8150 if (TARGET_64BIT && call_abi == MS_ABI)
8151 return 32;
8152 return 0;
8155 /* We add this as a workaround in order to use libc_has_function
8156 hook in i386.md. */
8157 bool
8158 ix86_libc_has_function (enum function_class fn_class)
8160 return targetm.libc_has_function (fn_class);
8163 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8164 specifying the call abi used. */
8165 enum calling_abi
8166 ix86_function_type_abi (const_tree fntype)
8168 enum calling_abi abi = ix86_abi;
8170 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8171 return abi;
8173 if (abi == SYSV_ABI
8174 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8176 if (TARGET_X32)
8177 error ("X32 does not support ms_abi attribute");
8179 abi = MS_ABI;
8181 else if (abi == MS_ABI
8182 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8183 abi = SYSV_ABI;
8185 return abi;
8188 static enum calling_abi
8189 ix86_function_abi (const_tree fndecl)
8191 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8194 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8195 specifying the call abi used. */
8196 enum calling_abi
8197 ix86_cfun_abi (void)
8199 return cfun ? cfun->machine->call_abi : ix86_abi;
8202 static bool
8203 ix86_function_ms_hook_prologue (const_tree fn)
8205 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8207 if (decl_function_context (fn) != NULL_TREE)
8208 error_at (DECL_SOURCE_LOCATION (fn),
8209 "ms_hook_prologue is not compatible with nested function");
8210 else
8211 return true;
8213 return false;
8216 /* Write the extra assembler code needed to declare a function properly. */
8218 void
8219 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8220 tree decl)
8222 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8224 if (is_ms_hook)
8226 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8227 unsigned int filler_cc = 0xcccccccc;
8229 for (i = 0; i < filler_count; i += 4)
8230 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8233 #ifdef SUBTARGET_ASM_UNWIND_INIT
8234 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8235 #endif
8237 ASM_OUTPUT_LABEL (asm_out_file, fname);
8239 /* Output magic byte marker, if hot-patch attribute is set. */
8240 if (is_ms_hook)
8242 if (TARGET_64BIT)
8244 /* leaq [%rsp + 0], %rsp */
8245 asm_fprintf (asm_out_file, ASM_BYTE
8246 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
8248 else
8250 /* movl.s %edi, %edi
8251 push %ebp
8252 movl.s %esp, %ebp */
8253 asm_fprintf (asm_out_file, ASM_BYTE
8254 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
8259 /* regclass.c */
8260 extern void init_regs (void);
8262 /* Implementation of call abi switching target hook. Specific to FNDECL
8263 the specific call register sets are set. See also
8264 ix86_conditional_register_usage for more details. */
8265 void
8266 ix86_call_abi_override (const_tree fndecl)
8268 cfun->machine->call_abi = ix86_function_abi (fndecl);
8271 /* Return 1 if pseudo register should be created and used to hold
8272 GOT address for PIC code. */
8273 bool
8274 ix86_use_pseudo_pic_reg (void)
8276 if ((TARGET_64BIT
8277 && (ix86_cmodel == CM_SMALL_PIC
8278 || TARGET_PECOFF))
8279 || !flag_pic)
8280 return false;
8281 return true;
8284 /* Initialize large model PIC register. */
8286 static void
8287 ix86_init_large_pic_reg (unsigned int tmp_regno)
8289 rtx_code_label *label;
8290 rtx tmp_reg;
8292 gcc_assert (Pmode == DImode);
8293 label = gen_label_rtx ();
8294 emit_label (label);
8295 LABEL_PRESERVE_P (label) = 1;
8296 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8297 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8298 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8299 label));
8300 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8301 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8302 pic_offset_table_rtx, tmp_reg));
8305 /* Create and initialize PIC register if required. */
8306 static void
8307 ix86_init_pic_reg (void)
8309 edge entry_edge;
8310 rtx_insn *seq;
8312 if (!ix86_use_pseudo_pic_reg ())
8313 return;
8315 start_sequence ();
8317 if (TARGET_64BIT)
8319 if (ix86_cmodel == CM_LARGE_PIC)
8320 ix86_init_large_pic_reg (R11_REG);
8321 else
8322 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8324 else
8326 /* If there is future mcount call in the function it is more profitable
8327 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8328 rtx reg = crtl->profile
8329 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8330 : pic_offset_table_rtx;
8331 rtx_insn *insn = emit_insn (gen_set_got (reg));
8332 RTX_FRAME_RELATED_P (insn) = 1;
8333 if (crtl->profile)
8334 emit_move_insn (pic_offset_table_rtx, reg);
8335 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8338 seq = get_insns ();
8339 end_sequence ();
8341 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8342 insert_insn_on_edge (seq, entry_edge);
8343 commit_one_edge_insertion (entry_edge);
8346 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8347 for a call to a function whose data type is FNTYPE.
8348 For a library call, FNTYPE is 0. */
8350 void
8351 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8352 tree fntype, /* tree ptr for function decl */
8353 rtx libname, /* SYMBOL_REF of library name or 0 */
8354 tree fndecl,
8355 int caller)
8357 struct cgraph_local_info *i = NULL;
8358 struct cgraph_node *target = NULL;
8360 memset (cum, 0, sizeof (*cum));
8362 if (fndecl)
8364 target = cgraph_node::get (fndecl);
8365 if (target)
8367 target = target->function_symbol ();
8368 i = cgraph_node::local_info (target->decl);
8369 cum->call_abi = ix86_function_abi (target->decl);
8371 else
8372 cum->call_abi = ix86_function_abi (fndecl);
8374 else
8375 cum->call_abi = ix86_function_type_abi (fntype);
8377 cum->caller = caller;
8379 /* Set up the number of registers to use for passing arguments. */
8380 cum->nregs = ix86_regparm;
8381 if (TARGET_64BIT)
8383 cum->nregs = (cum->call_abi == SYSV_ABI
8384 ? X86_64_REGPARM_MAX
8385 : X86_64_MS_REGPARM_MAX);
8387 if (TARGET_SSE)
8389 cum->sse_nregs = SSE_REGPARM_MAX;
8390 if (TARGET_64BIT)
8392 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8393 ? X86_64_SSE_REGPARM_MAX
8394 : X86_64_MS_SSE_REGPARM_MAX);
8397 if (TARGET_MMX)
8398 cum->mmx_nregs = MMX_REGPARM_MAX;
8399 cum->warn_avx512f = true;
8400 cum->warn_avx = true;
8401 cum->warn_sse = true;
8402 cum->warn_mmx = true;
8404 /* Because type might mismatch in between caller and callee, we need to
8405 use actual type of function for local calls.
8406 FIXME: cgraph_analyze can be told to actually record if function uses
8407 va_start so for local functions maybe_vaarg can be made aggressive
8408 helping K&R code.
8409 FIXME: once typesytem is fixed, we won't need this code anymore. */
8410 if (i && i->local && i->can_change_signature)
8411 fntype = TREE_TYPE (target->decl);
8412 cum->stdarg = stdarg_p (fntype);
8413 cum->maybe_vaarg = (fntype
8414 ? (!prototype_p (fntype) || stdarg_p (fntype))
8415 : !libname);
8417 cum->bnd_regno = FIRST_BND_REG;
8418 cum->bnds_in_bt = 0;
8419 cum->force_bnd_pass = 0;
8420 cum->decl = fndecl;
8422 if (!TARGET_64BIT)
8424 /* If there are variable arguments, then we won't pass anything
8425 in registers in 32-bit mode. */
8426 if (stdarg_p (fntype))
8428 cum->nregs = 0;
8429 /* Since in 32-bit, variable arguments are always passed on
8430 stack, there is scratch register available for indirect
8431 sibcall. */
8432 cfun->machine->arg_reg_available = true;
8433 cum->sse_nregs = 0;
8434 cum->mmx_nregs = 0;
8435 cum->warn_avx512f = false;
8436 cum->warn_avx = false;
8437 cum->warn_sse = false;
8438 cum->warn_mmx = false;
8439 return;
8442 /* Use ecx and edx registers if function has fastcall attribute,
8443 else look for regparm information. */
8444 if (fntype)
8446 unsigned int ccvt = ix86_get_callcvt (fntype);
8447 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8449 cum->nregs = 1;
8450 cum->fastcall = 1; /* Same first register as in fastcall. */
8452 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8454 cum->nregs = 2;
8455 cum->fastcall = 1;
8457 else
8458 cum->nregs = ix86_function_regparm (fntype, fndecl);
8461 /* Set up the number of SSE registers used for passing SFmode
8462 and DFmode arguments. Warn for mismatching ABI. */
8463 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
8466 cfun->machine->arg_reg_available = (cum->nregs > 0);
8469 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
8470 But in the case of vector types, it is some vector mode.
8472 When we have only some of our vector isa extensions enabled, then there
8473 are some modes for which vector_mode_supported_p is false. For these
8474 modes, the generic vector support in gcc will choose some non-vector mode
8475 in order to implement the type. By computing the natural mode, we'll
8476 select the proper ABI location for the operand and not depend on whatever
8477 the middle-end decides to do with these vector types.
8479 The midde-end can't deal with the vector types > 16 bytes. In this
8480 case, we return the original mode and warn ABI change if CUM isn't
8481 NULL.
8483 If INT_RETURN is true, warn ABI change if the vector mode isn't
8484 available for function return value. */
8486 static machine_mode
8487 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
8488 bool in_return)
8490 machine_mode mode = TYPE_MODE (type);
8492 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
8494 HOST_WIDE_INT size = int_size_in_bytes (type);
8495 if ((size == 8 || size == 16 || size == 32 || size == 64)
8496 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
8497 && TYPE_VECTOR_SUBPARTS (type) > 1)
8499 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
8501 /* There are no XFmode vector modes. */
8502 if (innermode == XFmode)
8503 return mode;
8505 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
8506 mode = MIN_MODE_VECTOR_FLOAT;
8507 else
8508 mode = MIN_MODE_VECTOR_INT;
8510 /* Get the mode which has this inner mode and number of units. */
8511 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
8512 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
8513 && GET_MODE_INNER (mode) == innermode)
8515 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
8517 static bool warnedavx512f;
8518 static bool warnedavx512f_ret;
8520 if (cum && cum->warn_avx512f && !warnedavx512f)
8522 if (warning (OPT_Wpsabi, "AVX512F vector argument "
8523 "without AVX512F enabled changes the ABI"))
8524 warnedavx512f = true;
8526 else if (in_return && !warnedavx512f_ret)
8528 if (warning (OPT_Wpsabi, "AVX512F vector return "
8529 "without AVX512F enabled changes the ABI"))
8530 warnedavx512f_ret = true;
8533 return TYPE_MODE (type);
8535 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
8537 static bool warnedavx;
8538 static bool warnedavx_ret;
8540 if (cum && cum->warn_avx && !warnedavx)
8542 if (warning (OPT_Wpsabi, "AVX vector argument "
8543 "without AVX enabled changes the ABI"))
8544 warnedavx = true;
8546 else if (in_return && !warnedavx_ret)
8548 if (warning (OPT_Wpsabi, "AVX vector return "
8549 "without AVX enabled changes the ABI"))
8550 warnedavx_ret = true;
8553 return TYPE_MODE (type);
8555 else if (((size == 8 && TARGET_64BIT) || size == 16)
8556 && !TARGET_SSE
8557 && !TARGET_IAMCU)
8559 static bool warnedsse;
8560 static bool warnedsse_ret;
8562 if (cum && cum->warn_sse && !warnedsse)
8564 if (warning (OPT_Wpsabi, "SSE vector argument "
8565 "without SSE enabled changes the ABI"))
8566 warnedsse = true;
8568 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
8570 if (warning (OPT_Wpsabi, "SSE vector return "
8571 "without SSE enabled changes the ABI"))
8572 warnedsse_ret = true;
8575 else if ((size == 8 && !TARGET_64BIT)
8576 && (!cfun
8577 || cfun->machine->func_type == TYPE_NORMAL)
8578 && !TARGET_MMX
8579 && !TARGET_IAMCU)
8581 static bool warnedmmx;
8582 static bool warnedmmx_ret;
8584 if (cum && cum->warn_mmx && !warnedmmx)
8586 if (warning (OPT_Wpsabi, "MMX vector argument "
8587 "without MMX enabled changes the ABI"))
8588 warnedmmx = true;
8590 else if (in_return && !warnedmmx_ret)
8592 if (warning (OPT_Wpsabi, "MMX vector return "
8593 "without MMX enabled changes the ABI"))
8594 warnedmmx_ret = true;
8597 return mode;
8600 gcc_unreachable ();
8604 return mode;
8607 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
8608 this may not agree with the mode that the type system has chosen for the
8609 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
8610 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
8612 static rtx
8613 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
8614 unsigned int regno)
8616 rtx tmp;
8618 if (orig_mode != BLKmode)
8619 tmp = gen_rtx_REG (orig_mode, regno);
8620 else
8622 tmp = gen_rtx_REG (mode, regno);
8623 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
8624 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
8627 return tmp;
8630 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
8631 of this code is to classify each 8bytes of incoming argument by the register
8632 class and assign registers accordingly. */
8634 /* Return the union class of CLASS1 and CLASS2.
8635 See the x86-64 PS ABI for details. */
8637 static enum x86_64_reg_class
8638 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
8640 /* Rule #1: If both classes are equal, this is the resulting class. */
8641 if (class1 == class2)
8642 return class1;
8644 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
8645 the other class. */
8646 if (class1 == X86_64_NO_CLASS)
8647 return class2;
8648 if (class2 == X86_64_NO_CLASS)
8649 return class1;
8651 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
8652 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
8653 return X86_64_MEMORY_CLASS;
8655 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
8656 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
8657 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
8658 return X86_64_INTEGERSI_CLASS;
8659 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
8660 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
8661 return X86_64_INTEGER_CLASS;
8663 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
8664 MEMORY is used. */
8665 if (class1 == X86_64_X87_CLASS
8666 || class1 == X86_64_X87UP_CLASS
8667 || class1 == X86_64_COMPLEX_X87_CLASS
8668 || class2 == X86_64_X87_CLASS
8669 || class2 == X86_64_X87UP_CLASS
8670 || class2 == X86_64_COMPLEX_X87_CLASS)
8671 return X86_64_MEMORY_CLASS;
8673 /* Rule #6: Otherwise class SSE is used. */
8674 return X86_64_SSE_CLASS;
8677 /* Classify the argument of type TYPE and mode MODE.
8678 CLASSES will be filled by the register class used to pass each word
8679 of the operand. The number of words is returned. In case the parameter
8680 should be passed in memory, 0 is returned. As a special case for zero
8681 sized containers, classes[0] will be NO_CLASS and 1 is returned.
8683 BIT_OFFSET is used internally for handling records and specifies offset
8684 of the offset in bits modulo 512 to avoid overflow cases.
8686 See the x86-64 PS ABI for details.
8689 static int
8690 classify_argument (machine_mode mode, const_tree type,
8691 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
8693 HOST_WIDE_INT bytes =
8694 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8695 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
8697 /* Variable sized entities are always passed/returned in memory. */
8698 if (bytes < 0)
8699 return 0;
8701 if (mode != VOIDmode
8702 && targetm.calls.must_pass_in_stack (mode, type))
8703 return 0;
8705 if (type && AGGREGATE_TYPE_P (type))
8707 int i;
8708 tree field;
8709 enum x86_64_reg_class subclasses[MAX_CLASSES];
8711 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
8712 if (bytes > 64)
8713 return 0;
8715 for (i = 0; i < words; i++)
8716 classes[i] = X86_64_NO_CLASS;
8718 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
8719 signalize memory class, so handle it as special case. */
8720 if (!words)
8722 classes[0] = X86_64_NO_CLASS;
8723 return 1;
8726 /* Classify each field of record and merge classes. */
8727 switch (TREE_CODE (type))
8729 case RECORD_TYPE:
8730 /* And now merge the fields of structure. */
8731 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8733 if (TREE_CODE (field) == FIELD_DECL)
8735 int num;
8737 if (TREE_TYPE (field) == error_mark_node)
8738 continue;
8740 /* Bitfields are always classified as integer. Handle them
8741 early, since later code would consider them to be
8742 misaligned integers. */
8743 if (DECL_BIT_FIELD (field))
8745 for (i = (int_bit_position (field)
8746 + (bit_offset % 64)) / 8 / 8;
8747 i < ((int_bit_position (field) + (bit_offset % 64))
8748 + tree_to_shwi (DECL_SIZE (field))
8749 + 63) / 8 / 8; i++)
8750 classes[i] =
8751 merge_classes (X86_64_INTEGER_CLASS,
8752 classes[i]);
8754 else
8756 int pos;
8758 type = TREE_TYPE (field);
8760 /* Flexible array member is ignored. */
8761 if (TYPE_MODE (type) == BLKmode
8762 && TREE_CODE (type) == ARRAY_TYPE
8763 && TYPE_SIZE (type) == NULL_TREE
8764 && TYPE_DOMAIN (type) != NULL_TREE
8765 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
8766 == NULL_TREE))
8768 static bool warned;
8770 if (!warned && warn_psabi)
8772 warned = true;
8773 inform (input_location,
8774 "the ABI of passing struct with"
8775 " a flexible array member has"
8776 " changed in GCC 4.4");
8778 continue;
8780 num = classify_argument (TYPE_MODE (type), type,
8781 subclasses,
8782 (int_bit_position (field)
8783 + bit_offset) % 512);
8784 if (!num)
8785 return 0;
8786 pos = (int_bit_position (field)
8787 + (bit_offset % 64)) / 8 / 8;
8788 for (i = 0; i < num && (i + pos) < words; i++)
8789 classes[i + pos] =
8790 merge_classes (subclasses[i], classes[i + pos]);
8794 break;
8796 case ARRAY_TYPE:
8797 /* Arrays are handled as small records. */
8799 int num;
8800 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
8801 TREE_TYPE (type), subclasses, bit_offset);
8802 if (!num)
8803 return 0;
8805 /* The partial classes are now full classes. */
8806 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
8807 subclasses[0] = X86_64_SSE_CLASS;
8808 if (subclasses[0] == X86_64_INTEGERSI_CLASS
8809 && !((bit_offset % 64) == 0 && bytes == 4))
8810 subclasses[0] = X86_64_INTEGER_CLASS;
8812 for (i = 0; i < words; i++)
8813 classes[i] = subclasses[i % num];
8815 break;
8817 case UNION_TYPE:
8818 case QUAL_UNION_TYPE:
8819 /* Unions are similar to RECORD_TYPE but offset is always 0.
8821 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8823 if (TREE_CODE (field) == FIELD_DECL)
8825 int num;
8827 if (TREE_TYPE (field) == error_mark_node)
8828 continue;
8830 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
8831 TREE_TYPE (field), subclasses,
8832 bit_offset);
8833 if (!num)
8834 return 0;
8835 for (i = 0; i < num && i < words; i++)
8836 classes[i] = merge_classes (subclasses[i], classes[i]);
8839 break;
8841 default:
8842 gcc_unreachable ();
8845 if (words > 2)
8847 /* When size > 16 bytes, if the first one isn't
8848 X86_64_SSE_CLASS or any other ones aren't
8849 X86_64_SSEUP_CLASS, everything should be passed in
8850 memory. */
8851 if (classes[0] != X86_64_SSE_CLASS)
8852 return 0;
8854 for (i = 1; i < words; i++)
8855 if (classes[i] != X86_64_SSEUP_CLASS)
8856 return 0;
8859 /* Final merger cleanup. */
8860 for (i = 0; i < words; i++)
8862 /* If one class is MEMORY, everything should be passed in
8863 memory. */
8864 if (classes[i] == X86_64_MEMORY_CLASS)
8865 return 0;
8867 /* The X86_64_SSEUP_CLASS should be always preceded by
8868 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
8869 if (classes[i] == X86_64_SSEUP_CLASS
8870 && classes[i - 1] != X86_64_SSE_CLASS
8871 && classes[i - 1] != X86_64_SSEUP_CLASS)
8873 /* The first one should never be X86_64_SSEUP_CLASS. */
8874 gcc_assert (i != 0);
8875 classes[i] = X86_64_SSE_CLASS;
8878 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
8879 everything should be passed in memory. */
8880 if (classes[i] == X86_64_X87UP_CLASS
8881 && (classes[i - 1] != X86_64_X87_CLASS))
8883 static bool warned;
8885 /* The first one should never be X86_64_X87UP_CLASS. */
8886 gcc_assert (i != 0);
8887 if (!warned && warn_psabi)
8889 warned = true;
8890 inform (input_location,
8891 "the ABI of passing union with long double"
8892 " has changed in GCC 4.4");
8894 return 0;
8897 return words;
8900 /* Compute alignment needed. We align all types to natural boundaries with
8901 exception of XFmode that is aligned to 64bits. */
8902 if (mode != VOIDmode && mode != BLKmode)
8904 int mode_alignment = GET_MODE_BITSIZE (mode);
8906 if (mode == XFmode)
8907 mode_alignment = 128;
8908 else if (mode == XCmode)
8909 mode_alignment = 256;
8910 if (COMPLEX_MODE_P (mode))
8911 mode_alignment /= 2;
8912 /* Misaligned fields are always returned in memory. */
8913 if (bit_offset % mode_alignment)
8914 return 0;
8917 /* for V1xx modes, just use the base mode */
8918 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
8919 && GET_MODE_UNIT_SIZE (mode) == bytes)
8920 mode = GET_MODE_INNER (mode);
8922 /* Classification of atomic types. */
8923 switch (mode)
8925 case SDmode:
8926 case DDmode:
8927 classes[0] = X86_64_SSE_CLASS;
8928 return 1;
8929 case TDmode:
8930 classes[0] = X86_64_SSE_CLASS;
8931 classes[1] = X86_64_SSEUP_CLASS;
8932 return 2;
8933 case DImode:
8934 case SImode:
8935 case HImode:
8936 case QImode:
8937 case CSImode:
8938 case CHImode:
8939 case CQImode:
8941 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
8943 /* Analyze last 128 bits only. */
8944 size = (size - 1) & 0x7f;
8946 if (size < 32)
8948 classes[0] = X86_64_INTEGERSI_CLASS;
8949 return 1;
8951 else if (size < 64)
8953 classes[0] = X86_64_INTEGER_CLASS;
8954 return 1;
8956 else if (size < 64+32)
8958 classes[0] = X86_64_INTEGER_CLASS;
8959 classes[1] = X86_64_INTEGERSI_CLASS;
8960 return 2;
8962 else if (size < 64+64)
8964 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8965 return 2;
8967 else
8968 gcc_unreachable ();
8970 case CDImode:
8971 case TImode:
8972 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
8973 return 2;
8974 case COImode:
8975 case OImode:
8976 /* OImode shouldn't be used directly. */
8977 gcc_unreachable ();
8978 case CTImode:
8979 return 0;
8980 case SFmode:
8981 if (!(bit_offset % 64))
8982 classes[0] = X86_64_SSESF_CLASS;
8983 else
8984 classes[0] = X86_64_SSE_CLASS;
8985 return 1;
8986 case DFmode:
8987 classes[0] = X86_64_SSEDF_CLASS;
8988 return 1;
8989 case XFmode:
8990 classes[0] = X86_64_X87_CLASS;
8991 classes[1] = X86_64_X87UP_CLASS;
8992 return 2;
8993 case TFmode:
8994 classes[0] = X86_64_SSE_CLASS;
8995 classes[1] = X86_64_SSEUP_CLASS;
8996 return 2;
8997 case SCmode:
8998 classes[0] = X86_64_SSE_CLASS;
8999 if (!(bit_offset % 64))
9000 return 1;
9001 else
9003 static bool warned;
9005 if (!warned && warn_psabi)
9007 warned = true;
9008 inform (input_location,
9009 "the ABI of passing structure with complex float"
9010 " member has changed in GCC 4.4");
9012 classes[1] = X86_64_SSESF_CLASS;
9013 return 2;
9015 case DCmode:
9016 classes[0] = X86_64_SSEDF_CLASS;
9017 classes[1] = X86_64_SSEDF_CLASS;
9018 return 2;
9019 case XCmode:
9020 classes[0] = X86_64_COMPLEX_X87_CLASS;
9021 return 1;
9022 case TCmode:
9023 /* This modes is larger than 16 bytes. */
9024 return 0;
9025 case V8SFmode:
9026 case V8SImode:
9027 case V32QImode:
9028 case V16HImode:
9029 case V4DFmode:
9030 case V4DImode:
9031 classes[0] = X86_64_SSE_CLASS;
9032 classes[1] = X86_64_SSEUP_CLASS;
9033 classes[2] = X86_64_SSEUP_CLASS;
9034 classes[3] = X86_64_SSEUP_CLASS;
9035 return 4;
9036 case V8DFmode:
9037 case V16SFmode:
9038 case V8DImode:
9039 case V16SImode:
9040 case V32HImode:
9041 case V64QImode:
9042 classes[0] = X86_64_SSE_CLASS;
9043 classes[1] = X86_64_SSEUP_CLASS;
9044 classes[2] = X86_64_SSEUP_CLASS;
9045 classes[3] = X86_64_SSEUP_CLASS;
9046 classes[4] = X86_64_SSEUP_CLASS;
9047 classes[5] = X86_64_SSEUP_CLASS;
9048 classes[6] = X86_64_SSEUP_CLASS;
9049 classes[7] = X86_64_SSEUP_CLASS;
9050 return 8;
9051 case V4SFmode:
9052 case V4SImode:
9053 case V16QImode:
9054 case V8HImode:
9055 case V2DFmode:
9056 case V2DImode:
9057 classes[0] = X86_64_SSE_CLASS;
9058 classes[1] = X86_64_SSEUP_CLASS;
9059 return 2;
9060 case V1TImode:
9061 case V1DImode:
9062 case V2SFmode:
9063 case V2SImode:
9064 case V4HImode:
9065 case V8QImode:
9066 classes[0] = X86_64_SSE_CLASS;
9067 return 1;
9068 case BLKmode:
9069 case VOIDmode:
9070 return 0;
9071 default:
9072 gcc_assert (VECTOR_MODE_P (mode));
9074 if (bytes > 16)
9075 return 0;
9077 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9079 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9080 classes[0] = X86_64_INTEGERSI_CLASS;
9081 else
9082 classes[0] = X86_64_INTEGER_CLASS;
9083 classes[1] = X86_64_INTEGER_CLASS;
9084 return 1 + (bytes > 8);
9088 /* Examine the argument and return set number of register required in each
9089 class. Return true iff parameter should be passed in memory. */
9091 static bool
9092 examine_argument (machine_mode mode, const_tree type, int in_return,
9093 int *int_nregs, int *sse_nregs)
9095 enum x86_64_reg_class regclass[MAX_CLASSES];
9096 int n = classify_argument (mode, type, regclass, 0);
9098 *int_nregs = 0;
9099 *sse_nregs = 0;
9101 if (!n)
9102 return true;
9103 for (n--; n >= 0; n--)
9104 switch (regclass[n])
9106 case X86_64_INTEGER_CLASS:
9107 case X86_64_INTEGERSI_CLASS:
9108 (*int_nregs)++;
9109 break;
9110 case X86_64_SSE_CLASS:
9111 case X86_64_SSESF_CLASS:
9112 case X86_64_SSEDF_CLASS:
9113 (*sse_nregs)++;
9114 break;
9115 case X86_64_NO_CLASS:
9116 case X86_64_SSEUP_CLASS:
9117 break;
9118 case X86_64_X87_CLASS:
9119 case X86_64_X87UP_CLASS:
9120 case X86_64_COMPLEX_X87_CLASS:
9121 if (!in_return)
9122 return true;
9123 break;
9124 case X86_64_MEMORY_CLASS:
9125 gcc_unreachable ();
9128 return false;
9131 /* Construct container for the argument used by GCC interface. See
9132 FUNCTION_ARG for the detailed description. */
9134 static rtx
9135 construct_container (machine_mode mode, machine_mode orig_mode,
9136 const_tree type, int in_return, int nintregs, int nsseregs,
9137 const int *intreg, int sse_regno)
9139 /* The following variables hold the static issued_error state. */
9140 static bool issued_sse_arg_error;
9141 static bool issued_sse_ret_error;
9142 static bool issued_x87_ret_error;
9144 machine_mode tmpmode;
9145 int bytes =
9146 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9147 enum x86_64_reg_class regclass[MAX_CLASSES];
9148 int n;
9149 int i;
9150 int nexps = 0;
9151 int needed_sseregs, needed_intregs;
9152 rtx exp[MAX_CLASSES];
9153 rtx ret;
9155 n = classify_argument (mode, type, regclass, 0);
9156 if (!n)
9157 return NULL;
9158 if (examine_argument (mode, type, in_return, &needed_intregs,
9159 &needed_sseregs))
9160 return NULL;
9161 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9162 return NULL;
9164 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9165 some less clueful developer tries to use floating-point anyway. */
9166 if (needed_sseregs && !TARGET_SSE)
9168 if (in_return)
9170 if (!issued_sse_ret_error)
9172 error ("SSE register return with SSE disabled");
9173 issued_sse_ret_error = true;
9176 else if (!issued_sse_arg_error)
9178 error ("SSE register argument with SSE disabled");
9179 issued_sse_arg_error = true;
9181 return NULL;
9184 /* Likewise, error if the ABI requires us to return values in the
9185 x87 registers and the user specified -mno-80387. */
9186 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9187 for (i = 0; i < n; i++)
9188 if (regclass[i] == X86_64_X87_CLASS
9189 || regclass[i] == X86_64_X87UP_CLASS
9190 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9192 if (!issued_x87_ret_error)
9194 error ("x87 register return with x87 disabled");
9195 issued_x87_ret_error = true;
9197 return NULL;
9200 /* First construct simple cases. Avoid SCmode, since we want to use
9201 single register to pass this type. */
9202 if (n == 1 && mode != SCmode)
9203 switch (regclass[0])
9205 case X86_64_INTEGER_CLASS:
9206 case X86_64_INTEGERSI_CLASS:
9207 return gen_rtx_REG (mode, intreg[0]);
9208 case X86_64_SSE_CLASS:
9209 case X86_64_SSESF_CLASS:
9210 case X86_64_SSEDF_CLASS:
9211 if (mode != BLKmode)
9212 return gen_reg_or_parallel (mode, orig_mode,
9213 SSE_REGNO (sse_regno));
9214 break;
9215 case X86_64_X87_CLASS:
9216 case X86_64_COMPLEX_X87_CLASS:
9217 return gen_rtx_REG (mode, FIRST_STACK_REG);
9218 case X86_64_NO_CLASS:
9219 /* Zero sized array, struct or class. */
9220 return NULL;
9221 default:
9222 gcc_unreachable ();
9224 if (n == 2
9225 && regclass[0] == X86_64_SSE_CLASS
9226 && regclass[1] == X86_64_SSEUP_CLASS
9227 && mode != BLKmode)
9228 return gen_reg_or_parallel (mode, orig_mode,
9229 SSE_REGNO (sse_regno));
9230 if (n == 4
9231 && regclass[0] == X86_64_SSE_CLASS
9232 && regclass[1] == X86_64_SSEUP_CLASS
9233 && regclass[2] == X86_64_SSEUP_CLASS
9234 && regclass[3] == X86_64_SSEUP_CLASS
9235 && mode != BLKmode)
9236 return gen_reg_or_parallel (mode, orig_mode,
9237 SSE_REGNO (sse_regno));
9238 if (n == 8
9239 && regclass[0] == X86_64_SSE_CLASS
9240 && regclass[1] == X86_64_SSEUP_CLASS
9241 && regclass[2] == X86_64_SSEUP_CLASS
9242 && regclass[3] == X86_64_SSEUP_CLASS
9243 && regclass[4] == X86_64_SSEUP_CLASS
9244 && regclass[5] == X86_64_SSEUP_CLASS
9245 && regclass[6] == X86_64_SSEUP_CLASS
9246 && regclass[7] == X86_64_SSEUP_CLASS
9247 && mode != BLKmode)
9248 return gen_reg_or_parallel (mode, orig_mode,
9249 SSE_REGNO (sse_regno));
9250 if (n == 2
9251 && regclass[0] == X86_64_X87_CLASS
9252 && regclass[1] == X86_64_X87UP_CLASS)
9253 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9255 if (n == 2
9256 && regclass[0] == X86_64_INTEGER_CLASS
9257 && regclass[1] == X86_64_INTEGER_CLASS
9258 && (mode == CDImode || mode == TImode)
9259 && intreg[0] + 1 == intreg[1])
9260 return gen_rtx_REG (mode, intreg[0]);
9262 /* Otherwise figure out the entries of the PARALLEL. */
9263 for (i = 0; i < n; i++)
9265 int pos;
9267 switch (regclass[i])
9269 case X86_64_NO_CLASS:
9270 break;
9271 case X86_64_INTEGER_CLASS:
9272 case X86_64_INTEGERSI_CLASS:
9273 /* Merge TImodes on aligned occasions here too. */
9274 if (i * 8 + 8 > bytes)
9275 tmpmode
9276 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9277 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9278 tmpmode = SImode;
9279 else
9280 tmpmode = DImode;
9281 /* We've requested 24 bytes we
9282 don't have mode for. Use DImode. */
9283 if (tmpmode == BLKmode)
9284 tmpmode = DImode;
9285 exp [nexps++]
9286 = gen_rtx_EXPR_LIST (VOIDmode,
9287 gen_rtx_REG (tmpmode, *intreg),
9288 GEN_INT (i*8));
9289 intreg++;
9290 break;
9291 case X86_64_SSESF_CLASS:
9292 exp [nexps++]
9293 = gen_rtx_EXPR_LIST (VOIDmode,
9294 gen_rtx_REG (SFmode,
9295 SSE_REGNO (sse_regno)),
9296 GEN_INT (i*8));
9297 sse_regno++;
9298 break;
9299 case X86_64_SSEDF_CLASS:
9300 exp [nexps++]
9301 = gen_rtx_EXPR_LIST (VOIDmode,
9302 gen_rtx_REG (DFmode,
9303 SSE_REGNO (sse_regno)),
9304 GEN_INT (i*8));
9305 sse_regno++;
9306 break;
9307 case X86_64_SSE_CLASS:
9308 pos = i;
9309 switch (n)
9311 case 1:
9312 tmpmode = DImode;
9313 break;
9314 case 2:
9315 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9317 tmpmode = TImode;
9318 i++;
9320 else
9321 tmpmode = DImode;
9322 break;
9323 case 4:
9324 gcc_assert (i == 0
9325 && regclass[1] == X86_64_SSEUP_CLASS
9326 && regclass[2] == X86_64_SSEUP_CLASS
9327 && regclass[3] == X86_64_SSEUP_CLASS);
9328 tmpmode = OImode;
9329 i += 3;
9330 break;
9331 case 8:
9332 gcc_assert (i == 0
9333 && regclass[1] == X86_64_SSEUP_CLASS
9334 && regclass[2] == X86_64_SSEUP_CLASS
9335 && regclass[3] == X86_64_SSEUP_CLASS
9336 && regclass[4] == X86_64_SSEUP_CLASS
9337 && regclass[5] == X86_64_SSEUP_CLASS
9338 && regclass[6] == X86_64_SSEUP_CLASS
9339 && regclass[7] == X86_64_SSEUP_CLASS);
9340 tmpmode = XImode;
9341 i += 7;
9342 break;
9343 default:
9344 gcc_unreachable ();
9346 exp [nexps++]
9347 = gen_rtx_EXPR_LIST (VOIDmode,
9348 gen_rtx_REG (tmpmode,
9349 SSE_REGNO (sse_regno)),
9350 GEN_INT (pos*8));
9351 sse_regno++;
9352 break;
9353 default:
9354 gcc_unreachable ();
9358 /* Empty aligned struct, union or class. */
9359 if (nexps == 0)
9360 return NULL;
9362 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9363 for (i = 0; i < nexps; i++)
9364 XVECEXP (ret, 0, i) = exp [i];
9365 return ret;
9368 /* Update the data in CUM to advance over an argument of mode MODE
9369 and data type TYPE. (TYPE is null for libcalls where that information
9370 may not be available.)
9372 Return a number of integer regsiters advanced over. */
9374 static int
9375 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9376 const_tree type, HOST_WIDE_INT bytes,
9377 HOST_WIDE_INT words)
9379 int res = 0;
9380 bool error_p = NULL;
9382 if (TARGET_IAMCU)
9384 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9385 bytes in registers. */
9386 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9387 goto pass_in_reg;
9388 return res;
9391 switch (mode)
9393 default:
9394 break;
9396 case BLKmode:
9397 if (bytes < 0)
9398 break;
9399 /* FALLTHRU */
9401 case DImode:
9402 case SImode:
9403 case HImode:
9404 case QImode:
9405 pass_in_reg:
9406 cum->words += words;
9407 cum->nregs -= words;
9408 cum->regno += words;
9409 if (cum->nregs >= 0)
9410 res = words;
9411 if (cum->nregs <= 0)
9413 cum->nregs = 0;
9414 cfun->machine->arg_reg_available = false;
9415 cum->regno = 0;
9417 break;
9419 case OImode:
9420 /* OImode shouldn't be used directly. */
9421 gcc_unreachable ();
9423 case DFmode:
9424 if (cum->float_in_sse == -1)
9425 error_p = 1;
9426 if (cum->float_in_sse < 2)
9427 break;
9428 /* FALLTHRU */
9429 case SFmode:
9430 if (cum->float_in_sse == -1)
9431 error_p = 1;
9432 if (cum->float_in_sse < 1)
9433 break;
9434 /* FALLTHRU */
9436 case V8SFmode:
9437 case V8SImode:
9438 case V64QImode:
9439 case V32HImode:
9440 case V16SImode:
9441 case V8DImode:
9442 case V16SFmode:
9443 case V8DFmode:
9444 case V32QImode:
9445 case V16HImode:
9446 case V4DFmode:
9447 case V4DImode:
9448 case TImode:
9449 case V16QImode:
9450 case V8HImode:
9451 case V4SImode:
9452 case V2DImode:
9453 case V4SFmode:
9454 case V2DFmode:
9455 if (!type || !AGGREGATE_TYPE_P (type))
9457 cum->sse_words += words;
9458 cum->sse_nregs -= 1;
9459 cum->sse_regno += 1;
9460 if (cum->sse_nregs <= 0)
9462 cum->sse_nregs = 0;
9463 cum->sse_regno = 0;
9466 break;
9468 case V8QImode:
9469 case V4HImode:
9470 case V2SImode:
9471 case V2SFmode:
9472 case V1TImode:
9473 case V1DImode:
9474 if (!type || !AGGREGATE_TYPE_P (type))
9476 cum->mmx_words += words;
9477 cum->mmx_nregs -= 1;
9478 cum->mmx_regno += 1;
9479 if (cum->mmx_nregs <= 0)
9481 cum->mmx_nregs = 0;
9482 cum->mmx_regno = 0;
9485 break;
9487 if (error_p)
9489 cum->float_in_sse = 0;
9490 error ("calling %qD with SSE calling convention without "
9491 "SSE/SSE2 enabled", cum->decl);
9492 sorry ("this is a GCC bug that can be worked around by adding "
9493 "attribute used to function called");
9496 return res;
9499 static int
9500 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
9501 const_tree type, HOST_WIDE_INT words, bool named)
9503 int int_nregs, sse_nregs;
9505 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
9506 if (!named && (VALID_AVX512F_REG_MODE (mode)
9507 || VALID_AVX256_REG_MODE (mode)))
9508 return 0;
9510 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
9511 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
9513 cum->nregs -= int_nregs;
9514 cum->sse_nregs -= sse_nregs;
9515 cum->regno += int_nregs;
9516 cum->sse_regno += sse_nregs;
9517 return int_nregs;
9519 else
9521 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
9522 cum->words = ROUND_UP (cum->words, align);
9523 cum->words += words;
9524 return 0;
9528 static int
9529 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
9530 HOST_WIDE_INT words)
9532 /* Otherwise, this should be passed indirect. */
9533 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
9535 cum->words += words;
9536 if (cum->nregs > 0)
9538 cum->nregs -= 1;
9539 cum->regno += 1;
9540 return 1;
9542 return 0;
9545 /* Update the data in CUM to advance over an argument of mode MODE and
9546 data type TYPE. (TYPE is null for libcalls where that information
9547 may not be available.) */
9549 static void
9550 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
9551 const_tree type, bool named)
9553 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9554 HOST_WIDE_INT bytes, words;
9555 int nregs;
9557 /* The argument of interrupt handler is a special case and is
9558 handled in ix86_function_arg. */
9559 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9560 return;
9562 if (mode == BLKmode)
9563 bytes = int_size_in_bytes (type);
9564 else
9565 bytes = GET_MODE_SIZE (mode);
9566 words = CEIL (bytes, UNITS_PER_WORD);
9568 if (type)
9569 mode = type_natural_mode (type, NULL, false);
9571 if ((type && POINTER_BOUNDS_TYPE_P (type))
9572 || POINTER_BOUNDS_MODE_P (mode))
9574 /* If we pass bounds in BT then just update remained bounds count. */
9575 if (cum->bnds_in_bt)
9577 cum->bnds_in_bt--;
9578 return;
9581 /* Update remained number of bounds to force. */
9582 if (cum->force_bnd_pass)
9583 cum->force_bnd_pass--;
9585 cum->bnd_regno++;
9587 return;
9590 /* The first arg not going to Bounds Tables resets this counter. */
9591 cum->bnds_in_bt = 0;
9592 /* For unnamed args we always pass bounds to avoid bounds mess when
9593 passed and received types do not match. If bounds do not follow
9594 unnamed arg, still pretend required number of bounds were passed. */
9595 if (cum->force_bnd_pass)
9597 cum->bnd_regno += cum->force_bnd_pass;
9598 cum->force_bnd_pass = 0;
9601 if (TARGET_64BIT)
9603 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9605 if (call_abi == MS_ABI)
9606 nregs = function_arg_advance_ms_64 (cum, bytes, words);
9607 else
9608 nregs = function_arg_advance_64 (cum, mode, type, words, named);
9610 else
9611 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
9613 /* For stdarg we expect bounds to be passed for each value passed
9614 in register. */
9615 if (cum->stdarg)
9616 cum->force_bnd_pass = nregs;
9617 /* For pointers passed in memory we expect bounds passed in Bounds
9618 Table. */
9619 if (!nregs)
9620 cum->bnds_in_bt = chkp_type_bounds_count (type);
9623 /* Define where to put the arguments to a function.
9624 Value is zero to push the argument on the stack,
9625 or a hard register in which to store the argument.
9627 MODE is the argument's machine mode.
9628 TYPE is the data type of the argument (as a tree).
9629 This is null for libcalls where that information may
9630 not be available.
9631 CUM is a variable of type CUMULATIVE_ARGS which gives info about
9632 the preceding args and about the function being called.
9633 NAMED is nonzero if this argument is a named parameter
9634 (otherwise it is an extra parameter matching an ellipsis). */
9636 static rtx
9637 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9638 machine_mode orig_mode, const_tree type,
9639 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
9641 bool error_p = false;
9642 /* Avoid the AL settings for the Unix64 ABI. */
9643 if (mode == VOIDmode)
9644 return constm1_rtx;
9646 if (TARGET_IAMCU)
9648 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9649 bytes in registers. */
9650 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9651 goto pass_in_reg;
9652 return NULL_RTX;
9655 switch (mode)
9657 default:
9658 break;
9660 case BLKmode:
9661 if (bytes < 0)
9662 break;
9663 /* FALLTHRU */
9664 case DImode:
9665 case SImode:
9666 case HImode:
9667 case QImode:
9668 pass_in_reg:
9669 if (words <= cum->nregs)
9671 int regno = cum->regno;
9673 /* Fastcall allocates the first two DWORD (SImode) or
9674 smaller arguments to ECX and EDX if it isn't an
9675 aggregate type . */
9676 if (cum->fastcall)
9678 if (mode == BLKmode
9679 || mode == DImode
9680 || (type && AGGREGATE_TYPE_P (type)))
9681 break;
9683 /* ECX not EAX is the first allocated register. */
9684 if (regno == AX_REG)
9685 regno = CX_REG;
9687 return gen_rtx_REG (mode, regno);
9689 break;
9691 case DFmode:
9692 if (cum->float_in_sse == -1)
9693 error_p = 1;
9694 if (cum->float_in_sse < 2)
9695 break;
9696 /* FALLTHRU */
9697 case SFmode:
9698 if (cum->float_in_sse == -1)
9699 error_p = 1;
9700 if (cum->float_in_sse < 1)
9701 break;
9702 /* FALLTHRU */
9703 case TImode:
9704 /* In 32bit, we pass TImode in xmm registers. */
9705 case V16QImode:
9706 case V8HImode:
9707 case V4SImode:
9708 case V2DImode:
9709 case V4SFmode:
9710 case V2DFmode:
9711 if (!type || !AGGREGATE_TYPE_P (type))
9713 if (cum->sse_nregs)
9714 return gen_reg_or_parallel (mode, orig_mode,
9715 cum->sse_regno + FIRST_SSE_REG);
9717 break;
9719 case OImode:
9720 case XImode:
9721 /* OImode and XImode shouldn't be used directly. */
9722 gcc_unreachable ();
9724 case V64QImode:
9725 case V32HImode:
9726 case V16SImode:
9727 case V8DImode:
9728 case V16SFmode:
9729 case V8DFmode:
9730 case V8SFmode:
9731 case V8SImode:
9732 case V32QImode:
9733 case V16HImode:
9734 case V4DFmode:
9735 case V4DImode:
9736 if (!type || !AGGREGATE_TYPE_P (type))
9738 if (cum->sse_nregs)
9739 return gen_reg_or_parallel (mode, orig_mode,
9740 cum->sse_regno + FIRST_SSE_REG);
9742 break;
9744 case V8QImode:
9745 case V4HImode:
9746 case V2SImode:
9747 case V2SFmode:
9748 case V1TImode:
9749 case V1DImode:
9750 if (!type || !AGGREGATE_TYPE_P (type))
9752 if (cum->mmx_nregs)
9753 return gen_reg_or_parallel (mode, orig_mode,
9754 cum->mmx_regno + FIRST_MMX_REG);
9756 break;
9758 if (error_p)
9760 cum->float_in_sse = 0;
9761 error ("calling %qD with SSE calling convention without "
9762 "SSE/SSE2 enabled", cum->decl);
9763 sorry ("this is a GCC bug that can be worked around by adding "
9764 "attribute used to function called");
9767 return NULL_RTX;
9770 static rtx
9771 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9772 machine_mode orig_mode, const_tree type, bool named)
9774 /* Handle a hidden AL argument containing number of registers
9775 for varargs x86-64 functions. */
9776 if (mode == VOIDmode)
9777 return GEN_INT (cum->maybe_vaarg
9778 ? (cum->sse_nregs < 0
9779 ? X86_64_SSE_REGPARM_MAX
9780 : cum->sse_regno)
9781 : -1);
9783 switch (mode)
9785 default:
9786 break;
9788 case V8SFmode:
9789 case V8SImode:
9790 case V32QImode:
9791 case V16HImode:
9792 case V4DFmode:
9793 case V4DImode:
9794 case V16SFmode:
9795 case V16SImode:
9796 case V64QImode:
9797 case V32HImode:
9798 case V8DFmode:
9799 case V8DImode:
9800 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9801 if (!named)
9802 return NULL;
9803 break;
9806 return construct_container (mode, orig_mode, type, 0, cum->nregs,
9807 cum->sse_nregs,
9808 &x86_64_int_parameter_registers [cum->regno],
9809 cum->sse_regno);
9812 static rtx
9813 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
9814 machine_mode orig_mode, bool named,
9815 HOST_WIDE_INT bytes)
9817 unsigned int regno;
9819 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
9820 We use value of -2 to specify that current function call is MSABI. */
9821 if (mode == VOIDmode)
9822 return GEN_INT (-2);
9824 /* If we've run out of registers, it goes on the stack. */
9825 if (cum->nregs == 0)
9826 return NULL_RTX;
9828 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
9830 /* Only floating point modes are passed in anything but integer regs. */
9831 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
9833 if (named)
9834 regno = cum->regno + FIRST_SSE_REG;
9835 else
9837 rtx t1, t2;
9839 /* Unnamed floating parameters are passed in both the
9840 SSE and integer registers. */
9841 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
9842 t2 = gen_rtx_REG (mode, regno);
9843 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
9844 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
9845 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
9848 /* Handle aggregated types passed in register. */
9849 if (orig_mode == BLKmode)
9851 if (bytes > 0 && bytes <= 8)
9852 mode = (bytes > 4 ? DImode : SImode);
9853 if (mode == BLKmode)
9854 mode = DImode;
9857 return gen_reg_or_parallel (mode, orig_mode, regno);
9860 /* Return where to put the arguments to a function.
9861 Return zero to push the argument on the stack, or a hard register in which to store the argument.
9863 MODE is the argument's machine mode. TYPE is the data type of the
9864 argument. It is null for libcalls where that information may not be
9865 available. CUM gives information about the preceding args and about
9866 the function being called. NAMED is nonzero if this argument is a
9867 named parameter (otherwise it is an extra parameter matching an
9868 ellipsis). */
9870 static rtx
9871 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
9872 const_tree type, bool named)
9874 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9875 machine_mode mode = omode;
9876 HOST_WIDE_INT bytes, words;
9877 rtx arg;
9879 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
9881 gcc_assert (type != NULL_TREE);
9882 if (POINTER_TYPE_P (type))
9884 /* This is the pointer argument. */
9885 gcc_assert (TYPE_MODE (type) == Pmode);
9886 if (cfun->machine->func_type == TYPE_INTERRUPT)
9887 /* -WORD(AP) in the current frame in interrupt handler. */
9888 arg = plus_constant (Pmode, arg_pointer_rtx,
9889 -UNITS_PER_WORD);
9890 else
9891 /* (AP) in the current frame in exception handler. */
9892 arg = arg_pointer_rtx;
9894 else
9896 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
9897 && TREE_CODE (type) == INTEGER_TYPE
9898 && TYPE_MODE (type) == word_mode);
9899 /* The integer argument is the error code at -WORD(AP) in
9900 the current frame in exception handler. */
9901 arg = gen_rtx_MEM (word_mode,
9902 plus_constant (Pmode,
9903 arg_pointer_rtx,
9904 -UNITS_PER_WORD));
9906 return arg;
9909 /* All pointer bounds arguments are handled separately here. */
9910 if ((type && POINTER_BOUNDS_TYPE_P (type))
9911 || POINTER_BOUNDS_MODE_P (mode))
9913 /* Return NULL if bounds are forced to go in Bounds Table. */
9914 if (cum->bnds_in_bt)
9915 arg = NULL;
9916 /* Return the next available bound reg if any. */
9917 else if (cum->bnd_regno <= LAST_BND_REG)
9918 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
9919 /* Return the next special slot number otherwise. */
9920 else
9921 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
9923 return arg;
9926 if (mode == BLKmode)
9927 bytes = int_size_in_bytes (type);
9928 else
9929 bytes = GET_MODE_SIZE (mode);
9930 words = CEIL (bytes, UNITS_PER_WORD);
9932 /* To simplify the code below, represent vector types with a vector mode
9933 even if MMX/SSE are not active. */
9934 if (type && TREE_CODE (type) == VECTOR_TYPE)
9935 mode = type_natural_mode (type, cum, false);
9937 if (TARGET_64BIT)
9939 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9941 if (call_abi == MS_ABI)
9942 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
9943 else
9944 arg = function_arg_64 (cum, mode, omode, type, named);
9946 else
9947 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
9949 return arg;
9952 /* A C expression that indicates when an argument must be passed by
9953 reference. If nonzero for an argument, a copy of that argument is
9954 made in memory and a pointer to the argument is passed instead of
9955 the argument itself. The pointer is passed in whatever way is
9956 appropriate for passing a pointer to that type. */
9958 static bool
9959 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
9960 const_tree type, bool)
9962 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9964 /* Bounds are never passed by reference. */
9965 if ((type && POINTER_BOUNDS_TYPE_P (type))
9966 || POINTER_BOUNDS_MODE_P (mode))
9967 return false;
9969 if (TARGET_64BIT)
9971 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
9973 /* See Windows x64 Software Convention. */
9974 if (call_abi == MS_ABI)
9976 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
9978 if (type)
9980 /* Arrays are passed by reference. */
9981 if (TREE_CODE (type) == ARRAY_TYPE)
9982 return true;
9984 if (RECORD_OR_UNION_TYPE_P (type))
9986 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
9987 are passed by reference. */
9988 msize = int_size_in_bytes (type);
9992 /* __m128 is passed by reference. */
9993 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
9995 else if (type && int_size_in_bytes (type) == -1)
9996 return true;
9999 return false;
10002 /* Return true when TYPE should be 128bit aligned for 32bit argument
10003 passing ABI. XXX: This function is obsolete and is only used for
10004 checking psABI compatibility with previous versions of GCC. */
10006 static bool
10007 ix86_compat_aligned_value_p (const_tree type)
10009 machine_mode mode = TYPE_MODE (type);
10010 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10011 || mode == TDmode
10012 || mode == TFmode
10013 || mode == TCmode)
10014 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10015 return true;
10016 if (TYPE_ALIGN (type) < 128)
10017 return false;
10019 if (AGGREGATE_TYPE_P (type))
10021 /* Walk the aggregates recursively. */
10022 switch (TREE_CODE (type))
10024 case RECORD_TYPE:
10025 case UNION_TYPE:
10026 case QUAL_UNION_TYPE:
10028 tree field;
10030 /* Walk all the structure fields. */
10031 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10033 if (TREE_CODE (field) == FIELD_DECL
10034 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10035 return true;
10037 break;
10040 case ARRAY_TYPE:
10041 /* Just for use if some languages passes arrays by value. */
10042 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10043 return true;
10044 break;
10046 default:
10047 gcc_unreachable ();
10050 return false;
10053 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10054 XXX: This function is obsolete and is only used for checking psABI
10055 compatibility with previous versions of GCC. */
10057 static unsigned int
10058 ix86_compat_function_arg_boundary (machine_mode mode,
10059 const_tree type, unsigned int align)
10061 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10062 natural boundaries. */
10063 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10065 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10066 make an exception for SSE modes since these require 128bit
10067 alignment.
10069 The handling here differs from field_alignment. ICC aligns MMX
10070 arguments to 4 byte boundaries, while structure fields are aligned
10071 to 8 byte boundaries. */
10072 if (!type)
10074 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10075 align = PARM_BOUNDARY;
10077 else
10079 if (!ix86_compat_aligned_value_p (type))
10080 align = PARM_BOUNDARY;
10083 if (align > BIGGEST_ALIGNMENT)
10084 align = BIGGEST_ALIGNMENT;
10085 return align;
10088 /* Return true when TYPE should be 128bit aligned for 32bit argument
10089 passing ABI. */
10091 static bool
10092 ix86_contains_aligned_value_p (const_tree type)
10094 machine_mode mode = TYPE_MODE (type);
10096 if (mode == XFmode || mode == XCmode)
10097 return false;
10099 if (TYPE_ALIGN (type) < 128)
10100 return false;
10102 if (AGGREGATE_TYPE_P (type))
10104 /* Walk the aggregates recursively. */
10105 switch (TREE_CODE (type))
10107 case RECORD_TYPE:
10108 case UNION_TYPE:
10109 case QUAL_UNION_TYPE:
10111 tree field;
10113 /* Walk all the structure fields. */
10114 for (field = TYPE_FIELDS (type);
10115 field;
10116 field = DECL_CHAIN (field))
10118 if (TREE_CODE (field) == FIELD_DECL
10119 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10120 return true;
10122 break;
10125 case ARRAY_TYPE:
10126 /* Just for use if some languages passes arrays by value. */
10127 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10128 return true;
10129 break;
10131 default:
10132 gcc_unreachable ();
10135 else
10136 return TYPE_ALIGN (type) >= 128;
10138 return false;
10141 /* Gives the alignment boundary, in bits, of an argument with the
10142 specified mode and type. */
10144 static unsigned int
10145 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10147 unsigned int align;
10148 if (type)
10150 /* Since the main variant type is used for call, we convert it to
10151 the main variant type. */
10152 type = TYPE_MAIN_VARIANT (type);
10153 align = TYPE_ALIGN (type);
10155 else
10156 align = GET_MODE_ALIGNMENT (mode);
10157 if (align < PARM_BOUNDARY)
10158 align = PARM_BOUNDARY;
10159 else
10161 static bool warned;
10162 unsigned int saved_align = align;
10164 if (!TARGET_64BIT)
10166 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10167 if (!type)
10169 if (mode == XFmode || mode == XCmode)
10170 align = PARM_BOUNDARY;
10172 else if (!ix86_contains_aligned_value_p (type))
10173 align = PARM_BOUNDARY;
10175 if (align < 128)
10176 align = PARM_BOUNDARY;
10179 if (warn_psabi
10180 && !warned
10181 && align != ix86_compat_function_arg_boundary (mode, type,
10182 saved_align))
10184 warned = true;
10185 inform (input_location,
10186 "The ABI for passing parameters with %d-byte"
10187 " alignment has changed in GCC 4.6",
10188 align / BITS_PER_UNIT);
10192 return align;
10195 /* Return true if N is a possible register number of function value. */
10197 static bool
10198 ix86_function_value_regno_p (const unsigned int regno)
10200 switch (regno)
10202 case AX_REG:
10203 return true;
10204 case DX_REG:
10205 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10206 case DI_REG:
10207 case SI_REG:
10208 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10210 case BND0_REG:
10211 case BND1_REG:
10212 return chkp_function_instrumented_p (current_function_decl);
10214 /* Complex values are returned in %st(0)/%st(1) pair. */
10215 case ST0_REG:
10216 case ST1_REG:
10217 /* TODO: The function should depend on current function ABI but
10218 builtins.c would need updating then. Therefore we use the
10219 default ABI. */
10220 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10221 return false;
10222 return TARGET_FLOAT_RETURNS_IN_80387;
10224 /* Complex values are returned in %xmm0/%xmm1 pair. */
10225 case XMM0_REG:
10226 case XMM1_REG:
10227 return TARGET_SSE;
10229 case MM0_REG:
10230 if (TARGET_MACHO || TARGET_64BIT)
10231 return false;
10232 return TARGET_MMX;
10235 return false;
10238 /* Define how to find the value returned by a function.
10239 VALTYPE is the data type of the value (as a tree).
10240 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10241 otherwise, FUNC is 0. */
10243 static rtx
10244 function_value_32 (machine_mode orig_mode, machine_mode mode,
10245 const_tree fntype, const_tree fn)
10247 unsigned int regno;
10249 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10250 we normally prevent this case when mmx is not available. However
10251 some ABIs may require the result to be returned like DImode. */
10252 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10253 regno = FIRST_MMX_REG;
10255 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10256 we prevent this case when sse is not available. However some ABIs
10257 may require the result to be returned like integer TImode. */
10258 else if (mode == TImode
10259 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10260 regno = FIRST_SSE_REG;
10262 /* 32-byte vector modes in %ymm0. */
10263 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10264 regno = FIRST_SSE_REG;
10266 /* 64-byte vector modes in %zmm0. */
10267 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10268 regno = FIRST_SSE_REG;
10270 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10271 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10272 regno = FIRST_FLOAT_REG;
10273 else
10274 /* Most things go in %eax. */
10275 regno = AX_REG;
10277 /* Override FP return register with %xmm0 for local functions when
10278 SSE math is enabled or for functions with sseregparm attribute. */
10279 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10281 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10282 if (sse_level == -1)
10284 error ("calling %qD with SSE caling convention without "
10285 "SSE/SSE2 enabled", fn);
10286 sorry ("this is a GCC bug that can be worked around by adding "
10287 "attribute used to function called");
10289 else if ((sse_level >= 1 && mode == SFmode)
10290 || (sse_level == 2 && mode == DFmode))
10291 regno = FIRST_SSE_REG;
10294 /* OImode shouldn't be used directly. */
10295 gcc_assert (mode != OImode);
10297 return gen_rtx_REG (orig_mode, regno);
10300 static rtx
10301 function_value_64 (machine_mode orig_mode, machine_mode mode,
10302 const_tree valtype)
10304 rtx ret;
10306 /* Handle libcalls, which don't provide a type node. */
10307 if (valtype == NULL)
10309 unsigned int regno;
10311 switch (mode)
10313 case SFmode:
10314 case SCmode:
10315 case DFmode:
10316 case DCmode:
10317 case TFmode:
10318 case SDmode:
10319 case DDmode:
10320 case TDmode:
10321 regno = FIRST_SSE_REG;
10322 break;
10323 case XFmode:
10324 case XCmode:
10325 regno = FIRST_FLOAT_REG;
10326 break;
10327 case TCmode:
10328 return NULL;
10329 default:
10330 regno = AX_REG;
10333 return gen_rtx_REG (mode, regno);
10335 else if (POINTER_TYPE_P (valtype))
10337 /* Pointers are always returned in word_mode. */
10338 mode = word_mode;
10341 ret = construct_container (mode, orig_mode, valtype, 1,
10342 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10343 x86_64_int_return_registers, 0);
10345 /* For zero sized structures, construct_container returns NULL, but we
10346 need to keep rest of compiler happy by returning meaningful value. */
10347 if (!ret)
10348 ret = gen_rtx_REG (orig_mode, AX_REG);
10350 return ret;
10353 static rtx
10354 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10355 const_tree valtype)
10357 unsigned int regno = AX_REG;
10359 if (TARGET_SSE)
10361 switch (GET_MODE_SIZE (mode))
10363 case 16:
10364 if (valtype != NULL_TREE
10365 && !VECTOR_INTEGER_TYPE_P (valtype)
10366 && !VECTOR_INTEGER_TYPE_P (valtype)
10367 && !INTEGRAL_TYPE_P (valtype)
10368 && !VECTOR_FLOAT_TYPE_P (valtype))
10369 break;
10370 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10371 && !COMPLEX_MODE_P (mode))
10372 regno = FIRST_SSE_REG;
10373 break;
10374 case 8:
10375 case 4:
10376 if (mode == SFmode || mode == DFmode)
10377 regno = FIRST_SSE_REG;
10378 break;
10379 default:
10380 break;
10383 return gen_rtx_REG (orig_mode, regno);
10386 static rtx
10387 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10388 machine_mode orig_mode, machine_mode mode)
10390 const_tree fn, fntype;
10392 fn = NULL_TREE;
10393 if (fntype_or_decl && DECL_P (fntype_or_decl))
10394 fn = fntype_or_decl;
10395 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10397 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10398 || POINTER_BOUNDS_MODE_P (mode))
10399 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10400 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10401 return function_value_ms_64 (orig_mode, mode, valtype);
10402 else if (TARGET_64BIT)
10403 return function_value_64 (orig_mode, mode, valtype);
10404 else
10405 return function_value_32 (orig_mode, mode, fntype, fn);
10408 static rtx
10409 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10411 machine_mode mode, orig_mode;
10413 orig_mode = TYPE_MODE (valtype);
10414 mode = type_natural_mode (valtype, NULL, true);
10415 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10418 /* Return an RTX representing a place where a function returns
10419 or recieves pointer bounds or NULL if no bounds are returned.
10421 VALTYPE is a data type of a value returned by the function.
10423 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10424 or FUNCTION_TYPE of the function.
10426 If OUTGOING is false, return a place in which the caller will
10427 see the return value. Otherwise, return a place where a
10428 function returns a value. */
10430 static rtx
10431 ix86_function_value_bounds (const_tree valtype,
10432 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10433 bool outgoing ATTRIBUTE_UNUSED)
10435 rtx res = NULL_RTX;
10437 if (BOUNDED_TYPE_P (valtype))
10438 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10439 else if (chkp_type_has_pointer (valtype))
10441 bitmap slots;
10442 rtx bounds[2];
10443 bitmap_iterator bi;
10444 unsigned i, bnd_no = 0;
10446 bitmap_obstack_initialize (NULL);
10447 slots = BITMAP_ALLOC (NULL);
10448 chkp_find_bound_slots (valtype, slots);
10450 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
10452 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
10453 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
10454 gcc_assert (bnd_no < 2);
10455 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
10458 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
10460 BITMAP_FREE (slots);
10461 bitmap_obstack_release (NULL);
10463 else
10464 res = NULL_RTX;
10466 return res;
10469 /* Pointer function arguments and return values are promoted to
10470 word_mode for normal functions. */
10472 static machine_mode
10473 ix86_promote_function_mode (const_tree type, machine_mode mode,
10474 int *punsignedp, const_tree fntype,
10475 int for_return)
10477 if (cfun->machine->func_type == TYPE_NORMAL
10478 && type != NULL_TREE
10479 && POINTER_TYPE_P (type))
10481 *punsignedp = POINTERS_EXTEND_UNSIGNED;
10482 return word_mode;
10484 return default_promote_function_mode (type, mode, punsignedp, fntype,
10485 for_return);
10488 /* Return true if a structure, union or array with MODE containing FIELD
10489 should be accessed using BLKmode. */
10491 static bool
10492 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
10494 /* Union with XFmode must be in BLKmode. */
10495 return (mode == XFmode
10496 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
10497 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
10501 ix86_libcall_value (machine_mode mode)
10503 return ix86_function_value_1 (NULL, NULL, mode, mode);
10506 /* Return true iff type is returned in memory. */
10508 static bool
10509 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
10511 #ifdef SUBTARGET_RETURN_IN_MEMORY
10512 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
10513 #else
10514 const machine_mode mode = type_natural_mode (type, NULL, true);
10515 HOST_WIDE_INT size;
10517 if (POINTER_BOUNDS_TYPE_P (type))
10518 return false;
10520 if (TARGET_64BIT)
10522 if (ix86_function_type_abi (fntype) == MS_ABI)
10524 size = int_size_in_bytes (type);
10526 /* __m128 is returned in xmm0. */
10527 if ((!type || VECTOR_INTEGER_TYPE_P (type)
10528 || INTEGRAL_TYPE_P (type)
10529 || VECTOR_FLOAT_TYPE_P (type))
10530 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10531 && !COMPLEX_MODE_P (mode)
10532 && (GET_MODE_SIZE (mode) == 16 || size == 16))
10533 return false;
10535 /* Otherwise, the size must be exactly in [1248]. */
10536 return size != 1 && size != 2 && size != 4 && size != 8;
10538 else
10540 int needed_intregs, needed_sseregs;
10542 return examine_argument (mode, type, 1,
10543 &needed_intregs, &needed_sseregs);
10546 else
10548 size = int_size_in_bytes (type);
10550 /* Intel MCU psABI returns scalars and aggregates no larger than 8
10551 bytes in registers. */
10552 if (TARGET_IAMCU)
10553 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
10555 if (mode == BLKmode)
10556 return true;
10558 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
10559 return false;
10561 if (VECTOR_MODE_P (mode) || mode == TImode)
10563 /* User-created vectors small enough to fit in EAX. */
10564 if (size < 8)
10565 return false;
10567 /* Unless ABI prescibes otherwise,
10568 MMX/3dNow values are returned in MM0 if available. */
10570 if (size == 8)
10571 return TARGET_VECT8_RETURNS || !TARGET_MMX;
10573 /* SSE values are returned in XMM0 if available. */
10574 if (size == 16)
10575 return !TARGET_SSE;
10577 /* AVX values are returned in YMM0 if available. */
10578 if (size == 32)
10579 return !TARGET_AVX;
10581 /* AVX512F values are returned in ZMM0 if available. */
10582 if (size == 64)
10583 return !TARGET_AVX512F;
10586 if (mode == XFmode)
10587 return false;
10589 if (size > 12)
10590 return true;
10592 /* OImode shouldn't be used directly. */
10593 gcc_assert (mode != OImode);
10595 return false;
10597 #endif
10601 /* Create the va_list data type. */
10603 static tree
10604 ix86_build_builtin_va_list_64 (void)
10606 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
10608 record = lang_hooks.types.make_type (RECORD_TYPE);
10609 type_decl = build_decl (BUILTINS_LOCATION,
10610 TYPE_DECL, get_identifier ("__va_list_tag"), record);
10612 f_gpr = build_decl (BUILTINS_LOCATION,
10613 FIELD_DECL, get_identifier ("gp_offset"),
10614 unsigned_type_node);
10615 f_fpr = build_decl (BUILTINS_LOCATION,
10616 FIELD_DECL, get_identifier ("fp_offset"),
10617 unsigned_type_node);
10618 f_ovf = build_decl (BUILTINS_LOCATION,
10619 FIELD_DECL, get_identifier ("overflow_arg_area"),
10620 ptr_type_node);
10621 f_sav = build_decl (BUILTINS_LOCATION,
10622 FIELD_DECL, get_identifier ("reg_save_area"),
10623 ptr_type_node);
10625 va_list_gpr_counter_field = f_gpr;
10626 va_list_fpr_counter_field = f_fpr;
10628 DECL_FIELD_CONTEXT (f_gpr) = record;
10629 DECL_FIELD_CONTEXT (f_fpr) = record;
10630 DECL_FIELD_CONTEXT (f_ovf) = record;
10631 DECL_FIELD_CONTEXT (f_sav) = record;
10633 TYPE_STUB_DECL (record) = type_decl;
10634 TYPE_NAME (record) = type_decl;
10635 TYPE_FIELDS (record) = f_gpr;
10636 DECL_CHAIN (f_gpr) = f_fpr;
10637 DECL_CHAIN (f_fpr) = f_ovf;
10638 DECL_CHAIN (f_ovf) = f_sav;
10640 layout_type (record);
10642 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
10643 NULL_TREE, TYPE_ATTRIBUTES (record));
10645 /* The correct type is an array type of one element. */
10646 return build_array_type (record, build_index_type (size_zero_node));
10649 /* Setup the builtin va_list data type and for 64-bit the additional
10650 calling convention specific va_list data types. */
10652 static tree
10653 ix86_build_builtin_va_list (void)
10655 if (TARGET_64BIT)
10657 /* Initialize ABI specific va_list builtin types.
10659 In lto1, we can encounter two va_list types:
10660 - one as a result of the type-merge across TUs, and
10661 - the one constructed here.
10662 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
10663 a type identity check in canonical_va_list_type based on
10664 TYPE_MAIN_VARIANT (which we used to have) will not work.
10665 Instead, we tag each va_list_type_node with its unique attribute, and
10666 look for the attribute in the type identity check in
10667 canonical_va_list_type.
10669 Tagging sysv_va_list_type_node directly with the attribute is
10670 problematic since it's a array of one record, which will degrade into a
10671 pointer to record when used as parameter (see build_va_arg comments for
10672 an example), dropping the attribute in the process. So we tag the
10673 record instead. */
10675 /* For SYSV_ABI we use an array of one record. */
10676 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
10678 /* For MS_ABI we use plain pointer to argument area. */
10679 tree char_ptr_type = build_pointer_type (char_type_node);
10680 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
10681 TYPE_ATTRIBUTES (char_ptr_type));
10682 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
10684 return ((ix86_abi == MS_ABI)
10685 ? ms_va_list_type_node
10686 : sysv_va_list_type_node);
10688 else
10690 /* For i386 we use plain pointer to argument area. */
10691 return build_pointer_type (char_type_node);
10695 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
10697 static void
10698 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
10700 rtx save_area, mem;
10701 alias_set_type set;
10702 int i, max;
10704 /* GPR size of varargs save area. */
10705 if (cfun->va_list_gpr_size)
10706 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
10707 else
10708 ix86_varargs_gpr_size = 0;
10710 /* FPR size of varargs save area. We don't need it if we don't pass
10711 anything in SSE registers. */
10712 if (TARGET_SSE && cfun->va_list_fpr_size)
10713 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
10714 else
10715 ix86_varargs_fpr_size = 0;
10717 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
10718 return;
10720 save_area = frame_pointer_rtx;
10721 set = get_varargs_alias_set ();
10723 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10724 if (max > X86_64_REGPARM_MAX)
10725 max = X86_64_REGPARM_MAX;
10727 for (i = cum->regno; i < max; i++)
10729 mem = gen_rtx_MEM (word_mode,
10730 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
10731 MEM_NOTRAP_P (mem) = 1;
10732 set_mem_alias_set (mem, set);
10733 emit_move_insn (mem,
10734 gen_rtx_REG (word_mode,
10735 x86_64_int_parameter_registers[i]));
10738 if (ix86_varargs_fpr_size)
10740 machine_mode smode;
10741 rtx_code_label *label;
10742 rtx test;
10744 /* Now emit code to save SSE registers. The AX parameter contains number
10745 of SSE parameter registers used to call this function, though all we
10746 actually check here is the zero/non-zero status. */
10748 label = gen_label_rtx ();
10749 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
10750 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
10751 label));
10753 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
10754 we used movdqa (i.e. TImode) instead? Perhaps even better would
10755 be if we could determine the real mode of the data, via a hook
10756 into pass_stdarg. Ignore all that for now. */
10757 smode = V4SFmode;
10758 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
10759 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
10761 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
10762 if (max > X86_64_SSE_REGPARM_MAX)
10763 max = X86_64_SSE_REGPARM_MAX;
10765 for (i = cum->sse_regno; i < max; ++i)
10767 mem = plus_constant (Pmode, save_area,
10768 i * 16 + ix86_varargs_gpr_size);
10769 mem = gen_rtx_MEM (smode, mem);
10770 MEM_NOTRAP_P (mem) = 1;
10771 set_mem_alias_set (mem, set);
10772 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
10774 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
10777 emit_label (label);
10781 static void
10782 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
10784 alias_set_type set = get_varargs_alias_set ();
10785 int i;
10787 /* Reset to zero, as there might be a sysv vaarg used
10788 before. */
10789 ix86_varargs_gpr_size = 0;
10790 ix86_varargs_fpr_size = 0;
10792 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
10794 rtx reg, mem;
10796 mem = gen_rtx_MEM (Pmode,
10797 plus_constant (Pmode, virtual_incoming_args_rtx,
10798 i * UNITS_PER_WORD));
10799 MEM_NOTRAP_P (mem) = 1;
10800 set_mem_alias_set (mem, set);
10802 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
10803 emit_move_insn (mem, reg);
10807 static void
10808 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10809 tree type, int *, int no_rtl)
10811 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10812 CUMULATIVE_ARGS next_cum;
10813 tree fntype;
10815 /* This argument doesn't appear to be used anymore. Which is good,
10816 because the old code here didn't suppress rtl generation. */
10817 gcc_assert (!no_rtl);
10819 if (!TARGET_64BIT)
10820 return;
10822 fntype = TREE_TYPE (current_function_decl);
10824 /* For varargs, we do not want to skip the dummy va_dcl argument.
10825 For stdargs, we do want to skip the last named argument. */
10826 next_cum = *cum;
10827 if (stdarg_p (fntype))
10828 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10829 true);
10831 if (cum->call_abi == MS_ABI)
10832 setup_incoming_varargs_ms_64 (&next_cum);
10833 else
10834 setup_incoming_varargs_64 (&next_cum);
10837 static void
10838 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
10839 enum machine_mode mode,
10840 tree type,
10841 int *pretend_size ATTRIBUTE_UNUSED,
10842 int no_rtl)
10844 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10845 CUMULATIVE_ARGS next_cum;
10846 tree fntype;
10847 rtx save_area;
10848 int bnd_reg, i, max;
10850 gcc_assert (!no_rtl);
10852 /* Do nothing if we use plain pointer to argument area. */
10853 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
10854 return;
10856 fntype = TREE_TYPE (current_function_decl);
10858 /* For varargs, we do not want to skip the dummy va_dcl argument.
10859 For stdargs, we do want to skip the last named argument. */
10860 next_cum = *cum;
10861 if (stdarg_p (fntype))
10862 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
10863 true);
10864 save_area = frame_pointer_rtx;
10866 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
10867 if (max > X86_64_REGPARM_MAX)
10868 max = X86_64_REGPARM_MAX;
10870 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
10871 if (chkp_function_instrumented_p (current_function_decl))
10872 for (i = cum->regno; i < max; i++)
10874 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
10875 rtx ptr = gen_rtx_REG (Pmode,
10876 x86_64_int_parameter_registers[i]);
10877 rtx bounds;
10879 if (bnd_reg <= LAST_BND_REG)
10880 bounds = gen_rtx_REG (BNDmode, bnd_reg);
10881 else
10883 rtx ldx_addr =
10884 plus_constant (Pmode, arg_pointer_rtx,
10885 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
10886 bounds = gen_reg_rtx (BNDmode);
10887 emit_insn (BNDmode == BND64mode
10888 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
10889 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
10892 emit_insn (BNDmode == BND64mode
10893 ? gen_bnd64_stx (addr, ptr, bounds)
10894 : gen_bnd32_stx (addr, ptr, bounds));
10896 bnd_reg++;
10901 /* Checks if TYPE is of kind va_list char *. */
10903 static bool
10904 is_va_list_char_pointer (tree type)
10906 tree canonic;
10908 /* For 32-bit it is always true. */
10909 if (!TARGET_64BIT)
10910 return true;
10911 canonic = ix86_canonical_va_list_type (type);
10912 return (canonic == ms_va_list_type_node
10913 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
10916 /* Implement va_start. */
10918 static void
10919 ix86_va_start (tree valist, rtx nextarg)
10921 HOST_WIDE_INT words, n_gpr, n_fpr;
10922 tree f_gpr, f_fpr, f_ovf, f_sav;
10923 tree gpr, fpr, ovf, sav, t;
10924 tree type;
10925 rtx ovf_rtx;
10927 if (flag_split_stack
10928 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10930 unsigned int scratch_regno;
10932 /* When we are splitting the stack, we can't refer to the stack
10933 arguments using internal_arg_pointer, because they may be on
10934 the old stack. The split stack prologue will arrange to
10935 leave a pointer to the old stack arguments in a scratch
10936 register, which we here copy to a pseudo-register. The split
10937 stack prologue can't set the pseudo-register directly because
10938 it (the prologue) runs before any registers have been saved. */
10940 scratch_regno = split_stack_prologue_scratch_regno ();
10941 if (scratch_regno != INVALID_REGNUM)
10943 rtx reg;
10944 rtx_insn *seq;
10946 reg = gen_reg_rtx (Pmode);
10947 cfun->machine->split_stack_varargs_pointer = reg;
10949 start_sequence ();
10950 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
10951 seq = get_insns ();
10952 end_sequence ();
10954 push_topmost_sequence ();
10955 emit_insn_after (seq, entry_of_function ());
10956 pop_topmost_sequence ();
10960 /* Only 64bit target needs something special. */
10961 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10963 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
10964 std_expand_builtin_va_start (valist, nextarg);
10965 else
10967 rtx va_r, next;
10969 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
10970 next = expand_binop (ptr_mode, add_optab,
10971 cfun->machine->split_stack_varargs_pointer,
10972 crtl->args.arg_offset_rtx,
10973 NULL_RTX, 0, OPTAB_LIB_WIDEN);
10974 convert_move (va_r, next, 0);
10976 /* Store zero bounds for va_list. */
10977 if (chkp_function_instrumented_p (current_function_decl))
10978 chkp_expand_bounds_reset_for_mem (valist,
10979 make_tree (TREE_TYPE (valist),
10980 next));
10983 return;
10986 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10987 f_fpr = DECL_CHAIN (f_gpr);
10988 f_ovf = DECL_CHAIN (f_fpr);
10989 f_sav = DECL_CHAIN (f_ovf);
10991 valist = build_simple_mem_ref (valist);
10992 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
10993 /* The following should be folded into the MEM_REF offset. */
10994 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
10995 f_gpr, NULL_TREE);
10996 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
10997 f_fpr, NULL_TREE);
10998 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
10999 f_ovf, NULL_TREE);
11000 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11001 f_sav, NULL_TREE);
11003 /* Count number of gp and fp argument registers used. */
11004 words = crtl->args.info.words;
11005 n_gpr = crtl->args.info.regno;
11006 n_fpr = crtl->args.info.sse_regno;
11008 if (cfun->va_list_gpr_size)
11010 type = TREE_TYPE (gpr);
11011 t = build2 (MODIFY_EXPR, type,
11012 gpr, build_int_cst (type, n_gpr * 8));
11013 TREE_SIDE_EFFECTS (t) = 1;
11014 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11017 if (TARGET_SSE && cfun->va_list_fpr_size)
11019 type = TREE_TYPE (fpr);
11020 t = build2 (MODIFY_EXPR, type, fpr,
11021 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11022 TREE_SIDE_EFFECTS (t) = 1;
11023 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11026 /* Find the overflow area. */
11027 type = TREE_TYPE (ovf);
11028 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11029 ovf_rtx = crtl->args.internal_arg_pointer;
11030 else
11031 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11032 t = make_tree (type, ovf_rtx);
11033 if (words != 0)
11034 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11036 /* Store zero bounds for overflow area pointer. */
11037 if (chkp_function_instrumented_p (current_function_decl))
11038 chkp_expand_bounds_reset_for_mem (ovf, t);
11040 t = build2 (MODIFY_EXPR, type, ovf, t);
11041 TREE_SIDE_EFFECTS (t) = 1;
11042 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11044 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11046 /* Find the register save area.
11047 Prologue of the function save it right above stack frame. */
11048 type = TREE_TYPE (sav);
11049 t = make_tree (type, frame_pointer_rtx);
11050 if (!ix86_varargs_gpr_size)
11051 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11053 /* Store zero bounds for save area pointer. */
11054 if (chkp_function_instrumented_p (current_function_decl))
11055 chkp_expand_bounds_reset_for_mem (sav, t);
11057 t = build2 (MODIFY_EXPR, type, sav, t);
11058 TREE_SIDE_EFFECTS (t) = 1;
11059 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11063 /* Implement va_arg. */
11065 static tree
11066 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11067 gimple_seq *post_p)
11069 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11070 tree f_gpr, f_fpr, f_ovf, f_sav;
11071 tree gpr, fpr, ovf, sav, t;
11072 int size, rsize;
11073 tree lab_false, lab_over = NULL_TREE;
11074 tree addr, t2;
11075 rtx container;
11076 int indirect_p = 0;
11077 tree ptrtype;
11078 machine_mode nat_mode;
11079 unsigned int arg_boundary;
11081 /* Only 64bit target needs something special. */
11082 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11083 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11085 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11086 f_fpr = DECL_CHAIN (f_gpr);
11087 f_ovf = DECL_CHAIN (f_fpr);
11088 f_sav = DECL_CHAIN (f_ovf);
11090 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11091 valist, f_gpr, NULL_TREE);
11093 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11094 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11095 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11097 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11098 if (indirect_p)
11099 type = build_pointer_type (type);
11100 size = int_size_in_bytes (type);
11101 rsize = CEIL (size, UNITS_PER_WORD);
11103 nat_mode = type_natural_mode (type, NULL, false);
11104 switch (nat_mode)
11106 case V8SFmode:
11107 case V8SImode:
11108 case V32QImode:
11109 case V16HImode:
11110 case V4DFmode:
11111 case V4DImode:
11112 case V16SFmode:
11113 case V16SImode:
11114 case V64QImode:
11115 case V32HImode:
11116 case V8DFmode:
11117 case V8DImode:
11118 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11119 if (!TARGET_64BIT_MS_ABI)
11121 container = NULL;
11122 break;
11124 /* FALLTHRU */
11126 default:
11127 container = construct_container (nat_mode, TYPE_MODE (type),
11128 type, 0, X86_64_REGPARM_MAX,
11129 X86_64_SSE_REGPARM_MAX, intreg,
11131 break;
11134 /* Pull the value out of the saved registers. */
11136 addr = create_tmp_var (ptr_type_node, "addr");
11138 if (container)
11140 int needed_intregs, needed_sseregs;
11141 bool need_temp;
11142 tree int_addr, sse_addr;
11144 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11145 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11147 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11149 need_temp = (!REG_P (container)
11150 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11151 || TYPE_ALIGN (type) > 128));
11153 /* In case we are passing structure, verify that it is consecutive block
11154 on the register save area. If not we need to do moves. */
11155 if (!need_temp && !REG_P (container))
11157 /* Verify that all registers are strictly consecutive */
11158 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11160 int i;
11162 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11164 rtx slot = XVECEXP (container, 0, i);
11165 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11166 || INTVAL (XEXP (slot, 1)) != i * 16)
11167 need_temp = true;
11170 else
11172 int i;
11174 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11176 rtx slot = XVECEXP (container, 0, i);
11177 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11178 || INTVAL (XEXP (slot, 1)) != i * 8)
11179 need_temp = true;
11183 if (!need_temp)
11185 int_addr = addr;
11186 sse_addr = addr;
11188 else
11190 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11191 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11194 /* First ensure that we fit completely in registers. */
11195 if (needed_intregs)
11197 t = build_int_cst (TREE_TYPE (gpr),
11198 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11199 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11200 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11201 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11202 gimplify_and_add (t, pre_p);
11204 if (needed_sseregs)
11206 t = build_int_cst (TREE_TYPE (fpr),
11207 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11208 + X86_64_REGPARM_MAX * 8);
11209 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11210 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11211 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11212 gimplify_and_add (t, pre_p);
11215 /* Compute index to start of area used for integer regs. */
11216 if (needed_intregs)
11218 /* int_addr = gpr + sav; */
11219 t = fold_build_pointer_plus (sav, gpr);
11220 gimplify_assign (int_addr, t, pre_p);
11222 if (needed_sseregs)
11224 /* sse_addr = fpr + sav; */
11225 t = fold_build_pointer_plus (sav, fpr);
11226 gimplify_assign (sse_addr, t, pre_p);
11228 if (need_temp)
11230 int i, prev_size = 0;
11231 tree temp = create_tmp_var (type, "va_arg_tmp");
11233 /* addr = &temp; */
11234 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11235 gimplify_assign (addr, t, pre_p);
11237 for (i = 0; i < XVECLEN (container, 0); i++)
11239 rtx slot = XVECEXP (container, 0, i);
11240 rtx reg = XEXP (slot, 0);
11241 machine_mode mode = GET_MODE (reg);
11242 tree piece_type;
11243 tree addr_type;
11244 tree daddr_type;
11245 tree src_addr, src;
11246 int src_offset;
11247 tree dest_addr, dest;
11248 int cur_size = GET_MODE_SIZE (mode);
11250 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11251 prev_size = INTVAL (XEXP (slot, 1));
11252 if (prev_size + cur_size > size)
11254 cur_size = size - prev_size;
11255 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11256 if (mode == BLKmode)
11257 mode = QImode;
11259 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11260 if (mode == GET_MODE (reg))
11261 addr_type = build_pointer_type (piece_type);
11262 else
11263 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11264 true);
11265 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11266 true);
11268 if (SSE_REGNO_P (REGNO (reg)))
11270 src_addr = sse_addr;
11271 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11273 else
11275 src_addr = int_addr;
11276 src_offset = REGNO (reg) * 8;
11278 src_addr = fold_convert (addr_type, src_addr);
11279 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11281 dest_addr = fold_convert (daddr_type, addr);
11282 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11283 if (cur_size == GET_MODE_SIZE (mode))
11285 src = build_va_arg_indirect_ref (src_addr);
11286 dest = build_va_arg_indirect_ref (dest_addr);
11288 gimplify_assign (dest, src, pre_p);
11290 else
11292 tree copy
11293 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11294 3, dest_addr, src_addr,
11295 size_int (cur_size));
11296 gimplify_and_add (copy, pre_p);
11298 prev_size += cur_size;
11302 if (needed_intregs)
11304 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11305 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11306 gimplify_assign (gpr, t, pre_p);
11309 if (needed_sseregs)
11311 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11312 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11313 gimplify_assign (unshare_expr (fpr), t, pre_p);
11316 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11318 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11321 /* ... otherwise out of the overflow area. */
11323 /* When we align parameter on stack for caller, if the parameter
11324 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11325 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11326 here with caller. */
11327 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11328 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11329 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11331 /* Care for on-stack alignment if needed. */
11332 if (arg_boundary <= 64 || size == 0)
11333 t = ovf;
11334 else
11336 HOST_WIDE_INT align = arg_boundary / 8;
11337 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11338 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11339 build_int_cst (TREE_TYPE (t), -align));
11342 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11343 gimplify_assign (addr, t, pre_p);
11345 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11346 gimplify_assign (unshare_expr (ovf), t, pre_p);
11348 if (container)
11349 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11351 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11352 addr = fold_convert (ptrtype, addr);
11354 if (indirect_p)
11355 addr = build_va_arg_indirect_ref (addr);
11356 return build_va_arg_indirect_ref (addr);
11359 /* Return true if OPNUM's MEM should be matched
11360 in movabs* patterns. */
11362 bool
11363 ix86_check_movabs (rtx insn, int opnum)
11365 rtx set, mem;
11367 set = PATTERN (insn);
11368 if (GET_CODE (set) == PARALLEL)
11369 set = XVECEXP (set, 0, 0);
11370 gcc_assert (GET_CODE (set) == SET);
11371 mem = XEXP (set, opnum);
11372 while (SUBREG_P (mem))
11373 mem = SUBREG_REG (mem);
11374 gcc_assert (MEM_P (mem));
11375 return volatile_ok || !MEM_VOLATILE_P (mem);
11378 /* Return false if INSN contains a MEM with a non-default address space. */
11379 bool
11380 ix86_check_no_addr_space (rtx insn)
11382 subrtx_var_iterator::array_type array;
11383 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11385 rtx x = *iter;
11386 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11387 return false;
11389 return true;
11392 /* Initialize the table of extra 80387 mathematical constants. */
11394 static void
11395 init_ext_80387_constants (void)
11397 static const char * cst[5] =
11399 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11400 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11401 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11402 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11403 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11405 int i;
11407 for (i = 0; i < 5; i++)
11409 real_from_string (&ext_80387_constants_table[i], cst[i]);
11410 /* Ensure each constant is rounded to XFmode precision. */
11411 real_convert (&ext_80387_constants_table[i],
11412 XFmode, &ext_80387_constants_table[i]);
11415 ext_80387_constants_init = 1;
11418 /* Return non-zero if the constant is something that
11419 can be loaded with a special instruction. */
11422 standard_80387_constant_p (rtx x)
11424 machine_mode mode = GET_MODE (x);
11426 const REAL_VALUE_TYPE *r;
11428 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11429 return -1;
11431 if (x == CONST0_RTX (mode))
11432 return 1;
11433 if (x == CONST1_RTX (mode))
11434 return 2;
11436 r = CONST_DOUBLE_REAL_VALUE (x);
11438 /* For XFmode constants, try to find a special 80387 instruction when
11439 optimizing for size or on those CPUs that benefit from them. */
11440 if (mode == XFmode
11441 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11443 int i;
11445 if (! ext_80387_constants_init)
11446 init_ext_80387_constants ();
11448 for (i = 0; i < 5; i++)
11449 if (real_identical (r, &ext_80387_constants_table[i]))
11450 return i + 3;
11453 /* Load of the constant -0.0 or -1.0 will be split as
11454 fldz;fchs or fld1;fchs sequence. */
11455 if (real_isnegzero (r))
11456 return 8;
11457 if (real_identical (r, &dconstm1))
11458 return 9;
11460 return 0;
11463 /* Return the opcode of the special instruction to be used to load
11464 the constant X. */
11466 const char *
11467 standard_80387_constant_opcode (rtx x)
11469 switch (standard_80387_constant_p (x))
11471 case 1:
11472 return "fldz";
11473 case 2:
11474 return "fld1";
11475 case 3:
11476 return "fldlg2";
11477 case 4:
11478 return "fldln2";
11479 case 5:
11480 return "fldl2e";
11481 case 6:
11482 return "fldl2t";
11483 case 7:
11484 return "fldpi";
11485 case 8:
11486 case 9:
11487 return "#";
11488 default:
11489 gcc_unreachable ();
11493 /* Return the CONST_DOUBLE representing the 80387 constant that is
11494 loaded by the specified special instruction. The argument IDX
11495 matches the return value from standard_80387_constant_p. */
11498 standard_80387_constant_rtx (int idx)
11500 int i;
11502 if (! ext_80387_constants_init)
11503 init_ext_80387_constants ();
11505 switch (idx)
11507 case 3:
11508 case 4:
11509 case 5:
11510 case 6:
11511 case 7:
11512 i = idx - 3;
11513 break;
11515 default:
11516 gcc_unreachable ();
11519 return const_double_from_real_value (ext_80387_constants_table[i],
11520 XFmode);
11523 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
11524 in supported SSE/AVX vector mode. */
11527 standard_sse_constant_p (rtx x, machine_mode pred_mode)
11529 machine_mode mode;
11531 if (!TARGET_SSE)
11532 return 0;
11534 mode = GET_MODE (x);
11536 if (x == const0_rtx || const0_operand (x, mode))
11537 return 1;
11539 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11541 /* VOIDmode integer constant, get mode from the predicate. */
11542 if (mode == VOIDmode)
11543 mode = pred_mode;
11545 switch (GET_MODE_SIZE (mode))
11547 case 64:
11548 if (TARGET_AVX512F)
11549 return 2;
11550 break;
11551 case 32:
11552 if (TARGET_AVX2)
11553 return 2;
11554 break;
11555 case 16:
11556 if (TARGET_SSE2)
11557 return 2;
11558 break;
11559 case 0:
11560 /* VOIDmode */
11561 gcc_unreachable ();
11562 default:
11563 break;
11567 return 0;
11570 /* Return the opcode of the special instruction to be used to load
11571 the constant X. */
11573 const char *
11574 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
11576 machine_mode mode;
11578 gcc_assert (TARGET_SSE);
11580 mode = GET_MODE (x);
11582 if (x == const0_rtx || const0_operand (x, mode))
11584 switch (get_attr_mode (insn))
11586 case MODE_XI:
11587 return "vpxord\t%g0, %g0, %g0";
11588 case MODE_OI:
11589 return (TARGET_AVX512VL
11590 ? "vpxord\t%x0, %x0, %x0"
11591 : "vpxor\t%x0, %x0, %x0");
11592 case MODE_TI:
11593 return (TARGET_AVX512VL
11594 ? "vpxord\t%t0, %t0, %t0"
11595 : "%vpxor\t%0, %d0");
11597 case MODE_V8DF:
11598 return (TARGET_AVX512DQ
11599 ? "vxorpd\t%g0, %g0, %g0"
11600 : "vpxorq\t%g0, %g0, %g0");
11601 case MODE_V4DF:
11602 return "vxorpd\t%x0, %x0, %x0";
11603 case MODE_V2DF:
11604 return "%vxorpd\t%0, %d0";
11606 case MODE_V16SF:
11607 return (TARGET_AVX512DQ
11608 ? "vxorps\t%g0, %g0, %g0"
11609 : "vpxord\t%g0, %g0, %g0");
11610 case MODE_V8SF:
11611 return "vxorps\t%x0, %x0, %x0";
11612 case MODE_V4SF:
11613 return "%vxorps\t%0, %d0";
11615 default:
11616 gcc_unreachable ();
11619 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
11621 enum attr_mode insn_mode = get_attr_mode (insn);
11623 switch (insn_mode)
11625 case MODE_XI:
11626 case MODE_V8DF:
11627 case MODE_V16SF:
11628 gcc_assert (TARGET_AVX512F);
11629 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
11631 case MODE_OI:
11632 case MODE_V4DF:
11633 case MODE_V8SF:
11634 gcc_assert (TARGET_AVX2);
11635 /* FALLTHRU */
11636 case MODE_TI:
11637 case MODE_V2DF:
11638 case MODE_V4SF:
11639 gcc_assert (TARGET_SSE2);
11640 return (TARGET_AVX
11641 ? "vpcmpeqd\t%0, %0, %0"
11642 : "pcmpeqd\t%0, %0");
11644 default:
11645 gcc_unreachable ();
11649 gcc_unreachable ();
11652 /* Returns true if INSN can be transformed from a memory load
11653 to a supported FP constant load. */
11655 bool
11656 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
11658 rtx src = find_constant_src (insn);
11660 gcc_assert (REG_P (dst));
11662 if (src == NULL
11663 || (SSE_REGNO_P (REGNO (dst))
11664 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
11665 || (STACK_REGNO_P (REGNO (dst))
11666 && standard_80387_constant_p (src) < 1))
11667 return false;
11669 return true;
11672 /* Returns true if OP contains a symbol reference */
11674 bool
11675 symbolic_reference_mentioned_p (rtx op)
11677 const char *fmt;
11678 int i;
11680 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
11681 return true;
11683 fmt = GET_RTX_FORMAT (GET_CODE (op));
11684 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
11686 if (fmt[i] == 'E')
11688 int j;
11690 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
11691 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
11692 return true;
11695 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
11696 return true;
11699 return false;
11702 /* Return true if it is appropriate to emit `ret' instructions in the
11703 body of a function. Do this only if the epilogue is simple, needing a
11704 couple of insns. Prior to reloading, we can't tell how many registers
11705 must be saved, so return false then. Return false if there is no frame
11706 marker to de-allocate. */
11708 bool
11709 ix86_can_use_return_insn_p (void)
11711 struct ix86_frame frame;
11713 /* Don't use `ret' instruction in interrupt handler. */
11714 if (! reload_completed
11715 || frame_pointer_needed
11716 || cfun->machine->func_type != TYPE_NORMAL)
11717 return 0;
11719 /* Don't allow more than 32k pop, since that's all we can do
11720 with one instruction. */
11721 if (crtl->args.pops_args && crtl->args.size >= 32768)
11722 return 0;
11724 ix86_compute_frame_layout (&frame);
11725 return (frame.stack_pointer_offset == UNITS_PER_WORD
11726 && (frame.nregs + frame.nsseregs) == 0);
11729 /* Value should be nonzero if functions must have frame pointers.
11730 Zero means the frame pointer need not be set up (and parms may
11731 be accessed via the stack pointer) in functions that seem suitable. */
11733 static bool
11734 ix86_frame_pointer_required (void)
11736 /* If we accessed previous frames, then the generated code expects
11737 to be able to access the saved ebp value in our frame. */
11738 if (cfun->machine->accesses_prev_frame)
11739 return true;
11741 /* Several x86 os'es need a frame pointer for other reasons,
11742 usually pertaining to setjmp. */
11743 if (SUBTARGET_FRAME_POINTER_REQUIRED)
11744 return true;
11746 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
11747 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
11748 return true;
11750 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
11751 allocation is 4GB. */
11752 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
11753 return true;
11755 /* SSE saves require frame-pointer when stack is misaligned. */
11756 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
11757 return true;
11759 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
11760 turns off the frame pointer by default. Turn it back on now if
11761 we've not got a leaf function. */
11762 if (TARGET_OMIT_LEAF_FRAME_POINTER
11763 && (!crtl->is_leaf
11764 || ix86_current_function_calls_tls_descriptor))
11765 return true;
11767 if (crtl->profile && !flag_fentry)
11768 return true;
11770 return false;
11773 /* Record that the current function accesses previous call frames. */
11775 void
11776 ix86_setup_frame_addresses (void)
11778 cfun->machine->accesses_prev_frame = 1;
11781 #ifndef USE_HIDDEN_LINKONCE
11782 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
11783 # define USE_HIDDEN_LINKONCE 1
11784 # else
11785 # define USE_HIDDEN_LINKONCE 0
11786 # endif
11787 #endif
11789 static int pic_labels_used;
11791 /* Fills in the label name that should be used for a pc thunk for
11792 the given register. */
11794 static void
11795 get_pc_thunk_name (char name[32], unsigned int regno)
11797 gcc_assert (!TARGET_64BIT);
11799 if (USE_HIDDEN_LINKONCE)
11800 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11801 else
11802 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11806 /* This function generates code for -fpic that loads %ebx with
11807 the return address of the caller and then returns. */
11809 static void
11810 ix86_code_end (void)
11812 rtx xops[2];
11813 int regno;
11815 for (regno = AX_REG; regno <= SP_REG; regno++)
11817 char name[32];
11818 tree decl;
11820 if (!(pic_labels_used & (1 << regno)))
11821 continue;
11823 get_pc_thunk_name (name, regno);
11825 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11826 get_identifier (name),
11827 build_function_type_list (void_type_node, NULL_TREE));
11828 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11829 NULL_TREE, void_type_node);
11830 TREE_PUBLIC (decl) = 1;
11831 TREE_STATIC (decl) = 1;
11832 DECL_IGNORED_P (decl) = 1;
11834 #if TARGET_MACHO
11835 if (TARGET_MACHO)
11837 switch_to_section (darwin_sections[text_coal_section]);
11838 fputs ("\t.weak_definition\t", asm_out_file);
11839 assemble_name (asm_out_file, name);
11840 fputs ("\n\t.private_extern\t", asm_out_file);
11841 assemble_name (asm_out_file, name);
11842 putc ('\n', asm_out_file);
11843 ASM_OUTPUT_LABEL (asm_out_file, name);
11844 DECL_WEAK (decl) = 1;
11846 else
11847 #endif
11848 if (USE_HIDDEN_LINKONCE)
11850 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11852 targetm.asm_out.unique_section (decl, 0);
11853 switch_to_section (get_named_section (decl, NULL, 0));
11855 targetm.asm_out.globalize_label (asm_out_file, name);
11856 fputs ("\t.hidden\t", asm_out_file);
11857 assemble_name (asm_out_file, name);
11858 putc ('\n', asm_out_file);
11859 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11861 else
11863 switch_to_section (text_section);
11864 ASM_OUTPUT_LABEL (asm_out_file, name);
11867 DECL_INITIAL (decl) = make_node (BLOCK);
11868 current_function_decl = decl;
11869 allocate_struct_function (decl, false);
11870 init_function_start (decl);
11871 first_function_block_is_cold = false;
11872 /* Make sure unwind info is emitted for the thunk if needed. */
11873 final_start_function (emit_barrier (), asm_out_file, 1);
11875 /* Pad stack IP move with 4 instructions (two NOPs count
11876 as one instruction). */
11877 if (TARGET_PAD_SHORT_FUNCTION)
11879 int i = 8;
11881 while (i--)
11882 fputs ("\tnop\n", asm_out_file);
11885 xops[0] = gen_rtx_REG (Pmode, regno);
11886 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11887 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11888 output_asm_insn ("%!ret", NULL);
11889 final_end_function ();
11890 init_insn_lengths ();
11891 free_after_compilation (cfun);
11892 set_cfun (NULL);
11893 current_function_decl = NULL;
11896 if (flag_split_stack)
11897 file_end_indicate_split_stack ();
11900 /* Emit code for the SET_GOT patterns. */
11902 const char *
11903 output_set_got (rtx dest, rtx label)
11905 rtx xops[3];
11907 xops[0] = dest;
11909 if (TARGET_VXWORKS_RTP && flag_pic)
11911 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11912 xops[2] = gen_rtx_MEM (Pmode,
11913 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11914 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11916 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11917 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11918 an unadorned address. */
11919 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11920 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11921 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11922 return "";
11925 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11927 if (flag_pic)
11929 char name[32];
11930 get_pc_thunk_name (name, REGNO (dest));
11931 pic_labels_used |= 1 << REGNO (dest);
11933 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11934 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11935 output_asm_insn ("%!call\t%X2", xops);
11937 #if TARGET_MACHO
11938 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11939 This is what will be referenced by the Mach-O PIC subsystem. */
11940 if (machopic_should_output_picbase_label () || !label)
11941 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11943 /* When we are restoring the pic base at the site of a nonlocal label,
11944 and we decided to emit the pic base above, we will still output a
11945 local label used for calculating the correction offset (even though
11946 the offset will be 0 in that case). */
11947 if (label)
11948 targetm.asm_out.internal_label (asm_out_file, "L",
11949 CODE_LABEL_NUMBER (label));
11950 #endif
11952 else
11954 if (TARGET_MACHO)
11955 /* We don't need a pic base, we're not producing pic. */
11956 gcc_unreachable ();
11958 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11959 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11960 targetm.asm_out.internal_label (asm_out_file, "L",
11961 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11964 if (!TARGET_MACHO)
11965 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11967 return "";
11970 /* Generate an "push" pattern for input ARG. */
11972 static rtx
11973 gen_push (rtx arg)
11975 struct machine_function *m = cfun->machine;
11977 if (m->fs.cfa_reg == stack_pointer_rtx)
11978 m->fs.cfa_offset += UNITS_PER_WORD;
11979 m->fs.sp_offset += UNITS_PER_WORD;
11981 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11982 arg = gen_rtx_REG (word_mode, REGNO (arg));
11984 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11985 gen_rtx_PRE_DEC (Pmode,
11986 stack_pointer_rtx)),
11987 arg);
11990 /* Generate an "pop" pattern for input ARG. */
11992 static rtx
11993 gen_pop (rtx arg)
11995 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11996 arg = gen_rtx_REG (word_mode, REGNO (arg));
11998 return gen_rtx_SET (arg,
11999 gen_rtx_MEM (word_mode,
12000 gen_rtx_POST_INC (Pmode,
12001 stack_pointer_rtx)));
12004 /* Return >= 0 if there is an unused call-clobbered register available
12005 for the entire function. */
12007 static unsigned int
12008 ix86_select_alt_pic_regnum (void)
12010 if (ix86_use_pseudo_pic_reg ())
12011 return INVALID_REGNUM;
12013 if (crtl->is_leaf
12014 && !crtl->profile
12015 && !ix86_current_function_calls_tls_descriptor)
12017 int i, drap;
12018 /* Can't use the same register for both PIC and DRAP. */
12019 if (crtl->drap_reg)
12020 drap = REGNO (crtl->drap_reg);
12021 else
12022 drap = -1;
12023 for (i = 2; i >= 0; --i)
12024 if (i != drap && !df_regs_ever_live_p (i))
12025 return i;
12028 return INVALID_REGNUM;
12031 /* Return true if REGNO is used by the epilogue. */
12033 bool
12034 ix86_epilogue_uses (int regno)
12036 /* If there are no caller-saved registers, we preserve all registers,
12037 except for MMX and x87 registers which aren't supported when saving
12038 and restoring registers. Don't explicitly save SP register since
12039 it is always preserved. */
12040 return (epilogue_completed
12041 && cfun->machine->no_caller_saved_registers
12042 && !fixed_regs[regno]
12043 && !STACK_REGNO_P (regno)
12044 && !MMX_REGNO_P (regno));
12047 /* Return nonzero if register REGNO can be used as a scratch register
12048 in peephole2. */
12050 static bool
12051 ix86_hard_regno_scratch_ok (unsigned int regno)
12053 /* If there are no caller-saved registers, we can't use any register
12054 as a scratch register after epilogue and use REGNO as scratch
12055 register only if it has been used before to avoid saving and
12056 restoring it. */
12057 return (!cfun->machine->no_caller_saved_registers
12058 || (!epilogue_completed
12059 && df_regs_ever_live_p (regno)));
12062 /* Return TRUE if we need to save REGNO. */
12064 static bool
12065 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
12067 /* If there are no caller-saved registers, we preserve all registers,
12068 except for MMX and x87 registers which aren't supported when saving
12069 and restoring registers. Don't explicitly save SP register since
12070 it is always preserved. */
12071 if (cfun->machine->no_caller_saved_registers)
12073 /* Don't preserve registers used for function return value. */
12074 rtx reg = crtl->return_rtx;
12075 if (reg)
12077 unsigned int i = REGNO (reg);
12078 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12079 while (nregs-- > 0)
12080 if ((i + nregs) == regno)
12081 return false;
12083 reg = crtl->return_bnd;
12084 if (reg)
12086 i = REGNO (reg);
12087 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12088 while (nregs-- > 0)
12089 if ((i + nregs) == regno)
12090 return false;
12094 return (df_regs_ever_live_p (regno)
12095 && !fixed_regs[regno]
12096 && !STACK_REGNO_P (regno)
12097 && !MMX_REGNO_P (regno)
12098 && (regno != HARD_FRAME_POINTER_REGNUM
12099 || !frame_pointer_needed));
12102 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12103 && pic_offset_table_rtx)
12105 if (ix86_use_pseudo_pic_reg ())
12107 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12108 _mcount in prologue. */
12109 if (!TARGET_64BIT && flag_pic && crtl->profile)
12110 return true;
12112 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12113 || crtl->profile
12114 || crtl->calls_eh_return
12115 || crtl->uses_const_pool
12116 || cfun->has_nonlocal_label)
12117 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12120 if (crtl->calls_eh_return && maybe_eh_return)
12122 unsigned i;
12123 for (i = 0; ; i++)
12125 unsigned test = EH_RETURN_DATA_REGNO (i);
12126 if (test == INVALID_REGNUM)
12127 break;
12128 if (test == regno)
12129 return true;
12133 if (crtl->drap_reg
12134 && regno == REGNO (crtl->drap_reg)
12135 && !cfun->machine->no_drap_save_restore)
12136 return true;
12138 return (df_regs_ever_live_p (regno)
12139 && !call_used_regs[regno]
12140 && !fixed_regs[regno]
12141 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12144 /* Return number of saved general prupose registers. */
12146 static int
12147 ix86_nsaved_regs (void)
12149 int nregs = 0;
12150 int regno;
12152 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12153 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12154 nregs ++;
12155 return nregs;
12158 /* Return number of saved SSE registers. */
12160 static int
12161 ix86_nsaved_sseregs (void)
12163 int nregs = 0;
12164 int regno;
12166 if (!TARGET_64BIT_MS_ABI)
12167 return 0;
12168 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12169 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12170 nregs ++;
12171 return nregs;
12174 /* Given FROM and TO register numbers, say whether this elimination is
12175 allowed. If stack alignment is needed, we can only replace argument
12176 pointer with hard frame pointer, or replace frame pointer with stack
12177 pointer. Otherwise, frame pointer elimination is automatically
12178 handled and all other eliminations are valid. */
12180 static bool
12181 ix86_can_eliminate (const int from, const int to)
12183 if (stack_realign_fp)
12184 return ((from == ARG_POINTER_REGNUM
12185 && to == HARD_FRAME_POINTER_REGNUM)
12186 || (from == FRAME_POINTER_REGNUM
12187 && to == STACK_POINTER_REGNUM));
12188 else
12189 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12192 /* Return the offset between two registers, one to be eliminated, and the other
12193 its replacement, at the start of a routine. */
12195 HOST_WIDE_INT
12196 ix86_initial_elimination_offset (int from, int to)
12198 struct ix86_frame frame;
12199 ix86_compute_frame_layout (&frame);
12201 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12202 return frame.hard_frame_pointer_offset;
12203 else if (from == FRAME_POINTER_REGNUM
12204 && to == HARD_FRAME_POINTER_REGNUM)
12205 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12206 else
12208 gcc_assert (to == STACK_POINTER_REGNUM);
12210 if (from == ARG_POINTER_REGNUM)
12211 return frame.stack_pointer_offset;
12213 gcc_assert (from == FRAME_POINTER_REGNUM);
12214 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12218 /* In a dynamically-aligned function, we can't know the offset from
12219 stack pointer to frame pointer, so we must ensure that setjmp
12220 eliminates fp against the hard fp (%ebp) rather than trying to
12221 index from %esp up to the top of the frame across a gap that is
12222 of unknown (at compile-time) size. */
12223 static rtx
12224 ix86_builtin_setjmp_frame_value (void)
12226 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12229 /* When using -fsplit-stack, the allocation routines set a field in
12230 the TCB to the bottom of the stack plus this much space, measured
12231 in bytes. */
12233 #define SPLIT_STACK_AVAILABLE 256
12235 /* Fill structure ix86_frame about frame of currently computed function. */
12237 static void
12238 ix86_compute_frame_layout (struct ix86_frame *frame)
12240 unsigned HOST_WIDE_INT stack_alignment_needed;
12241 HOST_WIDE_INT offset;
12242 unsigned HOST_WIDE_INT preferred_alignment;
12243 HOST_WIDE_INT size = get_frame_size ();
12244 HOST_WIDE_INT to_allocate;
12246 frame->nregs = ix86_nsaved_regs ();
12247 frame->nsseregs = ix86_nsaved_sseregs ();
12249 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12250 except for function prologues, leaf functions and when the defult
12251 incoming stack boundary is overriden at command line or via
12252 force_align_arg_pointer attribute. */
12253 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12254 && (!crtl->is_leaf || cfun->calls_alloca != 0
12255 || ix86_current_function_calls_tls_descriptor
12256 || ix86_incoming_stack_boundary < 128))
12258 crtl->preferred_stack_boundary = 128;
12259 crtl->stack_alignment_needed = 128;
12262 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12263 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12265 gcc_assert (!size || stack_alignment_needed);
12266 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12267 gcc_assert (preferred_alignment <= stack_alignment_needed);
12269 /* For SEH we have to limit the amount of code movement into the prologue.
12270 At present we do this via a BLOCKAGE, at which point there's very little
12271 scheduling that can be done, which means that there's very little point
12272 in doing anything except PUSHs. */
12273 if (TARGET_SEH)
12274 cfun->machine->use_fast_prologue_epilogue = false;
12276 /* During reload iteration the amount of registers saved can change.
12277 Recompute the value as needed. Do not recompute when amount of registers
12278 didn't change as reload does multiple calls to the function and does not
12279 expect the decision to change within single iteration. */
12280 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
12281 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
12283 int count = frame->nregs;
12284 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12286 cfun->machine->use_fast_prologue_epilogue_nregs = count;
12288 /* The fast prologue uses move instead of push to save registers. This
12289 is significantly longer, but also executes faster as modern hardware
12290 can execute the moves in parallel, but can't do that for push/pop.
12292 Be careful about choosing what prologue to emit: When function takes
12293 many instructions to execute we may use slow version as well as in
12294 case function is known to be outside hot spot (this is known with
12295 feedback only). Weight the size of function by number of registers
12296 to save as it is cheap to use one or two push instructions but very
12297 slow to use many of them. */
12298 if (count)
12299 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12300 if (node->frequency < NODE_FREQUENCY_NORMAL
12301 || (flag_branch_probabilities
12302 && node->frequency < NODE_FREQUENCY_HOT))
12303 cfun->machine->use_fast_prologue_epilogue = false;
12304 else
12305 cfun->machine->use_fast_prologue_epilogue
12306 = !expensive_function_p (count);
12309 frame->save_regs_using_mov
12310 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
12311 /* If static stack checking is enabled and done with probes,
12312 the registers need to be saved before allocating the frame. */
12313 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12315 /* Skip return address. */
12316 offset = UNITS_PER_WORD;
12318 /* Skip pushed static chain. */
12319 if (ix86_static_chain_on_stack)
12320 offset += UNITS_PER_WORD;
12322 /* Skip saved base pointer. */
12323 if (frame_pointer_needed)
12324 offset += UNITS_PER_WORD;
12325 frame->hfp_save_offset = offset;
12327 /* The traditional frame pointer location is at the top of the frame. */
12328 frame->hard_frame_pointer_offset = offset;
12330 /* Register save area */
12331 offset += frame->nregs * UNITS_PER_WORD;
12332 frame->reg_save_offset = offset;
12334 /* On SEH target, registers are pushed just before the frame pointer
12335 location. */
12336 if (TARGET_SEH)
12337 frame->hard_frame_pointer_offset = offset;
12339 /* Align and set SSE register save area. */
12340 if (frame->nsseregs)
12342 /* The only ABI that has saved SSE registers (Win64) also has a
12343 16-byte aligned default stack, and thus we don't need to be
12344 within the re-aligned local stack frame to save them. In case
12345 incoming stack boundary is aligned to less than 16 bytes,
12346 unaligned move of SSE register will be emitted, so there is
12347 no point to round up the SSE register save area outside the
12348 re-aligned local stack frame to 16 bytes. */
12349 if (ix86_incoming_stack_boundary >= 128)
12350 offset = ROUND_UP (offset, 16);
12351 offset += frame->nsseregs * 16;
12353 frame->sse_reg_save_offset = offset;
12355 /* The re-aligned stack starts here. Values before this point are not
12356 directly comparable with values below this point. In order to make
12357 sure that no value happens to be the same before and after, force
12358 the alignment computation below to add a non-zero value. */
12359 if (stack_realign_fp)
12360 offset = ROUND_UP (offset, stack_alignment_needed);
12362 /* Va-arg area */
12363 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12364 offset += frame->va_arg_size;
12366 /* Align start of frame for local function. */
12367 if (stack_realign_fp
12368 || offset != frame->sse_reg_save_offset
12369 || size != 0
12370 || !crtl->is_leaf
12371 || cfun->calls_alloca
12372 || ix86_current_function_calls_tls_descriptor)
12373 offset = ROUND_UP (offset, stack_alignment_needed);
12375 /* Frame pointer points here. */
12376 frame->frame_pointer_offset = offset;
12378 offset += size;
12380 /* Add outgoing arguments area. Can be skipped if we eliminated
12381 all the function calls as dead code.
12382 Skipping is however impossible when function calls alloca. Alloca
12383 expander assumes that last crtl->outgoing_args_size
12384 of stack frame are unused. */
12385 if (ACCUMULATE_OUTGOING_ARGS
12386 && (!crtl->is_leaf || cfun->calls_alloca
12387 || ix86_current_function_calls_tls_descriptor))
12389 offset += crtl->outgoing_args_size;
12390 frame->outgoing_arguments_size = crtl->outgoing_args_size;
12392 else
12393 frame->outgoing_arguments_size = 0;
12395 /* Align stack boundary. Only needed if we're calling another function
12396 or using alloca. */
12397 if (!crtl->is_leaf || cfun->calls_alloca
12398 || ix86_current_function_calls_tls_descriptor)
12399 offset = ROUND_UP (offset, preferred_alignment);
12401 /* We've reached end of stack frame. */
12402 frame->stack_pointer_offset = offset;
12404 /* Size prologue needs to allocate. */
12405 to_allocate = offset - frame->sse_reg_save_offset;
12407 if ((!to_allocate && frame->nregs <= 1)
12408 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
12409 frame->save_regs_using_mov = false;
12411 if (ix86_using_red_zone ()
12412 && crtl->sp_is_unchanging
12413 && crtl->is_leaf
12414 && !ix86_pc_thunk_call_expanded
12415 && !ix86_current_function_calls_tls_descriptor)
12417 frame->red_zone_size = to_allocate;
12418 if (frame->save_regs_using_mov)
12419 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
12420 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
12421 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
12423 else
12424 frame->red_zone_size = 0;
12425 frame->stack_pointer_offset -= frame->red_zone_size;
12427 /* The SEH frame pointer location is near the bottom of the frame.
12428 This is enforced by the fact that the difference between the
12429 stack pointer and the frame pointer is limited to 240 bytes in
12430 the unwind data structure. */
12431 if (TARGET_SEH)
12433 HOST_WIDE_INT diff;
12435 /* If we can leave the frame pointer where it is, do so. Also, returns
12436 the establisher frame for __builtin_frame_address (0). */
12437 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
12438 if (diff <= SEH_MAX_FRAME_SIZE
12439 && (diff > 240 || (diff & 15) != 0)
12440 && !crtl->accesses_prior_frames)
12442 /* Ideally we'd determine what portion of the local stack frame
12443 (within the constraint of the lowest 240) is most heavily used.
12444 But without that complication, simply bias the frame pointer
12445 by 128 bytes so as to maximize the amount of the local stack
12446 frame that is addressable with 8-bit offsets. */
12447 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
12452 /* This is semi-inlined memory_address_length, but simplified
12453 since we know that we're always dealing with reg+offset, and
12454 to avoid having to create and discard all that rtl. */
12456 static inline int
12457 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
12459 int len = 4;
12461 if (offset == 0)
12463 /* EBP and R13 cannot be encoded without an offset. */
12464 len = (regno == BP_REG || regno == R13_REG);
12466 else if (IN_RANGE (offset, -128, 127))
12467 len = 1;
12469 /* ESP and R12 must be encoded with a SIB byte. */
12470 if (regno == SP_REG || regno == R12_REG)
12471 len++;
12473 return len;
12476 /* Return an RTX that points to CFA_OFFSET within the stack frame.
12477 The valid base registers are taken from CFUN->MACHINE->FS. */
12479 static rtx
12480 choose_baseaddr (HOST_WIDE_INT cfa_offset)
12482 const struct machine_function *m = cfun->machine;
12483 rtx base_reg = NULL;
12484 HOST_WIDE_INT base_offset = 0;
12486 if (m->use_fast_prologue_epilogue)
12488 /* Choose the base register most likely to allow the most scheduling
12489 opportunities. Generally FP is valid throughout the function,
12490 while DRAP must be reloaded within the epilogue. But choose either
12491 over the SP due to increased encoding size. */
12493 if (m->fs.fp_valid)
12495 base_reg = hard_frame_pointer_rtx;
12496 base_offset = m->fs.fp_offset - cfa_offset;
12498 else if (m->fs.drap_valid)
12500 base_reg = crtl->drap_reg;
12501 base_offset = 0 - cfa_offset;
12503 else if (m->fs.sp_valid)
12505 base_reg = stack_pointer_rtx;
12506 base_offset = m->fs.sp_offset - cfa_offset;
12509 else
12511 HOST_WIDE_INT toffset;
12512 int len = 16, tlen;
12514 /* Choose the base register with the smallest address encoding.
12515 With a tie, choose FP > DRAP > SP. */
12516 if (m->fs.sp_valid)
12518 base_reg = stack_pointer_rtx;
12519 base_offset = m->fs.sp_offset - cfa_offset;
12520 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
12522 if (m->fs.drap_valid)
12524 toffset = 0 - cfa_offset;
12525 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
12526 if (tlen <= len)
12528 base_reg = crtl->drap_reg;
12529 base_offset = toffset;
12530 len = tlen;
12533 if (m->fs.fp_valid)
12535 toffset = m->fs.fp_offset - cfa_offset;
12536 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
12537 if (tlen <= len)
12539 base_reg = hard_frame_pointer_rtx;
12540 base_offset = toffset;
12541 len = tlen;
12545 gcc_assert (base_reg != NULL);
12547 return plus_constant (Pmode, base_reg, base_offset);
12550 /* Emit code to save registers in the prologue. */
12552 static void
12553 ix86_emit_save_regs (void)
12555 unsigned int regno;
12556 rtx_insn *insn;
12558 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12559 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12561 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12562 RTX_FRAME_RELATED_P (insn) = 1;
12566 /* Emit a single register save at CFA - CFA_OFFSET. */
12568 static void
12569 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12570 HOST_WIDE_INT cfa_offset)
12572 struct machine_function *m = cfun->machine;
12573 rtx reg = gen_rtx_REG (mode, regno);
12574 rtx mem, addr, base, insn;
12575 unsigned int align;
12577 addr = choose_baseaddr (cfa_offset);
12578 mem = gen_frame_mem (mode, addr);
12580 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
12581 align = MIN (GET_MODE_ALIGNMENT (mode), INCOMING_STACK_BOUNDARY);
12582 set_mem_align (mem, align);
12584 insn = emit_insn (gen_rtx_SET (mem, reg));
12585 RTX_FRAME_RELATED_P (insn) = 1;
12587 base = addr;
12588 if (GET_CODE (base) == PLUS)
12589 base = XEXP (base, 0);
12590 gcc_checking_assert (REG_P (base));
12592 /* When saving registers into a re-aligned local stack frame, avoid
12593 any tricky guessing by dwarf2out. */
12594 if (m->fs.realigned)
12596 gcc_checking_assert (stack_realign_drap);
12598 if (regno == REGNO (crtl->drap_reg))
12600 /* A bit of a hack. We force the DRAP register to be saved in
12601 the re-aligned stack frame, which provides us with a copy
12602 of the CFA that will last past the prologue. Install it. */
12603 gcc_checking_assert (cfun->machine->fs.fp_valid);
12604 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12605 cfun->machine->fs.fp_offset - cfa_offset);
12606 mem = gen_rtx_MEM (mode, addr);
12607 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12609 else
12611 /* The frame pointer is a stable reference within the
12612 aligned frame. Use it. */
12613 gcc_checking_assert (cfun->machine->fs.fp_valid);
12614 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12615 cfun->machine->fs.fp_offset - cfa_offset);
12616 mem = gen_rtx_MEM (mode, addr);
12617 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12621 /* The memory may not be relative to the current CFA register,
12622 which means that we may need to generate a new pattern for
12623 use by the unwind info. */
12624 else if (base != m->fs.cfa_reg)
12626 addr = plus_constant (Pmode, m->fs.cfa_reg,
12627 m->fs.cfa_offset - cfa_offset);
12628 mem = gen_rtx_MEM (mode, addr);
12629 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12633 /* Emit code to save registers using MOV insns.
12634 First register is stored at CFA - CFA_OFFSET. */
12635 static void
12636 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12638 unsigned int regno;
12640 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12641 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true))
12643 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12644 cfa_offset -= UNITS_PER_WORD;
12648 /* Emit code to save SSE registers using MOV insns.
12649 First register is stored at CFA - CFA_OFFSET. */
12650 static void
12651 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12653 unsigned int regno;
12655 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12656 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
12658 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12659 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12663 static GTY(()) rtx queued_cfa_restores;
12665 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12666 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12667 Don't add the note if the previously saved value will be left untouched
12668 within stack red-zone till return, as unwinders can find the same value
12669 in the register and on the stack. */
12671 static void
12672 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12674 if (!crtl->shrink_wrapped
12675 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12676 return;
12678 if (insn)
12680 add_reg_note (insn, REG_CFA_RESTORE, reg);
12681 RTX_FRAME_RELATED_P (insn) = 1;
12683 else
12684 queued_cfa_restores
12685 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12688 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12690 static void
12691 ix86_add_queued_cfa_restore_notes (rtx insn)
12693 rtx last;
12694 if (!queued_cfa_restores)
12695 return;
12696 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12698 XEXP (last, 1) = REG_NOTES (insn);
12699 REG_NOTES (insn) = queued_cfa_restores;
12700 queued_cfa_restores = NULL_RTX;
12701 RTX_FRAME_RELATED_P (insn) = 1;
12704 /* Expand prologue or epilogue stack adjustment.
12705 The pattern exist to put a dependency on all ebp-based memory accesses.
12706 STYLE should be negative if instructions should be marked as frame related,
12707 zero if %r11 register is live and cannot be freely used and positive
12708 otherwise. */
12710 static void
12711 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12712 int style, bool set_cfa)
12714 struct machine_function *m = cfun->machine;
12715 rtx insn;
12716 bool add_frame_related_expr = false;
12718 if (Pmode == SImode)
12719 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12720 else if (x86_64_immediate_operand (offset, DImode))
12721 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12722 else
12724 rtx tmp;
12725 /* r11 is used by indirect sibcall return as well, set before the
12726 epilogue and used after the epilogue. */
12727 if (style)
12728 tmp = gen_rtx_REG (DImode, R11_REG);
12729 else
12731 gcc_assert (src != hard_frame_pointer_rtx
12732 && dest != hard_frame_pointer_rtx);
12733 tmp = hard_frame_pointer_rtx;
12735 insn = emit_insn (gen_rtx_SET (tmp, offset));
12736 if (style < 0)
12737 add_frame_related_expr = true;
12739 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12742 insn = emit_insn (insn);
12743 if (style >= 0)
12744 ix86_add_queued_cfa_restore_notes (insn);
12746 if (set_cfa)
12748 rtx r;
12750 gcc_assert (m->fs.cfa_reg == src);
12751 m->fs.cfa_offset += INTVAL (offset);
12752 m->fs.cfa_reg = dest;
12754 r = gen_rtx_PLUS (Pmode, src, offset);
12755 r = gen_rtx_SET (dest, r);
12756 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12757 RTX_FRAME_RELATED_P (insn) = 1;
12759 else if (style < 0)
12761 RTX_FRAME_RELATED_P (insn) = 1;
12762 if (add_frame_related_expr)
12764 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12765 r = gen_rtx_SET (dest, r);
12766 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12770 if (dest == stack_pointer_rtx)
12772 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12773 bool valid = m->fs.sp_valid;
12775 if (src == hard_frame_pointer_rtx)
12777 valid = m->fs.fp_valid;
12778 ooffset = m->fs.fp_offset;
12780 else if (src == crtl->drap_reg)
12782 valid = m->fs.drap_valid;
12783 ooffset = 0;
12785 else
12787 /* Else there are two possibilities: SP itself, which we set
12788 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12789 taken care of this by hand along the eh_return path. */
12790 gcc_checking_assert (src == stack_pointer_rtx
12791 || offset == const0_rtx);
12794 m->fs.sp_offset = ooffset - INTVAL (offset);
12795 m->fs.sp_valid = valid;
12799 /* Find an available register to be used as dynamic realign argument
12800 pointer regsiter. Such a register will be written in prologue and
12801 used in begin of body, so it must not be
12802 1. parameter passing register.
12803 2. GOT pointer.
12804 We reuse static-chain register if it is available. Otherwise, we
12805 use DI for i386 and R13 for x86-64. We chose R13 since it has
12806 shorter encoding.
12808 Return: the regno of chosen register. */
12810 static unsigned int
12811 find_drap_reg (void)
12813 tree decl = cfun->decl;
12815 /* Always use callee-saved register if there are no caller-saved
12816 registers. */
12817 if (TARGET_64BIT)
12819 /* Use R13 for nested function or function need static chain.
12820 Since function with tail call may use any caller-saved
12821 registers in epilogue, DRAP must not use caller-saved
12822 register in such case. */
12823 if (DECL_STATIC_CHAIN (decl)
12824 || cfun->machine->no_caller_saved_registers
12825 || crtl->tail_call_emit)
12826 return R13_REG;
12828 return R10_REG;
12830 else
12832 /* Use DI for nested function or function need static chain.
12833 Since function with tail call may use any caller-saved
12834 registers in epilogue, DRAP must not use caller-saved
12835 register in such case. */
12836 if (DECL_STATIC_CHAIN (decl)
12837 || cfun->machine->no_caller_saved_registers
12838 || crtl->tail_call_emit)
12839 return DI_REG;
12841 /* Reuse static chain register if it isn't used for parameter
12842 passing. */
12843 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12845 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12846 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12847 return CX_REG;
12849 return DI_REG;
12853 /* Handle a "force_align_arg_pointer" attribute. */
12855 static tree
12856 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12857 tree, int, bool *no_add_attrs)
12859 if (TREE_CODE (*node) != FUNCTION_TYPE
12860 && TREE_CODE (*node) != METHOD_TYPE
12861 && TREE_CODE (*node) != FIELD_DECL
12862 && TREE_CODE (*node) != TYPE_DECL)
12864 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12865 name);
12866 *no_add_attrs = true;
12869 return NULL_TREE;
12872 /* Return minimum incoming stack alignment. */
12874 static unsigned int
12875 ix86_minimum_incoming_stack_boundary (bool sibcall)
12877 unsigned int incoming_stack_boundary;
12879 /* Stack of interrupt handler is always aligned to MIN_STACK_BOUNDARY.
12881 if (cfun->machine->func_type != TYPE_NORMAL)
12882 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12883 /* Prefer the one specified at command line. */
12884 else if (ix86_user_incoming_stack_boundary)
12885 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12886 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12887 if -mstackrealign is used, it isn't used for sibcall check and
12888 estimated stack alignment is 128bit. */
12889 else if (!sibcall
12890 && ix86_force_align_arg_pointer
12891 && crtl->stack_alignment_estimated == 128)
12892 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12893 else
12894 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12896 /* Incoming stack alignment can be changed on individual functions
12897 via force_align_arg_pointer attribute. We use the smallest
12898 incoming stack boundary. */
12899 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12900 && lookup_attribute (ix86_force_align_arg_pointer_string,
12901 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12902 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12904 /* The incoming stack frame has to be aligned at least at
12905 parm_stack_boundary. */
12906 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12907 incoming_stack_boundary = crtl->parm_stack_boundary;
12909 /* Stack at entrance of main is aligned by runtime. We use the
12910 smallest incoming stack boundary. */
12911 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12912 && DECL_NAME (current_function_decl)
12913 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12914 && DECL_FILE_SCOPE_P (current_function_decl))
12915 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12917 return incoming_stack_boundary;
12920 /* Update incoming stack boundary and estimated stack alignment. */
12922 static void
12923 ix86_update_stack_boundary (void)
12925 ix86_incoming_stack_boundary
12926 = ix86_minimum_incoming_stack_boundary (false);
12928 /* x86_64 vararg needs 16byte stack alignment for register save
12929 area. */
12930 if (TARGET_64BIT
12931 && cfun->stdarg
12932 && crtl->stack_alignment_estimated < 128)
12933 crtl->stack_alignment_estimated = 128;
12935 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12936 if (ix86_tls_descriptor_calls_expanded_in_cfun
12937 && crtl->preferred_stack_boundary < 128)
12938 crtl->preferred_stack_boundary = 128;
12941 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12942 needed or an rtx for DRAP otherwise. */
12944 static rtx
12945 ix86_get_drap_rtx (void)
12947 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
12948 crtl->need_drap = true;
12950 if (stack_realign_drap)
12952 /* Assign DRAP to vDRAP and returns vDRAP */
12953 unsigned int regno = find_drap_reg ();
12954 rtx drap_vreg;
12955 rtx arg_ptr;
12956 rtx_insn *seq, *insn;
12958 arg_ptr = gen_rtx_REG (Pmode, regno);
12959 crtl->drap_reg = arg_ptr;
12961 start_sequence ();
12962 drap_vreg = copy_to_reg (arg_ptr);
12963 seq = get_insns ();
12964 end_sequence ();
12966 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12967 if (!optimize)
12969 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12970 RTX_FRAME_RELATED_P (insn) = 1;
12972 return drap_vreg;
12974 else
12975 return NULL;
12978 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12980 static rtx
12981 ix86_internal_arg_pointer (void)
12983 return virtual_incoming_args_rtx;
12986 struct scratch_reg {
12987 rtx reg;
12988 bool saved;
12991 /* Return a short-lived scratch register for use on function entry.
12992 In 32-bit mode, it is valid only after the registers are saved
12993 in the prologue. This register must be released by means of
12994 release_scratch_register_on_entry once it is dead. */
12996 static void
12997 get_scratch_register_on_entry (struct scratch_reg *sr)
12999 int regno;
13001 sr->saved = false;
13003 if (TARGET_64BIT)
13005 /* We always use R11 in 64-bit mode. */
13006 regno = R11_REG;
13008 else
13010 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13011 bool fastcall_p
13012 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13013 bool thiscall_p
13014 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13015 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13016 int regparm = ix86_function_regparm (fntype, decl);
13017 int drap_regno
13018 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13020 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13021 for the static chain register. */
13022 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13023 && drap_regno != AX_REG)
13024 regno = AX_REG;
13025 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13026 for the static chain register. */
13027 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13028 regno = AX_REG;
13029 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13030 regno = DX_REG;
13031 /* ecx is the static chain register. */
13032 else if (regparm < 3 && !fastcall_p && !thiscall_p
13033 && !static_chain_p
13034 && drap_regno != CX_REG)
13035 regno = CX_REG;
13036 else if (ix86_save_reg (BX_REG, true))
13037 regno = BX_REG;
13038 /* esi is the static chain register. */
13039 else if (!(regparm == 3 && static_chain_p)
13040 && ix86_save_reg (SI_REG, true))
13041 regno = SI_REG;
13042 else if (ix86_save_reg (DI_REG, true))
13043 regno = DI_REG;
13044 else
13046 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13047 sr->saved = true;
13051 sr->reg = gen_rtx_REG (Pmode, regno);
13052 if (sr->saved)
13054 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13055 RTX_FRAME_RELATED_P (insn) = 1;
13059 /* Release a scratch register obtained from the preceding function. */
13061 static void
13062 release_scratch_register_on_entry (struct scratch_reg *sr)
13064 if (sr->saved)
13066 struct machine_function *m = cfun->machine;
13067 rtx x, insn = emit_insn (gen_pop (sr->reg));
13069 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13070 RTX_FRAME_RELATED_P (insn) = 1;
13071 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13072 x = gen_rtx_SET (stack_pointer_rtx, x);
13073 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13074 m->fs.sp_offset -= UNITS_PER_WORD;
13078 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13080 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13082 static void
13083 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13085 /* We skip the probe for the first interval + a small dope of 4 words and
13086 probe that many bytes past the specified size to maintain a protection
13087 area at the botton of the stack. */
13088 const int dope = 4 * UNITS_PER_WORD;
13089 rtx size_rtx = GEN_INT (size), last;
13091 /* See if we have a constant small number of probes to generate. If so,
13092 that's the easy case. The run-time loop is made up of 9 insns in the
13093 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13094 for n # of intervals. */
13095 if (size <= 4 * PROBE_INTERVAL)
13097 HOST_WIDE_INT i, adjust;
13098 bool first_probe = true;
13100 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13101 values of N from 1 until it exceeds SIZE. If only one probe is
13102 needed, this will not generate any code. Then adjust and probe
13103 to PROBE_INTERVAL + SIZE. */
13104 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13106 if (first_probe)
13108 adjust = 2 * PROBE_INTERVAL + dope;
13109 first_probe = false;
13111 else
13112 adjust = PROBE_INTERVAL;
13114 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13115 plus_constant (Pmode, stack_pointer_rtx,
13116 -adjust)));
13117 emit_stack_probe (stack_pointer_rtx);
13120 if (first_probe)
13121 adjust = size + PROBE_INTERVAL + dope;
13122 else
13123 adjust = size + PROBE_INTERVAL - i;
13125 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13126 plus_constant (Pmode, stack_pointer_rtx,
13127 -adjust)));
13128 emit_stack_probe (stack_pointer_rtx);
13130 /* Adjust back to account for the additional first interval. */
13131 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13132 plus_constant (Pmode, stack_pointer_rtx,
13133 PROBE_INTERVAL + dope)));
13136 /* Otherwise, do the same as above, but in a loop. Note that we must be
13137 extra careful with variables wrapping around because we might be at
13138 the very top (or the very bottom) of the address space and we have
13139 to be able to handle this case properly; in particular, we use an
13140 equality test for the loop condition. */
13141 else
13143 HOST_WIDE_INT rounded_size;
13144 struct scratch_reg sr;
13146 get_scratch_register_on_entry (&sr);
13149 /* Step 1: round SIZE to the previous multiple of the interval. */
13151 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13154 /* Step 2: compute initial and final value of the loop counter. */
13156 /* SP = SP_0 + PROBE_INTERVAL. */
13157 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13158 plus_constant (Pmode, stack_pointer_rtx,
13159 - (PROBE_INTERVAL + dope))));
13161 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13162 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13163 emit_insn (gen_rtx_SET (sr.reg,
13164 plus_constant (Pmode, stack_pointer_rtx,
13165 -rounded_size)));
13166 else
13168 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13169 emit_insn (gen_rtx_SET (sr.reg,
13170 gen_rtx_PLUS (Pmode, sr.reg,
13171 stack_pointer_rtx)));
13175 /* Step 3: the loop
13179 SP = SP + PROBE_INTERVAL
13180 probe at SP
13182 while (SP != LAST_ADDR)
13184 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13185 values of N from 1 until it is equal to ROUNDED_SIZE. */
13187 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13190 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13191 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13193 if (size != rounded_size)
13195 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13196 plus_constant (Pmode, stack_pointer_rtx,
13197 rounded_size - size)));
13198 emit_stack_probe (stack_pointer_rtx);
13201 /* Adjust back to account for the additional first interval. */
13202 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13203 plus_constant (Pmode, stack_pointer_rtx,
13204 PROBE_INTERVAL + dope)));
13206 release_scratch_register_on_entry (&sr);
13209 /* Even if the stack pointer isn't the CFA register, we need to correctly
13210 describe the adjustments made to it, in particular differentiate the
13211 frame-related ones from the frame-unrelated ones. */
13212 if (size > 0)
13214 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13215 XVECEXP (expr, 0, 0)
13216 = gen_rtx_SET (stack_pointer_rtx,
13217 plus_constant (Pmode, stack_pointer_rtx, -size));
13218 XVECEXP (expr, 0, 1)
13219 = gen_rtx_SET (stack_pointer_rtx,
13220 plus_constant (Pmode, stack_pointer_rtx,
13221 PROBE_INTERVAL + dope + size));
13222 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13223 RTX_FRAME_RELATED_P (last) = 1;
13225 cfun->machine->fs.sp_offset += size;
13228 /* Make sure nothing is scheduled before we are done. */
13229 emit_insn (gen_blockage ());
13232 /* Adjust the stack pointer up to REG while probing it. */
13234 const char *
13235 output_adjust_stack_and_probe (rtx reg)
13237 static int labelno = 0;
13238 char loop_lab[32];
13239 rtx xops[2];
13241 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13243 /* Loop. */
13244 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13246 /* SP = SP + PROBE_INTERVAL. */
13247 xops[0] = stack_pointer_rtx;
13248 xops[1] = GEN_INT (PROBE_INTERVAL);
13249 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13251 /* Probe at SP. */
13252 xops[1] = const0_rtx;
13253 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13255 /* Test if SP == LAST_ADDR. */
13256 xops[0] = stack_pointer_rtx;
13257 xops[1] = reg;
13258 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13260 /* Branch. */
13261 fputs ("\tjne\t", asm_out_file);
13262 assemble_name_raw (asm_out_file, loop_lab);
13263 fputc ('\n', asm_out_file);
13265 return "";
13268 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13269 inclusive. These are offsets from the current stack pointer. */
13271 static void
13272 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
13274 /* See if we have a constant small number of probes to generate. If so,
13275 that's the easy case. The run-time loop is made up of 6 insns in the
13276 generic case while the compile-time loop is made up of n insns for n #
13277 of intervals. */
13278 if (size <= 6 * PROBE_INTERVAL)
13280 HOST_WIDE_INT i;
13282 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
13283 it exceeds SIZE. If only one probe is needed, this will not
13284 generate any code. Then probe at FIRST + SIZE. */
13285 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13286 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13287 -(first + i)));
13289 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
13290 -(first + size)));
13293 /* Otherwise, do the same as above, but in a loop. Note that we must be
13294 extra careful with variables wrapping around because we might be at
13295 the very top (or the very bottom) of the address space and we have
13296 to be able to handle this case properly; in particular, we use an
13297 equality test for the loop condition. */
13298 else
13300 HOST_WIDE_INT rounded_size, last;
13301 struct scratch_reg sr;
13303 get_scratch_register_on_entry (&sr);
13306 /* Step 1: round SIZE to the previous multiple of the interval. */
13308 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13311 /* Step 2: compute initial and final value of the loop counter. */
13313 /* TEST_OFFSET = FIRST. */
13314 emit_move_insn (sr.reg, GEN_INT (-first));
13316 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13317 last = first + rounded_size;
13320 /* Step 3: the loop
13324 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13325 probe at TEST_ADDR
13327 while (TEST_ADDR != LAST_ADDR)
13329 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13330 until it is equal to ROUNDED_SIZE. */
13332 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13335 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13336 that SIZE is equal to ROUNDED_SIZE. */
13338 if (size != rounded_size)
13339 emit_stack_probe (plus_constant (Pmode,
13340 gen_rtx_PLUS (Pmode,
13341 stack_pointer_rtx,
13342 sr.reg),
13343 rounded_size - size));
13345 release_scratch_register_on_entry (&sr);
13348 /* Make sure nothing is scheduled before we are done. */
13349 emit_insn (gen_blockage ());
13352 /* Probe a range of stack addresses from REG to END, inclusive. These are
13353 offsets from the current stack pointer. */
13355 const char *
13356 output_probe_stack_range (rtx reg, rtx end)
13358 static int labelno = 0;
13359 char loop_lab[32];
13360 rtx xops[3];
13362 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13364 /* Loop. */
13365 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13367 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13368 xops[0] = reg;
13369 xops[1] = GEN_INT (PROBE_INTERVAL);
13370 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13372 /* Probe at TEST_ADDR. */
13373 xops[0] = stack_pointer_rtx;
13374 xops[1] = reg;
13375 xops[2] = const0_rtx;
13376 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13378 /* Test if TEST_ADDR == LAST_ADDR. */
13379 xops[0] = reg;
13380 xops[1] = end;
13381 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13383 /* Branch. */
13384 fputs ("\tjne\t", asm_out_file);
13385 assemble_name_raw (asm_out_file, loop_lab);
13386 fputc ('\n', asm_out_file);
13388 return "";
13391 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
13392 to be generated in correct form. */
13393 static void
13394 ix86_finalize_stack_realign_flags (void)
13396 /* Check if stack realign is really needed after reload, and
13397 stores result in cfun */
13398 unsigned int incoming_stack_boundary
13399 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13400 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13401 unsigned int stack_realign
13402 = (incoming_stack_boundary
13403 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13404 ? crtl->max_used_stack_slot_alignment
13405 : crtl->stack_alignment_needed));
13407 if (crtl->stack_realign_finalized)
13409 /* After stack_realign_needed is finalized, we can't no longer
13410 change it. */
13411 gcc_assert (crtl->stack_realign_needed == stack_realign);
13412 return;
13415 /* If the only reason for frame_pointer_needed is that we conservatively
13416 assumed stack realignment might be needed, but in the end nothing that
13417 needed the stack alignment had been spilled, clear frame_pointer_needed
13418 and say we don't need stack realignment. */
13419 if (stack_realign
13420 && frame_pointer_needed
13421 && crtl->is_leaf
13422 && flag_omit_frame_pointer
13423 && crtl->sp_is_unchanging
13424 && !ix86_current_function_calls_tls_descriptor
13425 && !crtl->accesses_prior_frames
13426 && !cfun->calls_alloca
13427 && !crtl->calls_eh_return
13428 /* See ira_setup_eliminable_regset for the rationale. */
13429 && !(STACK_CHECK_MOVING_SP
13430 && flag_stack_check
13431 && flag_exceptions
13432 && cfun->can_throw_non_call_exceptions)
13433 && !ix86_frame_pointer_required ()
13434 && get_frame_size () == 0
13435 && ix86_nsaved_sseregs () == 0
13436 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13438 HARD_REG_SET set_up_by_prologue, prologue_used;
13439 basic_block bb;
13441 CLEAR_HARD_REG_SET (prologue_used);
13442 CLEAR_HARD_REG_SET (set_up_by_prologue);
13443 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13444 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13445 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13446 HARD_FRAME_POINTER_REGNUM);
13447 FOR_EACH_BB_FN (bb, cfun)
13449 rtx_insn *insn;
13450 FOR_BB_INSNS (bb, insn)
13451 if (NONDEBUG_INSN_P (insn)
13452 && requires_stack_frame_p (insn, prologue_used,
13453 set_up_by_prologue))
13455 crtl->stack_realign_needed = stack_realign;
13456 crtl->stack_realign_finalized = true;
13457 return;
13461 /* If drap has been set, but it actually isn't live at the start
13462 of the function, there is no reason to set it up. */
13463 if (crtl->drap_reg)
13465 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13466 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
13468 crtl->drap_reg = NULL_RTX;
13469 crtl->need_drap = false;
13472 else
13473 cfun->machine->no_drap_save_restore = true;
13475 frame_pointer_needed = false;
13476 stack_realign = false;
13477 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13478 crtl->stack_alignment_needed = incoming_stack_boundary;
13479 crtl->stack_alignment_estimated = incoming_stack_boundary;
13480 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13481 crtl->preferred_stack_boundary = incoming_stack_boundary;
13482 df_finish_pass (true);
13483 df_scan_alloc (NULL);
13484 df_scan_blocks ();
13485 df_compute_regs_ever_live (true);
13486 df_analyze ();
13489 crtl->stack_realign_needed = stack_realign;
13490 crtl->stack_realign_finalized = true;
13493 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13495 static void
13496 ix86_elim_entry_set_got (rtx reg)
13498 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13499 rtx_insn *c_insn = BB_HEAD (bb);
13500 if (!NONDEBUG_INSN_P (c_insn))
13501 c_insn = next_nonnote_nondebug_insn (c_insn);
13502 if (c_insn && NONJUMP_INSN_P (c_insn))
13504 rtx pat = PATTERN (c_insn);
13505 if (GET_CODE (pat) == PARALLEL)
13507 rtx vec = XVECEXP (pat, 0, 0);
13508 if (GET_CODE (vec) == SET
13509 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13510 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13511 delete_insn (c_insn);
13516 /* Expand the prologue into a bunch of separate insns. */
13518 void
13519 ix86_expand_prologue (void)
13521 struct machine_function *m = cfun->machine;
13522 rtx insn, t;
13523 struct ix86_frame frame;
13524 HOST_WIDE_INT allocate;
13525 bool int_registers_saved;
13526 bool sse_registers_saved;
13527 rtx static_chain = NULL_RTX;
13529 ix86_finalize_stack_realign_flags ();
13531 /* DRAP should not coexist with stack_realign_fp */
13532 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13534 memset (&m->fs, 0, sizeof (m->fs));
13536 /* Initialize CFA state for before the prologue. */
13537 m->fs.cfa_reg = stack_pointer_rtx;
13538 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13540 /* Track SP offset to the CFA. We continue tracking this after we've
13541 swapped the CFA register away from SP. In the case of re-alignment
13542 this is fudged; we're interested to offsets within the local frame. */
13543 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13544 m->fs.sp_valid = true;
13546 ix86_compute_frame_layout (&frame);
13548 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13550 /* We should have already generated an error for any use of
13551 ms_hook on a nested function. */
13552 gcc_checking_assert (!ix86_static_chain_on_stack);
13554 /* Check if profiling is active and we shall use profiling before
13555 prologue variant. If so sorry. */
13556 if (crtl->profile && flag_fentry != 0)
13557 sorry ("ms_hook_prologue attribute isn%'t compatible "
13558 "with -mfentry for 32-bit");
13560 /* In ix86_asm_output_function_label we emitted:
13561 8b ff movl.s %edi,%edi
13562 55 push %ebp
13563 8b ec movl.s %esp,%ebp
13565 This matches the hookable function prologue in Win32 API
13566 functions in Microsoft Windows XP Service Pack 2 and newer.
13567 Wine uses this to enable Windows apps to hook the Win32 API
13568 functions provided by Wine.
13570 What that means is that we've already set up the frame pointer. */
13572 if (frame_pointer_needed
13573 && !(crtl->drap_reg && crtl->stack_realign_needed))
13575 rtx push, mov;
13577 /* We've decided to use the frame pointer already set up.
13578 Describe this to the unwinder by pretending that both
13579 push and mov insns happen right here.
13581 Putting the unwind info here at the end of the ms_hook
13582 is done so that we can make absolutely certain we get
13583 the required byte sequence at the start of the function,
13584 rather than relying on an assembler that can produce
13585 the exact encoding required.
13587 However it does mean (in the unpatched case) that we have
13588 a 1 insn window where the asynchronous unwind info is
13589 incorrect. However, if we placed the unwind info at
13590 its correct location we would have incorrect unwind info
13591 in the patched case. Which is probably all moot since
13592 I don't expect Wine generates dwarf2 unwind info for the
13593 system libraries that use this feature. */
13595 insn = emit_insn (gen_blockage ());
13597 push = gen_push (hard_frame_pointer_rtx);
13598 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13599 stack_pointer_rtx);
13600 RTX_FRAME_RELATED_P (push) = 1;
13601 RTX_FRAME_RELATED_P (mov) = 1;
13603 RTX_FRAME_RELATED_P (insn) = 1;
13604 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13605 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13607 /* Note that gen_push incremented m->fs.cfa_offset, even
13608 though we didn't emit the push insn here. */
13609 m->fs.cfa_reg = hard_frame_pointer_rtx;
13610 m->fs.fp_offset = m->fs.cfa_offset;
13611 m->fs.fp_valid = true;
13613 else
13615 /* The frame pointer is not needed so pop %ebp again.
13616 This leaves us with a pristine state. */
13617 emit_insn (gen_pop (hard_frame_pointer_rtx));
13621 /* The first insn of a function that accepts its static chain on the
13622 stack is to push the register that would be filled in by a direct
13623 call. This insn will be skipped by the trampoline. */
13624 else if (ix86_static_chain_on_stack)
13626 static_chain = ix86_static_chain (cfun->decl, false);
13627 insn = emit_insn (gen_push (static_chain));
13628 emit_insn (gen_blockage ());
13630 /* We don't want to interpret this push insn as a register save,
13631 only as a stack adjustment. The real copy of the register as
13632 a save will be done later, if needed. */
13633 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13634 t = gen_rtx_SET (stack_pointer_rtx, t);
13635 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13636 RTX_FRAME_RELATED_P (insn) = 1;
13639 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13640 of DRAP is needed and stack realignment is really needed after reload */
13641 if (stack_realign_drap)
13643 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13645 /* Can't use DRAP in interrupt function. */
13646 if (cfun->machine->func_type != TYPE_NORMAL)
13647 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13648 "in interrupt service routine. This may be worked "
13649 "around by avoiding functions with aggregate return.");
13651 /* Only need to push parameter pointer reg if it is caller saved. */
13652 if (!call_used_regs[REGNO (crtl->drap_reg)])
13654 /* Push arg pointer reg */
13655 insn = emit_insn (gen_push (crtl->drap_reg));
13656 RTX_FRAME_RELATED_P (insn) = 1;
13659 /* Grab the argument pointer. */
13660 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13661 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13662 RTX_FRAME_RELATED_P (insn) = 1;
13663 m->fs.cfa_reg = crtl->drap_reg;
13664 m->fs.cfa_offset = 0;
13666 /* Align the stack. */
13667 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13668 stack_pointer_rtx,
13669 GEN_INT (-align_bytes)));
13670 RTX_FRAME_RELATED_P (insn) = 1;
13672 /* Replicate the return address on the stack so that return
13673 address can be reached via (argp - 1) slot. This is needed
13674 to implement macro RETURN_ADDR_RTX and intrinsic function
13675 expand_builtin_return_addr etc. */
13676 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13677 t = gen_frame_mem (word_mode, t);
13678 insn = emit_insn (gen_push (t));
13679 RTX_FRAME_RELATED_P (insn) = 1;
13681 /* For the purposes of frame and register save area addressing,
13682 we've started over with a new frame. */
13683 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13684 m->fs.realigned = true;
13686 if (static_chain)
13688 /* Replicate static chain on the stack so that static chain
13689 can be reached via (argp - 2) slot. This is needed for
13690 nested function with stack realignment. */
13691 insn = emit_insn (gen_push (static_chain));
13692 RTX_FRAME_RELATED_P (insn) = 1;
13696 int_registers_saved = (frame.nregs == 0);
13697 sse_registers_saved = (frame.nsseregs == 0);
13699 if (frame_pointer_needed && !m->fs.fp_valid)
13701 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13702 slower on all targets. Also sdb doesn't like it. */
13703 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13704 RTX_FRAME_RELATED_P (insn) = 1;
13706 /* Push registers now, before setting the frame pointer
13707 on SEH target. */
13708 if (!int_registers_saved
13709 && TARGET_SEH
13710 && !frame.save_regs_using_mov)
13712 ix86_emit_save_regs ();
13713 int_registers_saved = true;
13714 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13717 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13719 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13720 RTX_FRAME_RELATED_P (insn) = 1;
13722 if (m->fs.cfa_reg == stack_pointer_rtx)
13723 m->fs.cfa_reg = hard_frame_pointer_rtx;
13724 m->fs.fp_offset = m->fs.sp_offset;
13725 m->fs.fp_valid = true;
13729 if (!int_registers_saved)
13731 /* If saving registers via PUSH, do so now. */
13732 if (!frame.save_regs_using_mov)
13734 ix86_emit_save_regs ();
13735 int_registers_saved = true;
13736 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13739 /* When using red zone we may start register saving before allocating
13740 the stack frame saving one cycle of the prologue. However, avoid
13741 doing this if we have to probe the stack; at least on x86_64 the
13742 stack probe can turn into a call that clobbers a red zone location. */
13743 else if (ix86_using_red_zone ()
13744 && (! TARGET_STACK_PROBE
13745 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13747 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13748 int_registers_saved = true;
13752 if (stack_realign_fp)
13754 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13755 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13757 /* The computation of the size of the re-aligned stack frame means
13758 that we must allocate the size of the register save area before
13759 performing the actual alignment. Otherwise we cannot guarantee
13760 that there's enough storage above the realignment point. */
13761 if (m->fs.sp_offset != frame.sse_reg_save_offset)
13762 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13763 GEN_INT (m->fs.sp_offset
13764 - frame.sse_reg_save_offset),
13765 -1, false);
13767 /* Align the stack. */
13768 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13769 stack_pointer_rtx,
13770 GEN_INT (-align_bytes)));
13772 /* For the purposes of register save area addressing, the stack
13773 pointer is no longer valid. As for the value of sp_offset,
13774 see ix86_compute_frame_layout, which we need to match in order
13775 to pass verification of stack_pointer_offset at the end. */
13776 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13777 m->fs.sp_valid = false;
13780 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13782 if (flag_stack_usage_info)
13784 /* We start to count from ARG_POINTER. */
13785 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13787 /* If it was realigned, take into account the fake frame. */
13788 if (stack_realign_drap)
13790 if (ix86_static_chain_on_stack)
13791 stack_size += UNITS_PER_WORD;
13793 if (!call_used_regs[REGNO (crtl->drap_reg)])
13794 stack_size += UNITS_PER_WORD;
13796 /* This over-estimates by 1 minimal-stack-alignment-unit but
13797 mitigates that by counting in the new return address slot. */
13798 current_function_dynamic_stack_size
13799 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13802 current_function_static_stack_size = stack_size;
13805 /* On SEH target with very large frame size, allocate an area to save
13806 SSE registers (as the very large allocation won't be described). */
13807 if (TARGET_SEH
13808 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13809 && !sse_registers_saved)
13811 HOST_WIDE_INT sse_size =
13812 frame.sse_reg_save_offset - frame.reg_save_offset;
13814 gcc_assert (int_registers_saved);
13816 /* No need to do stack checking as the area will be immediately
13817 written. */
13818 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13819 GEN_INT (-sse_size), -1,
13820 m->fs.cfa_reg == stack_pointer_rtx);
13821 allocate -= sse_size;
13822 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13823 sse_registers_saved = true;
13826 /* The stack has already been decremented by the instruction calling us
13827 so probe if the size is non-negative to preserve the protection area. */
13828 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
13830 /* We expect the registers to be saved when probes are used. */
13831 gcc_assert (int_registers_saved);
13833 if (STACK_CHECK_MOVING_SP)
13835 if (!(crtl->is_leaf && !cfun->calls_alloca
13836 && allocate <= PROBE_INTERVAL))
13838 ix86_adjust_stack_and_probe (allocate);
13839 allocate = 0;
13842 else
13844 HOST_WIDE_INT size = allocate;
13846 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13847 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
13849 if (TARGET_STACK_PROBE)
13851 if (crtl->is_leaf && !cfun->calls_alloca)
13853 if (size > PROBE_INTERVAL)
13854 ix86_emit_probe_stack_range (0, size);
13856 else
13857 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
13859 else
13861 if (crtl->is_leaf && !cfun->calls_alloca)
13863 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
13864 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
13865 size - STACK_CHECK_PROTECT);
13867 else
13868 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
13873 if (allocate == 0)
13875 else if (!ix86_target_stack_probe ()
13876 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13878 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13879 GEN_INT (-allocate), -1,
13880 m->fs.cfa_reg == stack_pointer_rtx);
13882 else
13884 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13885 rtx r10 = NULL;
13886 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13887 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13888 bool eax_live = ix86_eax_live_at_start_p ();
13889 bool r10_live = false;
13891 if (TARGET_64BIT)
13892 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13894 if (eax_live)
13896 insn = emit_insn (gen_push (eax));
13897 allocate -= UNITS_PER_WORD;
13898 /* Note that SEH directives need to continue tracking the stack
13899 pointer even after the frame pointer has been set up. */
13900 if (sp_is_cfa_reg || TARGET_SEH)
13902 if (sp_is_cfa_reg)
13903 m->fs.cfa_offset += UNITS_PER_WORD;
13904 RTX_FRAME_RELATED_P (insn) = 1;
13905 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13906 gen_rtx_SET (stack_pointer_rtx,
13907 plus_constant (Pmode, stack_pointer_rtx,
13908 -UNITS_PER_WORD)));
13912 if (r10_live)
13914 r10 = gen_rtx_REG (Pmode, R10_REG);
13915 insn = emit_insn (gen_push (r10));
13916 allocate -= UNITS_PER_WORD;
13917 if (sp_is_cfa_reg || TARGET_SEH)
13919 if (sp_is_cfa_reg)
13920 m->fs.cfa_offset += UNITS_PER_WORD;
13921 RTX_FRAME_RELATED_P (insn) = 1;
13922 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13923 gen_rtx_SET (stack_pointer_rtx,
13924 plus_constant (Pmode, stack_pointer_rtx,
13925 -UNITS_PER_WORD)));
13929 emit_move_insn (eax, GEN_INT (allocate));
13930 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13932 /* Use the fact that AX still contains ALLOCATE. */
13933 adjust_stack_insn = (Pmode == DImode
13934 ? gen_pro_epilogue_adjust_stack_di_sub
13935 : gen_pro_epilogue_adjust_stack_si_sub);
13937 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13938 stack_pointer_rtx, eax));
13940 if (sp_is_cfa_reg || TARGET_SEH)
13942 if (sp_is_cfa_reg)
13943 m->fs.cfa_offset += allocate;
13944 RTX_FRAME_RELATED_P (insn) = 1;
13945 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13946 gen_rtx_SET (stack_pointer_rtx,
13947 plus_constant (Pmode, stack_pointer_rtx,
13948 -allocate)));
13950 m->fs.sp_offset += allocate;
13952 /* Use stack_pointer_rtx for relative addressing so that code
13953 works for realigned stack, too. */
13954 if (r10_live && eax_live)
13956 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13957 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13958 gen_frame_mem (word_mode, t));
13959 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13960 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13961 gen_frame_mem (word_mode, t));
13963 else if (eax_live || r10_live)
13965 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13966 emit_move_insn (gen_rtx_REG (word_mode,
13967 (eax_live ? AX_REG : R10_REG)),
13968 gen_frame_mem (word_mode, t));
13971 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13973 /* If we havn't already set up the frame pointer, do so now. */
13974 if (frame_pointer_needed && !m->fs.fp_valid)
13976 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13977 GEN_INT (frame.stack_pointer_offset
13978 - frame.hard_frame_pointer_offset));
13979 insn = emit_insn (insn);
13980 RTX_FRAME_RELATED_P (insn) = 1;
13981 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13983 if (m->fs.cfa_reg == stack_pointer_rtx)
13984 m->fs.cfa_reg = hard_frame_pointer_rtx;
13985 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13986 m->fs.fp_valid = true;
13989 if (!int_registers_saved)
13990 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13991 if (!sse_registers_saved)
13992 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13994 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13995 in PROLOGUE. */
13996 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13998 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13999 insn = emit_insn (gen_set_got (pic));
14000 RTX_FRAME_RELATED_P (insn) = 1;
14001 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14002 emit_insn (gen_prologue_use (pic));
14003 /* Deleting already emmitted SET_GOT if exist and allocated to
14004 REAL_PIC_OFFSET_TABLE_REGNUM. */
14005 ix86_elim_entry_set_got (pic);
14008 if (crtl->drap_reg && !crtl->stack_realign_needed)
14010 /* vDRAP is setup but after reload it turns out stack realign
14011 isn't necessary, here we will emit prologue to setup DRAP
14012 without stack realign adjustment */
14013 t = choose_baseaddr (0);
14014 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14017 /* Prevent instructions from being scheduled into register save push
14018 sequence when access to the redzone area is done through frame pointer.
14019 The offset between the frame pointer and the stack pointer is calculated
14020 relative to the value of the stack pointer at the end of the function
14021 prologue, and moving instructions that access redzone area via frame
14022 pointer inside push sequence violates this assumption. */
14023 if (frame_pointer_needed && frame.red_zone_size)
14024 emit_insn (gen_memory_blockage ());
14026 /* SEH requires that the prologue end within 256 bytes of the start of
14027 the function. Prevent instruction schedules that would extend that.
14028 Further, prevent alloca modifications to the stack pointer from being
14029 combined with prologue modifications. */
14030 if (TARGET_SEH)
14031 emit_insn (gen_prologue_use (stack_pointer_rtx));
14034 /* Emit code to restore REG using a POP insn. */
14036 static void
14037 ix86_emit_restore_reg_using_pop (rtx reg)
14039 struct machine_function *m = cfun->machine;
14040 rtx_insn *insn = emit_insn (gen_pop (reg));
14042 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14043 m->fs.sp_offset -= UNITS_PER_WORD;
14045 if (m->fs.cfa_reg == crtl->drap_reg
14046 && REGNO (reg) == REGNO (crtl->drap_reg))
14048 /* Previously we'd represented the CFA as an expression
14049 like *(%ebp - 8). We've just popped that value from
14050 the stack, which means we need to reset the CFA to
14051 the drap register. This will remain until we restore
14052 the stack pointer. */
14053 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14054 RTX_FRAME_RELATED_P (insn) = 1;
14056 /* This means that the DRAP register is valid for addressing too. */
14057 m->fs.drap_valid = true;
14058 return;
14061 if (m->fs.cfa_reg == stack_pointer_rtx)
14063 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14064 x = gen_rtx_SET (stack_pointer_rtx, x);
14065 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14066 RTX_FRAME_RELATED_P (insn) = 1;
14068 m->fs.cfa_offset -= UNITS_PER_WORD;
14071 /* When the frame pointer is the CFA, and we pop it, we are
14072 swapping back to the stack pointer as the CFA. This happens
14073 for stack frames that don't allocate other data, so we assume
14074 the stack pointer is now pointing at the return address, i.e.
14075 the function entry state, which makes the offset be 1 word. */
14076 if (reg == hard_frame_pointer_rtx)
14078 m->fs.fp_valid = false;
14079 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14081 m->fs.cfa_reg = stack_pointer_rtx;
14082 m->fs.cfa_offset -= UNITS_PER_WORD;
14084 add_reg_note (insn, REG_CFA_DEF_CFA,
14085 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14086 GEN_INT (m->fs.cfa_offset)));
14087 RTX_FRAME_RELATED_P (insn) = 1;
14092 /* Emit code to restore saved registers using POP insns. */
14094 static void
14095 ix86_emit_restore_regs_using_pop (void)
14097 unsigned int regno;
14099 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14100 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false))
14101 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14104 /* Emit code and notes for the LEAVE instruction. */
14106 static void
14107 ix86_emit_leave (void)
14109 struct machine_function *m = cfun->machine;
14110 rtx_insn *insn = emit_insn (ix86_gen_leave ());
14112 ix86_add_queued_cfa_restore_notes (insn);
14114 gcc_assert (m->fs.fp_valid);
14115 m->fs.sp_valid = true;
14116 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14117 m->fs.fp_valid = false;
14119 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14121 m->fs.cfa_reg = stack_pointer_rtx;
14122 m->fs.cfa_offset = m->fs.sp_offset;
14124 add_reg_note (insn, REG_CFA_DEF_CFA,
14125 plus_constant (Pmode, stack_pointer_rtx,
14126 m->fs.sp_offset));
14127 RTX_FRAME_RELATED_P (insn) = 1;
14129 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14130 m->fs.fp_offset);
14133 /* Emit code to restore saved registers using MOV insns.
14134 First register is restored from CFA - CFA_OFFSET. */
14135 static void
14136 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14137 bool maybe_eh_return)
14139 struct machine_function *m = cfun->machine;
14140 unsigned int regno;
14142 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14143 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14145 rtx reg = gen_rtx_REG (word_mode, regno);
14146 rtx mem;
14147 rtx_insn *insn;
14149 mem = choose_baseaddr (cfa_offset);
14150 mem = gen_frame_mem (word_mode, mem);
14151 insn = emit_move_insn (reg, mem);
14153 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14155 /* Previously we'd represented the CFA as an expression
14156 like *(%ebp - 8). We've just popped that value from
14157 the stack, which means we need to reset the CFA to
14158 the drap register. This will remain until we restore
14159 the stack pointer. */
14160 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14161 RTX_FRAME_RELATED_P (insn) = 1;
14163 /* This means that the DRAP register is valid for addressing. */
14164 m->fs.drap_valid = true;
14166 else
14167 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14169 cfa_offset -= UNITS_PER_WORD;
14173 /* Emit code to restore saved registers using MOV insns.
14174 First register is restored from CFA - CFA_OFFSET. */
14175 static void
14176 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14177 bool maybe_eh_return)
14179 unsigned int regno;
14181 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14182 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
14184 rtx reg = gen_rtx_REG (V4SFmode, regno);
14185 rtx mem;
14186 unsigned int align;
14188 mem = choose_baseaddr (cfa_offset);
14189 mem = gen_rtx_MEM (V4SFmode, mem);
14191 /* The location is aligned up to INCOMING_STACK_BOUNDARY. */
14192 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), INCOMING_STACK_BOUNDARY);
14193 set_mem_align (mem, align);
14194 emit_insn (gen_rtx_SET (reg, mem));
14196 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14198 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14202 /* Restore function stack, frame, and registers. */
14204 void
14205 ix86_expand_epilogue (int style)
14207 struct machine_function *m = cfun->machine;
14208 struct machine_frame_state frame_state_save = m->fs;
14209 struct ix86_frame frame;
14210 bool restore_regs_via_mov;
14211 bool using_drap;
14213 ix86_finalize_stack_realign_flags ();
14214 ix86_compute_frame_layout (&frame);
14216 m->fs.sp_valid = (!frame_pointer_needed
14217 || (crtl->sp_is_unchanging
14218 && !stack_realign_fp));
14219 gcc_assert (!m->fs.sp_valid
14220 || m->fs.sp_offset == frame.stack_pointer_offset);
14222 /* The FP must be valid if the frame pointer is present. */
14223 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14224 gcc_assert (!m->fs.fp_valid
14225 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14227 /* We must have *some* valid pointer to the stack frame. */
14228 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14230 /* The DRAP is never valid at this point. */
14231 gcc_assert (!m->fs.drap_valid);
14233 /* See the comment about red zone and frame
14234 pointer usage in ix86_expand_prologue. */
14235 if (frame_pointer_needed && frame.red_zone_size)
14236 emit_insn (gen_memory_blockage ());
14238 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14239 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14241 /* Determine the CFA offset of the end of the red-zone. */
14242 m->fs.red_zone_offset = 0;
14243 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14245 /* The red-zone begins below the return address. */
14246 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
14248 /* When the register save area is in the aligned portion of
14249 the stack, determine the maximum runtime displacement that
14250 matches up with the aligned frame. */
14251 if (stack_realign_drap)
14252 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14253 + UNITS_PER_WORD);
14256 /* Special care must be taken for the normal return case of a function
14257 using eh_return: the eax and edx registers are marked as saved, but
14258 not restored along this path. Adjust the save location to match. */
14259 if (crtl->calls_eh_return && style != 2)
14260 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
14262 /* EH_RETURN requires the use of moves to function properly. */
14263 if (crtl->calls_eh_return)
14264 restore_regs_via_mov = true;
14265 /* SEH requires the use of pops to identify the epilogue. */
14266 else if (TARGET_SEH)
14267 restore_regs_via_mov = false;
14268 /* If we're only restoring one register and sp is not valid then
14269 using a move instruction to restore the register since it's
14270 less work than reloading sp and popping the register. */
14271 else if (!m->fs.sp_valid && frame.nregs <= 1)
14272 restore_regs_via_mov = true;
14273 else if (TARGET_EPILOGUE_USING_MOVE
14274 && cfun->machine->use_fast_prologue_epilogue
14275 && (frame.nregs > 1
14276 || m->fs.sp_offset != frame.reg_save_offset))
14277 restore_regs_via_mov = true;
14278 else if (frame_pointer_needed
14279 && !frame.nregs
14280 && m->fs.sp_offset != frame.reg_save_offset)
14281 restore_regs_via_mov = true;
14282 else if (frame_pointer_needed
14283 && TARGET_USE_LEAVE
14284 && cfun->machine->use_fast_prologue_epilogue
14285 && frame.nregs == 1)
14286 restore_regs_via_mov = true;
14287 else
14288 restore_regs_via_mov = false;
14290 if (restore_regs_via_mov || frame.nsseregs)
14292 /* Ensure that the entire register save area is addressable via
14293 the stack pointer, if we will restore via sp. */
14294 if (TARGET_64BIT
14295 && m->fs.sp_offset > 0x7fffffff
14296 && !(m->fs.fp_valid || m->fs.drap_valid)
14297 && (frame.nsseregs + frame.nregs) != 0)
14299 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14300 GEN_INT (m->fs.sp_offset
14301 - frame.sse_reg_save_offset),
14302 style,
14303 m->fs.cfa_reg == stack_pointer_rtx);
14307 /* If there are any SSE registers to restore, then we have to do it
14308 via moves, since there's obviously no pop for SSE regs. */
14309 if (frame.nsseregs)
14310 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14311 style == 2);
14313 if (restore_regs_via_mov)
14315 rtx t;
14317 if (frame.nregs)
14318 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14320 /* eh_return epilogues need %ecx added to the stack pointer. */
14321 if (style == 2)
14323 rtx sa = EH_RETURN_STACKADJ_RTX;
14324 rtx_insn *insn;
14326 /* %ecx can't be used for both DRAP register and eh_return. */
14327 if (crtl->drap_reg)
14328 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14330 /* regparm nested functions don't work with eh_return. */
14331 gcc_assert (!ix86_static_chain_on_stack);
14333 if (frame_pointer_needed)
14335 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14336 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14337 emit_insn (gen_rtx_SET (sa, t));
14339 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14340 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14342 /* Note that we use SA as a temporary CFA, as the return
14343 address is at the proper place relative to it. We
14344 pretend this happens at the FP restore insn because
14345 prior to this insn the FP would be stored at the wrong
14346 offset relative to SA, and after this insn we have no
14347 other reasonable register to use for the CFA. We don't
14348 bother resetting the CFA to the SP for the duration of
14349 the return insn. */
14350 add_reg_note (insn, REG_CFA_DEF_CFA,
14351 plus_constant (Pmode, sa, UNITS_PER_WORD));
14352 ix86_add_queued_cfa_restore_notes (insn);
14353 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14354 RTX_FRAME_RELATED_P (insn) = 1;
14356 m->fs.cfa_reg = sa;
14357 m->fs.cfa_offset = UNITS_PER_WORD;
14358 m->fs.fp_valid = false;
14360 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14361 const0_rtx, style, false);
14363 else
14365 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14366 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14367 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14368 ix86_add_queued_cfa_restore_notes (insn);
14370 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14371 if (m->fs.cfa_offset != UNITS_PER_WORD)
14373 m->fs.cfa_offset = UNITS_PER_WORD;
14374 add_reg_note (insn, REG_CFA_DEF_CFA,
14375 plus_constant (Pmode, stack_pointer_rtx,
14376 UNITS_PER_WORD));
14377 RTX_FRAME_RELATED_P (insn) = 1;
14380 m->fs.sp_offset = UNITS_PER_WORD;
14381 m->fs.sp_valid = true;
14384 else
14386 /* SEH requires that the function end with (1) a stack adjustment
14387 if necessary, (2) a sequence of pops, and (3) a return or
14388 jump instruction. Prevent insns from the function body from
14389 being scheduled into this sequence. */
14390 if (TARGET_SEH)
14392 /* Prevent a catch region from being adjacent to the standard
14393 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14394 several other flags that would be interesting to test are
14395 not yet set up. */
14396 if (flag_non_call_exceptions)
14397 emit_insn (gen_nops (const1_rtx));
14398 else
14399 emit_insn (gen_blockage ());
14402 /* First step is to deallocate the stack frame so that we can
14403 pop the registers. Also do it on SEH target for very large
14404 frame as the emitted instructions aren't allowed by the ABI in
14405 epilogues. */
14406 if (!m->fs.sp_valid
14407 || (TARGET_SEH
14408 && (m->fs.sp_offset - frame.reg_save_offset
14409 >= SEH_MAX_FRAME_SIZE)))
14411 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14412 GEN_INT (m->fs.fp_offset
14413 - frame.reg_save_offset),
14414 style, false);
14416 else if (m->fs.sp_offset != frame.reg_save_offset)
14418 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14419 GEN_INT (m->fs.sp_offset
14420 - frame.reg_save_offset),
14421 style,
14422 m->fs.cfa_reg == stack_pointer_rtx);
14425 ix86_emit_restore_regs_using_pop ();
14428 /* If we used a stack pointer and haven't already got rid of it,
14429 then do so now. */
14430 if (m->fs.fp_valid)
14432 /* If the stack pointer is valid and pointing at the frame
14433 pointer store address, then we only need a pop. */
14434 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
14435 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14436 /* Leave results in shorter dependency chains on CPUs that are
14437 able to grok it fast. */
14438 else if (TARGET_USE_LEAVE
14439 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14440 || !cfun->machine->use_fast_prologue_epilogue)
14441 ix86_emit_leave ();
14442 else
14444 pro_epilogue_adjust_stack (stack_pointer_rtx,
14445 hard_frame_pointer_rtx,
14446 const0_rtx, style, !using_drap);
14447 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14451 if (using_drap)
14453 int param_ptr_offset = UNITS_PER_WORD;
14454 rtx_insn *insn;
14456 gcc_assert (stack_realign_drap);
14458 if (ix86_static_chain_on_stack)
14459 param_ptr_offset += UNITS_PER_WORD;
14460 if (!call_used_regs[REGNO (crtl->drap_reg)])
14461 param_ptr_offset += UNITS_PER_WORD;
14463 insn = emit_insn (gen_rtx_SET
14464 (stack_pointer_rtx,
14465 gen_rtx_PLUS (Pmode,
14466 crtl->drap_reg,
14467 GEN_INT (-param_ptr_offset))));
14468 m->fs.cfa_reg = stack_pointer_rtx;
14469 m->fs.cfa_offset = param_ptr_offset;
14470 m->fs.sp_offset = param_ptr_offset;
14471 m->fs.realigned = false;
14473 add_reg_note (insn, REG_CFA_DEF_CFA,
14474 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14475 GEN_INT (param_ptr_offset)));
14476 RTX_FRAME_RELATED_P (insn) = 1;
14478 if (!call_used_regs[REGNO (crtl->drap_reg)])
14479 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14482 /* At this point the stack pointer must be valid, and we must have
14483 restored all of the registers. We may not have deallocated the
14484 entire stack frame. We've delayed this until now because it may
14485 be possible to merge the local stack deallocation with the
14486 deallocation forced by ix86_static_chain_on_stack. */
14487 gcc_assert (m->fs.sp_valid);
14488 gcc_assert (!m->fs.fp_valid);
14489 gcc_assert (!m->fs.realigned);
14490 if (m->fs.sp_offset != UNITS_PER_WORD)
14492 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14493 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14494 style, true);
14496 else
14497 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14499 /* Sibcall epilogues don't want a return instruction. */
14500 if (style == 0)
14502 m->fs = frame_state_save;
14503 return;
14506 if (cfun->machine->func_type != TYPE_NORMAL)
14508 /* Return with the "IRET" instruction from interrupt handler.
14509 Pop the 'ERROR_CODE' off the stack before the 'IRET'
14510 instruction in exception handler. */
14511 if (cfun->machine->func_type == TYPE_EXCEPTION)
14513 rtx r = plus_constant (Pmode, stack_pointer_rtx,
14514 UNITS_PER_WORD);
14515 emit_insn (gen_rtx_SET (stack_pointer_rtx, r));
14517 emit_jump_insn (gen_interrupt_return ());
14519 else if (crtl->args.pops_args && crtl->args.size)
14521 rtx popc = GEN_INT (crtl->args.pops_args);
14523 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14524 address, do explicit add, and jump indirectly to the caller. */
14526 if (crtl->args.pops_args >= 65536)
14528 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14529 rtx_insn *insn;
14531 /* There is no "pascal" calling convention in any 64bit ABI. */
14532 gcc_assert (!TARGET_64BIT);
14534 insn = emit_insn (gen_pop (ecx));
14535 m->fs.cfa_offset -= UNITS_PER_WORD;
14536 m->fs.sp_offset -= UNITS_PER_WORD;
14538 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14539 x = gen_rtx_SET (stack_pointer_rtx, x);
14540 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14541 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14542 RTX_FRAME_RELATED_P (insn) = 1;
14544 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14545 popc, -1, true);
14546 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14548 else
14549 emit_jump_insn (gen_simple_return_pop_internal (popc));
14551 else
14552 emit_jump_insn (gen_simple_return_internal ());
14554 /* Restore the state back to the state from the prologue,
14555 so that it's correct for the next epilogue. */
14556 m->fs = frame_state_save;
14559 /* Reset from the function's potential modifications. */
14561 static void
14562 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
14564 if (pic_offset_table_rtx
14565 && !ix86_use_pseudo_pic_reg ())
14566 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14567 #if TARGET_MACHO
14568 /* Mach-O doesn't support labels at the end of objects, so if
14569 it looks like we might want one, insert a NOP. */
14571 rtx_insn *insn = get_last_insn ();
14572 rtx_insn *deleted_debug_label = NULL;
14573 while (insn
14574 && NOTE_P (insn)
14575 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14577 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14578 notes only, instead set their CODE_LABEL_NUMBER to -1,
14579 otherwise there would be code generation differences
14580 in between -g and -g0. */
14581 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14582 deleted_debug_label = insn;
14583 insn = PREV_INSN (insn);
14585 if (insn
14586 && (LABEL_P (insn)
14587 || (NOTE_P (insn)
14588 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
14589 fputs ("\tnop\n", file);
14590 else if (deleted_debug_label)
14591 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14592 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14593 CODE_LABEL_NUMBER (insn) = -1;
14595 #endif
14599 /* Return a scratch register to use in the split stack prologue. The
14600 split stack prologue is used for -fsplit-stack. It is the first
14601 instructions in the function, even before the regular prologue.
14602 The scratch register can be any caller-saved register which is not
14603 used for parameters or for the static chain. */
14605 static unsigned int
14606 split_stack_prologue_scratch_regno (void)
14608 if (TARGET_64BIT)
14609 return R11_REG;
14610 else
14612 bool is_fastcall, is_thiscall;
14613 int regparm;
14615 is_fastcall = (lookup_attribute ("fastcall",
14616 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14617 != NULL);
14618 is_thiscall = (lookup_attribute ("thiscall",
14619 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14620 != NULL);
14621 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14623 if (is_fastcall)
14625 if (DECL_STATIC_CHAIN (cfun->decl))
14627 sorry ("-fsplit-stack does not support fastcall with "
14628 "nested function");
14629 return INVALID_REGNUM;
14631 return AX_REG;
14633 else if (is_thiscall)
14635 if (!DECL_STATIC_CHAIN (cfun->decl))
14636 return DX_REG;
14637 return AX_REG;
14639 else if (regparm < 3)
14641 if (!DECL_STATIC_CHAIN (cfun->decl))
14642 return CX_REG;
14643 else
14645 if (regparm >= 2)
14647 sorry ("-fsplit-stack does not support 2 register "
14648 "parameters for a nested function");
14649 return INVALID_REGNUM;
14651 return DX_REG;
14654 else
14656 /* FIXME: We could make this work by pushing a register
14657 around the addition and comparison. */
14658 sorry ("-fsplit-stack does not support 3 register parameters");
14659 return INVALID_REGNUM;
14664 /* A SYMBOL_REF for the function which allocates new stackspace for
14665 -fsplit-stack. */
14667 static GTY(()) rtx split_stack_fn;
14669 /* A SYMBOL_REF for the more stack function when using the large
14670 model. */
14672 static GTY(()) rtx split_stack_fn_large;
14674 /* Handle -fsplit-stack. These are the first instructions in the
14675 function, even before the regular prologue. */
14677 void
14678 ix86_expand_split_stack_prologue (void)
14680 struct ix86_frame frame;
14681 HOST_WIDE_INT allocate;
14682 unsigned HOST_WIDE_INT args_size;
14683 rtx_code_label *label;
14684 rtx limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
14685 rtx scratch_reg = NULL_RTX;
14686 rtx_code_label *varargs_label = NULL;
14687 rtx fn;
14689 gcc_assert (flag_split_stack && reload_completed);
14691 ix86_finalize_stack_realign_flags ();
14692 ix86_compute_frame_layout (&frame);
14693 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14695 /* This is the label we will branch to if we have enough stack
14696 space. We expect the basic block reordering pass to reverse this
14697 branch if optimizing, so that we branch in the unlikely case. */
14698 label = gen_label_rtx ();
14700 /* We need to compare the stack pointer minus the frame size with
14701 the stack boundary in the TCB. The stack boundary always gives
14702 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14703 can compare directly. Otherwise we need to do an addition. */
14705 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
14706 UNSPEC_STACK_CHECK);
14707 limit = gen_rtx_CONST (Pmode, limit);
14708 limit = gen_rtx_MEM (Pmode, limit);
14709 if (allocate < SPLIT_STACK_AVAILABLE)
14710 current = stack_pointer_rtx;
14711 else
14713 unsigned int scratch_regno;
14714 rtx offset;
14716 /* We need a scratch register to hold the stack pointer minus
14717 the required frame size. Since this is the very start of the
14718 function, the scratch register can be any caller-saved
14719 register which is not used for parameters. */
14720 offset = GEN_INT (- allocate);
14721 scratch_regno = split_stack_prologue_scratch_regno ();
14722 if (scratch_regno == INVALID_REGNUM)
14723 return;
14724 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14725 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14727 /* We don't use ix86_gen_add3 in this case because it will
14728 want to split to lea, but when not optimizing the insn
14729 will not be split after this point. */
14730 emit_insn (gen_rtx_SET (scratch_reg,
14731 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14732 offset)));
14734 else
14736 emit_move_insn (scratch_reg, offset);
14737 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14738 stack_pointer_rtx));
14740 current = scratch_reg;
14743 ix86_expand_branch (GEU, current, limit, label);
14744 jump_insn = get_last_insn ();
14745 JUMP_LABEL (jump_insn) = label;
14747 /* Mark the jump as very likely to be taken. */
14748 add_int_reg_note (jump_insn, REG_BR_PROB,
14749 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
14751 if (split_stack_fn == NULL_RTX)
14753 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14754 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14756 fn = split_stack_fn;
14758 /* Get more stack space. We pass in the desired stack space and the
14759 size of the arguments to copy to the new stack. In 32-bit mode
14760 we push the parameters; __morestack will return on a new stack
14761 anyhow. In 64-bit mode we pass the parameters in r10 and
14762 r11. */
14763 allocate_rtx = GEN_INT (allocate);
14764 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14765 call_fusage = NULL_RTX;
14766 if (TARGET_64BIT)
14768 rtx reg10, reg11;
14770 reg10 = gen_rtx_REG (Pmode, R10_REG);
14771 reg11 = gen_rtx_REG (Pmode, R11_REG);
14773 /* If this function uses a static chain, it will be in %r10.
14774 Preserve it across the call to __morestack. */
14775 if (DECL_STATIC_CHAIN (cfun->decl))
14777 rtx rax;
14779 rax = gen_rtx_REG (word_mode, AX_REG);
14780 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14781 use_reg (&call_fusage, rax);
14784 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14785 && !TARGET_PECOFF)
14787 HOST_WIDE_INT argval;
14789 gcc_assert (Pmode == DImode);
14790 /* When using the large model we need to load the address
14791 into a register, and we've run out of registers. So we
14792 switch to a different calling convention, and we call a
14793 different function: __morestack_large. We pass the
14794 argument size in the upper 32 bits of r10 and pass the
14795 frame size in the lower 32 bits. */
14796 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14797 gcc_assert ((args_size & 0xffffffff) == args_size);
14799 if (split_stack_fn_large == NULL_RTX)
14801 split_stack_fn_large =
14802 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14803 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14805 if (ix86_cmodel == CM_LARGE_PIC)
14807 rtx_code_label *label;
14808 rtx x;
14810 label = gen_label_rtx ();
14811 emit_label (label);
14812 LABEL_PRESERVE_P (label) = 1;
14813 emit_insn (gen_set_rip_rex64 (reg10, label));
14814 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14815 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14816 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14817 UNSPEC_GOT);
14818 x = gen_rtx_CONST (Pmode, x);
14819 emit_move_insn (reg11, x);
14820 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14821 x = gen_const_mem (Pmode, x);
14822 emit_move_insn (reg11, x);
14824 else
14825 emit_move_insn (reg11, split_stack_fn_large);
14827 fn = reg11;
14829 argval = ((args_size << 16) << 16) + allocate;
14830 emit_move_insn (reg10, GEN_INT (argval));
14832 else
14834 emit_move_insn (reg10, allocate_rtx);
14835 emit_move_insn (reg11, GEN_INT (args_size));
14836 use_reg (&call_fusage, reg11);
14839 use_reg (&call_fusage, reg10);
14841 else
14843 emit_insn (gen_push (GEN_INT (args_size)));
14844 emit_insn (gen_push (allocate_rtx));
14846 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14847 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14848 NULL_RTX, false);
14849 add_function_usage_to (call_insn, call_fusage);
14851 /* In order to make call/return prediction work right, we now need
14852 to execute a return instruction. See
14853 libgcc/config/i386/morestack.S for the details on how this works.
14855 For flow purposes gcc must not see this as a return
14856 instruction--we need control flow to continue at the subsequent
14857 label. Therefore, we use an unspec. */
14858 gcc_assert (crtl->args.pops_args < 65536);
14859 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14861 /* If we are in 64-bit mode and this function uses a static chain,
14862 we saved %r10 in %rax before calling _morestack. */
14863 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14864 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14865 gen_rtx_REG (word_mode, AX_REG));
14867 /* If this function calls va_start, we need to store a pointer to
14868 the arguments on the old stack, because they may not have been
14869 all copied to the new stack. At this point the old stack can be
14870 found at the frame pointer value used by __morestack, because
14871 __morestack has set that up before calling back to us. Here we
14872 store that pointer in a scratch register, and in
14873 ix86_expand_prologue we store the scratch register in a stack
14874 slot. */
14875 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14877 unsigned int scratch_regno;
14878 rtx frame_reg;
14879 int words;
14881 scratch_regno = split_stack_prologue_scratch_regno ();
14882 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14883 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14885 /* 64-bit:
14886 fp -> old fp value
14887 return address within this function
14888 return address of caller of this function
14889 stack arguments
14890 So we add three words to get to the stack arguments.
14892 32-bit:
14893 fp -> old fp value
14894 return address within this function
14895 first argument to __morestack
14896 second argument to __morestack
14897 return address of caller of this function
14898 stack arguments
14899 So we add five words to get to the stack arguments.
14901 words = TARGET_64BIT ? 3 : 5;
14902 emit_insn (gen_rtx_SET (scratch_reg,
14903 gen_rtx_PLUS (Pmode, frame_reg,
14904 GEN_INT (words * UNITS_PER_WORD))));
14906 varargs_label = gen_label_rtx ();
14907 emit_jump_insn (gen_jump (varargs_label));
14908 JUMP_LABEL (get_last_insn ()) = varargs_label;
14910 emit_barrier ();
14913 emit_label (label);
14914 LABEL_NUSES (label) = 1;
14916 /* If this function calls va_start, we now have to set the scratch
14917 register for the case where we do not call __morestack. In this
14918 case we need to set it based on the stack pointer. */
14919 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14921 emit_insn (gen_rtx_SET (scratch_reg,
14922 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14923 GEN_INT (UNITS_PER_WORD))));
14925 emit_label (varargs_label);
14926 LABEL_NUSES (varargs_label) = 1;
14930 /* We may have to tell the dataflow pass that the split stack prologue
14931 is initializing a scratch register. */
14933 static void
14934 ix86_live_on_entry (bitmap regs)
14936 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14938 gcc_assert (flag_split_stack);
14939 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14943 /* Extract the parts of an RTL expression that is a valid memory address
14944 for an instruction. Return 0 if the structure of the address is
14945 grossly off. Return -1 if the address contains ASHIFT, so it is not
14946 strictly valid, but still used for computing length of lea instruction. */
14949 ix86_decompose_address (rtx addr, struct ix86_address *out)
14951 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14952 rtx base_reg, index_reg;
14953 HOST_WIDE_INT scale = 1;
14954 rtx scale_rtx = NULL_RTX;
14955 rtx tmp;
14956 int retval = 1;
14957 addr_space_t seg = ADDR_SPACE_GENERIC;
14959 /* Allow zero-extended SImode addresses,
14960 they will be emitted with addr32 prefix. */
14961 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14963 if (GET_CODE (addr) == ZERO_EXTEND
14964 && GET_MODE (XEXP (addr, 0)) == SImode)
14966 addr = XEXP (addr, 0);
14967 if (CONST_INT_P (addr))
14968 return 0;
14970 else if (GET_CODE (addr) == AND
14971 && const_32bit_mask (XEXP (addr, 1), DImode))
14973 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14974 if (addr == NULL_RTX)
14975 return 0;
14977 if (CONST_INT_P (addr))
14978 return 0;
14982 /* Allow SImode subregs of DImode addresses,
14983 they will be emitted with addr32 prefix. */
14984 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14986 if (SUBREG_P (addr)
14987 && GET_MODE (SUBREG_REG (addr)) == DImode)
14989 addr = SUBREG_REG (addr);
14990 if (CONST_INT_P (addr))
14991 return 0;
14995 if (REG_P (addr))
14996 base = addr;
14997 else if (SUBREG_P (addr))
14999 if (REG_P (SUBREG_REG (addr)))
15000 base = addr;
15001 else
15002 return 0;
15004 else if (GET_CODE (addr) == PLUS)
15006 rtx addends[4], op;
15007 int n = 0, i;
15009 op = addr;
15012 if (n >= 4)
15013 return 0;
15014 addends[n++] = XEXP (op, 1);
15015 op = XEXP (op, 0);
15017 while (GET_CODE (op) == PLUS);
15018 if (n >= 4)
15019 return 0;
15020 addends[n] = op;
15022 for (i = n; i >= 0; --i)
15024 op = addends[i];
15025 switch (GET_CODE (op))
15027 case MULT:
15028 if (index)
15029 return 0;
15030 index = XEXP (op, 0);
15031 scale_rtx = XEXP (op, 1);
15032 break;
15034 case ASHIFT:
15035 if (index)
15036 return 0;
15037 index = XEXP (op, 0);
15038 tmp = XEXP (op, 1);
15039 if (!CONST_INT_P (tmp))
15040 return 0;
15041 scale = INTVAL (tmp);
15042 if ((unsigned HOST_WIDE_INT) scale > 3)
15043 return 0;
15044 scale = 1 << scale;
15045 break;
15047 case ZERO_EXTEND:
15048 op = XEXP (op, 0);
15049 if (GET_CODE (op) != UNSPEC)
15050 return 0;
15051 /* FALLTHRU */
15053 case UNSPEC:
15054 if (XINT (op, 1) == UNSPEC_TP
15055 && TARGET_TLS_DIRECT_SEG_REFS
15056 && seg == ADDR_SPACE_GENERIC)
15057 seg = DEFAULT_TLS_SEG_REG;
15058 else
15059 return 0;
15060 break;
15062 case SUBREG:
15063 if (!REG_P (SUBREG_REG (op)))
15064 return 0;
15065 /* FALLTHRU */
15067 case REG:
15068 if (!base)
15069 base = op;
15070 else if (!index)
15071 index = op;
15072 else
15073 return 0;
15074 break;
15076 case CONST:
15077 case CONST_INT:
15078 case SYMBOL_REF:
15079 case LABEL_REF:
15080 if (disp)
15081 return 0;
15082 disp = op;
15083 break;
15085 default:
15086 return 0;
15090 else if (GET_CODE (addr) == MULT)
15092 index = XEXP (addr, 0); /* index*scale */
15093 scale_rtx = XEXP (addr, 1);
15095 else if (GET_CODE (addr) == ASHIFT)
15097 /* We're called for lea too, which implements ashift on occasion. */
15098 index = XEXP (addr, 0);
15099 tmp = XEXP (addr, 1);
15100 if (!CONST_INT_P (tmp))
15101 return 0;
15102 scale = INTVAL (tmp);
15103 if ((unsigned HOST_WIDE_INT) scale > 3)
15104 return 0;
15105 scale = 1 << scale;
15106 retval = -1;
15108 else
15109 disp = addr; /* displacement */
15111 if (index)
15113 if (REG_P (index))
15115 else if (SUBREG_P (index)
15116 && REG_P (SUBREG_REG (index)))
15118 else
15119 return 0;
15122 /* Extract the integral value of scale. */
15123 if (scale_rtx)
15125 if (!CONST_INT_P (scale_rtx))
15126 return 0;
15127 scale = INTVAL (scale_rtx);
15130 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15131 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15133 /* Avoid useless 0 displacement. */
15134 if (disp == const0_rtx && (base || index))
15135 disp = NULL_RTX;
15137 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15138 if (base_reg && index_reg && scale == 1
15139 && (index_reg == arg_pointer_rtx
15140 || index_reg == frame_pointer_rtx
15141 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
15143 std::swap (base, index);
15144 std::swap (base_reg, index_reg);
15147 /* Special case: %ebp cannot be encoded as a base without a displacement.
15148 Similarly %r13. */
15149 if (!disp
15150 && base_reg
15151 && (base_reg == hard_frame_pointer_rtx
15152 || base_reg == frame_pointer_rtx
15153 || base_reg == arg_pointer_rtx
15154 || (REG_P (base_reg)
15155 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
15156 || REGNO (base_reg) == R13_REG))))
15157 disp = const0_rtx;
15159 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15160 Avoid this by transforming to [%esi+0].
15161 Reload calls address legitimization without cfun defined, so we need
15162 to test cfun for being non-NULL. */
15163 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15164 && base_reg && !index_reg && !disp
15165 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
15166 disp = const0_rtx;
15168 /* Special case: encode reg+reg instead of reg*2. */
15169 if (!base && index && scale == 2)
15170 base = index, base_reg = index_reg, scale = 1;
15172 /* Special case: scaling cannot be encoded without base or displacement. */
15173 if (!base && !disp && index && scale != 1)
15174 disp = const0_rtx;
15176 out->base = base;
15177 out->index = index;
15178 out->disp = disp;
15179 out->scale = scale;
15180 out->seg = seg;
15182 return retval;
15185 /* Return cost of the memory address x.
15186 For i386, it is better to use a complex address than let gcc copy
15187 the address into a reg and make a new pseudo. But not if the address
15188 requires to two regs - that would mean more pseudos with longer
15189 lifetimes. */
15190 static int
15191 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15193 struct ix86_address parts;
15194 int cost = 1;
15195 int ok = ix86_decompose_address (x, &parts);
15197 gcc_assert (ok);
15199 if (parts.base && SUBREG_P (parts.base))
15200 parts.base = SUBREG_REG (parts.base);
15201 if (parts.index && SUBREG_P (parts.index))
15202 parts.index = SUBREG_REG (parts.index);
15204 /* Attempt to minimize number of registers in the address by increasing
15205 address cost for each used register. We don't increase address cost
15206 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15207 is not invariant itself it most likely means that base or index is not
15208 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15209 which is not profitable for x86. */
15210 if (parts.base
15211 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15212 && (current_pass->type == GIMPLE_PASS
15213 || !pic_offset_table_rtx
15214 || !REG_P (parts.base)
15215 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15216 cost++;
15218 if (parts.index
15219 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15220 && (current_pass->type == GIMPLE_PASS
15221 || !pic_offset_table_rtx
15222 || !REG_P (parts.index)
15223 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15224 cost++;
15226 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15227 since it's predecode logic can't detect the length of instructions
15228 and it degenerates to vector decoded. Increase cost of such
15229 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15230 to split such addresses or even refuse such addresses at all.
15232 Following addressing modes are affected:
15233 [base+scale*index]
15234 [scale*index+disp]
15235 [base+index]
15237 The first and last case may be avoidable by explicitly coding the zero in
15238 memory address, but I don't have AMD-K6 machine handy to check this
15239 theory. */
15241 if (TARGET_K6
15242 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15243 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15244 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15245 cost += 10;
15247 return cost;
15250 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15251 this is used for to form addresses to local data when -fPIC is in
15252 use. */
15254 static bool
15255 darwin_local_data_pic (rtx disp)
15257 return (GET_CODE (disp) == UNSPEC
15258 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15261 /* True if operand X should be loaded from GOT. */
15263 bool
15264 ix86_force_load_from_GOT_p (rtx x)
15266 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15267 && !TARGET_PECOFF && !TARGET_MACHO
15268 && !flag_plt && !flag_pic
15269 && ix86_cmodel != CM_LARGE
15270 && GET_CODE (x) == SYMBOL_REF
15271 && SYMBOL_REF_FUNCTION_P (x)
15272 && !SYMBOL_REF_LOCAL_P (x));
15275 /* Determine if a given RTX is a valid constant. We already know this
15276 satisfies CONSTANT_P. */
15278 static bool
15279 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15281 /* Pointer bounds constants are not valid. */
15282 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15283 return false;
15285 switch (GET_CODE (x))
15287 case CONST:
15288 x = XEXP (x, 0);
15290 if (GET_CODE (x) == PLUS)
15292 if (!CONST_INT_P (XEXP (x, 1)))
15293 return false;
15294 x = XEXP (x, 0);
15297 if (TARGET_MACHO && darwin_local_data_pic (x))
15298 return true;
15300 /* Only some unspecs are valid as "constants". */
15301 if (GET_CODE (x) == UNSPEC)
15302 switch (XINT (x, 1))
15304 case UNSPEC_GOT:
15305 case UNSPEC_GOTOFF:
15306 case UNSPEC_PLTOFF:
15307 return TARGET_64BIT;
15308 case UNSPEC_TPOFF:
15309 case UNSPEC_NTPOFF:
15310 x = XVECEXP (x, 0, 0);
15311 return (GET_CODE (x) == SYMBOL_REF
15312 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15313 case UNSPEC_DTPOFF:
15314 x = XVECEXP (x, 0, 0);
15315 return (GET_CODE (x) == SYMBOL_REF
15316 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15317 default:
15318 return false;
15321 /* We must have drilled down to a symbol. */
15322 if (GET_CODE (x) == LABEL_REF)
15323 return true;
15324 if (GET_CODE (x) != SYMBOL_REF)
15325 return false;
15326 /* FALLTHRU */
15328 case SYMBOL_REF:
15329 /* TLS symbols are never valid. */
15330 if (SYMBOL_REF_TLS_MODEL (x))
15331 return false;
15333 /* DLLIMPORT symbols are never valid. */
15334 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15335 && SYMBOL_REF_DLLIMPORT_P (x))
15336 return false;
15338 #if TARGET_MACHO
15339 /* mdynamic-no-pic */
15340 if (MACHO_DYNAMIC_NO_PIC_P)
15341 return machopic_symbol_defined_p (x);
15342 #endif
15344 /* External function address should be loaded
15345 via the GOT slot to avoid PLT. */
15346 if (ix86_force_load_from_GOT_p (x))
15347 return false;
15349 break;
15351 CASE_CONST_SCALAR_INT:
15352 switch (mode)
15354 case TImode:
15355 if (TARGET_64BIT)
15356 return true;
15357 /* FALLTHRU */
15358 case OImode:
15359 case XImode:
15360 if (!standard_sse_constant_p (x, mode))
15361 return false;
15362 default:
15363 break;
15365 break;
15367 case CONST_VECTOR:
15368 if (!standard_sse_constant_p (x, mode))
15369 return false;
15371 default:
15372 break;
15375 /* Otherwise we handle everything else in the move patterns. */
15376 return true;
15379 /* Determine if it's legal to put X into the constant pool. This
15380 is not possible for the address of thread-local symbols, which
15381 is checked above. */
15383 static bool
15384 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15386 /* We can put any immediate constant in memory. */
15387 switch (GET_CODE (x))
15389 CASE_CONST_ANY:
15390 return false;
15392 default:
15393 break;
15396 return !ix86_legitimate_constant_p (mode, x);
15399 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15400 otherwise zero. */
15402 static bool
15403 is_imported_p (rtx x)
15405 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15406 || GET_CODE (x) != SYMBOL_REF)
15407 return false;
15409 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15413 /* Nonzero if the constant value X is a legitimate general operand
15414 when generating PIC code. It is given that flag_pic is on and
15415 that X satisfies CONSTANT_P. */
15417 bool
15418 legitimate_pic_operand_p (rtx x)
15420 rtx inner;
15422 switch (GET_CODE (x))
15424 case CONST:
15425 inner = XEXP (x, 0);
15426 if (GET_CODE (inner) == PLUS
15427 && CONST_INT_P (XEXP (inner, 1)))
15428 inner = XEXP (inner, 0);
15430 /* Only some unspecs are valid as "constants". */
15431 if (GET_CODE (inner) == UNSPEC)
15432 switch (XINT (inner, 1))
15434 case UNSPEC_GOT:
15435 case UNSPEC_GOTOFF:
15436 case UNSPEC_PLTOFF:
15437 return TARGET_64BIT;
15438 case UNSPEC_TPOFF:
15439 x = XVECEXP (inner, 0, 0);
15440 return (GET_CODE (x) == SYMBOL_REF
15441 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15442 case UNSPEC_MACHOPIC_OFFSET:
15443 return legitimate_pic_address_disp_p (x);
15444 default:
15445 return false;
15447 /* FALLTHRU */
15449 case SYMBOL_REF:
15450 case LABEL_REF:
15451 return legitimate_pic_address_disp_p (x);
15453 default:
15454 return true;
15458 /* Determine if a given CONST RTX is a valid memory displacement
15459 in PIC mode. */
15461 bool
15462 legitimate_pic_address_disp_p (rtx disp)
15464 bool saw_plus;
15466 /* In 64bit mode we can allow direct addresses of symbols and labels
15467 when they are not dynamic symbols. */
15468 if (TARGET_64BIT)
15470 rtx op0 = disp, op1;
15472 switch (GET_CODE (disp))
15474 case LABEL_REF:
15475 return true;
15477 case CONST:
15478 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15479 break;
15480 op0 = XEXP (XEXP (disp, 0), 0);
15481 op1 = XEXP (XEXP (disp, 0), 1);
15482 if (!CONST_INT_P (op1)
15483 || INTVAL (op1) >= 16*1024*1024
15484 || INTVAL (op1) < -16*1024*1024)
15485 break;
15486 if (GET_CODE (op0) == LABEL_REF)
15487 return true;
15488 if (GET_CODE (op0) == CONST
15489 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15490 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15491 return true;
15492 if (GET_CODE (op0) == UNSPEC
15493 && XINT (op0, 1) == UNSPEC_PCREL)
15494 return true;
15495 if (GET_CODE (op0) != SYMBOL_REF)
15496 break;
15497 /* FALLTHRU */
15499 case SYMBOL_REF:
15500 /* TLS references should always be enclosed in UNSPEC.
15501 The dllimported symbol needs always to be resolved. */
15502 if (SYMBOL_REF_TLS_MODEL (op0)
15503 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15504 return false;
15506 if (TARGET_PECOFF)
15508 if (is_imported_p (op0))
15509 return true;
15511 if (SYMBOL_REF_FAR_ADDR_P (op0)
15512 || !SYMBOL_REF_LOCAL_P (op0))
15513 break;
15515 /* Function-symbols need to be resolved only for
15516 large-model.
15517 For the small-model we don't need to resolve anything
15518 here. */
15519 if ((ix86_cmodel != CM_LARGE_PIC
15520 && SYMBOL_REF_FUNCTION_P (op0))
15521 || ix86_cmodel == CM_SMALL_PIC)
15522 return true;
15523 /* Non-external symbols don't need to be resolved for
15524 large, and medium-model. */
15525 if ((ix86_cmodel == CM_LARGE_PIC
15526 || ix86_cmodel == CM_MEDIUM_PIC)
15527 && !SYMBOL_REF_EXTERNAL_P (op0))
15528 return true;
15530 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15531 && (SYMBOL_REF_LOCAL_P (op0)
15532 || (HAVE_LD_PIE_COPYRELOC
15533 && flag_pie
15534 && !SYMBOL_REF_WEAK (op0)
15535 && !SYMBOL_REF_FUNCTION_P (op0)))
15536 && ix86_cmodel != CM_LARGE_PIC)
15537 return true;
15538 break;
15540 default:
15541 break;
15544 if (GET_CODE (disp) != CONST)
15545 return false;
15546 disp = XEXP (disp, 0);
15548 if (TARGET_64BIT)
15550 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15551 of GOT tables. We should not need these anyway. */
15552 if (GET_CODE (disp) != UNSPEC
15553 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15554 && XINT (disp, 1) != UNSPEC_GOTOFF
15555 && XINT (disp, 1) != UNSPEC_PCREL
15556 && XINT (disp, 1) != UNSPEC_PLTOFF))
15557 return false;
15559 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15560 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15561 return false;
15562 return true;
15565 saw_plus = false;
15566 if (GET_CODE (disp) == PLUS)
15568 if (!CONST_INT_P (XEXP (disp, 1)))
15569 return false;
15570 disp = XEXP (disp, 0);
15571 saw_plus = true;
15574 if (TARGET_MACHO && darwin_local_data_pic (disp))
15575 return true;
15577 if (GET_CODE (disp) != UNSPEC)
15578 return false;
15580 switch (XINT (disp, 1))
15582 case UNSPEC_GOT:
15583 if (saw_plus)
15584 return false;
15585 /* We need to check for both symbols and labels because VxWorks loads
15586 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15587 details. */
15588 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15589 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15590 case UNSPEC_GOTOFF:
15591 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15592 While ABI specify also 32bit relocation but we don't produce it in
15593 small PIC model at all. */
15594 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15595 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15596 && !TARGET_64BIT)
15597 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15598 return false;
15599 case UNSPEC_GOTTPOFF:
15600 case UNSPEC_GOTNTPOFF:
15601 case UNSPEC_INDNTPOFF:
15602 if (saw_plus)
15603 return false;
15604 disp = XVECEXP (disp, 0, 0);
15605 return (GET_CODE (disp) == SYMBOL_REF
15606 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15607 case UNSPEC_NTPOFF:
15608 disp = XVECEXP (disp, 0, 0);
15609 return (GET_CODE (disp) == SYMBOL_REF
15610 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15611 case UNSPEC_DTPOFF:
15612 disp = XVECEXP (disp, 0, 0);
15613 return (GET_CODE (disp) == SYMBOL_REF
15614 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15617 return false;
15620 /* Determine if op is suitable RTX for an address register.
15621 Return naked register if a register or a register subreg is
15622 found, otherwise return NULL_RTX. */
15624 static rtx
15625 ix86_validate_address_register (rtx op)
15627 machine_mode mode = GET_MODE (op);
15629 /* Only SImode or DImode registers can form the address. */
15630 if (mode != SImode && mode != DImode)
15631 return NULL_RTX;
15633 if (REG_P (op))
15634 return op;
15635 else if (SUBREG_P (op))
15637 rtx reg = SUBREG_REG (op);
15639 if (!REG_P (reg))
15640 return NULL_RTX;
15642 mode = GET_MODE (reg);
15644 /* Don't allow SUBREGs that span more than a word. It can
15645 lead to spill failures when the register is one word out
15646 of a two word structure. */
15647 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15648 return NULL_RTX;
15650 /* Allow only SUBREGs of non-eliminable hard registers. */
15651 if (register_no_elim_operand (reg, mode))
15652 return reg;
15655 /* Op is not a register. */
15656 return NULL_RTX;
15659 /* Recognizes RTL expressions that are valid memory addresses for an
15660 instruction. The MODE argument is the machine mode for the MEM
15661 expression that wants to use this address.
15663 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15664 convert common non-canonical forms to canonical form so that they will
15665 be recognized. */
15667 static bool
15668 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15670 struct ix86_address parts;
15671 rtx base, index, disp;
15672 HOST_WIDE_INT scale;
15673 addr_space_t seg;
15675 if (ix86_decompose_address (addr, &parts) <= 0)
15676 /* Decomposition failed. */
15677 return false;
15679 base = parts.base;
15680 index = parts.index;
15681 disp = parts.disp;
15682 scale = parts.scale;
15683 seg = parts.seg;
15685 /* Validate base register. */
15686 if (base)
15688 rtx reg = ix86_validate_address_register (base);
15690 if (reg == NULL_RTX)
15691 return false;
15693 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15694 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15695 /* Base is not valid. */
15696 return false;
15699 /* Validate index register. */
15700 if (index)
15702 rtx reg = ix86_validate_address_register (index);
15704 if (reg == NULL_RTX)
15705 return false;
15707 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15708 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15709 /* Index is not valid. */
15710 return false;
15713 /* Index and base should have the same mode. */
15714 if (base && index
15715 && GET_MODE (base) != GET_MODE (index))
15716 return false;
15718 /* Address override works only on the (%reg) part of %fs:(%reg). */
15719 if (seg != ADDR_SPACE_GENERIC
15720 && ((base && GET_MODE (base) != word_mode)
15721 || (index && GET_MODE (index) != word_mode)))
15722 return false;
15724 /* Validate scale factor. */
15725 if (scale != 1)
15727 if (!index)
15728 /* Scale without index. */
15729 return false;
15731 if (scale != 2 && scale != 4 && scale != 8)
15732 /* Scale is not a valid multiplier. */
15733 return false;
15736 /* Validate displacement. */
15737 if (disp)
15739 if (GET_CODE (disp) == CONST
15740 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15741 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15742 switch (XINT (XEXP (disp, 0), 1))
15744 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15745 when used. While ABI specify also 32bit relocations, we
15746 don't produce them at all and use IP relative instead.
15747 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15748 should be loaded via GOT. */
15749 case UNSPEC_GOT:
15750 if (!TARGET_64BIT
15751 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15752 goto is_legitimate_pic;
15753 /* FALLTHRU */
15754 case UNSPEC_GOTOFF:
15755 gcc_assert (flag_pic);
15756 if (!TARGET_64BIT)
15757 goto is_legitimate_pic;
15759 /* 64bit address unspec. */
15760 return false;
15762 case UNSPEC_GOTPCREL:
15763 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15764 goto is_legitimate_pic;
15765 /* FALLTHRU */
15766 case UNSPEC_PCREL:
15767 gcc_assert (flag_pic);
15768 goto is_legitimate_pic;
15770 case UNSPEC_GOTTPOFF:
15771 case UNSPEC_GOTNTPOFF:
15772 case UNSPEC_INDNTPOFF:
15773 case UNSPEC_NTPOFF:
15774 case UNSPEC_DTPOFF:
15775 break;
15777 case UNSPEC_STACK_CHECK:
15778 gcc_assert (flag_split_stack);
15779 break;
15781 default:
15782 /* Invalid address unspec. */
15783 return false;
15786 else if (SYMBOLIC_CONST (disp)
15787 && (flag_pic
15788 || (TARGET_MACHO
15789 #if TARGET_MACHO
15790 && MACHOPIC_INDIRECT
15791 && !machopic_operand_p (disp)
15792 #endif
15796 is_legitimate_pic:
15797 if (TARGET_64BIT && (index || base))
15799 /* foo@dtpoff(%rX) is ok. */
15800 if (GET_CODE (disp) != CONST
15801 || GET_CODE (XEXP (disp, 0)) != PLUS
15802 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15803 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15804 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15805 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15806 /* Non-constant pic memory reference. */
15807 return false;
15809 else if ((!TARGET_MACHO || flag_pic)
15810 && ! legitimate_pic_address_disp_p (disp))
15811 /* Displacement is an invalid pic construct. */
15812 return false;
15813 #if TARGET_MACHO
15814 else if (MACHO_DYNAMIC_NO_PIC_P
15815 && !ix86_legitimate_constant_p (Pmode, disp))
15816 /* displacment must be referenced via non_lazy_pointer */
15817 return false;
15818 #endif
15820 /* This code used to verify that a symbolic pic displacement
15821 includes the pic_offset_table_rtx register.
15823 While this is good idea, unfortunately these constructs may
15824 be created by "adds using lea" optimization for incorrect
15825 code like:
15827 int a;
15828 int foo(int i)
15830 return *(&a+i);
15833 This code is nonsensical, but results in addressing
15834 GOT table with pic_offset_table_rtx base. We can't
15835 just refuse it easily, since it gets matched by
15836 "addsi3" pattern, that later gets split to lea in the
15837 case output register differs from input. While this
15838 can be handled by separate addsi pattern for this case
15839 that never results in lea, this seems to be easier and
15840 correct fix for crash to disable this test. */
15842 else if (GET_CODE (disp) != LABEL_REF
15843 && !CONST_INT_P (disp)
15844 && (GET_CODE (disp) != CONST
15845 || !ix86_legitimate_constant_p (Pmode, disp))
15846 && (GET_CODE (disp) != SYMBOL_REF
15847 || !ix86_legitimate_constant_p (Pmode, disp)))
15848 /* Displacement is not constant. */
15849 return false;
15850 else if (TARGET_64BIT
15851 && !x86_64_immediate_operand (disp, VOIDmode))
15852 /* Displacement is out of range. */
15853 return false;
15854 /* In x32 mode, constant addresses are sign extended to 64bit, so
15855 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15856 else if (TARGET_X32 && !(index || base)
15857 && CONST_INT_P (disp)
15858 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15859 return false;
15862 /* Everything looks valid. */
15863 return true;
15866 /* Determine if a given RTX is a valid constant address. */
15868 bool
15869 constant_address_p (rtx x)
15871 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15874 /* Return a unique alias set for the GOT. */
15876 static alias_set_type
15877 ix86_GOT_alias_set (void)
15879 static alias_set_type set = -1;
15880 if (set == -1)
15881 set = new_alias_set ();
15882 return set;
15885 /* Return a legitimate reference for ORIG (an address) using the
15886 register REG. If REG is 0, a new pseudo is generated.
15888 There are two types of references that must be handled:
15890 1. Global data references must load the address from the GOT, via
15891 the PIC reg. An insn is emitted to do this load, and the reg is
15892 returned.
15894 2. Static data references, constant pool addresses, and code labels
15895 compute the address as an offset from the GOT, whose base is in
15896 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15897 differentiate them from global data objects. The returned
15898 address is the PIC reg + an unspec constant.
15900 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15901 reg also appears in the address. */
15903 static rtx
15904 legitimize_pic_address (rtx orig, rtx reg)
15906 rtx addr = orig;
15907 rtx new_rtx = orig;
15909 #if TARGET_MACHO
15910 if (TARGET_MACHO && !TARGET_64BIT)
15912 if (reg == 0)
15913 reg = gen_reg_rtx (Pmode);
15914 /* Use the generic Mach-O PIC machinery. */
15915 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15917 #endif
15919 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15921 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15922 if (tmp)
15923 return tmp;
15926 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15927 new_rtx = addr;
15928 else if ((!TARGET_64BIT
15929 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15930 && !TARGET_PECOFF
15931 && gotoff_operand (addr, Pmode))
15933 /* This symbol may be referenced via a displacement
15934 from the PIC base address (@GOTOFF). */
15935 if (GET_CODE (addr) == CONST)
15936 addr = XEXP (addr, 0);
15938 if (GET_CODE (addr) == PLUS)
15940 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15941 UNSPEC_GOTOFF);
15942 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15944 else
15945 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15947 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15949 if (TARGET_64BIT)
15950 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15952 if (reg != 0)
15954 gcc_assert (REG_P (reg));
15955 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15956 new_rtx, reg, 1, OPTAB_DIRECT);
15958 else
15959 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15961 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15962 /* We can't use @GOTOFF for text labels
15963 on VxWorks, see gotoff_operand. */
15964 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15966 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15967 if (tmp)
15968 return tmp;
15970 /* For x64 PE-COFF there is no GOT table,
15971 so we use address directly. */
15972 if (TARGET_64BIT && TARGET_PECOFF)
15974 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15975 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15977 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15979 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15980 UNSPEC_GOTPCREL);
15981 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15982 new_rtx = gen_const_mem (Pmode, new_rtx);
15983 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15985 else
15987 /* This symbol must be referenced via a load
15988 from the Global Offset Table (@GOT). */
15989 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15990 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15991 if (TARGET_64BIT)
15992 new_rtx = force_reg (Pmode, new_rtx);
15993 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15994 new_rtx = gen_const_mem (Pmode, new_rtx);
15995 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15998 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16000 else
16002 if (CONST_INT_P (addr)
16003 && !x86_64_immediate_operand (addr, VOIDmode))
16004 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16005 else if (GET_CODE (addr) == CONST)
16007 addr = XEXP (addr, 0);
16009 /* We must match stuff we generate before. Assume the only
16010 unspecs that can get here are ours. Not that we could do
16011 anything with them anyway.... */
16012 if (GET_CODE (addr) == UNSPEC
16013 || (GET_CODE (addr) == PLUS
16014 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16015 return orig;
16016 gcc_assert (GET_CODE (addr) == PLUS);
16019 if (GET_CODE (addr) == PLUS)
16021 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16023 /* Check first to see if this is a constant
16024 offset from a @GOTOFF symbol reference. */
16025 if (!TARGET_PECOFF
16026 && gotoff_operand (op0, Pmode)
16027 && CONST_INT_P (op1))
16029 if (!TARGET_64BIT)
16031 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16032 UNSPEC_GOTOFF);
16033 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16034 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16036 if (reg != 0)
16038 gcc_assert (REG_P (reg));
16039 new_rtx = expand_simple_binop (Pmode, PLUS,
16040 pic_offset_table_rtx,
16041 new_rtx, reg, 1,
16042 OPTAB_DIRECT);
16044 else
16045 new_rtx
16046 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16048 else
16050 if (INTVAL (op1) < -16*1024*1024
16051 || INTVAL (op1) >= 16*1024*1024)
16053 if (!x86_64_immediate_operand (op1, Pmode))
16054 op1 = force_reg (Pmode, op1);
16056 new_rtx
16057 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16061 else
16063 rtx base = legitimize_pic_address (op0, reg);
16064 machine_mode mode = GET_MODE (base);
16065 new_rtx
16066 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16068 if (CONST_INT_P (new_rtx))
16070 if (INTVAL (new_rtx) < -16*1024*1024
16071 || INTVAL (new_rtx) >= 16*1024*1024)
16073 if (!x86_64_immediate_operand (new_rtx, mode))
16074 new_rtx = force_reg (mode, new_rtx);
16076 new_rtx
16077 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16079 else
16080 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16082 else
16084 /* For %rip addressing, we have to use
16085 just disp32, not base nor index. */
16086 if (TARGET_64BIT
16087 && (GET_CODE (base) == SYMBOL_REF
16088 || GET_CODE (base) == LABEL_REF))
16089 base = force_reg (mode, base);
16090 if (GET_CODE (new_rtx) == PLUS
16091 && CONSTANT_P (XEXP (new_rtx, 1)))
16093 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16094 new_rtx = XEXP (new_rtx, 1);
16096 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16101 return new_rtx;
16104 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16106 static rtx
16107 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16109 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16111 if (GET_MODE (tp) != tp_mode)
16113 gcc_assert (GET_MODE (tp) == SImode);
16114 gcc_assert (tp_mode == DImode);
16116 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16119 if (to_reg)
16120 tp = copy_to_mode_reg (tp_mode, tp);
16122 return tp;
16125 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16127 static GTY(()) rtx ix86_tls_symbol;
16129 static rtx
16130 ix86_tls_get_addr (void)
16132 if (!ix86_tls_symbol)
16134 const char *sym
16135 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16136 ? "___tls_get_addr" : "__tls_get_addr");
16138 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16141 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16143 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16144 UNSPEC_PLTOFF);
16145 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16146 gen_rtx_CONST (Pmode, unspec));
16149 return ix86_tls_symbol;
16152 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16154 static GTY(()) rtx ix86_tls_module_base_symbol;
16157 ix86_tls_module_base (void)
16159 if (!ix86_tls_module_base_symbol)
16161 ix86_tls_module_base_symbol
16162 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16164 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16165 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16168 return ix86_tls_module_base_symbol;
16171 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16172 false if we expect this to be used for a memory address and true if
16173 we expect to load the address into a register. */
16175 static rtx
16176 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16178 rtx dest, base, off;
16179 rtx pic = NULL_RTX, tp = NULL_RTX;
16180 machine_mode tp_mode = Pmode;
16181 int type;
16183 /* Fall back to global dynamic model if tool chain cannot support local
16184 dynamic. */
16185 if (TARGET_SUN_TLS && !TARGET_64BIT
16186 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16187 && model == TLS_MODEL_LOCAL_DYNAMIC)
16188 model = TLS_MODEL_GLOBAL_DYNAMIC;
16190 switch (model)
16192 case TLS_MODEL_GLOBAL_DYNAMIC:
16193 dest = gen_reg_rtx (Pmode);
16195 if (!TARGET_64BIT)
16197 if (flag_pic && !TARGET_PECOFF)
16198 pic = pic_offset_table_rtx;
16199 else
16201 pic = gen_reg_rtx (Pmode);
16202 emit_insn (gen_set_got (pic));
16206 if (TARGET_GNU2_TLS)
16208 if (TARGET_64BIT)
16209 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16210 else
16211 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16213 tp = get_thread_pointer (Pmode, true);
16214 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16216 if (GET_MODE (x) != Pmode)
16217 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16219 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16221 else
16223 rtx caddr = ix86_tls_get_addr ();
16225 if (TARGET_64BIT)
16227 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16228 rtx_insn *insns;
16230 start_sequence ();
16231 emit_call_insn
16232 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16233 insns = get_insns ();
16234 end_sequence ();
16236 if (GET_MODE (x) != Pmode)
16237 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16239 RTL_CONST_CALL_P (insns) = 1;
16240 emit_libcall_block (insns, dest, rax, x);
16242 else
16243 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16245 break;
16247 case TLS_MODEL_LOCAL_DYNAMIC:
16248 base = gen_reg_rtx (Pmode);
16250 if (!TARGET_64BIT)
16252 if (flag_pic)
16253 pic = pic_offset_table_rtx;
16254 else
16256 pic = gen_reg_rtx (Pmode);
16257 emit_insn (gen_set_got (pic));
16261 if (TARGET_GNU2_TLS)
16263 rtx tmp = ix86_tls_module_base ();
16265 if (TARGET_64BIT)
16266 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16267 else
16268 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16270 tp = get_thread_pointer (Pmode, true);
16271 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16272 gen_rtx_MINUS (Pmode, tmp, tp));
16274 else
16276 rtx caddr = ix86_tls_get_addr ();
16278 if (TARGET_64BIT)
16280 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16281 rtx_insn *insns;
16282 rtx eqv;
16284 start_sequence ();
16285 emit_call_insn
16286 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16287 insns = get_insns ();
16288 end_sequence ();
16290 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16291 share the LD_BASE result with other LD model accesses. */
16292 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16293 UNSPEC_TLS_LD_BASE);
16295 RTL_CONST_CALL_P (insns) = 1;
16296 emit_libcall_block (insns, base, rax, eqv);
16298 else
16299 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16302 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16303 off = gen_rtx_CONST (Pmode, off);
16305 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16307 if (TARGET_GNU2_TLS)
16309 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16311 if (GET_MODE (x) != Pmode)
16312 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16314 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16316 break;
16318 case TLS_MODEL_INITIAL_EXEC:
16319 if (TARGET_64BIT)
16321 if (TARGET_SUN_TLS && !TARGET_X32)
16323 /* The Sun linker took the AMD64 TLS spec literally
16324 and can only handle %rax as destination of the
16325 initial executable code sequence. */
16327 dest = gen_reg_rtx (DImode);
16328 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16329 return dest;
16332 /* Generate DImode references to avoid %fs:(%reg32)
16333 problems and linker IE->LE relaxation bug. */
16334 tp_mode = DImode;
16335 pic = NULL;
16336 type = UNSPEC_GOTNTPOFF;
16338 else if (flag_pic)
16340 pic = pic_offset_table_rtx;
16341 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16343 else if (!TARGET_ANY_GNU_TLS)
16345 pic = gen_reg_rtx (Pmode);
16346 emit_insn (gen_set_got (pic));
16347 type = UNSPEC_GOTTPOFF;
16349 else
16351 pic = NULL;
16352 type = UNSPEC_INDNTPOFF;
16355 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16356 off = gen_rtx_CONST (tp_mode, off);
16357 if (pic)
16358 off = gen_rtx_PLUS (tp_mode, pic, off);
16359 off = gen_const_mem (tp_mode, off);
16360 set_mem_alias_set (off, ix86_GOT_alias_set ());
16362 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16364 base = get_thread_pointer (tp_mode,
16365 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16366 off = force_reg (tp_mode, off);
16367 return gen_rtx_PLUS (tp_mode, base, off);
16369 else
16371 base = get_thread_pointer (Pmode, true);
16372 dest = gen_reg_rtx (Pmode);
16373 emit_insn (ix86_gen_sub3 (dest, base, off));
16375 break;
16377 case TLS_MODEL_LOCAL_EXEC:
16378 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16379 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16380 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16381 off = gen_rtx_CONST (Pmode, off);
16383 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16385 base = get_thread_pointer (Pmode,
16386 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16387 return gen_rtx_PLUS (Pmode, base, off);
16389 else
16391 base = get_thread_pointer (Pmode, true);
16392 dest = gen_reg_rtx (Pmode);
16393 emit_insn (ix86_gen_sub3 (dest, base, off));
16395 break;
16397 default:
16398 gcc_unreachable ();
16401 return dest;
16404 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16405 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16406 unique refptr-DECL symbol corresponding to symbol DECL. */
16408 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16410 static inline hashval_t hash (tree_map *m) { return m->hash; }
16411 static inline bool
16412 equal (tree_map *a, tree_map *b)
16414 return a->base.from == b->base.from;
16417 static int
16418 keep_cache_entry (tree_map *&m)
16420 return ggc_marked_p (m->base.from);
16424 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16426 static tree
16427 get_dllimport_decl (tree decl, bool beimport)
16429 struct tree_map *h, in;
16430 const char *name;
16431 const char *prefix;
16432 size_t namelen, prefixlen;
16433 char *imp_name;
16434 tree to;
16435 rtx rtl;
16437 if (!dllimport_map)
16438 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16440 in.hash = htab_hash_pointer (decl);
16441 in.base.from = decl;
16442 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16443 h = *loc;
16444 if (h)
16445 return h->to;
16447 *loc = h = ggc_alloc<tree_map> ();
16448 h->hash = in.hash;
16449 h->base.from = decl;
16450 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16451 VAR_DECL, NULL, ptr_type_node);
16452 DECL_ARTIFICIAL (to) = 1;
16453 DECL_IGNORED_P (to) = 1;
16454 DECL_EXTERNAL (to) = 1;
16455 TREE_READONLY (to) = 1;
16457 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16458 name = targetm.strip_name_encoding (name);
16459 if (beimport)
16460 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16461 ? "*__imp_" : "*__imp__";
16462 else
16463 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16464 namelen = strlen (name);
16465 prefixlen = strlen (prefix);
16466 imp_name = (char *) alloca (namelen + prefixlen + 1);
16467 memcpy (imp_name, prefix, prefixlen);
16468 memcpy (imp_name + prefixlen, name, namelen + 1);
16470 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16471 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16472 SET_SYMBOL_REF_DECL (rtl, to);
16473 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16474 if (!beimport)
16476 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16477 #ifdef SUB_TARGET_RECORD_STUB
16478 SUB_TARGET_RECORD_STUB (name);
16479 #endif
16482 rtl = gen_const_mem (Pmode, rtl);
16483 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16485 SET_DECL_RTL (to, rtl);
16486 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16488 return to;
16491 /* Expand SYMBOL into its corresponding far-addresse symbol.
16492 WANT_REG is true if we require the result be a register. */
16494 static rtx
16495 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16497 tree imp_decl;
16498 rtx x;
16500 gcc_assert (SYMBOL_REF_DECL (symbol));
16501 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16503 x = DECL_RTL (imp_decl);
16504 if (want_reg)
16505 x = force_reg (Pmode, x);
16506 return x;
16509 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16510 true if we require the result be a register. */
16512 static rtx
16513 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16515 tree imp_decl;
16516 rtx x;
16518 gcc_assert (SYMBOL_REF_DECL (symbol));
16519 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16521 x = DECL_RTL (imp_decl);
16522 if (want_reg)
16523 x = force_reg (Pmode, x);
16524 return x;
16527 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16528 is true if we require the result be a register. */
16530 static rtx
16531 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16533 if (!TARGET_PECOFF)
16534 return NULL_RTX;
16536 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16538 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16539 return legitimize_dllimport_symbol (addr, inreg);
16540 if (GET_CODE (addr) == CONST
16541 && GET_CODE (XEXP (addr, 0)) == PLUS
16542 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16543 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16545 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16546 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16550 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16551 return NULL_RTX;
16552 if (GET_CODE (addr) == SYMBOL_REF
16553 && !is_imported_p (addr)
16554 && SYMBOL_REF_EXTERNAL_P (addr)
16555 && SYMBOL_REF_DECL (addr))
16556 return legitimize_pe_coff_extern_decl (addr, inreg);
16558 if (GET_CODE (addr) == CONST
16559 && GET_CODE (XEXP (addr, 0)) == PLUS
16560 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16561 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16562 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16563 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16565 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16566 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16568 return NULL_RTX;
16571 /* Try machine-dependent ways of modifying an illegitimate address
16572 to be legitimate. If we find one, return the new, valid address.
16573 This macro is used in only one place: `memory_address' in explow.c.
16575 OLDX is the address as it was before break_out_memory_refs was called.
16576 In some cases it is useful to look at this to decide what needs to be done.
16578 It is always safe for this macro to do nothing. It exists to recognize
16579 opportunities to optimize the output.
16581 For the 80386, we handle X+REG by loading X into a register R and
16582 using R+REG. R will go in a general reg and indexing will be used.
16583 However, if REG is a broken-out memory address or multiplication,
16584 nothing needs to be done because REG can certainly go in a general reg.
16586 When -fpic is used, special handling is needed for symbolic references.
16587 See comments by legitimize_pic_address in i386.c for details. */
16589 static rtx
16590 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16592 bool changed = false;
16593 unsigned log;
16595 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16596 if (log)
16597 return legitimize_tls_address (x, (enum tls_model) log, false);
16598 if (GET_CODE (x) == CONST
16599 && GET_CODE (XEXP (x, 0)) == PLUS
16600 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16601 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16603 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16604 (enum tls_model) log, false);
16605 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16608 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16610 rtx tmp = legitimize_pe_coff_symbol (x, true);
16611 if (tmp)
16612 return tmp;
16615 if (flag_pic && SYMBOLIC_CONST (x))
16616 return legitimize_pic_address (x, 0);
16618 #if TARGET_MACHO
16619 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16620 return machopic_indirect_data_reference (x, 0);
16621 #endif
16623 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16624 if (GET_CODE (x) == ASHIFT
16625 && CONST_INT_P (XEXP (x, 1))
16626 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16628 changed = true;
16629 log = INTVAL (XEXP (x, 1));
16630 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16631 GEN_INT (1 << log));
16634 if (GET_CODE (x) == PLUS)
16636 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16638 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16639 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16640 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16642 changed = true;
16643 log = INTVAL (XEXP (XEXP (x, 0), 1));
16644 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16645 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16646 GEN_INT (1 << log));
16649 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16650 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16651 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16653 changed = true;
16654 log = INTVAL (XEXP (XEXP (x, 1), 1));
16655 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16656 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16657 GEN_INT (1 << log));
16660 /* Put multiply first if it isn't already. */
16661 if (GET_CODE (XEXP (x, 1)) == MULT)
16663 std::swap (XEXP (x, 0), XEXP (x, 1));
16664 changed = true;
16667 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16668 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16669 created by virtual register instantiation, register elimination, and
16670 similar optimizations. */
16671 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16673 changed = true;
16674 x = gen_rtx_PLUS (Pmode,
16675 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16676 XEXP (XEXP (x, 1), 0)),
16677 XEXP (XEXP (x, 1), 1));
16680 /* Canonicalize
16681 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16682 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16683 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16684 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16685 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16686 && CONSTANT_P (XEXP (x, 1)))
16688 rtx constant;
16689 rtx other = NULL_RTX;
16691 if (CONST_INT_P (XEXP (x, 1)))
16693 constant = XEXP (x, 1);
16694 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16696 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16698 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16699 other = XEXP (x, 1);
16701 else
16702 constant = 0;
16704 if (constant)
16706 changed = true;
16707 x = gen_rtx_PLUS (Pmode,
16708 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16709 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16710 plus_constant (Pmode, other,
16711 INTVAL (constant)));
16715 if (changed && ix86_legitimate_address_p (mode, x, false))
16716 return x;
16718 if (GET_CODE (XEXP (x, 0)) == MULT)
16720 changed = true;
16721 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16724 if (GET_CODE (XEXP (x, 1)) == MULT)
16726 changed = true;
16727 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16730 if (changed
16731 && REG_P (XEXP (x, 1))
16732 && REG_P (XEXP (x, 0)))
16733 return x;
16735 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16737 changed = true;
16738 x = legitimize_pic_address (x, 0);
16741 if (changed && ix86_legitimate_address_p (mode, x, false))
16742 return x;
16744 if (REG_P (XEXP (x, 0)))
16746 rtx temp = gen_reg_rtx (Pmode);
16747 rtx val = force_operand (XEXP (x, 1), temp);
16748 if (val != temp)
16750 val = convert_to_mode (Pmode, val, 1);
16751 emit_move_insn (temp, val);
16754 XEXP (x, 1) = temp;
16755 return x;
16758 else if (REG_P (XEXP (x, 1)))
16760 rtx temp = gen_reg_rtx (Pmode);
16761 rtx val = force_operand (XEXP (x, 0), temp);
16762 if (val != temp)
16764 val = convert_to_mode (Pmode, val, 1);
16765 emit_move_insn (temp, val);
16768 XEXP (x, 0) = temp;
16769 return x;
16773 return x;
16776 /* Print an integer constant expression in assembler syntax. Addition
16777 and subtraction are the only arithmetic that may appear in these
16778 expressions. FILE is the stdio stream to write to, X is the rtx, and
16779 CODE is the operand print code from the output string. */
16781 static void
16782 output_pic_addr_const (FILE *file, rtx x, int code)
16784 char buf[256];
16786 switch (GET_CODE (x))
16788 case PC:
16789 gcc_assert (flag_pic);
16790 putc ('.', file);
16791 break;
16793 case SYMBOL_REF:
16794 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16795 output_addr_const (file, x);
16796 else
16798 const char *name = XSTR (x, 0);
16800 /* Mark the decl as referenced so that cgraph will
16801 output the function. */
16802 if (SYMBOL_REF_DECL (x))
16803 mark_decl_referenced (SYMBOL_REF_DECL (x));
16805 #if TARGET_MACHO
16806 if (MACHOPIC_INDIRECT
16807 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16808 name = machopic_indirection_name (x, /*stub_p=*/true);
16809 #endif
16810 assemble_name (file, name);
16812 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16813 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16814 fputs ("@PLT", file);
16815 break;
16817 case LABEL_REF:
16818 x = XEXP (x, 0);
16819 /* FALLTHRU */
16820 case CODE_LABEL:
16821 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16822 assemble_name (asm_out_file, buf);
16823 break;
16825 case CONST_INT:
16826 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16827 break;
16829 case CONST:
16830 /* This used to output parentheses around the expression,
16831 but that does not work on the 386 (either ATT or BSD assembler). */
16832 output_pic_addr_const (file, XEXP (x, 0), code);
16833 break;
16835 case CONST_DOUBLE:
16836 /* We can't handle floating point constants;
16837 TARGET_PRINT_OPERAND must handle them. */
16838 output_operand_lossage ("floating constant misused");
16839 break;
16841 case PLUS:
16842 /* Some assemblers need integer constants to appear first. */
16843 if (CONST_INT_P (XEXP (x, 0)))
16845 output_pic_addr_const (file, XEXP (x, 0), code);
16846 putc ('+', file);
16847 output_pic_addr_const (file, XEXP (x, 1), code);
16849 else
16851 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16852 output_pic_addr_const (file, XEXP (x, 1), code);
16853 putc ('+', file);
16854 output_pic_addr_const (file, XEXP (x, 0), code);
16856 break;
16858 case MINUS:
16859 if (!TARGET_MACHO)
16860 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16861 output_pic_addr_const (file, XEXP (x, 0), code);
16862 putc ('-', file);
16863 output_pic_addr_const (file, XEXP (x, 1), code);
16864 if (!TARGET_MACHO)
16865 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16866 break;
16868 case UNSPEC:
16869 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
16871 bool f = i386_asm_output_addr_const_extra (file, x);
16872 gcc_assert (f);
16873 break;
16876 gcc_assert (XVECLEN (x, 0) == 1);
16877 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16878 switch (XINT (x, 1))
16880 case UNSPEC_GOT:
16881 fputs ("@GOT", file);
16882 break;
16883 case UNSPEC_GOTOFF:
16884 fputs ("@GOTOFF", file);
16885 break;
16886 case UNSPEC_PLTOFF:
16887 fputs ("@PLTOFF", file);
16888 break;
16889 case UNSPEC_PCREL:
16890 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16891 "(%rip)" : "[rip]", file);
16892 break;
16893 case UNSPEC_GOTPCREL:
16894 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16895 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16896 break;
16897 case UNSPEC_GOTTPOFF:
16898 /* FIXME: This might be @TPOFF in Sun ld too. */
16899 fputs ("@gottpoff", file);
16900 break;
16901 case UNSPEC_TPOFF:
16902 fputs ("@tpoff", file);
16903 break;
16904 case UNSPEC_NTPOFF:
16905 if (TARGET_64BIT)
16906 fputs ("@tpoff", file);
16907 else
16908 fputs ("@ntpoff", file);
16909 break;
16910 case UNSPEC_DTPOFF:
16911 fputs ("@dtpoff", file);
16912 break;
16913 case UNSPEC_GOTNTPOFF:
16914 if (TARGET_64BIT)
16915 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16916 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16917 else
16918 fputs ("@gotntpoff", file);
16919 break;
16920 case UNSPEC_INDNTPOFF:
16921 fputs ("@indntpoff", file);
16922 break;
16923 #if TARGET_MACHO
16924 case UNSPEC_MACHOPIC_OFFSET:
16925 putc ('-', file);
16926 machopic_output_function_base_name (file);
16927 break;
16928 #endif
16929 default:
16930 output_operand_lossage ("invalid UNSPEC as operand");
16931 break;
16933 break;
16935 default:
16936 output_operand_lossage ("invalid expression as operand");
16940 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16941 We need to emit DTP-relative relocations. */
16943 static void ATTRIBUTE_UNUSED
16944 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16946 fputs (ASM_LONG, file);
16947 output_addr_const (file, x);
16948 fputs ("@dtpoff", file);
16949 switch (size)
16951 case 4:
16952 break;
16953 case 8:
16954 fputs (", 0", file);
16955 break;
16956 default:
16957 gcc_unreachable ();
16961 /* Return true if X is a representation of the PIC register. This copes
16962 with calls from ix86_find_base_term, where the register might have
16963 been replaced by a cselib value. */
16965 static bool
16966 ix86_pic_register_p (rtx x)
16968 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16969 return (pic_offset_table_rtx
16970 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16971 else if (!REG_P (x))
16972 return false;
16973 else if (pic_offset_table_rtx)
16975 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16976 return true;
16977 if (HARD_REGISTER_P (x)
16978 && !HARD_REGISTER_P (pic_offset_table_rtx)
16979 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16980 return true;
16981 return false;
16983 else
16984 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16987 /* Helper function for ix86_delegitimize_address.
16988 Attempt to delegitimize TLS local-exec accesses. */
16990 static rtx
16991 ix86_delegitimize_tls_address (rtx orig_x)
16993 rtx x = orig_x, unspec;
16994 struct ix86_address addr;
16996 if (!TARGET_TLS_DIRECT_SEG_REFS)
16997 return orig_x;
16998 if (MEM_P (x))
16999 x = XEXP (x, 0);
17000 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17001 return orig_x;
17002 if (ix86_decompose_address (x, &addr) == 0
17003 || addr.seg != DEFAULT_TLS_SEG_REG
17004 || addr.disp == NULL_RTX
17005 || GET_CODE (addr.disp) != CONST)
17006 return orig_x;
17007 unspec = XEXP (addr.disp, 0);
17008 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17009 unspec = XEXP (unspec, 0);
17010 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17011 return orig_x;
17012 x = XVECEXP (unspec, 0, 0);
17013 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17014 if (unspec != XEXP (addr.disp, 0))
17015 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17016 if (addr.index)
17018 rtx idx = addr.index;
17019 if (addr.scale != 1)
17020 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17021 x = gen_rtx_PLUS (Pmode, idx, x);
17023 if (addr.base)
17024 x = gen_rtx_PLUS (Pmode, addr.base, x);
17025 if (MEM_P (orig_x))
17026 x = replace_equiv_address_nv (orig_x, x);
17027 return x;
17030 /* In the name of slightly smaller debug output, and to cater to
17031 general assembler lossage, recognize PIC+GOTOFF and turn it back
17032 into a direct symbol reference.
17034 On Darwin, this is necessary to avoid a crash, because Darwin
17035 has a different PIC label for each routine but the DWARF debugging
17036 information is not associated with any particular routine, so it's
17037 necessary to remove references to the PIC label from RTL stored by
17038 the DWARF output code. */
17040 static rtx
17041 ix86_delegitimize_address (rtx x)
17043 rtx orig_x = delegitimize_mem_from_attrs (x);
17044 /* addend is NULL or some rtx if x is something+GOTOFF where
17045 something doesn't include the PIC register. */
17046 rtx addend = NULL_RTX;
17047 /* reg_addend is NULL or a multiple of some register. */
17048 rtx reg_addend = NULL_RTX;
17049 /* const_addend is NULL or a const_int. */
17050 rtx const_addend = NULL_RTX;
17051 /* This is the result, or NULL. */
17052 rtx result = NULL_RTX;
17054 x = orig_x;
17056 if (MEM_P (x))
17057 x = XEXP (x, 0);
17059 if (TARGET_64BIT)
17061 if (GET_CODE (x) == CONST
17062 && GET_CODE (XEXP (x, 0)) == PLUS
17063 && GET_MODE (XEXP (x, 0)) == Pmode
17064 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17065 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17066 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17068 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17069 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17070 if (MEM_P (orig_x))
17071 x = replace_equiv_address_nv (orig_x, x);
17072 return x;
17075 if (GET_CODE (x) == CONST
17076 && GET_CODE (XEXP (x, 0)) == UNSPEC
17077 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17078 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17079 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17081 x = XVECEXP (XEXP (x, 0), 0, 0);
17082 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17084 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17085 if (x == NULL_RTX)
17086 return orig_x;
17088 return x;
17091 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17092 return ix86_delegitimize_tls_address (orig_x);
17094 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17095 and -mcmodel=medium -fpic. */
17098 if (GET_CODE (x) != PLUS
17099 || GET_CODE (XEXP (x, 1)) != CONST)
17100 return ix86_delegitimize_tls_address (orig_x);
17102 if (ix86_pic_register_p (XEXP (x, 0)))
17103 /* %ebx + GOT/GOTOFF */
17105 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17107 /* %ebx + %reg * scale + GOT/GOTOFF */
17108 reg_addend = XEXP (x, 0);
17109 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17110 reg_addend = XEXP (reg_addend, 1);
17111 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17112 reg_addend = XEXP (reg_addend, 0);
17113 else
17115 reg_addend = NULL_RTX;
17116 addend = XEXP (x, 0);
17119 else
17120 addend = XEXP (x, 0);
17122 x = XEXP (XEXP (x, 1), 0);
17123 if (GET_CODE (x) == PLUS
17124 && CONST_INT_P (XEXP (x, 1)))
17126 const_addend = XEXP (x, 1);
17127 x = XEXP (x, 0);
17130 if (GET_CODE (x) == UNSPEC
17131 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17132 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17133 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17134 && !MEM_P (orig_x) && !addend)))
17135 result = XVECEXP (x, 0, 0);
17137 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17138 && !MEM_P (orig_x))
17139 result = XVECEXP (x, 0, 0);
17141 if (! result)
17142 return ix86_delegitimize_tls_address (orig_x);
17144 if (const_addend)
17145 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17146 if (reg_addend)
17147 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17148 if (addend)
17150 /* If the rest of original X doesn't involve the PIC register, add
17151 addend and subtract pic_offset_table_rtx. This can happen e.g.
17152 for code like:
17153 leal (%ebx, %ecx, 4), %ecx
17155 movl foo@GOTOFF(%ecx), %edx
17156 in which case we return (%ecx - %ebx) + foo
17157 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17158 and reload has completed. */
17159 if (pic_offset_table_rtx
17160 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17161 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17162 pic_offset_table_rtx),
17163 result);
17164 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
17166 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17167 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17168 result = gen_rtx_PLUS (Pmode, tmp, result);
17170 else
17171 return orig_x;
17173 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17175 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17176 if (result == NULL_RTX)
17177 return orig_x;
17179 return result;
17182 /* If X is a machine specific address (i.e. a symbol or label being
17183 referenced as a displacement from the GOT implemented using an
17184 UNSPEC), then return the base term. Otherwise return X. */
17187 ix86_find_base_term (rtx x)
17189 rtx term;
17191 if (TARGET_64BIT)
17193 if (GET_CODE (x) != CONST)
17194 return x;
17195 term = XEXP (x, 0);
17196 if (GET_CODE (term) == PLUS
17197 && CONST_INT_P (XEXP (term, 1)))
17198 term = XEXP (term, 0);
17199 if (GET_CODE (term) != UNSPEC
17200 || (XINT (term, 1) != UNSPEC_GOTPCREL
17201 && XINT (term, 1) != UNSPEC_PCREL))
17202 return x;
17204 return XVECEXP (term, 0, 0);
17207 return ix86_delegitimize_address (x);
17210 static void
17211 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17212 bool fp, FILE *file)
17214 const char *suffix;
17216 if (mode == CCFPmode || mode == CCFPUmode)
17218 code = ix86_fp_compare_code_to_integer (code);
17219 mode = CCmode;
17221 if (reverse)
17222 code = reverse_condition (code);
17224 switch (code)
17226 case EQ:
17227 switch (mode)
17229 case CCAmode:
17230 suffix = "a";
17231 break;
17232 case CCCmode:
17233 suffix = "c";
17234 break;
17235 case CCOmode:
17236 suffix = "o";
17237 break;
17238 case CCPmode:
17239 suffix = "p";
17240 break;
17241 case CCSmode:
17242 suffix = "s";
17243 break;
17244 default:
17245 suffix = "e";
17246 break;
17248 break;
17249 case NE:
17250 switch (mode)
17252 case CCAmode:
17253 suffix = "na";
17254 break;
17255 case CCCmode:
17256 suffix = "nc";
17257 break;
17258 case CCOmode:
17259 suffix = "no";
17260 break;
17261 case CCPmode:
17262 suffix = "np";
17263 break;
17264 case CCSmode:
17265 suffix = "ns";
17266 break;
17267 default:
17268 suffix = "ne";
17269 break;
17271 break;
17272 case GT:
17273 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17274 suffix = "g";
17275 break;
17276 case GTU:
17277 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17278 Those same assemblers have the same but opposite lossage on cmov. */
17279 if (mode == CCmode)
17280 suffix = fp ? "nbe" : "a";
17281 else
17282 gcc_unreachable ();
17283 break;
17284 case LT:
17285 switch (mode)
17287 case CCNOmode:
17288 case CCGOCmode:
17289 suffix = "s";
17290 break;
17292 case CCmode:
17293 case CCGCmode:
17294 suffix = "l";
17295 break;
17297 default:
17298 gcc_unreachable ();
17300 break;
17301 case LTU:
17302 if (mode == CCmode)
17303 suffix = "b";
17304 else if (mode == CCCmode)
17305 suffix = fp ? "b" : "c";
17306 else
17307 gcc_unreachable ();
17308 break;
17309 case GE:
17310 switch (mode)
17312 case CCNOmode:
17313 case CCGOCmode:
17314 suffix = "ns";
17315 break;
17317 case CCmode:
17318 case CCGCmode:
17319 suffix = "ge";
17320 break;
17322 default:
17323 gcc_unreachable ();
17325 break;
17326 case GEU:
17327 if (mode == CCmode)
17328 suffix = "nb";
17329 else if (mode == CCCmode)
17330 suffix = fp ? "nb" : "nc";
17331 else
17332 gcc_unreachable ();
17333 break;
17334 case LE:
17335 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17336 suffix = "le";
17337 break;
17338 case LEU:
17339 if (mode == CCmode)
17340 suffix = "be";
17341 else
17342 gcc_unreachable ();
17343 break;
17344 case UNORDERED:
17345 suffix = fp ? "u" : "p";
17346 break;
17347 case ORDERED:
17348 suffix = fp ? "nu" : "np";
17349 break;
17350 default:
17351 gcc_unreachable ();
17353 fputs (suffix, file);
17356 /* Print the name of register X to FILE based on its machine mode and number.
17357 If CODE is 'w', pretend the mode is HImode.
17358 If CODE is 'b', pretend the mode is QImode.
17359 If CODE is 'k', pretend the mode is SImode.
17360 If CODE is 'q', pretend the mode is DImode.
17361 If CODE is 'x', pretend the mode is V4SFmode.
17362 If CODE is 't', pretend the mode is V8SFmode.
17363 If CODE is 'g', pretend the mode is V16SFmode.
17364 If CODE is 'h', pretend the reg is the 'high' byte register.
17365 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17366 If CODE is 'd', duplicate the operand for AVX instruction.
17369 void
17370 print_reg (rtx x, int code, FILE *file)
17372 const char *reg;
17373 int msize;
17374 unsigned int regno;
17375 bool duplicated;
17377 if (ASSEMBLER_DIALECT == ASM_ATT)
17378 putc ('%', file);
17380 if (x == pc_rtx)
17382 gcc_assert (TARGET_64BIT);
17383 fputs ("rip", file);
17384 return;
17387 if (code == 'y' && STACK_TOP_P (x))
17389 fputs ("st(0)", file);
17390 return;
17393 if (code == 'w')
17394 msize = 2;
17395 else if (code == 'b')
17396 msize = 1;
17397 else if (code == 'k')
17398 msize = 4;
17399 else if (code == 'q')
17400 msize = 8;
17401 else if (code == 'h')
17402 msize = 0;
17403 else if (code == 'x')
17404 msize = 16;
17405 else if (code == 't')
17406 msize = 32;
17407 else if (code == 'g')
17408 msize = 64;
17409 else
17410 msize = GET_MODE_SIZE (GET_MODE (x));
17412 regno = true_regnum (x);
17414 gcc_assert (regno != ARG_POINTER_REGNUM
17415 && regno != FRAME_POINTER_REGNUM
17416 && regno != FPSR_REG
17417 && regno != FPCR_REG);
17419 if (regno == FLAGS_REG)
17421 output_operand_lossage ("invalid use of asm flag output");
17422 return;
17425 duplicated = code == 'd' && TARGET_AVX;
17427 switch (msize)
17429 case 8:
17430 case 4:
17431 if (LEGACY_INT_REGNO_P (regno))
17432 putc (msize == 8 && TARGET_64BIT ? 'r' : 'e', file);
17433 /* FALLTHRU */
17434 case 16:
17435 case 12:
17436 case 2:
17437 normal:
17438 reg = hi_reg_name[regno];
17439 break;
17440 case 1:
17441 if (regno >= ARRAY_SIZE (qi_reg_name))
17442 goto normal;
17443 reg = qi_reg_name[regno];
17444 break;
17445 case 0:
17446 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17447 goto normal;
17448 reg = qi_high_reg_name[regno];
17449 break;
17450 case 32:
17451 case 64:
17452 if (SSE_REGNO_P (regno))
17454 gcc_assert (!duplicated);
17455 putc (msize == 32 ? 'y' : 'z', file);
17456 reg = hi_reg_name[regno] + 1;
17457 break;
17459 goto normal;
17460 default:
17461 gcc_unreachable ();
17464 fputs (reg, file);
17466 /* Irritatingly, AMD extended registers use
17467 different naming convention: "r%d[bwd]" */
17468 if (REX_INT_REGNO_P (regno))
17470 gcc_assert (TARGET_64BIT);
17471 switch (msize)
17473 case 0:
17474 error ("extended registers have no high halves");
17475 break;
17476 case 1:
17477 putc ('b', file);
17478 break;
17479 case 2:
17480 putc ('w', file);
17481 break;
17482 case 4:
17483 putc ('d', file);
17484 break;
17485 case 8:
17486 /* no suffix */
17487 break;
17488 default:
17489 error ("unsupported operand size for extended register");
17490 break;
17492 return;
17495 if (duplicated)
17497 if (ASSEMBLER_DIALECT == ASM_ATT)
17498 fprintf (file, ", %%%s", reg);
17499 else
17500 fprintf (file, ", %s", reg);
17504 /* Meaning of CODE:
17505 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17506 C -- print opcode suffix for set/cmov insn.
17507 c -- like C, but print reversed condition
17508 F,f -- likewise, but for floating-point.
17509 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17510 otherwise nothing
17511 R -- print embeded rounding and sae.
17512 r -- print only sae.
17513 z -- print the opcode suffix for the size of the current operand.
17514 Z -- likewise, with special suffixes for x87 instructions.
17515 * -- print a star (in certain assembler syntax)
17516 A -- print an absolute memory reference.
17517 E -- print address with DImode register names if TARGET_64BIT.
17518 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17519 s -- print a shift double count, followed by the assemblers argument
17520 delimiter.
17521 b -- print the QImode name of the register for the indicated operand.
17522 %b0 would print %al if operands[0] is reg 0.
17523 w -- likewise, print the HImode name of the register.
17524 k -- likewise, print the SImode name of the register.
17525 q -- likewise, print the DImode name of the register.
17526 x -- likewise, print the V4SFmode name of the register.
17527 t -- likewise, print the V8SFmode name of the register.
17528 g -- likewise, print the V16SFmode name of the register.
17529 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17530 y -- print "st(0)" instead of "st" as a register.
17531 d -- print duplicated register operand for AVX instruction.
17532 D -- print condition for SSE cmp instruction.
17533 P -- if PIC, print an @PLT suffix.
17534 p -- print raw symbol name.
17535 X -- don't print any sort of PIC '@' suffix for a symbol.
17536 & -- print some in-use local-dynamic symbol name.
17537 H -- print a memory address offset by 8; used for sse high-parts
17538 Y -- print condition for XOP pcom* instruction.
17539 + -- print a branch hint as 'cs' or 'ds' prefix
17540 ; -- print a semicolon (after prefixes due to bug in older gas).
17541 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17542 @ -- print a segment register of thread base pointer load
17543 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17544 ! -- print MPX prefix for jxx/call/ret instructions if required.
17547 void
17548 ix86_print_operand (FILE *file, rtx x, int code)
17550 if (code)
17552 switch (code)
17554 case 'A':
17555 switch (ASSEMBLER_DIALECT)
17557 case ASM_ATT:
17558 putc ('*', file);
17559 break;
17561 case ASM_INTEL:
17562 /* Intel syntax. For absolute addresses, registers should not
17563 be surrounded by braces. */
17564 if (!REG_P (x))
17566 putc ('[', file);
17567 ix86_print_operand (file, x, 0);
17568 putc (']', file);
17569 return;
17571 break;
17573 default:
17574 gcc_unreachable ();
17577 ix86_print_operand (file, x, 0);
17578 return;
17580 case 'E':
17581 /* Wrap address in an UNSPEC to declare special handling. */
17582 if (TARGET_64BIT)
17583 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17585 output_address (VOIDmode, x);
17586 return;
17588 case 'L':
17589 if (ASSEMBLER_DIALECT == ASM_ATT)
17590 putc ('l', file);
17591 return;
17593 case 'W':
17594 if (ASSEMBLER_DIALECT == ASM_ATT)
17595 putc ('w', file);
17596 return;
17598 case 'B':
17599 if (ASSEMBLER_DIALECT == ASM_ATT)
17600 putc ('b', file);
17601 return;
17603 case 'Q':
17604 if (ASSEMBLER_DIALECT == ASM_ATT)
17605 putc ('l', file);
17606 return;
17608 case 'S':
17609 if (ASSEMBLER_DIALECT == ASM_ATT)
17610 putc ('s', file);
17611 return;
17613 case 'T':
17614 if (ASSEMBLER_DIALECT == ASM_ATT)
17615 putc ('t', file);
17616 return;
17618 case 'O':
17619 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17620 if (ASSEMBLER_DIALECT != ASM_ATT)
17621 return;
17623 switch (GET_MODE_SIZE (GET_MODE (x)))
17625 case 2:
17626 putc ('w', file);
17627 break;
17629 case 4:
17630 putc ('l', file);
17631 break;
17633 case 8:
17634 putc ('q', file);
17635 break;
17637 default:
17638 output_operand_lossage
17639 ("invalid operand size for operand code 'O'");
17640 return;
17643 putc ('.', file);
17644 #endif
17645 return;
17647 case 'z':
17648 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17650 /* Opcodes don't get size suffixes if using Intel opcodes. */
17651 if (ASSEMBLER_DIALECT == ASM_INTEL)
17652 return;
17654 switch (GET_MODE_SIZE (GET_MODE (x)))
17656 case 1:
17657 putc ('b', file);
17658 return;
17660 case 2:
17661 putc ('w', file);
17662 return;
17664 case 4:
17665 putc ('l', file);
17666 return;
17668 case 8:
17669 putc ('q', file);
17670 return;
17672 default:
17673 output_operand_lossage
17674 ("invalid operand size for operand code 'z'");
17675 return;
17679 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17680 warning
17681 (0, "non-integer operand used with operand code 'z'");
17682 /* FALLTHRU */
17684 case 'Z':
17685 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17686 if (ASSEMBLER_DIALECT == ASM_INTEL)
17687 return;
17689 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17691 switch (GET_MODE_SIZE (GET_MODE (x)))
17693 case 2:
17694 #ifdef HAVE_AS_IX86_FILDS
17695 putc ('s', file);
17696 #endif
17697 return;
17699 case 4:
17700 putc ('l', file);
17701 return;
17703 case 8:
17704 #ifdef HAVE_AS_IX86_FILDQ
17705 putc ('q', file);
17706 #else
17707 fputs ("ll", file);
17708 #endif
17709 return;
17711 default:
17712 break;
17715 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17717 /* 387 opcodes don't get size suffixes
17718 if the operands are registers. */
17719 if (STACK_REG_P (x))
17720 return;
17722 switch (GET_MODE_SIZE (GET_MODE (x)))
17724 case 4:
17725 putc ('s', file);
17726 return;
17728 case 8:
17729 putc ('l', file);
17730 return;
17732 case 12:
17733 case 16:
17734 putc ('t', file);
17735 return;
17737 default:
17738 break;
17741 else
17743 output_operand_lossage
17744 ("invalid operand type used with operand code 'Z'");
17745 return;
17748 output_operand_lossage
17749 ("invalid operand size for operand code 'Z'");
17750 return;
17752 case 'd':
17753 case 'b':
17754 case 'w':
17755 case 'k':
17756 case 'q':
17757 case 'h':
17758 case 't':
17759 case 'g':
17760 case 'y':
17761 case 'x':
17762 case 'X':
17763 case 'P':
17764 case 'p':
17765 break;
17767 case 's':
17768 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17770 ix86_print_operand (file, x, 0);
17771 fputs (", ", file);
17773 return;
17775 case 'Y':
17776 switch (GET_CODE (x))
17778 case NE:
17779 fputs ("neq", file);
17780 break;
17781 case EQ:
17782 fputs ("eq", file);
17783 break;
17784 case GE:
17785 case GEU:
17786 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17787 break;
17788 case GT:
17789 case GTU:
17790 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17791 break;
17792 case LE:
17793 case LEU:
17794 fputs ("le", file);
17795 break;
17796 case LT:
17797 case LTU:
17798 fputs ("lt", file);
17799 break;
17800 case UNORDERED:
17801 fputs ("unord", file);
17802 break;
17803 case ORDERED:
17804 fputs ("ord", file);
17805 break;
17806 case UNEQ:
17807 fputs ("ueq", file);
17808 break;
17809 case UNGE:
17810 fputs ("nlt", file);
17811 break;
17812 case UNGT:
17813 fputs ("nle", file);
17814 break;
17815 case UNLE:
17816 fputs ("ule", file);
17817 break;
17818 case UNLT:
17819 fputs ("ult", file);
17820 break;
17821 case LTGT:
17822 fputs ("une", file);
17823 break;
17824 default:
17825 output_operand_lossage ("operand is not a condition code, "
17826 "invalid operand code 'Y'");
17827 return;
17829 return;
17831 case 'D':
17832 /* Little bit of braindamage here. The SSE compare instructions
17833 does use completely different names for the comparisons that the
17834 fp conditional moves. */
17835 switch (GET_CODE (x))
17837 case UNEQ:
17838 if (TARGET_AVX)
17840 fputs ("eq_us", file);
17841 break;
17843 /* FALLTHRU */
17844 case EQ:
17845 fputs ("eq", file);
17846 break;
17847 case UNLT:
17848 if (TARGET_AVX)
17850 fputs ("nge", file);
17851 break;
17853 /* FALLTHRU */
17854 case LT:
17855 fputs ("lt", file);
17856 break;
17857 case UNLE:
17858 if (TARGET_AVX)
17860 fputs ("ngt", file);
17861 break;
17863 /* FALLTHRU */
17864 case LE:
17865 fputs ("le", file);
17866 break;
17867 case UNORDERED:
17868 fputs ("unord", file);
17869 break;
17870 case LTGT:
17871 if (TARGET_AVX)
17873 fputs ("neq_oq", file);
17874 break;
17876 /* FALLTHRU */
17877 case NE:
17878 fputs ("neq", file);
17879 break;
17880 case GE:
17881 if (TARGET_AVX)
17883 fputs ("ge", file);
17884 break;
17886 /* FALLTHRU */
17887 case UNGE:
17888 fputs ("nlt", file);
17889 break;
17890 case GT:
17891 if (TARGET_AVX)
17893 fputs ("gt", file);
17894 break;
17896 /* FALLTHRU */
17897 case UNGT:
17898 fputs ("nle", file);
17899 break;
17900 case ORDERED:
17901 fputs ("ord", file);
17902 break;
17903 default:
17904 output_operand_lossage ("operand is not a condition code, "
17905 "invalid operand code 'D'");
17906 return;
17908 return;
17910 case 'F':
17911 case 'f':
17912 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17913 if (ASSEMBLER_DIALECT == ASM_ATT)
17914 putc ('.', file);
17915 gcc_fallthrough ();
17916 #endif
17918 case 'C':
17919 case 'c':
17920 if (!COMPARISON_P (x))
17922 output_operand_lossage ("operand is not a condition code, "
17923 "invalid operand code '%c'", code);
17924 return;
17926 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17927 code == 'c' || code == 'f',
17928 code == 'F' || code == 'f',
17929 file);
17930 return;
17932 case 'H':
17933 if (!offsettable_memref_p (x))
17935 output_operand_lossage ("operand is not an offsettable memory "
17936 "reference, invalid operand code 'H'");
17937 return;
17939 /* It doesn't actually matter what mode we use here, as we're
17940 only going to use this for printing. */
17941 x = adjust_address_nv (x, DImode, 8);
17942 /* Output 'qword ptr' for intel assembler dialect. */
17943 if (ASSEMBLER_DIALECT == ASM_INTEL)
17944 code = 'q';
17945 break;
17947 case 'K':
17948 gcc_assert (CONST_INT_P (x));
17950 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17951 #ifdef HAVE_AS_IX86_HLE
17952 fputs ("xacquire ", file);
17953 #else
17954 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17955 #endif
17956 else if (INTVAL (x) & IX86_HLE_RELEASE)
17957 #ifdef HAVE_AS_IX86_HLE
17958 fputs ("xrelease ", file);
17959 #else
17960 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17961 #endif
17962 /* We do not want to print value of the operand. */
17963 return;
17965 case 'N':
17966 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17967 fputs ("{z}", file);
17968 return;
17970 case 'r':
17971 gcc_assert (CONST_INT_P (x));
17972 gcc_assert (INTVAL (x) == ROUND_SAE);
17974 if (ASSEMBLER_DIALECT == ASM_INTEL)
17975 fputs (", ", file);
17977 fputs ("{sae}", file);
17979 if (ASSEMBLER_DIALECT == ASM_ATT)
17980 fputs (", ", file);
17982 return;
17984 case 'R':
17985 gcc_assert (CONST_INT_P (x));
17987 if (ASSEMBLER_DIALECT == ASM_INTEL)
17988 fputs (", ", file);
17990 switch (INTVAL (x))
17992 case ROUND_NEAREST_INT | ROUND_SAE:
17993 fputs ("{rn-sae}", file);
17994 break;
17995 case ROUND_NEG_INF | ROUND_SAE:
17996 fputs ("{rd-sae}", file);
17997 break;
17998 case ROUND_POS_INF | ROUND_SAE:
17999 fputs ("{ru-sae}", file);
18000 break;
18001 case ROUND_ZERO | ROUND_SAE:
18002 fputs ("{rz-sae}", file);
18003 break;
18004 default:
18005 gcc_unreachable ();
18008 if (ASSEMBLER_DIALECT == ASM_ATT)
18009 fputs (", ", file);
18011 return;
18013 case '*':
18014 if (ASSEMBLER_DIALECT == ASM_ATT)
18015 putc ('*', file);
18016 return;
18018 case '&':
18020 const char *name = get_some_local_dynamic_name ();
18021 if (name == NULL)
18022 output_operand_lossage ("'%%&' used without any "
18023 "local dynamic TLS references");
18024 else
18025 assemble_name (file, name);
18026 return;
18029 case '+':
18031 rtx x;
18033 if (!optimize
18034 || optimize_function_for_size_p (cfun)
18035 || !TARGET_BRANCH_PREDICTION_HINTS)
18036 return;
18038 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18039 if (x)
18041 int pred_val = XINT (x, 0);
18043 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18044 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18046 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18047 bool cputaken
18048 = final_forward_branch_p (current_output_insn) == 0;
18050 /* Emit hints only in the case default branch prediction
18051 heuristics would fail. */
18052 if (taken != cputaken)
18054 /* We use 3e (DS) prefix for taken branches and
18055 2e (CS) prefix for not taken branches. */
18056 if (taken)
18057 fputs ("ds ; ", file);
18058 else
18059 fputs ("cs ; ", file);
18063 return;
18066 case ';':
18067 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18068 putc (';', file);
18069 #endif
18070 return;
18072 case '@':
18073 if (ASSEMBLER_DIALECT == ASM_ATT)
18074 putc ('%', file);
18076 /* The kernel uses a different segment register for performance
18077 reasons; a system call would not have to trash the userspace
18078 segment register, which would be expensive. */
18079 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
18080 fputs ("fs", file);
18081 else
18082 fputs ("gs", file);
18083 return;
18085 case '~':
18086 putc (TARGET_AVX2 ? 'i' : 'f', file);
18087 return;
18089 case '^':
18090 if (TARGET_64BIT && Pmode != word_mode)
18091 fputs ("addr32 ", file);
18092 return;
18094 case '!':
18095 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18096 fputs ("bnd ", file);
18097 return;
18099 default:
18100 output_operand_lossage ("invalid operand code '%c'", code);
18104 if (REG_P (x))
18105 print_reg (x, code, file);
18107 else if (MEM_P (x))
18109 rtx addr = XEXP (x, 0);
18111 /* No `byte ptr' prefix for call instructions ... */
18112 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18114 machine_mode mode = GET_MODE (x);
18115 const char *size;
18117 /* Check for explicit size override codes. */
18118 if (code == 'b')
18119 size = "BYTE";
18120 else if (code == 'w')
18121 size = "WORD";
18122 else if (code == 'k')
18123 size = "DWORD";
18124 else if (code == 'q')
18125 size = "QWORD";
18126 else if (code == 'x')
18127 size = "XMMWORD";
18128 else if (code == 't')
18129 size = "YMMWORD";
18130 else if (code == 'g')
18131 size = "ZMMWORD";
18132 else if (mode == BLKmode)
18133 /* ... or BLKmode operands, when not overridden. */
18134 size = NULL;
18135 else
18136 switch (GET_MODE_SIZE (mode))
18138 case 1: size = "BYTE"; break;
18139 case 2: size = "WORD"; break;
18140 case 4: size = "DWORD"; break;
18141 case 8: size = "QWORD"; break;
18142 case 12: size = "TBYTE"; break;
18143 case 16:
18144 if (mode == XFmode)
18145 size = "TBYTE";
18146 else
18147 size = "XMMWORD";
18148 break;
18149 case 32: size = "YMMWORD"; break;
18150 case 64: size = "ZMMWORD"; break;
18151 default:
18152 gcc_unreachable ();
18154 if (size)
18156 fputs (size, file);
18157 fputs (" PTR ", file);
18161 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18162 output_operand_lossage ("invalid constraints for operand");
18163 else
18164 ix86_print_operand_address_as
18165 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18168 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18170 long l;
18172 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18174 if (ASSEMBLER_DIALECT == ASM_ATT)
18175 putc ('$', file);
18176 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18177 if (code == 'q')
18178 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18179 (unsigned long long) (int) l);
18180 else
18181 fprintf (file, "0x%08x", (unsigned int) l);
18184 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18186 long l[2];
18188 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18190 if (ASSEMBLER_DIALECT == ASM_ATT)
18191 putc ('$', file);
18192 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18195 /* These float cases don't actually occur as immediate operands. */
18196 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18198 char dstr[30];
18200 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18201 fputs (dstr, file);
18204 else
18206 /* We have patterns that allow zero sets of memory, for instance.
18207 In 64-bit mode, we should probably support all 8-byte vectors,
18208 since we can in fact encode that into an immediate. */
18209 if (GET_CODE (x) == CONST_VECTOR)
18211 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18212 x = const0_rtx;
18215 if (code != 'P' && code != 'p')
18217 if (CONST_INT_P (x))
18219 if (ASSEMBLER_DIALECT == ASM_ATT)
18220 putc ('$', file);
18222 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18223 || GET_CODE (x) == LABEL_REF)
18225 if (ASSEMBLER_DIALECT == ASM_ATT)
18226 putc ('$', file);
18227 else
18228 fputs ("OFFSET FLAT:", file);
18231 if (CONST_INT_P (x))
18232 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18233 else if (flag_pic || MACHOPIC_INDIRECT)
18234 output_pic_addr_const (file, x, code);
18235 else
18236 output_addr_const (file, x);
18240 static bool
18241 ix86_print_operand_punct_valid_p (unsigned char code)
18243 return (code == '@' || code == '*' || code == '+' || code == '&'
18244 || code == ';' || code == '~' || code == '^' || code == '!');
18247 /* Print a memory operand whose address is ADDR. */
18249 static void
18250 ix86_print_operand_address_as (FILE *file, rtx addr,
18251 addr_space_t as, bool no_rip)
18253 struct ix86_address parts;
18254 rtx base, index, disp;
18255 int scale;
18256 int ok;
18257 bool vsib = false;
18258 int code = 0;
18260 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18262 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18263 gcc_assert (parts.index == NULL_RTX);
18264 parts.index = XVECEXP (addr, 0, 1);
18265 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18266 addr = XVECEXP (addr, 0, 0);
18267 vsib = true;
18269 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18271 gcc_assert (TARGET_64BIT);
18272 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18273 code = 'q';
18275 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18277 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18278 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18279 if (parts.base != NULL_RTX)
18281 parts.index = parts.base;
18282 parts.scale = 1;
18284 parts.base = XVECEXP (addr, 0, 0);
18285 addr = XVECEXP (addr, 0, 0);
18287 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18289 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18290 gcc_assert (parts.index == NULL_RTX);
18291 parts.index = XVECEXP (addr, 0, 1);
18292 addr = XVECEXP (addr, 0, 0);
18294 else
18295 ok = ix86_decompose_address (addr, &parts);
18297 gcc_assert (ok);
18299 base = parts.base;
18300 index = parts.index;
18301 disp = parts.disp;
18302 scale = parts.scale;
18304 if (ADDR_SPACE_GENERIC_P (as))
18305 as = parts.seg;
18306 else
18307 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18309 if (!ADDR_SPACE_GENERIC_P (as))
18311 const char *string;
18313 if (as == ADDR_SPACE_SEG_FS)
18314 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18315 else if (as == ADDR_SPACE_SEG_GS)
18316 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18317 else
18318 gcc_unreachable ();
18319 fputs (string, file);
18322 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18323 if (TARGET_64BIT && !base && !index && !no_rip)
18325 rtx symbol = disp;
18327 if (GET_CODE (disp) == CONST
18328 && GET_CODE (XEXP (disp, 0)) == PLUS
18329 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18330 symbol = XEXP (XEXP (disp, 0), 0);
18332 if (GET_CODE (symbol) == LABEL_REF
18333 || (GET_CODE (symbol) == SYMBOL_REF
18334 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18335 base = pc_rtx;
18338 if (!base && !index)
18340 /* Displacement only requires special attention. */
18341 if (CONST_INT_P (disp))
18343 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == ADDR_SPACE_GENERIC)
18344 fputs ("ds:", file);
18345 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18347 /* Load the external function address via the GOT slot to avoid PLT. */
18348 else if (GET_CODE (disp) == CONST
18349 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18350 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18351 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18352 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18353 output_pic_addr_const (file, disp, 0);
18354 else if (flag_pic)
18355 output_pic_addr_const (file, disp, 0);
18356 else
18357 output_addr_const (file, disp);
18359 else
18361 /* Print SImode register names to force addr32 prefix. */
18362 if (SImode_address_operand (addr, VOIDmode))
18364 if (flag_checking)
18366 gcc_assert (TARGET_64BIT);
18367 switch (GET_CODE (addr))
18369 case SUBREG:
18370 gcc_assert (GET_MODE (addr) == SImode);
18371 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18372 break;
18373 case ZERO_EXTEND:
18374 case AND:
18375 gcc_assert (GET_MODE (addr) == DImode);
18376 break;
18377 default:
18378 gcc_unreachable ();
18381 gcc_assert (!code);
18382 code = 'k';
18384 else if (code == 0
18385 && TARGET_X32
18386 && disp
18387 && CONST_INT_P (disp)
18388 && INTVAL (disp) < -16*1024*1024)
18390 /* X32 runs in 64-bit mode, where displacement, DISP, in
18391 address DISP(%r64), is encoded as 32-bit immediate sign-
18392 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18393 address is %r64 + 0xffffffffbffffd00. When %r64 <
18394 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18395 which is invalid for x32. The correct address is %r64
18396 - 0x40000300 == 0xf7ffdd64. To properly encode
18397 -0x40000300(%r64) for x32, we zero-extend negative
18398 displacement by forcing addr32 prefix which truncates
18399 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18400 zero-extend all negative displacements, including -1(%rsp).
18401 However, for small negative displacements, sign-extension
18402 won't cause overflow. We only zero-extend negative
18403 displacements if they < -16*1024*1024, which is also used
18404 to check legitimate address displacements for PIC. */
18405 code = 'k';
18408 if (ASSEMBLER_DIALECT == ASM_ATT)
18410 if (disp)
18412 if (flag_pic)
18413 output_pic_addr_const (file, disp, 0);
18414 else if (GET_CODE (disp) == LABEL_REF)
18415 output_asm_label (disp);
18416 else
18417 output_addr_const (file, disp);
18420 putc ('(', file);
18421 if (base)
18422 print_reg (base, code, file);
18423 if (index)
18425 putc (',', file);
18426 print_reg (index, vsib ? 0 : code, file);
18427 if (scale != 1 || vsib)
18428 fprintf (file, ",%d", scale);
18430 putc (')', file);
18432 else
18434 rtx offset = NULL_RTX;
18436 if (disp)
18438 /* Pull out the offset of a symbol; print any symbol itself. */
18439 if (GET_CODE (disp) == CONST
18440 && GET_CODE (XEXP (disp, 0)) == PLUS
18441 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18443 offset = XEXP (XEXP (disp, 0), 1);
18444 disp = gen_rtx_CONST (VOIDmode,
18445 XEXP (XEXP (disp, 0), 0));
18448 if (flag_pic)
18449 output_pic_addr_const (file, disp, 0);
18450 else if (GET_CODE (disp) == LABEL_REF)
18451 output_asm_label (disp);
18452 else if (CONST_INT_P (disp))
18453 offset = disp;
18454 else
18455 output_addr_const (file, disp);
18458 putc ('[', file);
18459 if (base)
18461 print_reg (base, code, file);
18462 if (offset)
18464 if (INTVAL (offset) >= 0)
18465 putc ('+', file);
18466 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18469 else if (offset)
18470 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18471 else
18472 putc ('0', file);
18474 if (index)
18476 putc ('+', file);
18477 print_reg (index, vsib ? 0 : code, file);
18478 if (scale != 1 || vsib)
18479 fprintf (file, "*%d", scale);
18481 putc (']', file);
18486 static void
18487 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18489 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18492 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18494 static bool
18495 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18497 rtx op;
18499 if (GET_CODE (x) != UNSPEC)
18500 return false;
18502 op = XVECEXP (x, 0, 0);
18503 switch (XINT (x, 1))
18505 case UNSPEC_GOTTPOFF:
18506 output_addr_const (file, op);
18507 /* FIXME: This might be @TPOFF in Sun ld. */
18508 fputs ("@gottpoff", file);
18509 break;
18510 case UNSPEC_TPOFF:
18511 output_addr_const (file, op);
18512 fputs ("@tpoff", file);
18513 break;
18514 case UNSPEC_NTPOFF:
18515 output_addr_const (file, op);
18516 if (TARGET_64BIT)
18517 fputs ("@tpoff", file);
18518 else
18519 fputs ("@ntpoff", file);
18520 break;
18521 case UNSPEC_DTPOFF:
18522 output_addr_const (file, op);
18523 fputs ("@dtpoff", file);
18524 break;
18525 case UNSPEC_GOTNTPOFF:
18526 output_addr_const (file, op);
18527 if (TARGET_64BIT)
18528 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18529 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18530 else
18531 fputs ("@gotntpoff", file);
18532 break;
18533 case UNSPEC_INDNTPOFF:
18534 output_addr_const (file, op);
18535 fputs ("@indntpoff", file);
18536 break;
18537 #if TARGET_MACHO
18538 case UNSPEC_MACHOPIC_OFFSET:
18539 output_addr_const (file, op);
18540 putc ('-', file);
18541 machopic_output_function_base_name (file);
18542 break;
18543 #endif
18545 case UNSPEC_STACK_CHECK:
18547 int offset;
18549 gcc_assert (flag_split_stack);
18551 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
18552 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
18553 #else
18554 gcc_unreachable ();
18555 #endif
18557 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
18559 break;
18561 default:
18562 return false;
18565 return true;
18568 /* Split one or more double-mode RTL references into pairs of half-mode
18569 references. The RTL can be REG, offsettable MEM, integer constant, or
18570 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18571 split and "num" is its length. lo_half and hi_half are output arrays
18572 that parallel "operands". */
18574 void
18575 split_double_mode (machine_mode mode, rtx operands[],
18576 int num, rtx lo_half[], rtx hi_half[])
18578 machine_mode half_mode;
18579 unsigned int byte;
18581 switch (mode)
18583 case TImode:
18584 half_mode = DImode;
18585 break;
18586 case DImode:
18587 half_mode = SImode;
18588 break;
18589 default:
18590 gcc_unreachable ();
18593 byte = GET_MODE_SIZE (half_mode);
18595 while (num--)
18597 rtx op = operands[num];
18599 /* simplify_subreg refuse to split volatile memory addresses,
18600 but we still have to handle it. */
18601 if (MEM_P (op))
18603 lo_half[num] = adjust_address (op, half_mode, 0);
18604 hi_half[num] = adjust_address (op, half_mode, byte);
18606 else
18608 lo_half[num] = simplify_gen_subreg (half_mode, op,
18609 GET_MODE (op) == VOIDmode
18610 ? mode : GET_MODE (op), 0);
18611 hi_half[num] = simplify_gen_subreg (half_mode, op,
18612 GET_MODE (op) == VOIDmode
18613 ? mode : GET_MODE (op), byte);
18618 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18619 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18620 is the expression of the binary operation. The output may either be
18621 emitted here, or returned to the caller, like all output_* functions.
18623 There is no guarantee that the operands are the same mode, as they
18624 might be within FLOAT or FLOAT_EXTEND expressions. */
18626 #ifndef SYSV386_COMPAT
18627 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18628 wants to fix the assemblers because that causes incompatibility
18629 with gcc. No-one wants to fix gcc because that causes
18630 incompatibility with assemblers... You can use the option of
18631 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18632 #define SYSV386_COMPAT 1
18633 #endif
18635 const char *
18636 output_387_binary_op (rtx insn, rtx *operands)
18638 static char buf[40];
18639 const char *p;
18640 const char *ssep;
18641 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
18643 /* Even if we do not want to check the inputs, this documents input
18644 constraints. Which helps in understanding the following code. */
18645 if (flag_checking)
18647 if (STACK_REG_P (operands[0])
18648 && ((REG_P (operands[1])
18649 && REGNO (operands[0]) == REGNO (operands[1])
18650 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18651 || (REG_P (operands[2])
18652 && REGNO (operands[0]) == REGNO (operands[2])
18653 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18654 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18655 ; /* ok */
18656 else
18657 gcc_assert (is_sse);
18660 switch (GET_CODE (operands[3]))
18662 case PLUS:
18663 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18664 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18665 p = "fiadd";
18666 else
18667 p = "fadd";
18668 ssep = "vadd";
18669 break;
18671 case MINUS:
18672 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18673 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18674 p = "fisub";
18675 else
18676 p = "fsub";
18677 ssep = "vsub";
18678 break;
18680 case MULT:
18681 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18682 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18683 p = "fimul";
18684 else
18685 p = "fmul";
18686 ssep = "vmul";
18687 break;
18689 case DIV:
18690 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18691 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18692 p = "fidiv";
18693 else
18694 p = "fdiv";
18695 ssep = "vdiv";
18696 break;
18698 default:
18699 gcc_unreachable ();
18702 if (is_sse)
18704 if (TARGET_AVX)
18706 strcpy (buf, ssep);
18707 if (GET_MODE (operands[0]) == SFmode)
18708 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
18709 else
18710 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
18712 else
18714 strcpy (buf, ssep + 1);
18715 if (GET_MODE (operands[0]) == SFmode)
18716 strcat (buf, "ss\t{%2, %0|%0, %2}");
18717 else
18718 strcat (buf, "sd\t{%2, %0|%0, %2}");
18720 return buf;
18722 strcpy (buf, p);
18724 switch (GET_CODE (operands[3]))
18726 case MULT:
18727 case PLUS:
18728 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18729 std::swap (operands[1], operands[2]);
18731 /* know operands[0] == operands[1]. */
18733 if (MEM_P (operands[2]))
18735 p = "%Z2\t%2";
18736 break;
18739 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18741 if (STACK_TOP_P (operands[0]))
18742 /* How is it that we are storing to a dead operand[2]?
18743 Well, presumably operands[1] is dead too. We can't
18744 store the result to st(0) as st(0) gets popped on this
18745 instruction. Instead store to operands[2] (which I
18746 think has to be st(1)). st(1) will be popped later.
18747 gcc <= 2.8.1 didn't have this check and generated
18748 assembly code that the Unixware assembler rejected. */
18749 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18750 else
18751 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18752 break;
18755 if (STACK_TOP_P (operands[0]))
18756 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18757 else
18758 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18759 break;
18761 case MINUS:
18762 case DIV:
18763 if (MEM_P (operands[1]))
18765 p = "r%Z1\t%1";
18766 break;
18769 if (MEM_P (operands[2]))
18771 p = "%Z2\t%2";
18772 break;
18775 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18777 #if SYSV386_COMPAT
18778 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18779 derived assemblers, confusingly reverse the direction of
18780 the operation for fsub{r} and fdiv{r} when the
18781 destination register is not st(0). The Intel assembler
18782 doesn't have this brain damage. Read !SYSV386_COMPAT to
18783 figure out what the hardware really does. */
18784 if (STACK_TOP_P (operands[0]))
18785 p = "{p\t%0, %2|rp\t%2, %0}";
18786 else
18787 p = "{rp\t%2, %0|p\t%0, %2}";
18788 #else
18789 if (STACK_TOP_P (operands[0]))
18790 /* As above for fmul/fadd, we can't store to st(0). */
18791 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18792 else
18793 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18794 #endif
18795 break;
18798 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18800 #if SYSV386_COMPAT
18801 if (STACK_TOP_P (operands[0]))
18802 p = "{rp\t%0, %1|p\t%1, %0}";
18803 else
18804 p = "{p\t%1, %0|rp\t%0, %1}";
18805 #else
18806 if (STACK_TOP_P (operands[0]))
18807 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18808 else
18809 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18810 #endif
18811 break;
18814 if (STACK_TOP_P (operands[0]))
18816 if (STACK_TOP_P (operands[1]))
18817 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18818 else
18819 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18820 break;
18822 else if (STACK_TOP_P (operands[1]))
18824 #if SYSV386_COMPAT
18825 p = "{\t%1, %0|r\t%0, %1}";
18826 #else
18827 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18828 #endif
18830 else
18832 #if SYSV386_COMPAT
18833 p = "{r\t%2, %0|\t%0, %2}";
18834 #else
18835 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18836 #endif
18838 break;
18840 default:
18841 gcc_unreachable ();
18844 strcat (buf, p);
18845 return buf;
18848 /* Return needed mode for entity in optimize_mode_switching pass. */
18850 static int
18851 ix86_dirflag_mode_needed (rtx_insn *insn)
18853 if (CALL_P (insn))
18855 if (cfun->machine->func_type == TYPE_NORMAL)
18856 return X86_DIRFLAG_ANY;
18857 else
18858 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18859 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18862 if (recog_memoized (insn) < 0)
18863 return X86_DIRFLAG_ANY;
18865 if (get_attr_type (insn) == TYPE_STR)
18867 /* Emit cld instruction if stringops are used in the function. */
18868 if (cfun->machine->func_type == TYPE_NORMAL)
18869 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18870 else
18871 return X86_DIRFLAG_RESET;
18874 return X86_DIRFLAG_ANY;
18877 /* Check if a 256bit AVX register is referenced inside of EXP. */
18879 static bool
18880 ix86_check_avx256_register (const_rtx exp)
18882 if (SUBREG_P (exp))
18883 exp = SUBREG_REG (exp);
18885 return (REG_P (exp)
18886 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
18889 /* Return needed mode for entity in optimize_mode_switching pass. */
18891 static int
18892 ix86_avx_u128_mode_needed (rtx_insn *insn)
18894 if (CALL_P (insn))
18896 rtx link;
18898 /* Needed mode is set to AVX_U128_CLEAN if there are
18899 no 256bit modes used in function arguments. */
18900 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18901 link;
18902 link = XEXP (link, 1))
18904 if (GET_CODE (XEXP (link, 0)) == USE)
18906 rtx arg = XEXP (XEXP (link, 0), 0);
18908 if (ix86_check_avx256_register (arg))
18909 return AVX_U128_DIRTY;
18913 return AVX_U128_CLEAN;
18916 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
18917 changes state only when a 256bit register is written to, but we need
18918 to prevent the compiler from moving optimal insertion point above
18919 eventual read from 256bit register. */
18920 subrtx_iterator::array_type array;
18921 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18922 if (ix86_check_avx256_register (*iter))
18923 return AVX_U128_DIRTY;
18925 return AVX_U128_ANY;
18928 /* Return mode that i387 must be switched into
18929 prior to the execution of insn. */
18931 static int
18932 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18934 enum attr_i387_cw mode;
18936 /* The mode UNINITIALIZED is used to store control word after a
18937 function call or ASM pattern. The mode ANY specify that function
18938 has no requirements on the control word and make no changes in the
18939 bits we are interested in. */
18941 if (CALL_P (insn)
18942 || (NONJUMP_INSN_P (insn)
18943 && (asm_noperands (PATTERN (insn)) >= 0
18944 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18945 return I387_CW_UNINITIALIZED;
18947 if (recog_memoized (insn) < 0)
18948 return I387_CW_ANY;
18950 mode = get_attr_i387_cw (insn);
18952 switch (entity)
18954 case I387_TRUNC:
18955 if (mode == I387_CW_TRUNC)
18956 return mode;
18957 break;
18959 case I387_FLOOR:
18960 if (mode == I387_CW_FLOOR)
18961 return mode;
18962 break;
18964 case I387_CEIL:
18965 if (mode == I387_CW_CEIL)
18966 return mode;
18967 break;
18969 case I387_MASK_PM:
18970 if (mode == I387_CW_MASK_PM)
18971 return mode;
18972 break;
18974 default:
18975 gcc_unreachable ();
18978 return I387_CW_ANY;
18981 /* Return mode that entity must be switched into
18982 prior to the execution of insn. */
18984 static int
18985 ix86_mode_needed (int entity, rtx_insn *insn)
18987 switch (entity)
18989 case X86_DIRFLAG:
18990 return ix86_dirflag_mode_needed (insn);
18991 case AVX_U128:
18992 return ix86_avx_u128_mode_needed (insn);
18993 case I387_TRUNC:
18994 case I387_FLOOR:
18995 case I387_CEIL:
18996 case I387_MASK_PM:
18997 return ix86_i387_mode_needed (entity, insn);
18998 default:
18999 gcc_unreachable ();
19001 return 0;
19004 /* Check if a 256bit AVX register is referenced in stores. */
19006 static void
19007 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
19009 if (ix86_check_avx256_register (dest))
19011 bool *used = (bool *) data;
19012 *used = true;
19016 /* Calculate mode of upper 128bit AVX registers after the insn. */
19018 static int
19019 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19021 rtx pat = PATTERN (insn);
19023 if (vzeroupper_operation (pat, VOIDmode)
19024 || vzeroall_operation (pat, VOIDmode))
19025 return AVX_U128_CLEAN;
19027 /* We know that state is clean after CALL insn if there are no
19028 256bit registers used in the function return register. */
19029 if (CALL_P (insn))
19031 bool avx_reg256_found = false;
19032 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
19034 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19037 /* Otherwise, return current mode. Remember that if insn
19038 references AVX 256bit registers, the mode was already changed
19039 to DIRTY from MODE_NEEDED. */
19040 return mode;
19043 /* Return the mode that an insn results in. */
19045 static int
19046 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19048 switch (entity)
19050 case X86_DIRFLAG:
19051 return mode;
19052 case AVX_U128:
19053 return ix86_avx_u128_mode_after (mode, insn);
19054 case I387_TRUNC:
19055 case I387_FLOOR:
19056 case I387_CEIL:
19057 case I387_MASK_PM:
19058 return mode;
19059 default:
19060 gcc_unreachable ();
19064 static int
19065 ix86_dirflag_mode_entry (void)
19067 /* For TARGET_CLD or in the interrupt handler we can't assume
19068 direction flag state at function entry. */
19069 if (TARGET_CLD
19070 || cfun->machine->func_type != TYPE_NORMAL)
19071 return X86_DIRFLAG_ANY;
19073 return X86_DIRFLAG_RESET;
19076 static int
19077 ix86_avx_u128_mode_entry (void)
19079 tree arg;
19081 /* Entry mode is set to AVX_U128_DIRTY if there are
19082 256bit modes used in function arguments. */
19083 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19084 arg = TREE_CHAIN (arg))
19086 rtx incoming = DECL_INCOMING_RTL (arg);
19088 if (incoming && ix86_check_avx256_register (incoming))
19089 return AVX_U128_DIRTY;
19092 return AVX_U128_CLEAN;
19095 /* Return a mode that ENTITY is assumed to be
19096 switched to at function entry. */
19098 static int
19099 ix86_mode_entry (int entity)
19101 switch (entity)
19103 case X86_DIRFLAG:
19104 return ix86_dirflag_mode_entry ();
19105 case AVX_U128:
19106 return ix86_avx_u128_mode_entry ();
19107 case I387_TRUNC:
19108 case I387_FLOOR:
19109 case I387_CEIL:
19110 case I387_MASK_PM:
19111 return I387_CW_ANY;
19112 default:
19113 gcc_unreachable ();
19117 static int
19118 ix86_avx_u128_mode_exit (void)
19120 rtx reg = crtl->return_rtx;
19122 /* Exit mode is set to AVX_U128_DIRTY if there are
19123 256bit modes used in the function return register. */
19124 if (reg && ix86_check_avx256_register (reg))
19125 return AVX_U128_DIRTY;
19127 return AVX_U128_CLEAN;
19130 /* Return a mode that ENTITY is assumed to be
19131 switched to at function exit. */
19133 static int
19134 ix86_mode_exit (int entity)
19136 switch (entity)
19138 case X86_DIRFLAG:
19139 return X86_DIRFLAG_ANY;
19140 case AVX_U128:
19141 return ix86_avx_u128_mode_exit ();
19142 case I387_TRUNC:
19143 case I387_FLOOR:
19144 case I387_CEIL:
19145 case I387_MASK_PM:
19146 return I387_CW_ANY;
19147 default:
19148 gcc_unreachable ();
19152 static int
19153 ix86_mode_priority (int, int n)
19155 return n;
19158 /* Output code to initialize control word copies used by trunc?f?i and
19159 rounding patterns. CURRENT_MODE is set to current control word,
19160 while NEW_MODE is set to new control word. */
19162 static void
19163 emit_i387_cw_initialization (int mode)
19165 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19166 rtx new_mode;
19168 enum ix86_stack_slot slot;
19170 rtx reg = gen_reg_rtx (HImode);
19172 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19173 emit_move_insn (reg, copy_rtx (stored_mode));
19175 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19176 || optimize_insn_for_size_p ())
19178 switch (mode)
19180 case I387_CW_TRUNC:
19181 /* round toward zero (truncate) */
19182 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19183 slot = SLOT_CW_TRUNC;
19184 break;
19186 case I387_CW_FLOOR:
19187 /* round down toward -oo */
19188 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19189 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19190 slot = SLOT_CW_FLOOR;
19191 break;
19193 case I387_CW_CEIL:
19194 /* round up toward +oo */
19195 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19196 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19197 slot = SLOT_CW_CEIL;
19198 break;
19200 case I387_CW_MASK_PM:
19201 /* mask precision exception for nearbyint() */
19202 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19203 slot = SLOT_CW_MASK_PM;
19204 break;
19206 default:
19207 gcc_unreachable ();
19210 else
19212 switch (mode)
19214 case I387_CW_TRUNC:
19215 /* round toward zero (truncate) */
19216 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19217 slot = SLOT_CW_TRUNC;
19218 break;
19220 case I387_CW_FLOOR:
19221 /* round down toward -oo */
19222 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19223 slot = SLOT_CW_FLOOR;
19224 break;
19226 case I387_CW_CEIL:
19227 /* round up toward +oo */
19228 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19229 slot = SLOT_CW_CEIL;
19230 break;
19232 case I387_CW_MASK_PM:
19233 /* mask precision exception for nearbyint() */
19234 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19235 slot = SLOT_CW_MASK_PM;
19236 break;
19238 default:
19239 gcc_unreachable ();
19243 gcc_assert (slot < MAX_386_STACK_LOCALS);
19245 new_mode = assign_386_stack_local (HImode, slot);
19246 emit_move_insn (new_mode, reg);
19249 /* Emit vzeroupper. */
19251 void
19252 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19254 int i;
19256 /* Cancel automatic vzeroupper insertion if there are
19257 live call-saved SSE registers at the insertion point. */
19259 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19260 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19261 return;
19263 if (TARGET_64BIT)
19264 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19265 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19266 return;
19268 emit_insn (gen_avx_vzeroupper ());
19271 /* Generate one or more insns to set ENTITY to MODE. */
19273 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19274 is the set of hard registers live at the point where the insn(s)
19275 are to be inserted. */
19277 static void
19278 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19279 HARD_REG_SET regs_live)
19281 switch (entity)
19283 case X86_DIRFLAG:
19284 if (mode == X86_DIRFLAG_RESET)
19285 emit_insn (gen_cld ());
19286 break;
19287 case AVX_U128:
19288 if (mode == AVX_U128_CLEAN)
19289 ix86_avx_emit_vzeroupper (regs_live);
19290 break;
19291 case I387_TRUNC:
19292 case I387_FLOOR:
19293 case I387_CEIL:
19294 case I387_MASK_PM:
19295 if (mode != I387_CW_ANY
19296 && mode != I387_CW_UNINITIALIZED)
19297 emit_i387_cw_initialization (mode);
19298 break;
19299 default:
19300 gcc_unreachable ();
19304 /* Output code for INSN to convert a float to a signed int. OPERANDS
19305 are the insn operands. The output may be [HSD]Imode and the input
19306 operand may be [SDX]Fmode. */
19308 const char *
19309 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19311 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19312 int dimode_p = GET_MODE (operands[0]) == DImode;
19313 int round_mode = get_attr_i387_cw (insn);
19315 /* Jump through a hoop or two for DImode, since the hardware has no
19316 non-popping instruction. We used to do this a different way, but
19317 that was somewhat fragile and broke with post-reload splitters. */
19318 if ((dimode_p || fisttp) && !stack_top_dies)
19319 output_asm_insn ("fld\t%y1", operands);
19321 gcc_assert (STACK_TOP_P (operands[1]));
19322 gcc_assert (MEM_P (operands[0]));
19323 gcc_assert (GET_MODE (operands[1]) != TFmode);
19325 if (fisttp)
19326 output_asm_insn ("fisttp%Z0\t%0", operands);
19327 else
19329 if (round_mode != I387_CW_ANY)
19330 output_asm_insn ("fldcw\t%3", operands);
19331 if (stack_top_dies || dimode_p)
19332 output_asm_insn ("fistp%Z0\t%0", operands);
19333 else
19334 output_asm_insn ("fist%Z0\t%0", operands);
19335 if (round_mode != I387_CW_ANY)
19336 output_asm_insn ("fldcw\t%2", operands);
19339 return "";
19342 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19343 have the values zero or one, indicates the ffreep insn's operand
19344 from the OPERANDS array. */
19346 static const char *
19347 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19349 if (TARGET_USE_FFREEP)
19350 #ifdef HAVE_AS_IX86_FFREEP
19351 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19352 #else
19354 static char retval[32];
19355 int regno = REGNO (operands[opno]);
19357 gcc_assert (STACK_REGNO_P (regno));
19359 regno -= FIRST_STACK_REG;
19361 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19362 return retval;
19364 #endif
19366 return opno ? "fstp\t%y1" : "fstp\t%y0";
19370 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19371 should be used. UNORDERED_P is true when fucom should be used. */
19373 const char *
19374 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
19376 int stack_top_dies;
19377 rtx cmp_op0, cmp_op1;
19378 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
19380 if (eflags_p)
19382 cmp_op0 = operands[0];
19383 cmp_op1 = operands[1];
19385 else
19387 cmp_op0 = operands[1];
19388 cmp_op1 = operands[2];
19391 if (is_sse)
19393 if (GET_MODE (operands[0]) == SFmode)
19394 if (unordered_p)
19395 return "%vucomiss\t{%1, %0|%0, %1}";
19396 else
19397 return "%vcomiss\t{%1, %0|%0, %1}";
19398 else
19399 if (unordered_p)
19400 return "%vucomisd\t{%1, %0|%0, %1}";
19401 else
19402 return "%vcomisd\t{%1, %0|%0, %1}";
19405 gcc_assert (STACK_TOP_P (cmp_op0));
19407 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
19409 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
19411 if (stack_top_dies)
19413 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
19414 return output_387_ffreep (operands, 1);
19416 else
19417 return "ftst\n\tfnstsw\t%0";
19420 if (STACK_REG_P (cmp_op1)
19421 && stack_top_dies
19422 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
19423 && REGNO (cmp_op1) != FIRST_STACK_REG)
19425 /* If both the top of the 387 stack dies, and the other operand
19426 is also a stack register that dies, then this must be a
19427 `fcompp' float compare */
19429 if (eflags_p)
19431 /* There is no double popping fcomi variant. Fortunately,
19432 eflags is immune from the fstp's cc clobbering. */
19433 if (unordered_p)
19434 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
19435 else
19436 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
19437 return output_387_ffreep (operands, 0);
19439 else
19441 if (unordered_p)
19442 return "fucompp\n\tfnstsw\t%0";
19443 else
19444 return "fcompp\n\tfnstsw\t%0";
19447 else
19449 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
19451 static const char * const alt[16] =
19453 "fcom%Z2\t%y2\n\tfnstsw\t%0",
19454 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
19455 "fucom%Z2\t%y2\n\tfnstsw\t%0",
19456 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
19458 "ficom%Z2\t%y2\n\tfnstsw\t%0",
19459 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
19460 NULL,
19461 NULL,
19463 "fcomi\t{%y1, %0|%0, %y1}",
19464 "fcomip\t{%y1, %0|%0, %y1}",
19465 "fucomi\t{%y1, %0|%0, %y1}",
19466 "fucomip\t{%y1, %0|%0, %y1}",
19468 NULL,
19469 NULL,
19470 NULL,
19471 NULL
19474 int mask;
19475 const char *ret;
19477 mask = eflags_p << 3;
19478 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
19479 mask |= unordered_p << 1;
19480 mask |= stack_top_dies;
19482 gcc_assert (mask < 16);
19483 ret = alt[mask];
19484 gcc_assert (ret);
19486 return ret;
19490 void
19491 ix86_output_addr_vec_elt (FILE *file, int value)
19493 const char *directive = ASM_LONG;
19495 #ifdef ASM_QUAD
19496 if (TARGET_LP64)
19497 directive = ASM_QUAD;
19498 #else
19499 gcc_assert (!TARGET_64BIT);
19500 #endif
19502 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19505 void
19506 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19508 const char *directive = ASM_LONG;
19510 #ifdef ASM_QUAD
19511 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19512 directive = ASM_QUAD;
19513 #else
19514 gcc_assert (!TARGET_64BIT);
19515 #endif
19516 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19517 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19518 fprintf (file, "%s%s%d-%s%d\n",
19519 directive, LPREFIX, value, LPREFIX, rel);
19520 else if (HAVE_AS_GOTOFF_IN_DATA)
19521 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19522 #if TARGET_MACHO
19523 else if (TARGET_MACHO)
19525 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19526 machopic_output_function_base_name (file);
19527 putc ('\n', file);
19529 #endif
19530 else
19531 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19532 GOT_SYMBOL_NAME, LPREFIX, value);
19535 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19536 for the target. */
19538 void
19539 ix86_expand_clear (rtx dest)
19541 rtx tmp;
19543 /* We play register width games, which are only valid after reload. */
19544 gcc_assert (reload_completed);
19546 /* Avoid HImode and its attendant prefix byte. */
19547 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19548 dest = gen_rtx_REG (SImode, REGNO (dest));
19549 tmp = gen_rtx_SET (dest, const0_rtx);
19551 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19553 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19554 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19557 emit_insn (tmp);
19560 /* X is an unchanging MEM. If it is a constant pool reference, return
19561 the constant pool rtx, else NULL. */
19564 maybe_get_pool_constant (rtx x)
19566 x = ix86_delegitimize_address (XEXP (x, 0));
19568 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
19569 return get_pool_constant (x);
19571 return NULL_RTX;
19574 void
19575 ix86_expand_move (machine_mode mode, rtx operands[])
19577 rtx op0, op1;
19578 rtx tmp, addend = NULL_RTX;
19579 enum tls_model model;
19581 op0 = operands[0];
19582 op1 = operands[1];
19584 switch (GET_CODE (op1))
19586 case CONST:
19587 tmp = XEXP (op1, 0);
19589 if (GET_CODE (tmp) != PLUS
19590 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19591 break;
19593 op1 = XEXP (tmp, 0);
19594 addend = XEXP (tmp, 1);
19595 /* FALLTHRU */
19597 case SYMBOL_REF:
19598 model = SYMBOL_REF_TLS_MODEL (op1);
19600 if (model)
19601 op1 = legitimize_tls_address (op1, model, true);
19602 else if (ix86_force_load_from_GOT_p (op1))
19604 /* Load the external function address via GOT slot to avoid PLT. */
19605 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19606 (TARGET_64BIT
19607 ? UNSPEC_GOTPCREL
19608 : UNSPEC_GOT));
19609 op1 = gen_rtx_CONST (Pmode, op1);
19610 op1 = gen_const_mem (Pmode, op1);
19611 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19613 else
19615 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19616 if (tmp)
19618 op1 = tmp;
19619 if (!addend)
19620 break;
19622 else
19624 op1 = operands[1];
19625 break;
19629 if (addend)
19631 op1 = force_operand (op1, NULL_RTX);
19632 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19633 op0, 1, OPTAB_DIRECT);
19635 else
19636 op1 = force_operand (op1, op0);
19638 if (op1 == op0)
19639 return;
19641 op1 = convert_to_mode (mode, op1, 1);
19643 default:
19644 break;
19647 if ((flag_pic || MACHOPIC_INDIRECT)
19648 && symbolic_operand (op1, mode))
19650 if (TARGET_MACHO && !TARGET_64BIT)
19652 #if TARGET_MACHO
19653 /* dynamic-no-pic */
19654 if (MACHOPIC_INDIRECT)
19656 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19657 ? op0 : gen_reg_rtx (Pmode);
19658 op1 = machopic_indirect_data_reference (op1, temp);
19659 if (MACHOPIC_PURE)
19660 op1 = machopic_legitimize_pic_address (op1, mode,
19661 temp == op1 ? 0 : temp);
19663 if (op0 != op1 && GET_CODE (op0) != MEM)
19665 rtx insn = gen_rtx_SET (op0, op1);
19666 emit_insn (insn);
19667 return;
19669 if (GET_CODE (op0) == MEM)
19670 op1 = force_reg (Pmode, op1);
19671 else
19673 rtx temp = op0;
19674 if (GET_CODE (temp) != REG)
19675 temp = gen_reg_rtx (Pmode);
19676 temp = legitimize_pic_address (op1, temp);
19677 if (temp == op0)
19678 return;
19679 op1 = temp;
19681 /* dynamic-no-pic */
19682 #endif
19684 else
19686 if (MEM_P (op0))
19687 op1 = force_reg (mode, op1);
19688 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19690 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19691 op1 = legitimize_pic_address (op1, reg);
19692 if (op0 == op1)
19693 return;
19694 op1 = convert_to_mode (mode, op1, 1);
19698 else
19700 if (MEM_P (op0)
19701 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19702 || !push_operand (op0, mode))
19703 && MEM_P (op1))
19704 op1 = force_reg (mode, op1);
19706 if (push_operand (op0, mode)
19707 && ! general_no_elim_operand (op1, mode))
19708 op1 = copy_to_mode_reg (mode, op1);
19710 /* Force large constants in 64bit compilation into register
19711 to get them CSEed. */
19712 if (can_create_pseudo_p ()
19713 && (mode == DImode) && TARGET_64BIT
19714 && immediate_operand (op1, mode)
19715 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19716 && !register_operand (op0, mode)
19717 && optimize)
19718 op1 = copy_to_mode_reg (mode, op1);
19720 if (can_create_pseudo_p ()
19721 && CONST_DOUBLE_P (op1))
19723 /* If we are loading a floating point constant to a register,
19724 force the value to memory now, since we'll get better code
19725 out the back end. */
19727 op1 = validize_mem (force_const_mem (mode, op1));
19728 if (!register_operand (op0, mode))
19730 rtx temp = gen_reg_rtx (mode);
19731 emit_insn (gen_rtx_SET (temp, op1));
19732 emit_move_insn (op0, temp);
19733 return;
19738 emit_insn (gen_rtx_SET (op0, op1));
19741 void
19742 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19744 rtx op0 = operands[0], op1 = operands[1];
19745 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19746 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19747 unsigned int align = (TARGET_IAMCU
19748 ? GET_MODE_BITSIZE (mode)
19749 : GET_MODE_ALIGNMENT (mode));
19751 if (push_operand (op0, VOIDmode))
19752 op0 = emit_move_resolve_push (mode, op0);
19754 /* Force constants other than zero into memory. We do not know how
19755 the instructions used to build constants modify the upper 64 bits
19756 of the register, once we have that information we may be able
19757 to handle some of them more efficiently. */
19758 if (can_create_pseudo_p ()
19759 && (CONSTANT_P (op1)
19760 || (SUBREG_P (op1)
19761 && CONSTANT_P (SUBREG_REG (op1))))
19762 && ((register_operand (op0, mode)
19763 && !standard_sse_constant_p (op1, mode))
19764 /* ix86_expand_vector_move_misalign() does not like constants. */
19765 || (SSE_REG_MODE_P (mode)
19766 && MEM_P (op0)
19767 && MEM_ALIGN (op0) < align)))
19769 if (SUBREG_P (op1))
19771 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19772 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19773 if (r)
19774 r = validize_mem (r);
19775 else
19776 r = force_reg (imode, SUBREG_REG (op1));
19777 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19779 else
19780 op1 = validize_mem (force_const_mem (mode, op1));
19783 /* We need to check memory alignment for SSE mode since attribute
19784 can make operands unaligned. */
19785 if (can_create_pseudo_p ()
19786 && SSE_REG_MODE_P (mode)
19787 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19788 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19790 rtx tmp[2];
19792 /* ix86_expand_vector_move_misalign() does not like both
19793 arguments in memory. */
19794 if (!register_operand (op0, mode)
19795 && !register_operand (op1, mode))
19796 op1 = force_reg (mode, op1);
19798 tmp[0] = op0; tmp[1] = op1;
19799 ix86_expand_vector_move_misalign (mode, tmp);
19800 return;
19803 /* Make operand1 a register if it isn't already. */
19804 if (can_create_pseudo_p ()
19805 && !register_operand (op0, mode)
19806 && !register_operand (op1, mode))
19808 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19809 return;
19812 emit_insn (gen_rtx_SET (op0, op1));
19815 /* Split 32-byte AVX unaligned load and store if needed. */
19817 static void
19818 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19820 rtx m;
19821 rtx (*extract) (rtx, rtx, rtx);
19822 machine_mode mode;
19824 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19825 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19827 emit_insn (gen_rtx_SET (op0, op1));
19828 return;
19831 rtx orig_op0 = NULL_RTX;
19832 mode = GET_MODE (op0);
19833 switch (GET_MODE_CLASS (mode))
19835 case MODE_VECTOR_INT:
19836 case MODE_INT:
19837 if (mode != V32QImode)
19839 if (!MEM_P (op0))
19841 orig_op0 = op0;
19842 op0 = gen_reg_rtx (V32QImode);
19844 else
19845 op0 = gen_lowpart (V32QImode, op0);
19846 op1 = gen_lowpart (V32QImode, op1);
19847 mode = V32QImode;
19849 break;
19850 case MODE_VECTOR_FLOAT:
19851 break;
19852 default:
19853 gcc_unreachable ();
19856 switch (mode)
19858 default:
19859 gcc_unreachable ();
19860 case V32QImode:
19861 extract = gen_avx_vextractf128v32qi;
19862 mode = V16QImode;
19863 break;
19864 case V8SFmode:
19865 extract = gen_avx_vextractf128v8sf;
19866 mode = V4SFmode;
19867 break;
19868 case V4DFmode:
19869 extract = gen_avx_vextractf128v4df;
19870 mode = V2DFmode;
19871 break;
19874 if (MEM_P (op1))
19876 rtx r = gen_reg_rtx (mode);
19877 m = adjust_address (op1, mode, 0);
19878 emit_move_insn (r, m);
19879 m = adjust_address (op1, mode, 16);
19880 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19881 emit_move_insn (op0, r);
19883 else if (MEM_P (op0))
19885 m = adjust_address (op0, mode, 0);
19886 emit_insn (extract (m, op1, const0_rtx));
19887 m = adjust_address (op0, mode, 16);
19888 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19890 else
19891 gcc_unreachable ();
19893 if (orig_op0)
19894 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19897 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19898 straight to ix86_expand_vector_move. */
19899 /* Code generation for scalar reg-reg moves of single and double precision data:
19900 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19901 movaps reg, reg
19902 else
19903 movss reg, reg
19904 if (x86_sse_partial_reg_dependency == true)
19905 movapd reg, reg
19906 else
19907 movsd reg, reg
19909 Code generation for scalar loads of double precision data:
19910 if (x86_sse_split_regs == true)
19911 movlpd mem, reg (gas syntax)
19912 else
19913 movsd mem, reg
19915 Code generation for unaligned packed loads of single precision data
19916 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19917 if (x86_sse_unaligned_move_optimal)
19918 movups mem, reg
19920 if (x86_sse_partial_reg_dependency == true)
19922 xorps reg, reg
19923 movlps mem, reg
19924 movhps mem+8, reg
19926 else
19928 movlps mem, reg
19929 movhps mem+8, reg
19932 Code generation for unaligned packed loads of double precision data
19933 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19934 if (x86_sse_unaligned_move_optimal)
19935 movupd mem, reg
19937 if (x86_sse_split_regs == true)
19939 movlpd mem, reg
19940 movhpd mem+8, reg
19942 else
19944 movsd mem, reg
19945 movhpd mem+8, reg
19949 void
19950 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19952 rtx op0, op1, m;
19954 op0 = operands[0];
19955 op1 = operands[1];
19957 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19958 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19960 emit_insn (gen_rtx_SET (op0, op1));
19961 return;
19964 if (TARGET_AVX)
19966 if (GET_MODE_SIZE (mode) == 32)
19967 ix86_avx256_split_vector_move_misalign (op0, op1);
19968 else
19969 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19970 emit_insn (gen_rtx_SET (op0, op1));
19971 return;
19974 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19975 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19977 emit_insn (gen_rtx_SET (op0, op1));
19978 return;
19981 /* ??? If we have typed data, then it would appear that using
19982 movdqu is the only way to get unaligned data loaded with
19983 integer type. */
19984 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19986 emit_insn (gen_rtx_SET (op0, op1));
19987 return;
19990 if (MEM_P (op1))
19992 if (TARGET_SSE2 && mode == V2DFmode)
19994 rtx zero;
19996 /* When SSE registers are split into halves, we can avoid
19997 writing to the top half twice. */
19998 if (TARGET_SSE_SPLIT_REGS)
20000 emit_clobber (op0);
20001 zero = op0;
20003 else
20005 /* ??? Not sure about the best option for the Intel chips.
20006 The following would seem to satisfy; the register is
20007 entirely cleared, breaking the dependency chain. We
20008 then store to the upper half, with a dependency depth
20009 of one. A rumor has it that Intel recommends two movsd
20010 followed by an unpacklpd, but this is unconfirmed. And
20011 given that the dependency depth of the unpacklpd would
20012 still be one, I'm not sure why this would be better. */
20013 zero = CONST0_RTX (V2DFmode);
20016 m = adjust_address (op1, DFmode, 0);
20017 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20018 m = adjust_address (op1, DFmode, 8);
20019 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20021 else
20023 rtx t;
20025 if (mode != V4SFmode)
20026 t = gen_reg_rtx (V4SFmode);
20027 else
20028 t = op0;
20030 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20031 emit_move_insn (t, CONST0_RTX (V4SFmode));
20032 else
20033 emit_clobber (t);
20035 m = adjust_address (op1, V2SFmode, 0);
20036 emit_insn (gen_sse_loadlps (t, t, m));
20037 m = adjust_address (op1, V2SFmode, 8);
20038 emit_insn (gen_sse_loadhps (t, t, m));
20039 if (mode != V4SFmode)
20040 emit_move_insn (op0, gen_lowpart (mode, t));
20043 else if (MEM_P (op0))
20045 if (TARGET_SSE2 && mode == V2DFmode)
20047 m = adjust_address (op0, DFmode, 0);
20048 emit_insn (gen_sse2_storelpd (m, op1));
20049 m = adjust_address (op0, DFmode, 8);
20050 emit_insn (gen_sse2_storehpd (m, op1));
20052 else
20054 if (mode != V4SFmode)
20055 op1 = gen_lowpart (V4SFmode, op1);
20057 m = adjust_address (op0, V2SFmode, 0);
20058 emit_insn (gen_sse_storelps (m, op1));
20059 m = adjust_address (op0, V2SFmode, 8);
20060 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20063 else
20064 gcc_unreachable ();
20067 /* Helper function of ix86_fixup_binary_operands to canonicalize
20068 operand order. Returns true if the operands should be swapped. */
20070 static bool
20071 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20072 rtx operands[])
20074 rtx dst = operands[0];
20075 rtx src1 = operands[1];
20076 rtx src2 = operands[2];
20078 /* If the operation is not commutative, we can't do anything. */
20079 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
20080 return false;
20082 /* Highest priority is that src1 should match dst. */
20083 if (rtx_equal_p (dst, src1))
20084 return false;
20085 if (rtx_equal_p (dst, src2))
20086 return true;
20088 /* Next highest priority is that immediate constants come second. */
20089 if (immediate_operand (src2, mode))
20090 return false;
20091 if (immediate_operand (src1, mode))
20092 return true;
20094 /* Lowest priority is that memory references should come second. */
20095 if (MEM_P (src2))
20096 return false;
20097 if (MEM_P (src1))
20098 return true;
20100 return false;
20104 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20105 destination to use for the operation. If different from the true
20106 destination in operands[0], a copy operation will be required. */
20109 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20110 rtx operands[])
20112 rtx dst = operands[0];
20113 rtx src1 = operands[1];
20114 rtx src2 = operands[2];
20116 /* Canonicalize operand order. */
20117 if (ix86_swap_binary_operands_p (code, mode, operands))
20119 /* It is invalid to swap operands of different modes. */
20120 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20122 std::swap (src1, src2);
20125 /* Both source operands cannot be in memory. */
20126 if (MEM_P (src1) && MEM_P (src2))
20128 /* Optimization: Only read from memory once. */
20129 if (rtx_equal_p (src1, src2))
20131 src2 = force_reg (mode, src2);
20132 src1 = src2;
20134 else if (rtx_equal_p (dst, src1))
20135 src2 = force_reg (mode, src2);
20136 else
20137 src1 = force_reg (mode, src1);
20140 /* If the destination is memory, and we do not have matching source
20141 operands, do things in registers. */
20142 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20143 dst = gen_reg_rtx (mode);
20145 /* Source 1 cannot be a constant. */
20146 if (CONSTANT_P (src1))
20147 src1 = force_reg (mode, src1);
20149 /* Source 1 cannot be a non-matching memory. */
20150 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20151 src1 = force_reg (mode, src1);
20153 /* Improve address combine. */
20154 if (code == PLUS
20155 && GET_MODE_CLASS (mode) == MODE_INT
20156 && MEM_P (src2))
20157 src2 = force_reg (mode, src2);
20159 operands[1] = src1;
20160 operands[2] = src2;
20161 return dst;
20164 /* Similarly, but assume that the destination has already been
20165 set up properly. */
20167 void
20168 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20169 machine_mode mode, rtx operands[])
20171 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20172 gcc_assert (dst == operands[0]);
20175 /* Attempt to expand a binary operator. Make the expansion closer to the
20176 actual machine, then just general_operand, which will allow 3 separate
20177 memory references (one output, two input) in a single insn. */
20179 void
20180 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20181 rtx operands[])
20183 rtx src1, src2, dst, op, clob;
20185 dst = ix86_fixup_binary_operands (code, mode, operands);
20186 src1 = operands[1];
20187 src2 = operands[2];
20189 /* Emit the instruction. */
20191 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20193 if (reload_completed
20194 && code == PLUS
20195 && !rtx_equal_p (dst, src1))
20197 /* This is going to be an LEA; avoid splitting it later. */
20198 emit_insn (op);
20200 else
20202 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20203 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20206 /* Fix up the destination if needed. */
20207 if (dst != operands[0])
20208 emit_move_insn (operands[0], dst);
20211 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20212 the given OPERANDS. */
20214 void
20215 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20216 rtx operands[])
20218 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20219 if (SUBREG_P (operands[1]))
20221 op1 = operands[1];
20222 op2 = operands[2];
20224 else if (SUBREG_P (operands[2]))
20226 op1 = operands[2];
20227 op2 = operands[1];
20229 /* Optimize (__m128i) d | (__m128i) e and similar code
20230 when d and e are float vectors into float vector logical
20231 insn. In C/C++ without using intrinsics there is no other way
20232 to express vector logical operation on float vectors than
20233 to cast them temporarily to integer vectors. */
20234 if (op1
20235 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20236 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20237 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20238 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20239 && SUBREG_BYTE (op1) == 0
20240 && (GET_CODE (op2) == CONST_VECTOR
20241 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20242 && SUBREG_BYTE (op2) == 0))
20243 && can_create_pseudo_p ())
20245 rtx dst;
20246 switch (GET_MODE (SUBREG_REG (op1)))
20248 case V4SFmode:
20249 case V8SFmode:
20250 case V16SFmode:
20251 case V2DFmode:
20252 case V4DFmode:
20253 case V8DFmode:
20254 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20255 if (GET_CODE (op2) == CONST_VECTOR)
20257 op2 = gen_lowpart (GET_MODE (dst), op2);
20258 op2 = force_reg (GET_MODE (dst), op2);
20260 else
20262 op1 = operands[1];
20263 op2 = SUBREG_REG (operands[2]);
20264 if (!vector_operand (op2, GET_MODE (dst)))
20265 op2 = force_reg (GET_MODE (dst), op2);
20267 op1 = SUBREG_REG (op1);
20268 if (!vector_operand (op1, GET_MODE (dst)))
20269 op1 = force_reg (GET_MODE (dst), op1);
20270 emit_insn (gen_rtx_SET (dst,
20271 gen_rtx_fmt_ee (code, GET_MODE (dst),
20272 op1, op2)));
20273 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20274 return;
20275 default:
20276 break;
20279 if (!vector_operand (operands[1], mode))
20280 operands[1] = force_reg (mode, operands[1]);
20281 if (!vector_operand (operands[2], mode))
20282 operands[2] = force_reg (mode, operands[2]);
20283 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20284 emit_insn (gen_rtx_SET (operands[0],
20285 gen_rtx_fmt_ee (code, mode, operands[1],
20286 operands[2])));
20289 /* Return TRUE or FALSE depending on whether the binary operator meets the
20290 appropriate constraints. */
20292 bool
20293 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20294 rtx operands[3])
20296 rtx dst = operands[0];
20297 rtx src1 = operands[1];
20298 rtx src2 = operands[2];
20300 /* Both source operands cannot be in memory. */
20301 if (MEM_P (src1) && MEM_P (src2))
20302 return false;
20304 /* Canonicalize operand order for commutative operators. */
20305 if (ix86_swap_binary_operands_p (code, mode, operands))
20306 std::swap (src1, src2);
20308 /* If the destination is memory, we must have a matching source operand. */
20309 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20310 return false;
20312 /* Source 1 cannot be a constant. */
20313 if (CONSTANT_P (src1))
20314 return false;
20316 /* Source 1 cannot be a non-matching memory. */
20317 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20318 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20319 return (code == AND
20320 && (mode == HImode
20321 || mode == SImode
20322 || (TARGET_64BIT && mode == DImode))
20323 && satisfies_constraint_L (src2));
20325 return true;
20328 /* Attempt to expand a unary operator. Make the expansion closer to the
20329 actual machine, then just general_operand, which will allow 2 separate
20330 memory references (one output, one input) in a single insn. */
20332 void
20333 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20334 rtx operands[])
20336 bool matching_memory = false;
20337 rtx src, dst, op, clob;
20339 dst = operands[0];
20340 src = operands[1];
20342 /* If the destination is memory, and we do not have matching source
20343 operands, do things in registers. */
20344 if (MEM_P (dst))
20346 if (rtx_equal_p (dst, src))
20347 matching_memory = true;
20348 else
20349 dst = gen_reg_rtx (mode);
20352 /* When source operand is memory, destination must match. */
20353 if (MEM_P (src) && !matching_memory)
20354 src = force_reg (mode, src);
20356 /* Emit the instruction. */
20358 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20360 if (code == NOT)
20361 emit_insn (op);
20362 else
20364 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20365 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20368 /* Fix up the destination if needed. */
20369 if (dst != operands[0])
20370 emit_move_insn (operands[0], dst);
20373 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20374 divisor are within the range [0-255]. */
20376 void
20377 ix86_split_idivmod (machine_mode mode, rtx operands[],
20378 bool signed_p)
20380 rtx_code_label *end_label, *qimode_label;
20381 rtx insn, div, mod;
20382 rtx scratch, tmp0, tmp1, tmp2;
20383 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20384 rtx (*gen_zero_extend) (rtx, rtx);
20385 rtx (*gen_test_ccno_1) (rtx, rtx);
20387 switch (mode)
20389 case SImode:
20390 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20391 gen_test_ccno_1 = gen_testsi_ccno_1;
20392 gen_zero_extend = gen_zero_extendqisi2;
20393 break;
20394 case DImode:
20395 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20396 gen_test_ccno_1 = gen_testdi_ccno_1;
20397 gen_zero_extend = gen_zero_extendqidi2;
20398 break;
20399 default:
20400 gcc_unreachable ();
20403 end_label = gen_label_rtx ();
20404 qimode_label = gen_label_rtx ();
20406 scratch = gen_reg_rtx (mode);
20408 /* Use 8bit unsigned divimod if dividend and divisor are within
20409 the range [0-255]. */
20410 emit_move_insn (scratch, operands[2]);
20411 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20412 scratch, 1, OPTAB_DIRECT);
20413 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20414 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20415 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20416 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20417 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20418 pc_rtx);
20419 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20420 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20421 JUMP_LABEL (insn) = qimode_label;
20423 /* Generate original signed/unsigned divimod. */
20424 div = gen_divmod4_1 (operands[0], operands[1],
20425 operands[2], operands[3]);
20426 emit_insn (div);
20428 /* Branch to the end. */
20429 emit_jump_insn (gen_jump (end_label));
20430 emit_barrier ();
20432 /* Generate 8bit unsigned divide. */
20433 emit_label (qimode_label);
20434 /* Don't use operands[0] for result of 8bit divide since not all
20435 registers support QImode ZERO_EXTRACT. */
20436 tmp0 = lowpart_subreg (HImode, scratch, mode);
20437 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20438 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20439 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20441 if (signed_p)
20443 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
20444 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
20446 else
20448 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
20449 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
20452 /* Extract remainder from AH. */
20453 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
20454 if (REG_P (operands[1]))
20455 insn = emit_move_insn (operands[1], tmp1);
20456 else
20458 /* Need a new scratch register since the old one has result
20459 of 8bit divide. */
20460 scratch = gen_reg_rtx (mode);
20461 emit_move_insn (scratch, tmp1);
20462 insn = emit_move_insn (operands[1], scratch);
20464 set_unique_reg_note (insn, REG_EQUAL, mod);
20466 /* Zero extend quotient from AL. */
20467 tmp1 = gen_lowpart (QImode, tmp0);
20468 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20469 set_unique_reg_note (insn, REG_EQUAL, div);
20471 emit_label (end_label);
20474 #define LEA_MAX_STALL (3)
20475 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20477 /* Increase given DISTANCE in half-cycles according to
20478 dependencies between PREV and NEXT instructions.
20479 Add 1 half-cycle if there is no dependency and
20480 go to next cycle if there is some dependecy. */
20482 static unsigned int
20483 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20485 df_ref def, use;
20487 if (!prev || !next)
20488 return distance + (distance & 1) + 2;
20490 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20491 return distance + 1;
20493 FOR_EACH_INSN_USE (use, next)
20494 FOR_EACH_INSN_DEF (def, prev)
20495 if (!DF_REF_IS_ARTIFICIAL (def)
20496 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20497 return distance + (distance & 1) + 2;
20499 return distance + 1;
20502 /* Function checks if instruction INSN defines register number
20503 REGNO1 or REGNO2. */
20505 static bool
20506 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20507 rtx_insn *insn)
20509 df_ref def;
20511 FOR_EACH_INSN_DEF (def, insn)
20512 if (DF_REF_REG_DEF_P (def)
20513 && !DF_REF_IS_ARTIFICIAL (def)
20514 && (regno1 == DF_REF_REGNO (def)
20515 || regno2 == DF_REF_REGNO (def)))
20516 return true;
20518 return false;
20521 /* Function checks if instruction INSN uses register number
20522 REGNO as a part of address expression. */
20524 static bool
20525 insn_uses_reg_mem (unsigned int regno, rtx insn)
20527 df_ref use;
20529 FOR_EACH_INSN_USE (use, insn)
20530 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20531 return true;
20533 return false;
20536 /* Search backward for non-agu definition of register number REGNO1
20537 or register number REGNO2 in basic block starting from instruction
20538 START up to head of basic block or instruction INSN.
20540 Function puts true value into *FOUND var if definition was found
20541 and false otherwise.
20543 Distance in half-cycles between START and found instruction or head
20544 of BB is added to DISTANCE and returned. */
20546 static int
20547 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20548 rtx_insn *insn, int distance,
20549 rtx_insn *start, bool *found)
20551 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20552 rtx_insn *prev = start;
20553 rtx_insn *next = NULL;
20555 *found = false;
20557 while (prev
20558 && prev != insn
20559 && distance < LEA_SEARCH_THRESHOLD)
20561 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20563 distance = increase_distance (prev, next, distance);
20564 if (insn_defines_reg (regno1, regno2, prev))
20566 if (recog_memoized (prev) < 0
20567 || get_attr_type (prev) != TYPE_LEA)
20569 *found = true;
20570 return distance;
20574 next = prev;
20576 if (prev == BB_HEAD (bb))
20577 break;
20579 prev = PREV_INSN (prev);
20582 return distance;
20585 /* Search backward for non-agu definition of register number REGNO1
20586 or register number REGNO2 in INSN's basic block until
20587 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20588 2. Reach neighbor BBs boundary, or
20589 3. Reach agu definition.
20590 Returns the distance between the non-agu definition point and INSN.
20591 If no definition point, returns -1. */
20593 static int
20594 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20595 rtx_insn *insn)
20597 basic_block bb = BLOCK_FOR_INSN (insn);
20598 int distance = 0;
20599 bool found = false;
20601 if (insn != BB_HEAD (bb))
20602 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20603 distance, PREV_INSN (insn),
20604 &found);
20606 if (!found && distance < LEA_SEARCH_THRESHOLD)
20608 edge e;
20609 edge_iterator ei;
20610 bool simple_loop = false;
20612 FOR_EACH_EDGE (e, ei, bb->preds)
20613 if (e->src == bb)
20615 simple_loop = true;
20616 break;
20619 if (simple_loop)
20620 distance = distance_non_agu_define_in_bb (regno1, regno2,
20621 insn, distance,
20622 BB_END (bb), &found);
20623 else
20625 int shortest_dist = -1;
20626 bool found_in_bb = false;
20628 FOR_EACH_EDGE (e, ei, bb->preds)
20630 int bb_dist
20631 = distance_non_agu_define_in_bb (regno1, regno2,
20632 insn, distance,
20633 BB_END (e->src),
20634 &found_in_bb);
20635 if (found_in_bb)
20637 if (shortest_dist < 0)
20638 shortest_dist = bb_dist;
20639 else if (bb_dist > 0)
20640 shortest_dist = MIN (bb_dist, shortest_dist);
20642 found = true;
20646 distance = shortest_dist;
20650 /* get_attr_type may modify recog data. We want to make sure
20651 that recog data is valid for instruction INSN, on which
20652 distance_non_agu_define is called. INSN is unchanged here. */
20653 extract_insn_cached (insn);
20655 if (!found)
20656 return -1;
20658 return distance >> 1;
20661 /* Return the distance in half-cycles between INSN and the next
20662 insn that uses register number REGNO in memory address added
20663 to DISTANCE. Return -1 if REGNO0 is set.
20665 Put true value into *FOUND if register usage was found and
20666 false otherwise.
20667 Put true value into *REDEFINED if register redefinition was
20668 found and false otherwise. */
20670 static int
20671 distance_agu_use_in_bb (unsigned int regno,
20672 rtx_insn *insn, int distance, rtx_insn *start,
20673 bool *found, bool *redefined)
20675 basic_block bb = NULL;
20676 rtx_insn *next = start;
20677 rtx_insn *prev = NULL;
20679 *found = false;
20680 *redefined = false;
20682 if (start != NULL_RTX)
20684 bb = BLOCK_FOR_INSN (start);
20685 if (start != BB_HEAD (bb))
20686 /* If insn and start belong to the same bb, set prev to insn,
20687 so the call to increase_distance will increase the distance
20688 between insns by 1. */
20689 prev = insn;
20692 while (next
20693 && next != insn
20694 && distance < LEA_SEARCH_THRESHOLD)
20696 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20698 distance = increase_distance(prev, next, distance);
20699 if (insn_uses_reg_mem (regno, next))
20701 /* Return DISTANCE if OP0 is used in memory
20702 address in NEXT. */
20703 *found = true;
20704 return distance;
20707 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20709 /* Return -1 if OP0 is set in NEXT. */
20710 *redefined = true;
20711 return -1;
20714 prev = next;
20717 if (next == BB_END (bb))
20718 break;
20720 next = NEXT_INSN (next);
20723 return distance;
20726 /* Return the distance between INSN and the next insn that uses
20727 register number REGNO0 in memory address. Return -1 if no such
20728 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20730 static int
20731 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20733 basic_block bb = BLOCK_FOR_INSN (insn);
20734 int distance = 0;
20735 bool found = false;
20736 bool redefined = false;
20738 if (insn != BB_END (bb))
20739 distance = distance_agu_use_in_bb (regno0, insn, distance,
20740 NEXT_INSN (insn),
20741 &found, &redefined);
20743 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20745 edge e;
20746 edge_iterator ei;
20747 bool simple_loop = false;
20749 FOR_EACH_EDGE (e, ei, bb->succs)
20750 if (e->dest == bb)
20752 simple_loop = true;
20753 break;
20756 if (simple_loop)
20757 distance = distance_agu_use_in_bb (regno0, insn,
20758 distance, BB_HEAD (bb),
20759 &found, &redefined);
20760 else
20762 int shortest_dist = -1;
20763 bool found_in_bb = false;
20764 bool redefined_in_bb = false;
20766 FOR_EACH_EDGE (e, ei, bb->succs)
20768 int bb_dist
20769 = distance_agu_use_in_bb (regno0, insn,
20770 distance, BB_HEAD (e->dest),
20771 &found_in_bb, &redefined_in_bb);
20772 if (found_in_bb)
20774 if (shortest_dist < 0)
20775 shortest_dist = bb_dist;
20776 else if (bb_dist > 0)
20777 shortest_dist = MIN (bb_dist, shortest_dist);
20779 found = true;
20783 distance = shortest_dist;
20787 if (!found || redefined)
20788 return -1;
20790 return distance >> 1;
20793 /* Define this macro to tune LEA priority vs ADD, it take effect when
20794 there is a dilemma of choicing LEA or ADD
20795 Negative value: ADD is more preferred than LEA
20796 Zero: Netrual
20797 Positive value: LEA is more preferred than ADD*/
20798 #define IX86_LEA_PRIORITY 0
20800 /* Return true if usage of lea INSN has performance advantage
20801 over a sequence of instructions. Instructions sequence has
20802 SPLIT_COST cycles higher latency than lea latency. */
20804 static bool
20805 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20806 unsigned int regno2, int split_cost, bool has_scale)
20808 int dist_define, dist_use;
20810 /* For Silvermont if using a 2-source or 3-source LEA for
20811 non-destructive destination purposes, or due to wanting
20812 ability to use SCALE, the use of LEA is justified. */
20813 if (TARGET_SILVERMONT || TARGET_INTEL)
20815 if (has_scale)
20816 return true;
20817 if (split_cost < 1)
20818 return false;
20819 if (regno0 == regno1 || regno0 == regno2)
20820 return false;
20821 return true;
20824 dist_define = distance_non_agu_define (regno1, regno2, insn);
20825 dist_use = distance_agu_use (regno0, insn);
20827 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20829 /* If there is no non AGU operand definition, no AGU
20830 operand usage and split cost is 0 then both lea
20831 and non lea variants have same priority. Currently
20832 we prefer lea for 64 bit code and non lea on 32 bit
20833 code. */
20834 if (dist_use < 0 && split_cost == 0)
20835 return TARGET_64BIT || IX86_LEA_PRIORITY;
20836 else
20837 return true;
20840 /* With longer definitions distance lea is more preferable.
20841 Here we change it to take into account splitting cost and
20842 lea priority. */
20843 dist_define += split_cost + IX86_LEA_PRIORITY;
20845 /* If there is no use in memory addess then we just check
20846 that split cost exceeds AGU stall. */
20847 if (dist_use < 0)
20848 return dist_define > LEA_MAX_STALL;
20850 /* If this insn has both backward non-agu dependence and forward
20851 agu dependence, the one with short distance takes effect. */
20852 return dist_define >= dist_use;
20855 /* Return true if it is legal to clobber flags by INSN and
20856 false otherwise. */
20858 static bool
20859 ix86_ok_to_clobber_flags (rtx_insn *insn)
20861 basic_block bb = BLOCK_FOR_INSN (insn);
20862 df_ref use;
20863 bitmap live;
20865 while (insn)
20867 if (NONDEBUG_INSN_P (insn))
20869 FOR_EACH_INSN_USE (use, insn)
20870 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20871 return false;
20873 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20874 return true;
20877 if (insn == BB_END (bb))
20878 break;
20880 insn = NEXT_INSN (insn);
20883 live = df_get_live_out(bb);
20884 return !REGNO_REG_SET_P (live, FLAGS_REG);
20887 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20888 move and add to avoid AGU stalls. */
20890 bool
20891 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20893 unsigned int regno0, regno1, regno2;
20895 /* Check if we need to optimize. */
20896 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20897 return false;
20899 /* Check it is correct to split here. */
20900 if (!ix86_ok_to_clobber_flags(insn))
20901 return false;
20903 regno0 = true_regnum (operands[0]);
20904 regno1 = true_regnum (operands[1]);
20905 regno2 = true_regnum (operands[2]);
20907 /* We need to split only adds with non destructive
20908 destination operand. */
20909 if (regno0 == regno1 || regno0 == regno2)
20910 return false;
20911 else
20912 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20915 /* Return true if we should emit lea instruction instead of mov
20916 instruction. */
20918 bool
20919 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20921 unsigned int regno0, regno1;
20923 /* Check if we need to optimize. */
20924 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20925 return false;
20927 /* Use lea for reg to reg moves only. */
20928 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20929 return false;
20931 regno0 = true_regnum (operands[0]);
20932 regno1 = true_regnum (operands[1]);
20934 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20937 /* Return true if we need to split lea into a sequence of
20938 instructions to avoid AGU stalls. */
20940 bool
20941 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20943 unsigned int regno0, regno1, regno2;
20944 int split_cost;
20945 struct ix86_address parts;
20946 int ok;
20948 /* Check we need to optimize. */
20949 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20950 return false;
20952 /* The "at least two components" test below might not catch simple
20953 move or zero extension insns if parts.base is non-NULL and parts.disp
20954 is const0_rtx as the only components in the address, e.g. if the
20955 register is %rbp or %r13. As this test is much cheaper and moves or
20956 zero extensions are the common case, do this check first. */
20957 if (REG_P (operands[1])
20958 || (SImode_address_operand (operands[1], VOIDmode)
20959 && REG_P (XEXP (operands[1], 0))))
20960 return false;
20962 /* Check if it is OK to split here. */
20963 if (!ix86_ok_to_clobber_flags (insn))
20964 return false;
20966 ok = ix86_decompose_address (operands[1], &parts);
20967 gcc_assert (ok);
20969 /* There should be at least two components in the address. */
20970 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20971 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20972 return false;
20974 /* We should not split into add if non legitimate pic
20975 operand is used as displacement. */
20976 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20977 return false;
20979 regno0 = true_regnum (operands[0]) ;
20980 regno1 = INVALID_REGNUM;
20981 regno2 = INVALID_REGNUM;
20983 if (parts.base)
20984 regno1 = true_regnum (parts.base);
20985 if (parts.index)
20986 regno2 = true_regnum (parts.index);
20988 split_cost = 0;
20990 /* Compute how many cycles we will add to execution time
20991 if split lea into a sequence of instructions. */
20992 if (parts.base || parts.index)
20994 /* Have to use mov instruction if non desctructive
20995 destination form is used. */
20996 if (regno1 != regno0 && regno2 != regno0)
20997 split_cost += 1;
20999 /* Have to add index to base if both exist. */
21000 if (parts.base && parts.index)
21001 split_cost += 1;
21003 /* Have to use shift and adds if scale is 2 or greater. */
21004 if (parts.scale > 1)
21006 if (regno0 != regno1)
21007 split_cost += 1;
21008 else if (regno2 == regno0)
21009 split_cost += 4;
21010 else
21011 split_cost += parts.scale;
21014 /* Have to use add instruction with immediate if
21015 disp is non zero. */
21016 if (parts.disp && parts.disp != const0_rtx)
21017 split_cost += 1;
21019 /* Subtract the price of lea. */
21020 split_cost -= 1;
21023 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21024 parts.scale > 1);
21027 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21028 matches destination. RTX includes clobber of FLAGS_REG. */
21030 static void
21031 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21032 rtx dst, rtx src)
21034 rtx op, clob;
21036 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21037 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21039 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21042 /* Return true if regno1 def is nearest to the insn. */
21044 static bool
21045 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21047 rtx_insn *prev = insn;
21048 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21050 if (insn == start)
21051 return false;
21052 while (prev && prev != start)
21054 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21056 prev = PREV_INSN (prev);
21057 continue;
21059 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21060 return true;
21061 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21062 return false;
21063 prev = PREV_INSN (prev);
21066 /* None of the regs is defined in the bb. */
21067 return false;
21070 /* Split lea instructions into a sequence of instructions
21071 which are executed on ALU to avoid AGU stalls.
21072 It is assumed that it is allowed to clobber flags register
21073 at lea position. */
21075 void
21076 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21078 unsigned int regno0, regno1, regno2;
21079 struct ix86_address parts;
21080 rtx target, tmp;
21081 int ok, adds;
21083 ok = ix86_decompose_address (operands[1], &parts);
21084 gcc_assert (ok);
21086 target = gen_lowpart (mode, operands[0]);
21088 regno0 = true_regnum (target);
21089 regno1 = INVALID_REGNUM;
21090 regno2 = INVALID_REGNUM;
21092 if (parts.base)
21094 parts.base = gen_lowpart (mode, parts.base);
21095 regno1 = true_regnum (parts.base);
21098 if (parts.index)
21100 parts.index = gen_lowpart (mode, parts.index);
21101 regno2 = true_regnum (parts.index);
21104 if (parts.disp)
21105 parts.disp = gen_lowpart (mode, parts.disp);
21107 if (parts.scale > 1)
21109 /* Case r1 = r1 + ... */
21110 if (regno1 == regno0)
21112 /* If we have a case r1 = r1 + C * r2 then we
21113 should use multiplication which is very
21114 expensive. Assume cost model is wrong if we
21115 have such case here. */
21116 gcc_assert (regno2 != regno0);
21118 for (adds = parts.scale; adds > 0; adds--)
21119 ix86_emit_binop (PLUS, mode, target, parts.index);
21121 else
21123 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21124 if (regno0 != regno2)
21125 emit_insn (gen_rtx_SET (target, parts.index));
21127 /* Use shift for scaling. */
21128 ix86_emit_binop (ASHIFT, mode, target,
21129 GEN_INT (exact_log2 (parts.scale)));
21131 if (parts.base)
21132 ix86_emit_binop (PLUS, mode, target, parts.base);
21134 if (parts.disp && parts.disp != const0_rtx)
21135 ix86_emit_binop (PLUS, mode, target, parts.disp);
21138 else if (!parts.base && !parts.index)
21140 gcc_assert(parts.disp);
21141 emit_insn (gen_rtx_SET (target, parts.disp));
21143 else
21145 if (!parts.base)
21147 if (regno0 != regno2)
21148 emit_insn (gen_rtx_SET (target, parts.index));
21150 else if (!parts.index)
21152 if (regno0 != regno1)
21153 emit_insn (gen_rtx_SET (target, parts.base));
21155 else
21157 if (regno0 == regno1)
21158 tmp = parts.index;
21159 else if (regno0 == regno2)
21160 tmp = parts.base;
21161 else
21163 rtx tmp1;
21165 /* Find better operand for SET instruction, depending
21166 on which definition is farther from the insn. */
21167 if (find_nearest_reg_def (insn, regno1, regno2))
21168 tmp = parts.index, tmp1 = parts.base;
21169 else
21170 tmp = parts.base, tmp1 = parts.index;
21172 emit_insn (gen_rtx_SET (target, tmp));
21174 if (parts.disp && parts.disp != const0_rtx)
21175 ix86_emit_binop (PLUS, mode, target, parts.disp);
21177 ix86_emit_binop (PLUS, mode, target, tmp1);
21178 return;
21181 ix86_emit_binop (PLUS, mode, target, tmp);
21184 if (parts.disp && parts.disp != const0_rtx)
21185 ix86_emit_binop (PLUS, mode, target, parts.disp);
21189 /* Return true if it is ok to optimize an ADD operation to LEA
21190 operation to avoid flag register consumation. For most processors,
21191 ADD is faster than LEA. For the processors like BONNELL, if the
21192 destination register of LEA holds an actual address which will be
21193 used soon, LEA is better and otherwise ADD is better. */
21195 bool
21196 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21198 unsigned int regno0 = true_regnum (operands[0]);
21199 unsigned int regno1 = true_regnum (operands[1]);
21200 unsigned int regno2 = true_regnum (operands[2]);
21202 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21203 if (regno0 != regno1 && regno0 != regno2)
21204 return true;
21206 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21207 return false;
21209 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21212 /* Return true if destination reg of SET_BODY is shift count of
21213 USE_BODY. */
21215 static bool
21216 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21218 rtx set_dest;
21219 rtx shift_rtx;
21220 int i;
21222 /* Retrieve destination of SET_BODY. */
21223 switch (GET_CODE (set_body))
21225 case SET:
21226 set_dest = SET_DEST (set_body);
21227 if (!set_dest || !REG_P (set_dest))
21228 return false;
21229 break;
21230 case PARALLEL:
21231 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21232 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21233 use_body))
21234 return true;
21235 /* FALLTHROUGH */
21236 default:
21237 return false;
21240 /* Retrieve shift count of USE_BODY. */
21241 switch (GET_CODE (use_body))
21243 case SET:
21244 shift_rtx = XEXP (use_body, 1);
21245 break;
21246 case PARALLEL:
21247 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21248 if (ix86_dep_by_shift_count_body (set_body,
21249 XVECEXP (use_body, 0, i)))
21250 return true;
21251 /* FALLTHROUGH */
21252 default:
21253 return false;
21256 if (shift_rtx
21257 && (GET_CODE (shift_rtx) == ASHIFT
21258 || GET_CODE (shift_rtx) == LSHIFTRT
21259 || GET_CODE (shift_rtx) == ASHIFTRT
21260 || GET_CODE (shift_rtx) == ROTATE
21261 || GET_CODE (shift_rtx) == ROTATERT))
21263 rtx shift_count = XEXP (shift_rtx, 1);
21265 /* Return true if shift count is dest of SET_BODY. */
21266 if (REG_P (shift_count))
21268 /* Add check since it can be invoked before register
21269 allocation in pre-reload schedule. */
21270 if (reload_completed
21271 && true_regnum (set_dest) == true_regnum (shift_count))
21272 return true;
21273 else if (REGNO(set_dest) == REGNO(shift_count))
21274 return true;
21278 return false;
21281 /* Return true if destination reg of SET_INSN is shift count of
21282 USE_INSN. */
21284 bool
21285 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21287 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21288 PATTERN (use_insn));
21291 /* Return TRUE or FALSE depending on whether the unary operator meets the
21292 appropriate constraints. */
21294 bool
21295 ix86_unary_operator_ok (enum rtx_code,
21296 machine_mode,
21297 rtx operands[2])
21299 /* If one of operands is memory, source and destination must match. */
21300 if ((MEM_P (operands[0])
21301 || MEM_P (operands[1]))
21302 && ! rtx_equal_p (operands[0], operands[1]))
21303 return false;
21304 return true;
21307 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21308 are ok, keeping in mind the possible movddup alternative. */
21310 bool
21311 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21313 if (MEM_P (operands[0]))
21314 return rtx_equal_p (operands[0], operands[1 + high]);
21315 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21316 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21317 return true;
21320 /* Post-reload splitter for converting an SF or DFmode value in an
21321 SSE register into an unsigned SImode. */
21323 void
21324 ix86_split_convert_uns_si_sse (rtx operands[])
21326 machine_mode vecmode;
21327 rtx value, large, zero_or_two31, input, two31, x;
21329 large = operands[1];
21330 zero_or_two31 = operands[2];
21331 input = operands[3];
21332 two31 = operands[4];
21333 vecmode = GET_MODE (large);
21334 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21336 /* Load up the value into the low element. We must ensure that the other
21337 elements are valid floats -- zero is the easiest such value. */
21338 if (MEM_P (input))
21340 if (vecmode == V4SFmode)
21341 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21342 else
21343 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21345 else
21347 input = gen_rtx_REG (vecmode, REGNO (input));
21348 emit_move_insn (value, CONST0_RTX (vecmode));
21349 if (vecmode == V4SFmode)
21350 emit_insn (gen_sse_movss (value, value, input));
21351 else
21352 emit_insn (gen_sse2_movsd (value, value, input));
21355 emit_move_insn (large, two31);
21356 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21358 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21359 emit_insn (gen_rtx_SET (large, x));
21361 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21362 emit_insn (gen_rtx_SET (zero_or_two31, x));
21364 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21365 emit_insn (gen_rtx_SET (value, x));
21367 large = gen_rtx_REG (V4SImode, REGNO (large));
21368 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21370 x = gen_rtx_REG (V4SImode, REGNO (value));
21371 if (vecmode == V4SFmode)
21372 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21373 else
21374 emit_insn (gen_sse2_cvttpd2dq (x, value));
21375 value = x;
21377 emit_insn (gen_xorv4si3 (value, value, large));
21380 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21381 Expects the 64-bit DImode to be supplied in a pair of integral
21382 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21383 -mfpmath=sse, !optimize_size only. */
21385 void
21386 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21388 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21389 rtx int_xmm, fp_xmm;
21390 rtx biases, exponents;
21391 rtx x;
21393 int_xmm = gen_reg_rtx (V4SImode);
21394 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21395 emit_insn (gen_movdi_to_sse (int_xmm, input));
21396 else if (TARGET_SSE_SPLIT_REGS)
21398 emit_clobber (int_xmm);
21399 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21401 else
21403 x = gen_reg_rtx (V2DImode);
21404 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21405 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21408 x = gen_rtx_CONST_VECTOR (V4SImode,
21409 gen_rtvec (4, GEN_INT (0x43300000UL),
21410 GEN_INT (0x45300000UL),
21411 const0_rtx, const0_rtx));
21412 exponents = validize_mem (force_const_mem (V4SImode, x));
21414 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21415 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21417 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21418 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21419 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21420 (0x1.0p84 + double(fp_value_hi_xmm)).
21421 Note these exponents differ by 32. */
21423 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21425 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21426 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21427 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21428 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21429 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21430 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21431 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21432 biases = validize_mem (force_const_mem (V2DFmode, biases));
21433 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21435 /* Add the upper and lower DFmode values together. */
21436 if (TARGET_SSE3)
21437 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21438 else
21440 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21441 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21442 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21445 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21448 /* Not used, but eases macroization of patterns. */
21449 void
21450 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21452 gcc_unreachable ();
21455 /* Convert an unsigned SImode value into a DFmode. Only currently used
21456 for SSE, but applicable anywhere. */
21458 void
21459 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21461 REAL_VALUE_TYPE TWO31r;
21462 rtx x, fp;
21464 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21465 NULL, 1, OPTAB_DIRECT);
21467 fp = gen_reg_rtx (DFmode);
21468 emit_insn (gen_floatsidf2 (fp, x));
21470 real_ldexp (&TWO31r, &dconst1, 31);
21471 x = const_double_from_real_value (TWO31r, DFmode);
21473 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21474 if (x != target)
21475 emit_move_insn (target, x);
21478 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21479 32-bit mode; otherwise we have a direct convert instruction. */
21481 void
21482 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21484 REAL_VALUE_TYPE TWO32r;
21485 rtx fp_lo, fp_hi, x;
21487 fp_lo = gen_reg_rtx (DFmode);
21488 fp_hi = gen_reg_rtx (DFmode);
21490 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21492 real_ldexp (&TWO32r, &dconst1, 32);
21493 x = const_double_from_real_value (TWO32r, DFmode);
21494 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21496 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21498 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21499 0, OPTAB_DIRECT);
21500 if (x != target)
21501 emit_move_insn (target, x);
21504 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21505 For x86_32, -mfpmath=sse, !optimize_size only. */
21506 void
21507 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21509 REAL_VALUE_TYPE ONE16r;
21510 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21512 real_ldexp (&ONE16r, &dconst1, 16);
21513 x = const_double_from_real_value (ONE16r, SFmode);
21514 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21515 NULL, 0, OPTAB_DIRECT);
21516 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21517 NULL, 0, OPTAB_DIRECT);
21518 fp_hi = gen_reg_rtx (SFmode);
21519 fp_lo = gen_reg_rtx (SFmode);
21520 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21521 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21522 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21523 0, OPTAB_DIRECT);
21524 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21525 0, OPTAB_DIRECT);
21526 if (!rtx_equal_p (target, fp_hi))
21527 emit_move_insn (target, fp_hi);
21530 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21531 a vector of unsigned ints VAL to vector of floats TARGET. */
21533 void
21534 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21536 rtx tmp[8];
21537 REAL_VALUE_TYPE TWO16r;
21538 machine_mode intmode = GET_MODE (val);
21539 machine_mode fltmode = GET_MODE (target);
21540 rtx (*cvt) (rtx, rtx);
21542 if (intmode == V4SImode)
21543 cvt = gen_floatv4siv4sf2;
21544 else
21545 cvt = gen_floatv8siv8sf2;
21546 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21547 tmp[0] = force_reg (intmode, tmp[0]);
21548 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21549 OPTAB_DIRECT);
21550 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21551 NULL_RTX, 1, OPTAB_DIRECT);
21552 tmp[3] = gen_reg_rtx (fltmode);
21553 emit_insn (cvt (tmp[3], tmp[1]));
21554 tmp[4] = gen_reg_rtx (fltmode);
21555 emit_insn (cvt (tmp[4], tmp[2]));
21556 real_ldexp (&TWO16r, &dconst1, 16);
21557 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21558 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21559 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21560 OPTAB_DIRECT);
21561 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21562 OPTAB_DIRECT);
21563 if (tmp[7] != target)
21564 emit_move_insn (target, tmp[7]);
21567 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21568 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21569 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21570 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21573 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21575 REAL_VALUE_TYPE TWO31r;
21576 rtx two31r, tmp[4];
21577 machine_mode mode = GET_MODE (val);
21578 machine_mode scalarmode = GET_MODE_INNER (mode);
21579 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21580 rtx (*cmp) (rtx, rtx, rtx, rtx);
21581 int i;
21583 for (i = 0; i < 3; i++)
21584 tmp[i] = gen_reg_rtx (mode);
21585 real_ldexp (&TWO31r, &dconst1, 31);
21586 two31r = const_double_from_real_value (TWO31r, scalarmode);
21587 two31r = ix86_build_const_vector (mode, 1, two31r);
21588 two31r = force_reg (mode, two31r);
21589 switch (mode)
21591 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21592 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21593 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21594 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21595 default: gcc_unreachable ();
21597 tmp[3] = gen_rtx_LE (mode, two31r, val);
21598 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21599 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21600 0, OPTAB_DIRECT);
21601 if (intmode == V4SImode || TARGET_AVX2)
21602 *xorp = expand_simple_binop (intmode, ASHIFT,
21603 gen_lowpart (intmode, tmp[0]),
21604 GEN_INT (31), NULL_RTX, 0,
21605 OPTAB_DIRECT);
21606 else
21608 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21609 two31 = ix86_build_const_vector (intmode, 1, two31);
21610 *xorp = expand_simple_binop (intmode, AND,
21611 gen_lowpart (intmode, tmp[0]),
21612 two31, NULL_RTX, 0,
21613 OPTAB_DIRECT);
21615 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21616 0, OPTAB_DIRECT);
21619 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21620 then replicate the value for all elements of the vector
21621 register. */
21624 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21626 int i, n_elt;
21627 rtvec v;
21628 machine_mode scalar_mode;
21630 switch (mode)
21632 case V64QImode:
21633 case V32QImode:
21634 case V16QImode:
21635 case V32HImode:
21636 case V16HImode:
21637 case V8HImode:
21638 case V16SImode:
21639 case V8SImode:
21640 case V4SImode:
21641 case V8DImode:
21642 case V4DImode:
21643 case V2DImode:
21644 gcc_assert (vect);
21645 /* FALLTHRU */
21646 case V16SFmode:
21647 case V8SFmode:
21648 case V4SFmode:
21649 case V8DFmode:
21650 case V4DFmode:
21651 case V2DFmode:
21652 n_elt = GET_MODE_NUNITS (mode);
21653 v = rtvec_alloc (n_elt);
21654 scalar_mode = GET_MODE_INNER (mode);
21656 RTVEC_ELT (v, 0) = value;
21658 for (i = 1; i < n_elt; ++i)
21659 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21661 return gen_rtx_CONST_VECTOR (mode, v);
21663 default:
21664 gcc_unreachable ();
21668 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21669 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21670 for an SSE register. If VECT is true, then replicate the mask for
21671 all elements of the vector register. If INVERT is true, then create
21672 a mask excluding the sign bit. */
21675 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21677 machine_mode vec_mode, imode;
21678 wide_int w;
21679 rtx mask, v;
21681 switch (mode)
21683 case V16SImode:
21684 case V16SFmode:
21685 case V8SImode:
21686 case V4SImode:
21687 case V8SFmode:
21688 case V4SFmode:
21689 vec_mode = mode;
21690 imode = SImode;
21691 break;
21693 case V8DImode:
21694 case V4DImode:
21695 case V2DImode:
21696 case V8DFmode:
21697 case V4DFmode:
21698 case V2DFmode:
21699 vec_mode = mode;
21700 imode = DImode;
21701 break;
21703 case TImode:
21704 case TFmode:
21705 vec_mode = VOIDmode;
21706 imode = TImode;
21707 break;
21709 default:
21710 gcc_unreachable ();
21713 machine_mode inner_mode = GET_MODE_INNER (mode);
21714 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21715 GET_MODE_BITSIZE (inner_mode));
21716 if (invert)
21717 w = wi::bit_not (w);
21719 /* Force this value into the low part of a fp vector constant. */
21720 mask = immed_wide_int_const (w, imode);
21721 mask = gen_lowpart (inner_mode, mask);
21723 if (vec_mode == VOIDmode)
21724 return force_reg (inner_mode, mask);
21726 v = ix86_build_const_vector (vec_mode, vect, mask);
21727 return force_reg (vec_mode, v);
21730 /* Generate code for floating point ABS or NEG. */
21732 void
21733 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21734 rtx operands[])
21736 rtx mask, set, dst, src;
21737 bool use_sse = false;
21738 bool vector_mode = VECTOR_MODE_P (mode);
21739 machine_mode vmode = mode;
21741 if (vector_mode)
21742 use_sse = true;
21743 else if (mode == TFmode)
21744 use_sse = true;
21745 else if (TARGET_SSE_MATH)
21747 use_sse = SSE_FLOAT_MODE_P (mode);
21748 if (mode == SFmode)
21749 vmode = V4SFmode;
21750 else if (mode == DFmode)
21751 vmode = V2DFmode;
21754 /* NEG and ABS performed with SSE use bitwise mask operations.
21755 Create the appropriate mask now. */
21756 if (use_sse)
21757 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21758 else
21759 mask = NULL_RTX;
21761 dst = operands[0];
21762 src = operands[1];
21764 set = gen_rtx_fmt_e (code, mode, src);
21765 set = gen_rtx_SET (dst, set);
21767 if (mask)
21769 rtx use, clob;
21770 rtvec par;
21772 use = gen_rtx_USE (VOIDmode, mask);
21773 if (vector_mode)
21774 par = gen_rtvec (2, set, use);
21775 else
21777 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21778 par = gen_rtvec (3, set, use, clob);
21780 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21782 else
21783 emit_insn (set);
21786 /* Expand a copysign operation. Special case operand 0 being a constant. */
21788 void
21789 ix86_expand_copysign (rtx operands[])
21791 machine_mode mode, vmode;
21792 rtx dest, op0, op1, mask, nmask;
21794 dest = operands[0];
21795 op0 = operands[1];
21796 op1 = operands[2];
21798 mode = GET_MODE (dest);
21800 if (mode == SFmode)
21801 vmode = V4SFmode;
21802 else if (mode == DFmode)
21803 vmode = V2DFmode;
21804 else
21805 vmode = mode;
21807 if (CONST_DOUBLE_P (op0))
21809 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21811 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21812 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21814 if (mode == SFmode || mode == DFmode)
21816 if (op0 == CONST0_RTX (mode))
21817 op0 = CONST0_RTX (vmode);
21818 else
21820 rtx v = ix86_build_const_vector (vmode, false, op0);
21822 op0 = force_reg (vmode, v);
21825 else if (op0 != CONST0_RTX (mode))
21826 op0 = force_reg (mode, op0);
21828 mask = ix86_build_signbit_mask (vmode, 0, 0);
21830 if (mode == SFmode)
21831 copysign_insn = gen_copysignsf3_const;
21832 else if (mode == DFmode)
21833 copysign_insn = gen_copysigndf3_const;
21834 else
21835 copysign_insn = gen_copysigntf3_const;
21837 emit_insn (copysign_insn (dest, op0, op1, mask));
21839 else
21841 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21843 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21844 mask = ix86_build_signbit_mask (vmode, 0, 0);
21846 if (mode == SFmode)
21847 copysign_insn = gen_copysignsf3_var;
21848 else if (mode == DFmode)
21849 copysign_insn = gen_copysigndf3_var;
21850 else
21851 copysign_insn = gen_copysigntf3_var;
21853 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21857 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21858 be a constant, and so has already been expanded into a vector constant. */
21860 void
21861 ix86_split_copysign_const (rtx operands[])
21863 machine_mode mode, vmode;
21864 rtx dest, op0, mask, x;
21866 dest = operands[0];
21867 op0 = operands[1];
21868 mask = operands[3];
21870 mode = GET_MODE (dest);
21871 vmode = GET_MODE (mask);
21873 dest = lowpart_subreg (vmode, dest, mode);
21874 x = gen_rtx_AND (vmode, dest, mask);
21875 emit_insn (gen_rtx_SET (dest, x));
21877 if (op0 != CONST0_RTX (vmode))
21879 x = gen_rtx_IOR (vmode, dest, op0);
21880 emit_insn (gen_rtx_SET (dest, x));
21884 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21885 so we have to do two masks. */
21887 void
21888 ix86_split_copysign_var (rtx operands[])
21890 machine_mode mode, vmode;
21891 rtx dest, scratch, op0, op1, mask, nmask, x;
21893 dest = operands[0];
21894 scratch = operands[1];
21895 op0 = operands[2];
21896 op1 = operands[3];
21897 nmask = operands[4];
21898 mask = operands[5];
21900 mode = GET_MODE (dest);
21901 vmode = GET_MODE (mask);
21903 if (rtx_equal_p (op0, op1))
21905 /* Shouldn't happen often (it's useless, obviously), but when it does
21906 we'd generate incorrect code if we continue below. */
21907 emit_move_insn (dest, op0);
21908 return;
21911 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21913 gcc_assert (REGNO (op1) == REGNO (scratch));
21915 x = gen_rtx_AND (vmode, scratch, mask);
21916 emit_insn (gen_rtx_SET (scratch, x));
21918 dest = mask;
21919 op0 = lowpart_subreg (vmode, op0, mode);
21920 x = gen_rtx_NOT (vmode, dest);
21921 x = gen_rtx_AND (vmode, x, op0);
21922 emit_insn (gen_rtx_SET (dest, x));
21924 else
21926 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21928 x = gen_rtx_AND (vmode, scratch, mask);
21930 else /* alternative 2,4 */
21932 gcc_assert (REGNO (mask) == REGNO (scratch));
21933 op1 = lowpart_subreg (vmode, op1, mode);
21934 x = gen_rtx_AND (vmode, scratch, op1);
21936 emit_insn (gen_rtx_SET (scratch, x));
21938 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21940 dest = lowpart_subreg (vmode, op0, mode);
21941 x = gen_rtx_AND (vmode, dest, nmask);
21943 else /* alternative 3,4 */
21945 gcc_assert (REGNO (nmask) == REGNO (dest));
21946 dest = nmask;
21947 op0 = lowpart_subreg (vmode, op0, mode);
21948 x = gen_rtx_AND (vmode, dest, op0);
21950 emit_insn (gen_rtx_SET (dest, x));
21953 x = gen_rtx_IOR (vmode, dest, scratch);
21954 emit_insn (gen_rtx_SET (dest, x));
21957 /* Return TRUE or FALSE depending on whether the first SET in INSN
21958 has source and destination with matching CC modes, and that the
21959 CC mode is at least as constrained as REQ_MODE. */
21961 bool
21962 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21964 rtx set;
21965 machine_mode set_mode;
21967 set = PATTERN (insn);
21968 if (GET_CODE (set) == PARALLEL)
21969 set = XVECEXP (set, 0, 0);
21970 gcc_assert (GET_CODE (set) == SET);
21971 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21973 set_mode = GET_MODE (SET_DEST (set));
21974 switch (set_mode)
21976 case CCNOmode:
21977 if (req_mode != CCNOmode
21978 && (req_mode != CCmode
21979 || XEXP (SET_SRC (set), 1) != const0_rtx))
21980 return false;
21981 break;
21982 case CCmode:
21983 if (req_mode == CCGCmode)
21984 return false;
21985 /* FALLTHRU */
21986 case CCGCmode:
21987 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21988 return false;
21989 /* FALLTHRU */
21990 case CCGOCmode:
21991 if (req_mode == CCZmode)
21992 return false;
21993 /* FALLTHRU */
21994 case CCZmode:
21995 break;
21997 case CCAmode:
21998 case CCCmode:
21999 case CCOmode:
22000 case CCPmode:
22001 case CCSmode:
22002 if (set_mode != req_mode)
22003 return false;
22004 break;
22006 default:
22007 gcc_unreachable ();
22010 return GET_MODE (SET_SRC (set)) == set_mode;
22013 /* Generate insn patterns to do an integer compare of OPERANDS. */
22015 static rtx
22016 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22018 machine_mode cmpmode;
22019 rtx tmp, flags;
22021 cmpmode = SELECT_CC_MODE (code, op0, op1);
22022 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22024 /* This is very simple, but making the interface the same as in the
22025 FP case makes the rest of the code easier. */
22026 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22027 emit_insn (gen_rtx_SET (flags, tmp));
22029 /* Return the test that should be put into the flags user, i.e.
22030 the bcc, scc, or cmov instruction. */
22031 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22034 /* Figure out whether to use ordered or unordered fp comparisons.
22035 Return the appropriate mode to use. */
22037 machine_mode
22038 ix86_fp_compare_mode (enum rtx_code)
22040 /* ??? In order to make all comparisons reversible, we do all comparisons
22041 non-trapping when compiling for IEEE. Once gcc is able to distinguish
22042 all forms trapping and nontrapping comparisons, we can make inequality
22043 comparisons trapping again, since it results in better code when using
22044 FCOM based compares. */
22045 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
22048 machine_mode
22049 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22051 machine_mode mode = GET_MODE (op0);
22053 if (SCALAR_FLOAT_MODE_P (mode))
22055 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22056 return ix86_fp_compare_mode (code);
22059 switch (code)
22061 /* Only zero flag is needed. */
22062 case EQ: /* ZF=0 */
22063 case NE: /* ZF!=0 */
22064 return CCZmode;
22065 /* Codes needing carry flag. */
22066 case GEU: /* CF=0 */
22067 case LTU: /* CF=1 */
22068 /* Detect overflow checks. They need just the carry flag. */
22069 if (GET_CODE (op0) == PLUS
22070 && (rtx_equal_p (op1, XEXP (op0, 0))
22071 || rtx_equal_p (op1, XEXP (op0, 1))))
22072 return CCCmode;
22073 else
22074 return CCmode;
22075 case GTU: /* CF=0 & ZF=0 */
22076 case LEU: /* CF=1 | ZF=1 */
22077 return CCmode;
22078 /* Codes possibly doable only with sign flag when
22079 comparing against zero. */
22080 case GE: /* SF=OF or SF=0 */
22081 case LT: /* SF<>OF or SF=1 */
22082 if (op1 == const0_rtx)
22083 return CCGOCmode;
22084 else
22085 /* For other cases Carry flag is not required. */
22086 return CCGCmode;
22087 /* Codes doable only with sign flag when comparing
22088 against zero, but we miss jump instruction for it
22089 so we need to use relational tests against overflow
22090 that thus needs to be zero. */
22091 case GT: /* ZF=0 & SF=OF */
22092 case LE: /* ZF=1 | SF<>OF */
22093 if (op1 == const0_rtx)
22094 return CCNOmode;
22095 else
22096 return CCGCmode;
22097 /* strcmp pattern do (use flags) and combine may ask us for proper
22098 mode. */
22099 case USE:
22100 return CCmode;
22101 default:
22102 gcc_unreachable ();
22106 /* Return the fixed registers used for condition codes. */
22108 static bool
22109 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22111 *p1 = FLAGS_REG;
22112 *p2 = FPSR_REG;
22113 return true;
22116 /* If two condition code modes are compatible, return a condition code
22117 mode which is compatible with both. Otherwise, return
22118 VOIDmode. */
22120 static machine_mode
22121 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22123 if (m1 == m2)
22124 return m1;
22126 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22127 return VOIDmode;
22129 if ((m1 == CCGCmode && m2 == CCGOCmode)
22130 || (m1 == CCGOCmode && m2 == CCGCmode))
22131 return CCGCmode;
22133 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
22134 return m2;
22135 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
22136 return m1;
22138 switch (m1)
22140 default:
22141 gcc_unreachable ();
22143 case CCmode:
22144 case CCGCmode:
22145 case CCGOCmode:
22146 case CCNOmode:
22147 case CCAmode:
22148 case CCCmode:
22149 case CCOmode:
22150 case CCPmode:
22151 case CCSmode:
22152 case CCZmode:
22153 switch (m2)
22155 default:
22156 return VOIDmode;
22158 case CCmode:
22159 case CCGCmode:
22160 case CCGOCmode:
22161 case CCNOmode:
22162 case CCAmode:
22163 case CCCmode:
22164 case CCOmode:
22165 case CCPmode:
22166 case CCSmode:
22167 case CCZmode:
22168 return CCmode;
22171 case CCFPmode:
22172 case CCFPUmode:
22173 /* These are only compatible with themselves, which we already
22174 checked above. */
22175 return VOIDmode;
22180 /* Return a comparison we can do and that it is equivalent to
22181 swap_condition (code) apart possibly from orderedness.
22182 But, never change orderedness if TARGET_IEEE_FP, returning
22183 UNKNOWN in that case if necessary. */
22185 static enum rtx_code
22186 ix86_fp_swap_condition (enum rtx_code code)
22188 switch (code)
22190 case GT: /* GTU - CF=0 & ZF=0 */
22191 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22192 case GE: /* GEU - CF=0 */
22193 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22194 case UNLT: /* LTU - CF=1 */
22195 return TARGET_IEEE_FP ? UNKNOWN : GT;
22196 case UNLE: /* LEU - CF=1 | ZF=1 */
22197 return TARGET_IEEE_FP ? UNKNOWN : GE;
22198 default:
22199 return swap_condition (code);
22203 /* Return cost of comparison CODE using the best strategy for performance.
22204 All following functions do use number of instructions as a cost metrics.
22205 In future this should be tweaked to compute bytes for optimize_size and
22206 take into account performance of various instructions on various CPUs. */
22208 static int
22209 ix86_fp_comparison_cost (enum rtx_code code)
22211 int arith_cost;
22213 /* The cost of code using bit-twiddling on %ah. */
22214 switch (code)
22216 case UNLE:
22217 case UNLT:
22218 case LTGT:
22219 case GT:
22220 case GE:
22221 case UNORDERED:
22222 case ORDERED:
22223 case UNEQ:
22224 arith_cost = 4;
22225 break;
22226 case LT:
22227 case NE:
22228 case EQ:
22229 case UNGE:
22230 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22231 break;
22232 case LE:
22233 case UNGT:
22234 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22235 break;
22236 default:
22237 gcc_unreachable ();
22240 switch (ix86_fp_comparison_strategy (code))
22242 case IX86_FPCMP_COMI:
22243 return arith_cost > 4 ? 3 : 2;
22244 case IX86_FPCMP_SAHF:
22245 return arith_cost > 4 ? 4 : 3;
22246 default:
22247 return arith_cost;
22251 /* Return strategy to use for floating-point. We assume that fcomi is always
22252 preferrable where available, since that is also true when looking at size
22253 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22255 enum ix86_fpcmp_strategy
22256 ix86_fp_comparison_strategy (enum rtx_code)
22258 /* Do fcomi/sahf based test when profitable. */
22260 if (TARGET_CMOVE)
22261 return IX86_FPCMP_COMI;
22263 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22264 return IX86_FPCMP_SAHF;
22266 return IX86_FPCMP_ARITH;
22269 /* Swap, force into registers, or otherwise massage the two operands
22270 to a fp comparison. The operands are updated in place; the new
22271 comparison code is returned. */
22273 static enum rtx_code
22274 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22276 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
22277 rtx op0 = *pop0, op1 = *pop1;
22278 machine_mode op_mode = GET_MODE (op0);
22279 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22281 /* All of the unordered compare instructions only work on registers.
22282 The same is true of the fcomi compare instructions. The XFmode
22283 compare instructions require registers except when comparing
22284 against zero or when converting operand 1 from fixed point to
22285 floating point. */
22287 if (!is_sse
22288 && (fpcmp_mode == CCFPUmode
22289 || (op_mode == XFmode
22290 && ! (standard_80387_constant_p (op0) == 1
22291 || standard_80387_constant_p (op1) == 1)
22292 && GET_CODE (op1) != FLOAT)
22293 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22295 op0 = force_reg (op_mode, op0);
22296 op1 = force_reg (op_mode, op1);
22298 else
22300 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22301 things around if they appear profitable, otherwise force op0
22302 into a register. */
22304 if (standard_80387_constant_p (op0) == 0
22305 || (MEM_P (op0)
22306 && ! (standard_80387_constant_p (op1) == 0
22307 || MEM_P (op1))))
22309 enum rtx_code new_code = ix86_fp_swap_condition (code);
22310 if (new_code != UNKNOWN)
22312 std::swap (op0, op1);
22313 code = new_code;
22317 if (!REG_P (op0))
22318 op0 = force_reg (op_mode, op0);
22320 if (CONSTANT_P (op1))
22322 int tmp = standard_80387_constant_p (op1);
22323 if (tmp == 0)
22324 op1 = validize_mem (force_const_mem (op_mode, op1));
22325 else if (tmp == 1)
22327 if (TARGET_CMOVE)
22328 op1 = force_reg (op_mode, op1);
22330 else
22331 op1 = force_reg (op_mode, op1);
22335 /* Try to rearrange the comparison to make it cheaper. */
22336 if (ix86_fp_comparison_cost (code)
22337 > ix86_fp_comparison_cost (swap_condition (code))
22338 && (REG_P (op1) || can_create_pseudo_p ()))
22340 std::swap (op0, op1);
22341 code = swap_condition (code);
22342 if (!REG_P (op0))
22343 op0 = force_reg (op_mode, op0);
22346 *pop0 = op0;
22347 *pop1 = op1;
22348 return code;
22351 /* Convert comparison codes we use to represent FP comparison to integer
22352 code that will result in proper branch. Return UNKNOWN if no such code
22353 is available. */
22355 enum rtx_code
22356 ix86_fp_compare_code_to_integer (enum rtx_code code)
22358 switch (code)
22360 case GT:
22361 return GTU;
22362 case GE:
22363 return GEU;
22364 case ORDERED:
22365 case UNORDERED:
22366 return code;
22367 case UNEQ:
22368 return EQ;
22369 case UNLT:
22370 return LTU;
22371 case UNLE:
22372 return LEU;
22373 case LTGT:
22374 return NE;
22375 default:
22376 return UNKNOWN;
22380 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22382 static rtx
22383 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22385 machine_mode fpcmp_mode, intcmp_mode;
22386 rtx tmp, tmp2;
22388 fpcmp_mode = ix86_fp_compare_mode (code);
22389 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22391 /* Do fcomi/sahf based test when profitable. */
22392 switch (ix86_fp_comparison_strategy (code))
22394 case IX86_FPCMP_COMI:
22395 intcmp_mode = fpcmp_mode;
22396 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22397 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22398 emit_insn (tmp);
22399 break;
22401 case IX86_FPCMP_SAHF:
22402 intcmp_mode = fpcmp_mode;
22403 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22404 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
22406 if (!scratch)
22407 scratch = gen_reg_rtx (HImode);
22408 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22409 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22410 break;
22412 case IX86_FPCMP_ARITH:
22413 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22414 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
22415 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22416 if (!scratch)
22417 scratch = gen_reg_rtx (HImode);
22418 emit_insn (gen_rtx_SET (scratch, tmp2));
22420 /* In the unordered case, we have to check C2 for NaN's, which
22421 doesn't happen to work out to anything nice combination-wise.
22422 So do some bit twiddling on the value we've got in AH to come
22423 up with an appropriate set of condition codes. */
22425 intcmp_mode = CCNOmode;
22426 switch (code)
22428 case GT:
22429 case UNGT:
22430 if (code == GT || !TARGET_IEEE_FP)
22432 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22433 code = EQ;
22435 else
22437 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22438 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22439 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22440 intcmp_mode = CCmode;
22441 code = GEU;
22443 break;
22444 case LT:
22445 case UNLT:
22446 if (code == LT && TARGET_IEEE_FP)
22448 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22449 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22450 intcmp_mode = CCmode;
22451 code = EQ;
22453 else
22455 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
22456 code = NE;
22458 break;
22459 case GE:
22460 case UNGE:
22461 if (code == GE || !TARGET_IEEE_FP)
22463 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
22464 code = EQ;
22466 else
22468 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22469 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
22470 code = NE;
22472 break;
22473 case LE:
22474 case UNLE:
22475 if (code == LE && TARGET_IEEE_FP)
22477 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22478 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22479 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22480 intcmp_mode = CCmode;
22481 code = LTU;
22483 else
22485 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
22486 code = NE;
22488 break;
22489 case EQ:
22490 case UNEQ:
22491 if (code == EQ && TARGET_IEEE_FP)
22493 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22494 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22495 intcmp_mode = CCmode;
22496 code = EQ;
22498 else
22500 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22501 code = NE;
22503 break;
22504 case NE:
22505 case LTGT:
22506 if (code == NE && TARGET_IEEE_FP)
22508 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
22509 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
22510 GEN_INT (0x40)));
22511 code = NE;
22513 else
22515 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
22516 code = EQ;
22518 break;
22520 case UNORDERED:
22521 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22522 code = NE;
22523 break;
22524 case ORDERED:
22525 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
22526 code = EQ;
22527 break;
22529 default:
22530 gcc_unreachable ();
22532 break;
22534 default:
22535 gcc_unreachable();
22538 /* Return the test that should be put into the flags user, i.e.
22539 the bcc, scc, or cmov instruction. */
22540 return gen_rtx_fmt_ee (code, VOIDmode,
22541 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22542 const0_rtx);
22545 static rtx
22546 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22548 rtx ret;
22550 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22551 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22553 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22555 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22556 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22558 else
22559 ret = ix86_expand_int_compare (code, op0, op1);
22561 return ret;
22564 void
22565 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22567 machine_mode mode = GET_MODE (op0);
22568 rtx tmp;
22570 /* Handle special case - vector comparsion with boolean result, transform
22571 it using ptest instruction. */
22572 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22574 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22575 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22577 gcc_assert (code == EQ || code == NE);
22578 /* Generate XOR since we can't check that one operand is zero vector. */
22579 tmp = gen_reg_rtx (mode);
22580 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22581 tmp = gen_lowpart (p_mode, tmp);
22582 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22583 gen_rtx_UNSPEC (CCmode,
22584 gen_rtvec (2, tmp, tmp),
22585 UNSPEC_PTEST)));
22586 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22587 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22588 gen_rtx_LABEL_REF (VOIDmode, label),
22589 pc_rtx);
22590 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22591 return;
22594 switch (mode)
22596 case SFmode:
22597 case DFmode:
22598 case XFmode:
22599 case QImode:
22600 case HImode:
22601 case SImode:
22602 simple:
22603 tmp = ix86_expand_compare (code, op0, op1);
22604 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22605 gen_rtx_LABEL_REF (VOIDmode, label),
22606 pc_rtx);
22607 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22608 return;
22610 case DImode:
22611 if (TARGET_64BIT)
22612 goto simple;
22613 /* For 32-bit target DI comparison may be performed on
22614 SSE registers. To allow this we should avoid split
22615 to SI mode which is achieved by doing xor in DI mode
22616 and then comparing with zero (which is recognized by
22617 STV pass). We don't compare using xor when optimizing
22618 for size. */
22619 if (!optimize_insn_for_size_p ()
22620 && TARGET_STV
22621 && (code == EQ || code == NE))
22623 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22624 op1 = const0_rtx;
22626 /* FALLTHRU */
22627 case TImode:
22628 /* Expand DImode branch into multiple compare+branch. */
22630 rtx lo[2], hi[2];
22631 rtx_code_label *label2;
22632 enum rtx_code code1, code2, code3;
22633 machine_mode submode;
22635 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22637 std::swap (op0, op1);
22638 code = swap_condition (code);
22641 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22642 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22644 submode = mode == DImode ? SImode : DImode;
22646 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22647 avoid two branches. This costs one extra insn, so disable when
22648 optimizing for size. */
22650 if ((code == EQ || code == NE)
22651 && (!optimize_insn_for_size_p ()
22652 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22654 rtx xor0, xor1;
22656 xor1 = hi[0];
22657 if (hi[1] != const0_rtx)
22658 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22659 NULL_RTX, 0, OPTAB_WIDEN);
22661 xor0 = lo[0];
22662 if (lo[1] != const0_rtx)
22663 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22664 NULL_RTX, 0, OPTAB_WIDEN);
22666 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22667 NULL_RTX, 0, OPTAB_WIDEN);
22669 ix86_expand_branch (code, tmp, const0_rtx, label);
22670 return;
22673 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22674 op1 is a constant and the low word is zero, then we can just
22675 examine the high word. Similarly for low word -1 and
22676 less-or-equal-than or greater-than. */
22678 if (CONST_INT_P (hi[1]))
22679 switch (code)
22681 case LT: case LTU: case GE: case GEU:
22682 if (lo[1] == const0_rtx)
22684 ix86_expand_branch (code, hi[0], hi[1], label);
22685 return;
22687 break;
22688 case LE: case LEU: case GT: case GTU:
22689 if (lo[1] == constm1_rtx)
22691 ix86_expand_branch (code, hi[0], hi[1], label);
22692 return;
22694 break;
22695 default:
22696 break;
22699 /* Otherwise, we need two or three jumps. */
22701 label2 = gen_label_rtx ();
22703 code1 = code;
22704 code2 = swap_condition (code);
22705 code3 = unsigned_condition (code);
22707 switch (code)
22709 case LT: case GT: case LTU: case GTU:
22710 break;
22712 case LE: code1 = LT; code2 = GT; break;
22713 case GE: code1 = GT; code2 = LT; break;
22714 case LEU: code1 = LTU; code2 = GTU; break;
22715 case GEU: code1 = GTU; code2 = LTU; break;
22717 case EQ: code1 = UNKNOWN; code2 = NE; break;
22718 case NE: code2 = UNKNOWN; break;
22720 default:
22721 gcc_unreachable ();
22725 * a < b =>
22726 * if (hi(a) < hi(b)) goto true;
22727 * if (hi(a) > hi(b)) goto false;
22728 * if (lo(a) < lo(b)) goto true;
22729 * false:
22732 if (code1 != UNKNOWN)
22733 ix86_expand_branch (code1, hi[0], hi[1], label);
22734 if (code2 != UNKNOWN)
22735 ix86_expand_branch (code2, hi[0], hi[1], label2);
22737 ix86_expand_branch (code3, lo[0], lo[1], label);
22739 if (code2 != UNKNOWN)
22740 emit_label (label2);
22741 return;
22744 default:
22745 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22746 goto simple;
22750 /* Split branch based on floating point condition. */
22751 void
22752 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
22753 rtx target1, rtx target2, rtx tmp)
22755 rtx condition;
22756 rtx i;
22758 if (target2 != pc_rtx)
22760 std::swap (target1, target2);
22761 code = reverse_condition_maybe_unordered (code);
22764 condition = ix86_expand_fp_compare (code, op1, op2,
22765 tmp);
22767 i = emit_jump_insn (gen_rtx_SET
22768 (pc_rtx,
22769 gen_rtx_IF_THEN_ELSE (VOIDmode,
22770 condition, target1, target2)));
22771 if (split_branch_probability >= 0)
22772 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
22775 void
22776 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22778 rtx ret;
22780 gcc_assert (GET_MODE (dest) == QImode);
22782 ret = ix86_expand_compare (code, op0, op1);
22783 PUT_MODE (ret, QImode);
22784 emit_insn (gen_rtx_SET (dest, ret));
22787 /* Expand comparison setting or clearing carry flag. Return true when
22788 successful and set pop for the operation. */
22789 static bool
22790 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22792 machine_mode mode =
22793 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22795 /* Do not handle double-mode compares that go through special path. */
22796 if (mode == (TARGET_64BIT ? TImode : DImode))
22797 return false;
22799 if (SCALAR_FLOAT_MODE_P (mode))
22801 rtx compare_op;
22802 rtx_insn *compare_seq;
22804 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22806 /* Shortcut: following common codes never translate
22807 into carry flag compares. */
22808 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22809 || code == ORDERED || code == UNORDERED)
22810 return false;
22812 /* These comparisons require zero flag; swap operands so they won't. */
22813 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22814 && !TARGET_IEEE_FP)
22816 std::swap (op0, op1);
22817 code = swap_condition (code);
22820 /* Try to expand the comparison and verify that we end up with
22821 carry flag based comparison. This fails to be true only when
22822 we decide to expand comparison using arithmetic that is not
22823 too common scenario. */
22824 start_sequence ();
22825 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22826 compare_seq = get_insns ();
22827 end_sequence ();
22829 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
22830 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
22831 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22832 else
22833 code = GET_CODE (compare_op);
22835 if (code != LTU && code != GEU)
22836 return false;
22838 emit_insn (compare_seq);
22839 *pop = compare_op;
22840 return true;
22843 if (!INTEGRAL_MODE_P (mode))
22844 return false;
22846 switch (code)
22848 case LTU:
22849 case GEU:
22850 break;
22852 /* Convert a==0 into (unsigned)a<1. */
22853 case EQ:
22854 case NE:
22855 if (op1 != const0_rtx)
22856 return false;
22857 op1 = const1_rtx;
22858 code = (code == EQ ? LTU : GEU);
22859 break;
22861 /* Convert a>b into b<a or a>=b-1. */
22862 case GTU:
22863 case LEU:
22864 if (CONST_INT_P (op1))
22866 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22867 /* Bail out on overflow. We still can swap operands but that
22868 would force loading of the constant into register. */
22869 if (op1 == const0_rtx
22870 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22871 return false;
22872 code = (code == GTU ? GEU : LTU);
22874 else
22876 std::swap (op0, op1);
22877 code = (code == GTU ? LTU : GEU);
22879 break;
22881 /* Convert a>=0 into (unsigned)a<0x80000000. */
22882 case LT:
22883 case GE:
22884 if (mode == DImode || op1 != const0_rtx)
22885 return false;
22886 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22887 code = (code == LT ? GEU : LTU);
22888 break;
22889 case LE:
22890 case GT:
22891 if (mode == DImode || op1 != constm1_rtx)
22892 return false;
22893 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22894 code = (code == LE ? GEU : LTU);
22895 break;
22897 default:
22898 return false;
22900 /* Swapping operands may cause constant to appear as first operand. */
22901 if (!nonimmediate_operand (op0, VOIDmode))
22903 if (!can_create_pseudo_p ())
22904 return false;
22905 op0 = force_reg (mode, op0);
22907 *pop = ix86_expand_compare (code, op0, op1);
22908 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22909 return true;
22912 bool
22913 ix86_expand_int_movcc (rtx operands[])
22915 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22916 rtx_insn *compare_seq;
22917 rtx compare_op;
22918 machine_mode mode = GET_MODE (operands[0]);
22919 bool sign_bit_compare_p = false;
22920 rtx op0 = XEXP (operands[1], 0);
22921 rtx op1 = XEXP (operands[1], 1);
22923 if (GET_MODE (op0) == TImode
22924 || (GET_MODE (op0) == DImode
22925 && !TARGET_64BIT))
22926 return false;
22928 start_sequence ();
22929 compare_op = ix86_expand_compare (code, op0, op1);
22930 compare_seq = get_insns ();
22931 end_sequence ();
22933 compare_code = GET_CODE (compare_op);
22935 if ((op1 == const0_rtx && (code == GE || code == LT))
22936 || (op1 == constm1_rtx && (code == GT || code == LE)))
22937 sign_bit_compare_p = true;
22939 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22940 HImode insns, we'd be swallowed in word prefix ops. */
22942 if ((mode != HImode || TARGET_FAST_PREFIX)
22943 && (mode != (TARGET_64BIT ? TImode : DImode))
22944 && CONST_INT_P (operands[2])
22945 && CONST_INT_P (operands[3]))
22947 rtx out = operands[0];
22948 HOST_WIDE_INT ct = INTVAL (operands[2]);
22949 HOST_WIDE_INT cf = INTVAL (operands[3]);
22950 HOST_WIDE_INT diff;
22952 diff = ct - cf;
22953 /* Sign bit compares are better done using shifts than we do by using
22954 sbb. */
22955 if (sign_bit_compare_p
22956 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22958 /* Detect overlap between destination and compare sources. */
22959 rtx tmp = out;
22961 if (!sign_bit_compare_p)
22963 rtx flags;
22964 bool fpcmp = false;
22966 compare_code = GET_CODE (compare_op);
22968 flags = XEXP (compare_op, 0);
22970 if (GET_MODE (flags) == CCFPmode
22971 || GET_MODE (flags) == CCFPUmode)
22973 fpcmp = true;
22974 compare_code
22975 = ix86_fp_compare_code_to_integer (compare_code);
22978 /* To simplify rest of code, restrict to the GEU case. */
22979 if (compare_code == LTU)
22981 std::swap (ct, cf);
22982 compare_code = reverse_condition (compare_code);
22983 code = reverse_condition (code);
22985 else
22987 if (fpcmp)
22988 PUT_CODE (compare_op,
22989 reverse_condition_maybe_unordered
22990 (GET_CODE (compare_op)));
22991 else
22992 PUT_CODE (compare_op,
22993 reverse_condition (GET_CODE (compare_op)));
22995 diff = ct - cf;
22997 if (reg_overlap_mentioned_p (out, op0)
22998 || reg_overlap_mentioned_p (out, op1))
22999 tmp = gen_reg_rtx (mode);
23001 if (mode == DImode)
23002 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23003 else
23004 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23005 flags, compare_op));
23007 else
23009 if (code == GT || code == GE)
23010 code = reverse_condition (code);
23011 else
23013 std::swap (ct, cf);
23014 diff = ct - cf;
23016 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23019 if (diff == 1)
23022 * cmpl op0,op1
23023 * sbbl dest,dest
23024 * [addl dest, ct]
23026 * Size 5 - 8.
23028 if (ct)
23029 tmp = expand_simple_binop (mode, PLUS,
23030 tmp, GEN_INT (ct),
23031 copy_rtx (tmp), 1, OPTAB_DIRECT);
23033 else if (cf == -1)
23036 * cmpl op0,op1
23037 * sbbl dest,dest
23038 * orl $ct, dest
23040 * Size 8.
23042 tmp = expand_simple_binop (mode, IOR,
23043 tmp, GEN_INT (ct),
23044 copy_rtx (tmp), 1, OPTAB_DIRECT);
23046 else if (diff == -1 && ct)
23049 * cmpl op0,op1
23050 * sbbl dest,dest
23051 * notl dest
23052 * [addl dest, cf]
23054 * Size 8 - 11.
23056 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23057 if (cf)
23058 tmp = expand_simple_binop (mode, PLUS,
23059 copy_rtx (tmp), GEN_INT (cf),
23060 copy_rtx (tmp), 1, OPTAB_DIRECT);
23062 else
23065 * cmpl op0,op1
23066 * sbbl dest,dest
23067 * [notl dest]
23068 * andl cf - ct, dest
23069 * [addl dest, ct]
23071 * Size 8 - 11.
23074 if (cf == 0)
23076 cf = ct;
23077 ct = 0;
23078 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23081 tmp = expand_simple_binop (mode, AND,
23082 copy_rtx (tmp),
23083 gen_int_mode (cf - ct, mode),
23084 copy_rtx (tmp), 1, OPTAB_DIRECT);
23085 if (ct)
23086 tmp = expand_simple_binop (mode, PLUS,
23087 copy_rtx (tmp), GEN_INT (ct),
23088 copy_rtx (tmp), 1, OPTAB_DIRECT);
23091 if (!rtx_equal_p (tmp, out))
23092 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23094 return true;
23097 if (diff < 0)
23099 machine_mode cmp_mode = GET_MODE (op0);
23100 enum rtx_code new_code;
23102 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23104 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23106 /* We may be reversing unordered compare to normal compare, that
23107 is not valid in general (we may convert non-trapping condition
23108 to trapping one), however on i386 we currently emit all
23109 comparisons unordered. */
23110 new_code = reverse_condition_maybe_unordered (code);
23112 else
23113 new_code = ix86_reverse_condition (code, cmp_mode);
23114 if (new_code != UNKNOWN)
23116 std::swap (ct, cf);
23117 diff = -diff;
23118 code = new_code;
23122 compare_code = UNKNOWN;
23123 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23124 && CONST_INT_P (op1))
23126 if (op1 == const0_rtx
23127 && (code == LT || code == GE))
23128 compare_code = code;
23129 else if (op1 == constm1_rtx)
23131 if (code == LE)
23132 compare_code = LT;
23133 else if (code == GT)
23134 compare_code = GE;
23138 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23139 if (compare_code != UNKNOWN
23140 && GET_MODE (op0) == GET_MODE (out)
23141 && (cf == -1 || ct == -1))
23143 /* If lea code below could be used, only optimize
23144 if it results in a 2 insn sequence. */
23146 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23147 || diff == 3 || diff == 5 || diff == 9)
23148 || (compare_code == LT && ct == -1)
23149 || (compare_code == GE && cf == -1))
23152 * notl op1 (if necessary)
23153 * sarl $31, op1
23154 * orl cf, op1
23156 if (ct != -1)
23158 cf = ct;
23159 ct = -1;
23160 code = reverse_condition (code);
23163 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23165 out = expand_simple_binop (mode, IOR,
23166 out, GEN_INT (cf),
23167 out, 1, OPTAB_DIRECT);
23168 if (out != operands[0])
23169 emit_move_insn (operands[0], out);
23171 return true;
23176 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23177 || diff == 3 || diff == 5 || diff == 9)
23178 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23179 && (mode != DImode
23180 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23183 * xorl dest,dest
23184 * cmpl op1,op2
23185 * setcc dest
23186 * lea cf(dest*(ct-cf)),dest
23188 * Size 14.
23190 * This also catches the degenerate setcc-only case.
23193 rtx tmp;
23194 int nops;
23196 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23198 nops = 0;
23199 /* On x86_64 the lea instruction operates on Pmode, so we need
23200 to get arithmetics done in proper mode to match. */
23201 if (diff == 1)
23202 tmp = copy_rtx (out);
23203 else
23205 rtx out1;
23206 out1 = copy_rtx (out);
23207 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23208 nops++;
23209 if (diff & 1)
23211 tmp = gen_rtx_PLUS (mode, tmp, out1);
23212 nops++;
23215 if (cf != 0)
23217 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23218 nops++;
23220 if (!rtx_equal_p (tmp, out))
23222 if (nops == 1)
23223 out = force_operand (tmp, copy_rtx (out));
23224 else
23225 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23227 if (!rtx_equal_p (out, operands[0]))
23228 emit_move_insn (operands[0], copy_rtx (out));
23230 return true;
23234 * General case: Jumpful:
23235 * xorl dest,dest cmpl op1, op2
23236 * cmpl op1, op2 movl ct, dest
23237 * setcc dest jcc 1f
23238 * decl dest movl cf, dest
23239 * andl (cf-ct),dest 1:
23240 * addl ct,dest
23242 * Size 20. Size 14.
23244 * This is reasonably steep, but branch mispredict costs are
23245 * high on modern cpus, so consider failing only if optimizing
23246 * for space.
23249 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23250 && BRANCH_COST (optimize_insn_for_speed_p (),
23251 false) >= 2)
23253 if (cf == 0)
23255 machine_mode cmp_mode = GET_MODE (op0);
23256 enum rtx_code new_code;
23258 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23260 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23262 /* We may be reversing unordered compare to normal compare,
23263 that is not valid in general (we may convert non-trapping
23264 condition to trapping one), however on i386 we currently
23265 emit all comparisons unordered. */
23266 new_code = reverse_condition_maybe_unordered (code);
23268 else
23270 new_code = ix86_reverse_condition (code, cmp_mode);
23271 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23272 compare_code = reverse_condition (compare_code);
23275 if (new_code != UNKNOWN)
23277 cf = ct;
23278 ct = 0;
23279 code = new_code;
23283 if (compare_code != UNKNOWN)
23285 /* notl op1 (if needed)
23286 sarl $31, op1
23287 andl (cf-ct), op1
23288 addl ct, op1
23290 For x < 0 (resp. x <= -1) there will be no notl,
23291 so if possible swap the constants to get rid of the
23292 complement.
23293 True/false will be -1/0 while code below (store flag
23294 followed by decrement) is 0/-1, so the constants need
23295 to be exchanged once more. */
23297 if (compare_code == GE || !cf)
23299 code = reverse_condition (code);
23300 compare_code = LT;
23302 else
23303 std::swap (ct, cf);
23305 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23307 else
23309 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23311 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23312 constm1_rtx,
23313 copy_rtx (out), 1, OPTAB_DIRECT);
23316 out = expand_simple_binop (mode, AND, copy_rtx (out),
23317 gen_int_mode (cf - ct, mode),
23318 copy_rtx (out), 1, OPTAB_DIRECT);
23319 if (ct)
23320 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23321 copy_rtx (out), 1, OPTAB_DIRECT);
23322 if (!rtx_equal_p (out, operands[0]))
23323 emit_move_insn (operands[0], copy_rtx (out));
23325 return true;
23329 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23331 /* Try a few things more with specific constants and a variable. */
23333 optab op;
23334 rtx var, orig_out, out, tmp;
23336 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23337 return false;
23339 /* If one of the two operands is an interesting constant, load a
23340 constant with the above and mask it in with a logical operation. */
23342 if (CONST_INT_P (operands[2]))
23344 var = operands[3];
23345 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23346 operands[3] = constm1_rtx, op = and_optab;
23347 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23348 operands[3] = const0_rtx, op = ior_optab;
23349 else
23350 return false;
23352 else if (CONST_INT_P (operands[3]))
23354 var = operands[2];
23355 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23356 operands[2] = constm1_rtx, op = and_optab;
23357 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23358 operands[2] = const0_rtx, op = ior_optab;
23359 else
23360 return false;
23362 else
23363 return false;
23365 orig_out = operands[0];
23366 tmp = gen_reg_rtx (mode);
23367 operands[0] = tmp;
23369 /* Recurse to get the constant loaded. */
23370 if (!ix86_expand_int_movcc (operands))
23371 return false;
23373 /* Mask in the interesting variable. */
23374 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23375 OPTAB_WIDEN);
23376 if (!rtx_equal_p (out, orig_out))
23377 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23379 return true;
23383 * For comparison with above,
23385 * movl cf,dest
23386 * movl ct,tmp
23387 * cmpl op1,op2
23388 * cmovcc tmp,dest
23390 * Size 15.
23393 if (! nonimmediate_operand (operands[2], mode))
23394 operands[2] = force_reg (mode, operands[2]);
23395 if (! nonimmediate_operand (operands[3], mode))
23396 operands[3] = force_reg (mode, operands[3]);
23398 if (! register_operand (operands[2], VOIDmode)
23399 && (mode == QImode
23400 || ! register_operand (operands[3], VOIDmode)))
23401 operands[2] = force_reg (mode, operands[2]);
23403 if (mode == QImode
23404 && ! register_operand (operands[3], VOIDmode))
23405 operands[3] = force_reg (mode, operands[3]);
23407 emit_insn (compare_seq);
23408 emit_insn (gen_rtx_SET (operands[0],
23409 gen_rtx_IF_THEN_ELSE (mode,
23410 compare_op, operands[2],
23411 operands[3])));
23412 return true;
23415 /* Swap, force into registers, or otherwise massage the two operands
23416 to an sse comparison with a mask result. Thus we differ a bit from
23417 ix86_prepare_fp_compare_args which expects to produce a flags result.
23419 The DEST operand exists to help determine whether to commute commutative
23420 operators. The POP0/POP1 operands are updated in place. The new
23421 comparison code is returned, or UNKNOWN if not implementable. */
23423 static enum rtx_code
23424 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23425 rtx *pop0, rtx *pop1)
23427 switch (code)
23429 case LTGT:
23430 case UNEQ:
23431 /* AVX supports all the needed comparisons. */
23432 if (TARGET_AVX)
23433 break;
23434 /* We have no LTGT as an operator. We could implement it with
23435 NE & ORDERED, but this requires an extra temporary. It's
23436 not clear that it's worth it. */
23437 return UNKNOWN;
23439 case LT:
23440 case LE:
23441 case UNGT:
23442 case UNGE:
23443 /* These are supported directly. */
23444 break;
23446 case EQ:
23447 case NE:
23448 case UNORDERED:
23449 case ORDERED:
23450 /* AVX has 3 operand comparisons, no need to swap anything. */
23451 if (TARGET_AVX)
23452 break;
23453 /* For commutative operators, try to canonicalize the destination
23454 operand to be first in the comparison - this helps reload to
23455 avoid extra moves. */
23456 if (!dest || !rtx_equal_p (dest, *pop1))
23457 break;
23458 /* FALLTHRU */
23460 case GE:
23461 case GT:
23462 case UNLE:
23463 case UNLT:
23464 /* These are not supported directly before AVX, and furthermore
23465 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23466 comparison operands to transform into something that is
23467 supported. */
23468 std::swap (*pop0, *pop1);
23469 code = swap_condition (code);
23470 break;
23472 default:
23473 gcc_unreachable ();
23476 return code;
23479 /* Detect conditional moves that exactly match min/max operational
23480 semantics. Note that this is IEEE safe, as long as we don't
23481 interchange the operands.
23483 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23484 and TRUE if the operation is successful and instructions are emitted. */
23486 static bool
23487 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23488 rtx cmp_op1, rtx if_true, rtx if_false)
23490 machine_mode mode;
23491 bool is_min;
23492 rtx tmp;
23494 if (code == LT)
23496 else if (code == UNGE)
23497 std::swap (if_true, if_false);
23498 else
23499 return false;
23501 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23502 is_min = true;
23503 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23504 is_min = false;
23505 else
23506 return false;
23508 mode = GET_MODE (dest);
23510 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23511 but MODE may be a vector mode and thus not appropriate. */
23512 if (!flag_finite_math_only || flag_signed_zeros)
23514 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23515 rtvec v;
23517 if_true = force_reg (mode, if_true);
23518 v = gen_rtvec (2, if_true, if_false);
23519 tmp = gen_rtx_UNSPEC (mode, v, u);
23521 else
23523 code = is_min ? SMIN : SMAX;
23524 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23527 emit_insn (gen_rtx_SET (dest, tmp));
23528 return true;
23531 /* Expand an sse vector comparison. Return the register with the result. */
23533 static rtx
23534 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23535 rtx op_true, rtx op_false)
23537 machine_mode mode = GET_MODE (dest);
23538 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23540 /* In general case result of comparison can differ from operands' type. */
23541 machine_mode cmp_mode;
23543 /* In AVX512F the result of comparison is an integer mask. */
23544 bool maskcmp = false;
23545 rtx x;
23547 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23549 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
23550 gcc_assert (cmp_mode != BLKmode);
23552 maskcmp = true;
23554 else
23555 cmp_mode = cmp_ops_mode;
23558 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23559 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23560 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23562 if (optimize
23563 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23564 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23565 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23567 /* Compare patterns for int modes are unspec in AVX512F only. */
23568 if (maskcmp && (code == GT || code == EQ))
23570 rtx (*gen)(rtx, rtx, rtx);
23572 switch (cmp_ops_mode)
23574 case V64QImode:
23575 gcc_assert (TARGET_AVX512BW);
23576 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23577 break;
23578 case V32HImode:
23579 gcc_assert (TARGET_AVX512BW);
23580 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23581 break;
23582 case V16SImode:
23583 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23584 break;
23585 case V8DImode:
23586 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23587 break;
23588 default:
23589 gen = NULL;
23592 if (gen)
23594 emit_insn (gen (dest, cmp_op0, cmp_op1));
23595 return dest;
23598 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23600 if (cmp_mode != mode && !maskcmp)
23602 x = force_reg (cmp_ops_mode, x);
23603 convert_move (dest, x, false);
23605 else
23606 emit_insn (gen_rtx_SET (dest, x));
23608 return dest;
23611 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23612 operations. This is used for both scalar and vector conditional moves. */
23614 void
23615 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23617 machine_mode mode = GET_MODE (dest);
23618 machine_mode cmpmode = GET_MODE (cmp);
23620 /* In AVX512F the result of comparison is an integer mask. */
23621 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23623 rtx t2, t3, x;
23625 /* If we have an integer mask and FP value then we need
23626 to cast mask to FP mode. */
23627 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23629 cmp = force_reg (cmpmode, cmp);
23630 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23633 if (vector_all_ones_operand (op_true, mode)
23634 && rtx_equal_p (op_false, CONST0_RTX (mode))
23635 && !maskcmp)
23637 emit_insn (gen_rtx_SET (dest, cmp));
23639 else if (op_false == CONST0_RTX (mode)
23640 && !maskcmp)
23642 op_true = force_reg (mode, op_true);
23643 x = gen_rtx_AND (mode, cmp, op_true);
23644 emit_insn (gen_rtx_SET (dest, x));
23646 else if (op_true == CONST0_RTX (mode)
23647 && !maskcmp)
23649 op_false = force_reg (mode, op_false);
23650 x = gen_rtx_NOT (mode, cmp);
23651 x = gen_rtx_AND (mode, x, op_false);
23652 emit_insn (gen_rtx_SET (dest, x));
23654 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23655 && !maskcmp)
23657 op_false = force_reg (mode, op_false);
23658 x = gen_rtx_IOR (mode, cmp, op_false);
23659 emit_insn (gen_rtx_SET (dest, x));
23661 else if (TARGET_XOP
23662 && !maskcmp)
23664 op_true = force_reg (mode, op_true);
23666 if (!nonimmediate_operand (op_false, mode))
23667 op_false = force_reg (mode, op_false);
23669 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23670 op_true,
23671 op_false)));
23673 else
23675 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23676 rtx d = dest;
23678 if (!nonimmediate_operand (op_true, mode))
23679 op_true = force_reg (mode, op_true);
23681 op_false = force_reg (mode, op_false);
23683 switch (mode)
23685 case V4SFmode:
23686 if (TARGET_SSE4_1)
23687 gen = gen_sse4_1_blendvps;
23688 break;
23689 case V2DFmode:
23690 if (TARGET_SSE4_1)
23691 gen = gen_sse4_1_blendvpd;
23692 break;
23693 case V16QImode:
23694 case V8HImode:
23695 case V4SImode:
23696 case V2DImode:
23697 if (TARGET_SSE4_1)
23699 gen = gen_sse4_1_pblendvb;
23700 if (mode != V16QImode)
23701 d = gen_reg_rtx (V16QImode);
23702 op_false = gen_lowpart (V16QImode, op_false);
23703 op_true = gen_lowpart (V16QImode, op_true);
23704 cmp = gen_lowpart (V16QImode, cmp);
23706 break;
23707 case V8SFmode:
23708 if (TARGET_AVX)
23709 gen = gen_avx_blendvps256;
23710 break;
23711 case V4DFmode:
23712 if (TARGET_AVX)
23713 gen = gen_avx_blendvpd256;
23714 break;
23715 case V32QImode:
23716 case V16HImode:
23717 case V8SImode:
23718 case V4DImode:
23719 if (TARGET_AVX2)
23721 gen = gen_avx2_pblendvb;
23722 if (mode != V32QImode)
23723 d = gen_reg_rtx (V32QImode);
23724 op_false = gen_lowpart (V32QImode, op_false);
23725 op_true = gen_lowpart (V32QImode, op_true);
23726 cmp = gen_lowpart (V32QImode, cmp);
23728 break;
23730 case V64QImode:
23731 gen = gen_avx512bw_blendmv64qi;
23732 break;
23733 case V32HImode:
23734 gen = gen_avx512bw_blendmv32hi;
23735 break;
23736 case V16SImode:
23737 gen = gen_avx512f_blendmv16si;
23738 break;
23739 case V8DImode:
23740 gen = gen_avx512f_blendmv8di;
23741 break;
23742 case V8DFmode:
23743 gen = gen_avx512f_blendmv8df;
23744 break;
23745 case V16SFmode:
23746 gen = gen_avx512f_blendmv16sf;
23747 break;
23749 default:
23750 break;
23753 if (gen != NULL)
23755 emit_insn (gen (d, op_false, op_true, cmp));
23756 if (d != dest)
23757 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23759 else
23761 op_true = force_reg (mode, op_true);
23763 t2 = gen_reg_rtx (mode);
23764 if (optimize)
23765 t3 = gen_reg_rtx (mode);
23766 else
23767 t3 = dest;
23769 x = gen_rtx_AND (mode, op_true, cmp);
23770 emit_insn (gen_rtx_SET (t2, x));
23772 x = gen_rtx_NOT (mode, cmp);
23773 x = gen_rtx_AND (mode, x, op_false);
23774 emit_insn (gen_rtx_SET (t3, x));
23776 x = gen_rtx_IOR (mode, t3, t2);
23777 emit_insn (gen_rtx_SET (dest, x));
23782 /* Expand a floating-point conditional move. Return true if successful. */
23784 bool
23785 ix86_expand_fp_movcc (rtx operands[])
23787 machine_mode mode = GET_MODE (operands[0]);
23788 enum rtx_code code = GET_CODE (operands[1]);
23789 rtx tmp, compare_op;
23790 rtx op0 = XEXP (operands[1], 0);
23791 rtx op1 = XEXP (operands[1], 1);
23793 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23795 machine_mode cmode;
23797 /* Since we've no cmove for sse registers, don't force bad register
23798 allocation just to gain access to it. Deny movcc when the
23799 comparison mode doesn't match the move mode. */
23800 cmode = GET_MODE (op0);
23801 if (cmode == VOIDmode)
23802 cmode = GET_MODE (op1);
23803 if (cmode != mode)
23804 return false;
23806 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23807 if (code == UNKNOWN)
23808 return false;
23810 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23811 operands[2], operands[3]))
23812 return true;
23814 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23815 operands[2], operands[3]);
23816 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23817 return true;
23820 if (GET_MODE (op0) == TImode
23821 || (GET_MODE (op0) == DImode
23822 && !TARGET_64BIT))
23823 return false;
23825 /* The floating point conditional move instructions don't directly
23826 support conditions resulting from a signed integer comparison. */
23828 compare_op = ix86_expand_compare (code, op0, op1);
23829 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23831 tmp = gen_reg_rtx (QImode);
23832 ix86_expand_setcc (tmp, code, op0, op1);
23834 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23837 emit_insn (gen_rtx_SET (operands[0],
23838 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23839 operands[2], operands[3])));
23841 return true;
23844 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23846 static int
23847 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23849 switch (code)
23851 case EQ:
23852 return 0;
23853 case LT:
23854 case LTU:
23855 return 1;
23856 case LE:
23857 case LEU:
23858 return 2;
23859 case NE:
23860 return 4;
23861 case GE:
23862 case GEU:
23863 return 5;
23864 case GT:
23865 case GTU:
23866 return 6;
23867 default:
23868 gcc_unreachable ();
23872 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23874 static int
23875 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23877 switch (code)
23879 case EQ:
23880 return 0x00;
23881 case NE:
23882 return 0x04;
23883 case GT:
23884 return 0x0e;
23885 case LE:
23886 return 0x02;
23887 case GE:
23888 return 0x0d;
23889 case LT:
23890 return 0x01;
23891 case UNLE:
23892 return 0x0a;
23893 case UNLT:
23894 return 0x09;
23895 case UNGE:
23896 return 0x05;
23897 case UNGT:
23898 return 0x06;
23899 case UNEQ:
23900 return 0x18;
23901 case LTGT:
23902 return 0x0c;
23903 case ORDERED:
23904 return 0x07;
23905 case UNORDERED:
23906 return 0x03;
23907 default:
23908 gcc_unreachable ();
23912 /* Return immediate value to be used in UNSPEC_PCMP
23913 for comparison CODE in MODE. */
23915 static int
23916 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23918 if (FLOAT_MODE_P (mode))
23919 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23920 return ix86_int_cmp_code_to_pcmp_immediate (code);
23923 /* Expand AVX-512 vector comparison. */
23925 bool
23926 ix86_expand_mask_vec_cmp (rtx operands[])
23928 machine_mode mask_mode = GET_MODE (operands[0]);
23929 machine_mode cmp_mode = GET_MODE (operands[2]);
23930 enum rtx_code code = GET_CODE (operands[1]);
23931 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23932 int unspec_code;
23933 rtx unspec;
23935 switch (code)
23937 case LEU:
23938 case GTU:
23939 case GEU:
23940 case LTU:
23941 unspec_code = UNSPEC_UNSIGNED_PCMP;
23942 break;
23944 default:
23945 unspec_code = UNSPEC_PCMP;
23948 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23949 operands[3], imm),
23950 unspec_code);
23951 emit_insn (gen_rtx_SET (operands[0], unspec));
23953 return true;
23956 /* Expand fp vector comparison. */
23958 bool
23959 ix86_expand_fp_vec_cmp (rtx operands[])
23961 enum rtx_code code = GET_CODE (operands[1]);
23962 rtx cmp;
23964 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23965 &operands[2], &operands[3]);
23966 if (code == UNKNOWN)
23968 rtx temp;
23969 switch (GET_CODE (operands[1]))
23971 case LTGT:
23972 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23973 operands[3], NULL, NULL);
23974 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23975 operands[3], NULL, NULL);
23976 code = AND;
23977 break;
23978 case UNEQ:
23979 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23980 operands[3], NULL, NULL);
23981 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23982 operands[3], NULL, NULL);
23983 code = IOR;
23984 break;
23985 default:
23986 gcc_unreachable ();
23988 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23989 OPTAB_DIRECT);
23991 else
23992 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23993 operands[1], operands[2]);
23995 if (operands[0] != cmp)
23996 emit_move_insn (operands[0], cmp);
23998 return true;
24001 static rtx
24002 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24003 rtx op_true, rtx op_false, bool *negate)
24005 machine_mode data_mode = GET_MODE (dest);
24006 machine_mode mode = GET_MODE (cop0);
24007 rtx x;
24009 *negate = false;
24011 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24012 if (TARGET_XOP
24013 && (mode == V16QImode || mode == V8HImode
24014 || mode == V4SImode || mode == V2DImode))
24016 else
24018 /* Canonicalize the comparison to EQ, GT, GTU. */
24019 switch (code)
24021 case EQ:
24022 case GT:
24023 case GTU:
24024 break;
24026 case NE:
24027 case LE:
24028 case LEU:
24029 code = reverse_condition (code);
24030 *negate = true;
24031 break;
24033 case GE:
24034 case GEU:
24035 code = reverse_condition (code);
24036 *negate = true;
24037 /* FALLTHRU */
24039 case LT:
24040 case LTU:
24041 std::swap (cop0, cop1);
24042 code = swap_condition (code);
24043 break;
24045 default:
24046 gcc_unreachable ();
24049 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24050 if (mode == V2DImode)
24052 switch (code)
24054 case EQ:
24055 /* SSE4.1 supports EQ. */
24056 if (!TARGET_SSE4_1)
24057 return NULL;
24058 break;
24060 case GT:
24061 case GTU:
24062 /* SSE4.2 supports GT/GTU. */
24063 if (!TARGET_SSE4_2)
24064 return NULL;
24065 break;
24067 default:
24068 gcc_unreachable ();
24072 /* Unsigned parallel compare is not supported by the hardware.
24073 Play some tricks to turn this into a signed comparison
24074 against 0. */
24075 if (code == GTU)
24077 cop0 = force_reg (mode, cop0);
24079 switch (mode)
24081 case V16SImode:
24082 case V8DImode:
24083 case V8SImode:
24084 case V4DImode:
24085 case V4SImode:
24086 case V2DImode:
24088 rtx t1, t2, mask;
24089 rtx (*gen_sub3) (rtx, rtx, rtx);
24091 switch (mode)
24093 case V16SImode: gen_sub3 = gen_subv16si3; break;
24094 case V8DImode: gen_sub3 = gen_subv8di3; break;
24095 case V8SImode: gen_sub3 = gen_subv8si3; break;
24096 case V4DImode: gen_sub3 = gen_subv4di3; break;
24097 case V4SImode: gen_sub3 = gen_subv4si3; break;
24098 case V2DImode: gen_sub3 = gen_subv2di3; break;
24099 default:
24100 gcc_unreachable ();
24102 /* Subtract (-(INT MAX) - 1) from both operands to make
24103 them signed. */
24104 mask = ix86_build_signbit_mask (mode, true, false);
24105 t1 = gen_reg_rtx (mode);
24106 emit_insn (gen_sub3 (t1, cop0, mask));
24108 t2 = gen_reg_rtx (mode);
24109 emit_insn (gen_sub3 (t2, cop1, mask));
24111 cop0 = t1;
24112 cop1 = t2;
24113 code = GT;
24115 break;
24117 case V64QImode:
24118 case V32HImode:
24119 case V32QImode:
24120 case V16HImode:
24121 case V16QImode:
24122 case V8HImode:
24123 /* Perform a parallel unsigned saturating subtraction. */
24124 x = gen_reg_rtx (mode);
24125 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24126 cop1)));
24128 cop0 = x;
24129 cop1 = CONST0_RTX (mode);
24130 code = EQ;
24131 *negate = !*negate;
24132 break;
24134 default:
24135 gcc_unreachable ();
24140 if (*negate)
24141 std::swap (op_true, op_false);
24143 /* Allow the comparison to be done in one mode, but the movcc to
24144 happen in another mode. */
24145 if (data_mode == mode)
24147 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24148 op_true, op_false);
24150 else
24152 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24153 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24154 op_true, op_false);
24155 if (GET_MODE (x) == mode)
24156 x = gen_lowpart (data_mode, x);
24159 return x;
24162 /* Expand integer vector comparison. */
24164 bool
24165 ix86_expand_int_vec_cmp (rtx operands[])
24167 rtx_code code = GET_CODE (operands[1]);
24168 bool negate = false;
24169 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24170 operands[3], NULL, NULL, &negate);
24172 if (!cmp)
24173 return false;
24175 if (negate)
24176 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24177 CONST0_RTX (GET_MODE (cmp)),
24178 NULL, NULL, &negate);
24180 gcc_assert (!negate);
24182 if (operands[0] != cmp)
24183 emit_move_insn (operands[0], cmp);
24185 return true;
24188 /* Expand a floating-point vector conditional move; a vcond operation
24189 rather than a movcc operation. */
24191 bool
24192 ix86_expand_fp_vcond (rtx operands[])
24194 enum rtx_code code = GET_CODE (operands[3]);
24195 rtx cmp;
24197 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24198 &operands[4], &operands[5]);
24199 if (code == UNKNOWN)
24201 rtx temp;
24202 switch (GET_CODE (operands[3]))
24204 case LTGT:
24205 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24206 operands[5], operands[0], operands[0]);
24207 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24208 operands[5], operands[1], operands[2]);
24209 code = AND;
24210 break;
24211 case UNEQ:
24212 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24213 operands[5], operands[0], operands[0]);
24214 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24215 operands[5], operands[1], operands[2]);
24216 code = IOR;
24217 break;
24218 default:
24219 gcc_unreachable ();
24221 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24222 OPTAB_DIRECT);
24223 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24224 return true;
24227 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24228 operands[5], operands[1], operands[2]))
24229 return true;
24231 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24232 operands[1], operands[2]);
24233 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24234 return true;
24237 /* Expand a signed/unsigned integral vector conditional move. */
24239 bool
24240 ix86_expand_int_vcond (rtx operands[])
24242 machine_mode data_mode = GET_MODE (operands[0]);
24243 machine_mode mode = GET_MODE (operands[4]);
24244 enum rtx_code code = GET_CODE (operands[3]);
24245 bool negate = false;
24246 rtx x, cop0, cop1;
24248 cop0 = operands[4];
24249 cop1 = operands[5];
24251 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24252 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24253 if ((code == LT || code == GE)
24254 && data_mode == mode
24255 && cop1 == CONST0_RTX (mode)
24256 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24257 && GET_MODE_UNIT_SIZE (data_mode) > 1
24258 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24259 && (GET_MODE_SIZE (data_mode) == 16
24260 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24262 rtx negop = operands[2 - (code == LT)];
24263 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24264 if (negop == CONST1_RTX (data_mode))
24266 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24267 operands[0], 1, OPTAB_DIRECT);
24268 if (res != operands[0])
24269 emit_move_insn (operands[0], res);
24270 return true;
24272 else if (GET_MODE_INNER (data_mode) != DImode
24273 && vector_all_ones_operand (negop, data_mode))
24275 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24276 operands[0], 0, OPTAB_DIRECT);
24277 if (res != operands[0])
24278 emit_move_insn (operands[0], res);
24279 return true;
24283 if (!nonimmediate_operand (cop1, mode))
24284 cop1 = force_reg (mode, cop1);
24285 if (!general_operand (operands[1], data_mode))
24286 operands[1] = force_reg (data_mode, operands[1]);
24287 if (!general_operand (operands[2], data_mode))
24288 operands[2] = force_reg (data_mode, operands[2]);
24290 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24291 operands[1], operands[2], &negate);
24293 if (!x)
24294 return false;
24296 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24297 operands[2-negate]);
24298 return true;
24301 /* AVX512F does support 64-byte integer vector operations,
24302 thus the longest vector we are faced with is V64QImode. */
24303 #define MAX_VECT_LEN 64
24305 struct expand_vec_perm_d
24307 rtx target, op0, op1;
24308 unsigned char perm[MAX_VECT_LEN];
24309 machine_mode vmode;
24310 unsigned char nelt;
24311 bool one_operand_p;
24312 bool testing_p;
24315 static bool
24316 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
24317 struct expand_vec_perm_d *d)
24319 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24320 expander, so args are either in d, or in op0, op1 etc. */
24321 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24322 machine_mode maskmode = mode;
24323 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24325 switch (mode)
24327 case V8HImode:
24328 if (TARGET_AVX512VL && TARGET_AVX512BW)
24329 gen = gen_avx512vl_vpermi2varv8hi3;
24330 break;
24331 case V16HImode:
24332 if (TARGET_AVX512VL && TARGET_AVX512BW)
24333 gen = gen_avx512vl_vpermi2varv16hi3;
24334 break;
24335 case V64QImode:
24336 if (TARGET_AVX512VBMI)
24337 gen = gen_avx512bw_vpermi2varv64qi3;
24338 break;
24339 case V32HImode:
24340 if (TARGET_AVX512BW)
24341 gen = gen_avx512bw_vpermi2varv32hi3;
24342 break;
24343 case V4SImode:
24344 if (TARGET_AVX512VL)
24345 gen = gen_avx512vl_vpermi2varv4si3;
24346 break;
24347 case V8SImode:
24348 if (TARGET_AVX512VL)
24349 gen = gen_avx512vl_vpermi2varv8si3;
24350 break;
24351 case V16SImode:
24352 if (TARGET_AVX512F)
24353 gen = gen_avx512f_vpermi2varv16si3;
24354 break;
24355 case V4SFmode:
24356 if (TARGET_AVX512VL)
24358 gen = gen_avx512vl_vpermi2varv4sf3;
24359 maskmode = V4SImode;
24361 break;
24362 case V8SFmode:
24363 if (TARGET_AVX512VL)
24365 gen = gen_avx512vl_vpermi2varv8sf3;
24366 maskmode = V8SImode;
24368 break;
24369 case V16SFmode:
24370 if (TARGET_AVX512F)
24372 gen = gen_avx512f_vpermi2varv16sf3;
24373 maskmode = V16SImode;
24375 break;
24376 case V2DImode:
24377 if (TARGET_AVX512VL)
24378 gen = gen_avx512vl_vpermi2varv2di3;
24379 break;
24380 case V4DImode:
24381 if (TARGET_AVX512VL)
24382 gen = gen_avx512vl_vpermi2varv4di3;
24383 break;
24384 case V8DImode:
24385 if (TARGET_AVX512F)
24386 gen = gen_avx512f_vpermi2varv8di3;
24387 break;
24388 case V2DFmode:
24389 if (TARGET_AVX512VL)
24391 gen = gen_avx512vl_vpermi2varv2df3;
24392 maskmode = V2DImode;
24394 break;
24395 case V4DFmode:
24396 if (TARGET_AVX512VL)
24398 gen = gen_avx512vl_vpermi2varv4df3;
24399 maskmode = V4DImode;
24401 break;
24402 case V8DFmode:
24403 if (TARGET_AVX512F)
24405 gen = gen_avx512f_vpermi2varv8df3;
24406 maskmode = V8DImode;
24408 break;
24409 default:
24410 break;
24413 if (gen == NULL)
24414 return false;
24416 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
24417 expander, so args are either in d, or in op0, op1 etc. */
24418 if (d)
24420 rtx vec[64];
24421 target = d->target;
24422 op0 = d->op0;
24423 op1 = d->op1;
24424 for (int i = 0; i < d->nelt; ++i)
24425 vec[i] = GEN_INT (d->perm[i]);
24426 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24429 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
24430 return true;
24433 /* Expand a variable vector permutation. */
24435 void
24436 ix86_expand_vec_perm (rtx operands[])
24438 rtx target = operands[0];
24439 rtx op0 = operands[1];
24440 rtx op1 = operands[2];
24441 rtx mask = operands[3];
24442 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24443 machine_mode mode = GET_MODE (op0);
24444 machine_mode maskmode = GET_MODE (mask);
24445 int w, e, i;
24446 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24448 /* Number of elements in the vector. */
24449 w = GET_MODE_NUNITS (mode);
24450 e = GET_MODE_UNIT_SIZE (mode);
24451 gcc_assert (w <= 64);
24453 if (TARGET_AVX512F && one_operand_shuffle)
24455 rtx (*gen) (rtx, rtx, rtx) = NULL;
24456 switch (mode)
24458 case V16SImode:
24459 gen =gen_avx512f_permvarv16si;
24460 break;
24461 case V16SFmode:
24462 gen = gen_avx512f_permvarv16sf;
24463 break;
24464 case V8DImode:
24465 gen = gen_avx512f_permvarv8di;
24466 break;
24467 case V8DFmode:
24468 gen = gen_avx512f_permvarv8df;
24469 break;
24470 default:
24471 break;
24473 if (gen != NULL)
24475 emit_insn (gen (target, op0, mask));
24476 return;
24480 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
24481 return;
24483 if (TARGET_AVX2)
24485 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24487 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24488 an constant shuffle operand. With a tiny bit of effort we can
24489 use VPERMD instead. A re-interpretation stall for V4DFmode is
24490 unfortunate but there's no avoiding it.
24491 Similarly for V16HImode we don't have instructions for variable
24492 shuffling, while for V32QImode we can use after preparing suitable
24493 masks vpshufb; vpshufb; vpermq; vpor. */
24495 if (mode == V16HImode)
24497 maskmode = mode = V32QImode;
24498 w = 32;
24499 e = 1;
24501 else
24503 maskmode = mode = V8SImode;
24504 w = 8;
24505 e = 4;
24507 t1 = gen_reg_rtx (maskmode);
24509 /* Replicate the low bits of the V4DImode mask into V8SImode:
24510 mask = { A B C D }
24511 t1 = { A A B B C C D D }. */
24512 for (i = 0; i < w / 2; ++i)
24513 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24514 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24515 vt = force_reg (maskmode, vt);
24516 mask = gen_lowpart (maskmode, mask);
24517 if (maskmode == V8SImode)
24518 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24519 else
24520 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24522 /* Multiply the shuffle indicies by two. */
24523 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24524 OPTAB_DIRECT);
24526 /* Add one to the odd shuffle indicies:
24527 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24528 for (i = 0; i < w / 2; ++i)
24530 vec[i * 2] = const0_rtx;
24531 vec[i * 2 + 1] = const1_rtx;
24533 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24534 vt = validize_mem (force_const_mem (maskmode, vt));
24535 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24536 OPTAB_DIRECT);
24538 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24539 operands[3] = mask = t1;
24540 target = gen_reg_rtx (mode);
24541 op0 = gen_lowpart (mode, op0);
24542 op1 = gen_lowpart (mode, op1);
24545 switch (mode)
24547 case V8SImode:
24548 /* The VPERMD and VPERMPS instructions already properly ignore
24549 the high bits of the shuffle elements. No need for us to
24550 perform an AND ourselves. */
24551 if (one_operand_shuffle)
24553 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24554 if (target != operands[0])
24555 emit_move_insn (operands[0],
24556 gen_lowpart (GET_MODE (operands[0]), target));
24558 else
24560 t1 = gen_reg_rtx (V8SImode);
24561 t2 = gen_reg_rtx (V8SImode);
24562 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24563 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24564 goto merge_two;
24566 return;
24568 case V8SFmode:
24569 mask = gen_lowpart (V8SImode, mask);
24570 if (one_operand_shuffle)
24571 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24572 else
24574 t1 = gen_reg_rtx (V8SFmode);
24575 t2 = gen_reg_rtx (V8SFmode);
24576 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24577 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24578 goto merge_two;
24580 return;
24582 case V4SImode:
24583 /* By combining the two 128-bit input vectors into one 256-bit
24584 input vector, we can use VPERMD and VPERMPS for the full
24585 two-operand shuffle. */
24586 t1 = gen_reg_rtx (V8SImode);
24587 t2 = gen_reg_rtx (V8SImode);
24588 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24589 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24590 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24591 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24592 return;
24594 case V4SFmode:
24595 t1 = gen_reg_rtx (V8SFmode);
24596 t2 = gen_reg_rtx (V8SImode);
24597 mask = gen_lowpart (V4SImode, mask);
24598 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24599 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24600 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24601 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24602 return;
24604 case V32QImode:
24605 t1 = gen_reg_rtx (V32QImode);
24606 t2 = gen_reg_rtx (V32QImode);
24607 t3 = gen_reg_rtx (V32QImode);
24608 vt2 = GEN_INT (-128);
24609 for (i = 0; i < 32; i++)
24610 vec[i] = vt2;
24611 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24612 vt = force_reg (V32QImode, vt);
24613 for (i = 0; i < 32; i++)
24614 vec[i] = i < 16 ? vt2 : const0_rtx;
24615 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24616 vt2 = force_reg (V32QImode, vt2);
24617 /* From mask create two adjusted masks, which contain the same
24618 bits as mask in the low 7 bits of each vector element.
24619 The first mask will have the most significant bit clear
24620 if it requests element from the same 128-bit lane
24621 and MSB set if it requests element from the other 128-bit lane.
24622 The second mask will have the opposite values of the MSB,
24623 and additionally will have its 128-bit lanes swapped.
24624 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24625 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24626 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24627 stands for other 12 bytes. */
24628 /* The bit whether element is from the same lane or the other
24629 lane is bit 4, so shift it up by 3 to the MSB position. */
24630 t5 = gen_reg_rtx (V4DImode);
24631 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24632 GEN_INT (3)));
24633 /* Clear MSB bits from the mask just in case it had them set. */
24634 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24635 /* After this t1 will have MSB set for elements from other lane. */
24636 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24637 /* Clear bits other than MSB. */
24638 emit_insn (gen_andv32qi3 (t1, t1, vt));
24639 /* Or in the lower bits from mask into t3. */
24640 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24641 /* And invert MSB bits in t1, so MSB is set for elements from the same
24642 lane. */
24643 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24644 /* Swap 128-bit lanes in t3. */
24645 t6 = gen_reg_rtx (V4DImode);
24646 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24647 const2_rtx, GEN_INT (3),
24648 const0_rtx, const1_rtx));
24649 /* And or in the lower bits from mask into t1. */
24650 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24651 if (one_operand_shuffle)
24653 /* Each of these shuffles will put 0s in places where
24654 element from the other 128-bit lane is needed, otherwise
24655 will shuffle in the requested value. */
24656 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24657 gen_lowpart (V32QImode, t6)));
24658 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24659 /* For t3 the 128-bit lanes are swapped again. */
24660 t7 = gen_reg_rtx (V4DImode);
24661 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24662 const2_rtx, GEN_INT (3),
24663 const0_rtx, const1_rtx));
24664 /* And oring both together leads to the result. */
24665 emit_insn (gen_iorv32qi3 (target, t1,
24666 gen_lowpart (V32QImode, t7)));
24667 if (target != operands[0])
24668 emit_move_insn (operands[0],
24669 gen_lowpart (GET_MODE (operands[0]), target));
24670 return;
24673 t4 = gen_reg_rtx (V32QImode);
24674 /* Similarly to the above one_operand_shuffle code,
24675 just for repeated twice for each operand. merge_two:
24676 code will merge the two results together. */
24677 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24678 gen_lowpart (V32QImode, t6)));
24679 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24680 gen_lowpart (V32QImode, t6)));
24681 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24682 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24683 t7 = gen_reg_rtx (V4DImode);
24684 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24685 const2_rtx, GEN_INT (3),
24686 const0_rtx, const1_rtx));
24687 t8 = gen_reg_rtx (V4DImode);
24688 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24689 const2_rtx, GEN_INT (3),
24690 const0_rtx, const1_rtx));
24691 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24692 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24693 t1 = t4;
24694 t2 = t3;
24695 goto merge_two;
24697 default:
24698 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24699 break;
24703 if (TARGET_XOP)
24705 /* The XOP VPPERM insn supports three inputs. By ignoring the
24706 one_operand_shuffle special case, we avoid creating another
24707 set of constant vectors in memory. */
24708 one_operand_shuffle = false;
24710 /* mask = mask & {2*w-1, ...} */
24711 vt = GEN_INT (2*w - 1);
24713 else
24715 /* mask = mask & {w-1, ...} */
24716 vt = GEN_INT (w - 1);
24719 for (i = 0; i < w; i++)
24720 vec[i] = vt;
24721 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24722 mask = expand_simple_binop (maskmode, AND, mask, vt,
24723 NULL_RTX, 0, OPTAB_DIRECT);
24725 /* For non-QImode operations, convert the word permutation control
24726 into a byte permutation control. */
24727 if (mode != V16QImode)
24729 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24730 GEN_INT (exact_log2 (e)),
24731 NULL_RTX, 0, OPTAB_DIRECT);
24733 /* Convert mask to vector of chars. */
24734 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24736 /* Replicate each of the input bytes into byte positions:
24737 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24738 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24739 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24740 for (i = 0; i < 16; ++i)
24741 vec[i] = GEN_INT (i/e * e);
24742 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24743 vt = validize_mem (force_const_mem (V16QImode, vt));
24744 if (TARGET_XOP)
24745 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24746 else
24747 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24749 /* Convert it into the byte positions by doing
24750 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24751 for (i = 0; i < 16; ++i)
24752 vec[i] = GEN_INT (i % e);
24753 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24754 vt = validize_mem (force_const_mem (V16QImode, vt));
24755 emit_insn (gen_addv16qi3 (mask, mask, vt));
24758 /* The actual shuffle operations all operate on V16QImode. */
24759 op0 = gen_lowpart (V16QImode, op0);
24760 op1 = gen_lowpart (V16QImode, op1);
24762 if (TARGET_XOP)
24764 if (GET_MODE (target) != V16QImode)
24765 target = gen_reg_rtx (V16QImode);
24766 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24767 if (target != operands[0])
24768 emit_move_insn (operands[0],
24769 gen_lowpart (GET_MODE (operands[0]), target));
24771 else if (one_operand_shuffle)
24773 if (GET_MODE (target) != V16QImode)
24774 target = gen_reg_rtx (V16QImode);
24775 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24776 if (target != operands[0])
24777 emit_move_insn (operands[0],
24778 gen_lowpart (GET_MODE (operands[0]), target));
24780 else
24782 rtx xops[6];
24783 bool ok;
24785 /* Shuffle the two input vectors independently. */
24786 t1 = gen_reg_rtx (V16QImode);
24787 t2 = gen_reg_rtx (V16QImode);
24788 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24789 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24791 merge_two:
24792 /* Then merge them together. The key is whether any given control
24793 element contained a bit set that indicates the second word. */
24794 mask = operands[3];
24795 vt = GEN_INT (w);
24796 if (maskmode == V2DImode && !TARGET_SSE4_1)
24798 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24799 more shuffle to convert the V2DI input mask into a V4SI
24800 input mask. At which point the masking that expand_int_vcond
24801 will work as desired. */
24802 rtx t3 = gen_reg_rtx (V4SImode);
24803 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24804 const0_rtx, const0_rtx,
24805 const2_rtx, const2_rtx));
24806 mask = t3;
24807 maskmode = V4SImode;
24808 e = w = 4;
24811 for (i = 0; i < w; i++)
24812 vec[i] = vt;
24813 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24814 vt = force_reg (maskmode, vt);
24815 mask = expand_simple_binop (maskmode, AND, mask, vt,
24816 NULL_RTX, 0, OPTAB_DIRECT);
24818 if (GET_MODE (target) != mode)
24819 target = gen_reg_rtx (mode);
24820 xops[0] = target;
24821 xops[1] = gen_lowpart (mode, t2);
24822 xops[2] = gen_lowpart (mode, t1);
24823 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24824 xops[4] = mask;
24825 xops[5] = vt;
24826 ok = ix86_expand_int_vcond (xops);
24827 gcc_assert (ok);
24828 if (target != operands[0])
24829 emit_move_insn (operands[0],
24830 gen_lowpart (GET_MODE (operands[0]), target));
24834 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24835 true if we should do zero extension, else sign extension. HIGH_P is
24836 true if we want the N/2 high elements, else the low elements. */
24838 void
24839 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24841 machine_mode imode = GET_MODE (src);
24842 rtx tmp;
24844 if (TARGET_SSE4_1)
24846 rtx (*unpack)(rtx, rtx);
24847 rtx (*extract)(rtx, rtx) = NULL;
24848 machine_mode halfmode = BLKmode;
24850 switch (imode)
24852 case V64QImode:
24853 if (unsigned_p)
24854 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24855 else
24856 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24857 halfmode = V32QImode;
24858 extract
24859 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24860 break;
24861 case V32QImode:
24862 if (unsigned_p)
24863 unpack = gen_avx2_zero_extendv16qiv16hi2;
24864 else
24865 unpack = gen_avx2_sign_extendv16qiv16hi2;
24866 halfmode = V16QImode;
24867 extract
24868 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24869 break;
24870 case V32HImode:
24871 if (unsigned_p)
24872 unpack = gen_avx512f_zero_extendv16hiv16si2;
24873 else
24874 unpack = gen_avx512f_sign_extendv16hiv16si2;
24875 halfmode = V16HImode;
24876 extract
24877 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24878 break;
24879 case V16HImode:
24880 if (unsigned_p)
24881 unpack = gen_avx2_zero_extendv8hiv8si2;
24882 else
24883 unpack = gen_avx2_sign_extendv8hiv8si2;
24884 halfmode = V8HImode;
24885 extract
24886 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24887 break;
24888 case V16SImode:
24889 if (unsigned_p)
24890 unpack = gen_avx512f_zero_extendv8siv8di2;
24891 else
24892 unpack = gen_avx512f_sign_extendv8siv8di2;
24893 halfmode = V8SImode;
24894 extract
24895 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24896 break;
24897 case V8SImode:
24898 if (unsigned_p)
24899 unpack = gen_avx2_zero_extendv4siv4di2;
24900 else
24901 unpack = gen_avx2_sign_extendv4siv4di2;
24902 halfmode = V4SImode;
24903 extract
24904 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24905 break;
24906 case V16QImode:
24907 if (unsigned_p)
24908 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24909 else
24910 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24911 break;
24912 case V8HImode:
24913 if (unsigned_p)
24914 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24915 else
24916 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24917 break;
24918 case V4SImode:
24919 if (unsigned_p)
24920 unpack = gen_sse4_1_zero_extendv2siv2di2;
24921 else
24922 unpack = gen_sse4_1_sign_extendv2siv2di2;
24923 break;
24924 default:
24925 gcc_unreachable ();
24928 if (GET_MODE_SIZE (imode) >= 32)
24930 tmp = gen_reg_rtx (halfmode);
24931 emit_insn (extract (tmp, src));
24933 else if (high_p)
24935 /* Shift higher 8 bytes to lower 8 bytes. */
24936 tmp = gen_reg_rtx (V1TImode);
24937 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24938 GEN_INT (64)));
24939 tmp = gen_lowpart (imode, tmp);
24941 else
24942 tmp = src;
24944 emit_insn (unpack (dest, tmp));
24946 else
24948 rtx (*unpack)(rtx, rtx, rtx);
24950 switch (imode)
24952 case V16QImode:
24953 if (high_p)
24954 unpack = gen_vec_interleave_highv16qi;
24955 else
24956 unpack = gen_vec_interleave_lowv16qi;
24957 break;
24958 case V8HImode:
24959 if (high_p)
24960 unpack = gen_vec_interleave_highv8hi;
24961 else
24962 unpack = gen_vec_interleave_lowv8hi;
24963 break;
24964 case V4SImode:
24965 if (high_p)
24966 unpack = gen_vec_interleave_highv4si;
24967 else
24968 unpack = gen_vec_interleave_lowv4si;
24969 break;
24970 default:
24971 gcc_unreachable ();
24974 if (unsigned_p)
24975 tmp = force_reg (imode, CONST0_RTX (imode));
24976 else
24977 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24978 src, pc_rtx, pc_rtx);
24980 rtx tmp2 = gen_reg_rtx (imode);
24981 emit_insn (unpack (tmp2, src, tmp));
24982 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24986 /* Expand conditional increment or decrement using adb/sbb instructions.
24987 The default case using setcc followed by the conditional move can be
24988 done by generic code. */
24989 bool
24990 ix86_expand_int_addcc (rtx operands[])
24992 enum rtx_code code = GET_CODE (operands[1]);
24993 rtx flags;
24994 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24995 rtx compare_op;
24996 rtx val = const0_rtx;
24997 bool fpcmp = false;
24998 machine_mode mode;
24999 rtx op0 = XEXP (operands[1], 0);
25000 rtx op1 = XEXP (operands[1], 1);
25002 if (operands[3] != const1_rtx
25003 && operands[3] != constm1_rtx)
25004 return false;
25005 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25006 return false;
25007 code = GET_CODE (compare_op);
25009 flags = XEXP (compare_op, 0);
25011 if (GET_MODE (flags) == CCFPmode
25012 || GET_MODE (flags) == CCFPUmode)
25014 fpcmp = true;
25015 code = ix86_fp_compare_code_to_integer (code);
25018 if (code != LTU)
25020 val = constm1_rtx;
25021 if (fpcmp)
25022 PUT_CODE (compare_op,
25023 reverse_condition_maybe_unordered
25024 (GET_CODE (compare_op)));
25025 else
25026 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25029 mode = GET_MODE (operands[0]);
25031 /* Construct either adc or sbb insn. */
25032 if ((code == LTU) == (operands[3] == constm1_rtx))
25034 switch (mode)
25036 case QImode:
25037 insn = gen_subqi3_carry;
25038 break;
25039 case HImode:
25040 insn = gen_subhi3_carry;
25041 break;
25042 case SImode:
25043 insn = gen_subsi3_carry;
25044 break;
25045 case DImode:
25046 insn = gen_subdi3_carry;
25047 break;
25048 default:
25049 gcc_unreachable ();
25052 else
25054 switch (mode)
25056 case QImode:
25057 insn = gen_addqi3_carry;
25058 break;
25059 case HImode:
25060 insn = gen_addhi3_carry;
25061 break;
25062 case SImode:
25063 insn = gen_addsi3_carry;
25064 break;
25065 case DImode:
25066 insn = gen_adddi3_carry;
25067 break;
25068 default:
25069 gcc_unreachable ();
25072 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25074 return true;
25078 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25079 but works for floating pointer parameters and nonoffsetable memories.
25080 For pushes, it returns just stack offsets; the values will be saved
25081 in the right order. Maximally three parts are generated. */
25083 static int
25084 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25086 int size;
25088 if (!TARGET_64BIT)
25089 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25090 else
25091 size = (GET_MODE_SIZE (mode) + 4) / 8;
25093 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25094 gcc_assert (size >= 2 && size <= 4);
25096 /* Optimize constant pool reference to immediates. This is used by fp
25097 moves, that force all constants to memory to allow combining. */
25098 if (MEM_P (operand) && MEM_READONLY_P (operand))
25100 rtx tmp = maybe_get_pool_constant (operand);
25101 if (tmp)
25102 operand = tmp;
25105 if (MEM_P (operand) && !offsettable_memref_p (operand))
25107 /* The only non-offsetable memories we handle are pushes. */
25108 int ok = push_operand (operand, VOIDmode);
25110 gcc_assert (ok);
25112 operand = copy_rtx (operand);
25113 PUT_MODE (operand, word_mode);
25114 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25115 return size;
25118 if (GET_CODE (operand) == CONST_VECTOR)
25120 machine_mode imode = int_mode_for_mode (mode);
25121 /* Caution: if we looked through a constant pool memory above,
25122 the operand may actually have a different mode now. That's
25123 ok, since we want to pun this all the way back to an integer. */
25124 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25125 gcc_assert (operand != NULL);
25126 mode = imode;
25129 if (!TARGET_64BIT)
25131 if (mode == DImode)
25132 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25133 else
25135 int i;
25137 if (REG_P (operand))
25139 gcc_assert (reload_completed);
25140 for (i = 0; i < size; i++)
25141 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25143 else if (offsettable_memref_p (operand))
25145 operand = adjust_address (operand, SImode, 0);
25146 parts[0] = operand;
25147 for (i = 1; i < size; i++)
25148 parts[i] = adjust_address (operand, SImode, 4 * i);
25150 else if (CONST_DOUBLE_P (operand))
25152 const REAL_VALUE_TYPE *r;
25153 long l[4];
25155 r = CONST_DOUBLE_REAL_VALUE (operand);
25156 switch (mode)
25158 case TFmode:
25159 real_to_target (l, r, mode);
25160 parts[3] = gen_int_mode (l[3], SImode);
25161 parts[2] = gen_int_mode (l[2], SImode);
25162 break;
25163 case XFmode:
25164 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25165 long double may not be 80-bit. */
25166 real_to_target (l, r, mode);
25167 parts[2] = gen_int_mode (l[2], SImode);
25168 break;
25169 case DFmode:
25170 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25171 break;
25172 default:
25173 gcc_unreachable ();
25175 parts[1] = gen_int_mode (l[1], SImode);
25176 parts[0] = gen_int_mode (l[0], SImode);
25178 else
25179 gcc_unreachable ();
25182 else
25184 if (mode == TImode)
25185 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25186 if (mode == XFmode || mode == TFmode)
25188 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25189 if (REG_P (operand))
25191 gcc_assert (reload_completed);
25192 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25193 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25195 else if (offsettable_memref_p (operand))
25197 operand = adjust_address (operand, DImode, 0);
25198 parts[0] = operand;
25199 parts[1] = adjust_address (operand, upper_mode, 8);
25201 else if (CONST_DOUBLE_P (operand))
25203 long l[4];
25205 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25207 /* real_to_target puts 32-bit pieces in each long. */
25208 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25209 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25210 << 32), DImode);
25212 if (upper_mode == SImode)
25213 parts[1] = gen_int_mode (l[2], SImode);
25214 else
25215 parts[1]
25216 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25217 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25218 << 32), DImode);
25220 else
25221 gcc_unreachable ();
25225 return size;
25228 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25229 Return false when normal moves are needed; true when all required
25230 insns have been emitted. Operands 2-4 contain the input values
25231 int the correct order; operands 5-7 contain the output values. */
25233 void
25234 ix86_split_long_move (rtx operands[])
25236 rtx part[2][4];
25237 int nparts, i, j;
25238 int push = 0;
25239 int collisions = 0;
25240 machine_mode mode = GET_MODE (operands[0]);
25241 bool collisionparts[4];
25243 /* The DFmode expanders may ask us to move double.
25244 For 64bit target this is single move. By hiding the fact
25245 here we simplify i386.md splitters. */
25246 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25248 /* Optimize constant pool reference to immediates. This is used by
25249 fp moves, that force all constants to memory to allow combining. */
25251 if (MEM_P (operands[1])
25252 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25253 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25254 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25255 if (push_operand (operands[0], VOIDmode))
25257 operands[0] = copy_rtx (operands[0]);
25258 PUT_MODE (operands[0], word_mode);
25260 else
25261 operands[0] = gen_lowpart (DImode, operands[0]);
25262 operands[1] = gen_lowpart (DImode, operands[1]);
25263 emit_move_insn (operands[0], operands[1]);
25264 return;
25267 /* The only non-offsettable memory we handle is push. */
25268 if (push_operand (operands[0], VOIDmode))
25269 push = 1;
25270 else
25271 gcc_assert (!MEM_P (operands[0])
25272 || offsettable_memref_p (operands[0]));
25274 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25275 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25277 /* When emitting push, take care for source operands on the stack. */
25278 if (push && MEM_P (operands[1])
25279 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25281 rtx src_base = XEXP (part[1][nparts - 1], 0);
25283 /* Compensate for the stack decrement by 4. */
25284 if (!TARGET_64BIT && nparts == 3
25285 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25286 src_base = plus_constant (Pmode, src_base, 4);
25288 /* src_base refers to the stack pointer and is
25289 automatically decreased by emitted push. */
25290 for (i = 0; i < nparts; i++)
25291 part[1][i] = change_address (part[1][i],
25292 GET_MODE (part[1][i]), src_base);
25295 /* We need to do copy in the right order in case an address register
25296 of the source overlaps the destination. */
25297 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25299 rtx tmp;
25301 for (i = 0; i < nparts; i++)
25303 collisionparts[i]
25304 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25305 if (collisionparts[i])
25306 collisions++;
25309 /* Collision in the middle part can be handled by reordering. */
25310 if (collisions == 1 && nparts == 3 && collisionparts [1])
25312 std::swap (part[0][1], part[0][2]);
25313 std::swap (part[1][1], part[1][2]);
25315 else if (collisions == 1
25316 && nparts == 4
25317 && (collisionparts [1] || collisionparts [2]))
25319 if (collisionparts [1])
25321 std::swap (part[0][1], part[0][2]);
25322 std::swap (part[1][1], part[1][2]);
25324 else
25326 std::swap (part[0][2], part[0][3]);
25327 std::swap (part[1][2], part[1][3]);
25331 /* If there are more collisions, we can't handle it by reordering.
25332 Do an lea to the last part and use only one colliding move. */
25333 else if (collisions > 1)
25335 rtx base, addr, tls_base = NULL_RTX;
25337 collisions = 1;
25339 base = part[0][nparts - 1];
25341 /* Handle the case when the last part isn't valid for lea.
25342 Happens in 64-bit mode storing the 12-byte XFmode. */
25343 if (GET_MODE (base) != Pmode)
25344 base = gen_rtx_REG (Pmode, REGNO (base));
25346 addr = XEXP (part[1][0], 0);
25347 if (TARGET_TLS_DIRECT_SEG_REFS)
25349 struct ix86_address parts;
25350 int ok = ix86_decompose_address (addr, &parts);
25351 gcc_assert (ok);
25352 if (parts.seg == DEFAULT_TLS_SEG_REG)
25354 /* It is not valid to use %gs: or %fs: in
25355 lea though, so we need to remove it from the
25356 address used for lea and add it to each individual
25357 memory loads instead. */
25358 addr = copy_rtx (addr);
25359 rtx *x = &addr;
25360 while (GET_CODE (*x) == PLUS)
25362 for (i = 0; i < 2; i++)
25364 rtx u = XEXP (*x, i);
25365 if (GET_CODE (u) == ZERO_EXTEND)
25366 u = XEXP (u, 0);
25367 if (GET_CODE (u) == UNSPEC
25368 && XINT (u, 1) == UNSPEC_TP)
25370 tls_base = XEXP (*x, i);
25371 *x = XEXP (*x, 1 - i);
25372 break;
25375 if (tls_base)
25376 break;
25377 x = &XEXP (*x, 0);
25379 gcc_assert (tls_base);
25382 emit_insn (gen_rtx_SET (base, addr));
25383 if (tls_base)
25384 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
25385 part[1][0] = replace_equiv_address (part[1][0], base);
25386 for (i = 1; i < nparts; i++)
25388 if (tls_base)
25389 base = copy_rtx (base);
25390 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25391 part[1][i] = replace_equiv_address (part[1][i], tmp);
25396 if (push)
25398 if (!TARGET_64BIT)
25400 if (nparts == 3)
25402 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25403 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25404 stack_pointer_rtx, GEN_INT (-4)));
25405 emit_move_insn (part[0][2], part[1][2]);
25407 else if (nparts == 4)
25409 emit_move_insn (part[0][3], part[1][3]);
25410 emit_move_insn (part[0][2], part[1][2]);
25413 else
25415 /* In 64bit mode we don't have 32bit push available. In case this is
25416 register, it is OK - we will just use larger counterpart. We also
25417 retype memory - these comes from attempt to avoid REX prefix on
25418 moving of second half of TFmode value. */
25419 if (GET_MODE (part[1][1]) == SImode)
25421 switch (GET_CODE (part[1][1]))
25423 case MEM:
25424 part[1][1] = adjust_address (part[1][1], DImode, 0);
25425 break;
25427 case REG:
25428 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25429 break;
25431 default:
25432 gcc_unreachable ();
25435 if (GET_MODE (part[1][0]) == SImode)
25436 part[1][0] = part[1][1];
25439 emit_move_insn (part[0][1], part[1][1]);
25440 emit_move_insn (part[0][0], part[1][0]);
25441 return;
25444 /* Choose correct order to not overwrite the source before it is copied. */
25445 if ((REG_P (part[0][0])
25446 && REG_P (part[1][1])
25447 && (REGNO (part[0][0]) == REGNO (part[1][1])
25448 || (nparts == 3
25449 && REGNO (part[0][0]) == REGNO (part[1][2]))
25450 || (nparts == 4
25451 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25452 || (collisions > 0
25453 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25455 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25457 operands[2 + i] = part[0][j];
25458 operands[6 + i] = part[1][j];
25461 else
25463 for (i = 0; i < nparts; i++)
25465 operands[2 + i] = part[0][i];
25466 operands[6 + i] = part[1][i];
25470 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25471 if (optimize_insn_for_size_p ())
25473 for (j = 0; j < nparts - 1; j++)
25474 if (CONST_INT_P (operands[6 + j])
25475 && operands[6 + j] != const0_rtx
25476 && REG_P (operands[2 + j]))
25477 for (i = j; i < nparts - 1; i++)
25478 if (CONST_INT_P (operands[7 + i])
25479 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25480 operands[7 + i] = operands[2 + j];
25483 for (i = 0; i < nparts; i++)
25484 emit_move_insn (operands[2 + i], operands[6 + i]);
25486 return;
25489 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25490 left shift by a constant, either using a single shift or
25491 a sequence of add instructions. */
25493 static void
25494 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25496 rtx (*insn)(rtx, rtx, rtx);
25498 if (count == 1
25499 || (count * ix86_cost->add <= ix86_cost->shift_const
25500 && !optimize_insn_for_size_p ()))
25502 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25503 while (count-- > 0)
25504 emit_insn (insn (operand, operand, operand));
25506 else
25508 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25509 emit_insn (insn (operand, operand, GEN_INT (count)));
25513 void
25514 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25516 rtx (*gen_ashl3)(rtx, rtx, rtx);
25517 rtx (*gen_shld)(rtx, rtx, rtx);
25518 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25520 rtx low[2], high[2];
25521 int count;
25523 if (CONST_INT_P (operands[2]))
25525 split_double_mode (mode, operands, 2, low, high);
25526 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25528 if (count >= half_width)
25530 emit_move_insn (high[0], low[1]);
25531 emit_move_insn (low[0], const0_rtx);
25533 if (count > half_width)
25534 ix86_expand_ashl_const (high[0], count - half_width, mode);
25536 else
25538 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25540 if (!rtx_equal_p (operands[0], operands[1]))
25541 emit_move_insn (operands[0], operands[1]);
25543 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25544 ix86_expand_ashl_const (low[0], count, mode);
25546 return;
25549 split_double_mode (mode, operands, 1, low, high);
25551 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25553 if (operands[1] == const1_rtx)
25555 /* Assuming we've chosen a QImode capable registers, then 1 << N
25556 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25557 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25559 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25561 ix86_expand_clear (low[0]);
25562 ix86_expand_clear (high[0]);
25563 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25565 d = gen_lowpart (QImode, low[0]);
25566 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25567 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25568 emit_insn (gen_rtx_SET (d, s));
25570 d = gen_lowpart (QImode, high[0]);
25571 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25572 s = gen_rtx_NE (QImode, flags, const0_rtx);
25573 emit_insn (gen_rtx_SET (d, s));
25576 /* Otherwise, we can get the same results by manually performing
25577 a bit extract operation on bit 5/6, and then performing the two
25578 shifts. The two methods of getting 0/1 into low/high are exactly
25579 the same size. Avoiding the shift in the bit extract case helps
25580 pentium4 a bit; no one else seems to care much either way. */
25581 else
25583 machine_mode half_mode;
25584 rtx (*gen_lshr3)(rtx, rtx, rtx);
25585 rtx (*gen_and3)(rtx, rtx, rtx);
25586 rtx (*gen_xor3)(rtx, rtx, rtx);
25587 HOST_WIDE_INT bits;
25588 rtx x;
25590 if (mode == DImode)
25592 half_mode = SImode;
25593 gen_lshr3 = gen_lshrsi3;
25594 gen_and3 = gen_andsi3;
25595 gen_xor3 = gen_xorsi3;
25596 bits = 5;
25598 else
25600 half_mode = DImode;
25601 gen_lshr3 = gen_lshrdi3;
25602 gen_and3 = gen_anddi3;
25603 gen_xor3 = gen_xordi3;
25604 bits = 6;
25607 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25608 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25609 else
25610 x = gen_lowpart (half_mode, operands[2]);
25611 emit_insn (gen_rtx_SET (high[0], x));
25613 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25614 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25615 emit_move_insn (low[0], high[0]);
25616 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25619 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25620 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25621 return;
25624 if (operands[1] == constm1_rtx)
25626 /* For -1 << N, we can avoid the shld instruction, because we
25627 know that we're shifting 0...31/63 ones into a -1. */
25628 emit_move_insn (low[0], constm1_rtx);
25629 if (optimize_insn_for_size_p ())
25630 emit_move_insn (high[0], low[0]);
25631 else
25632 emit_move_insn (high[0], constm1_rtx);
25634 else
25636 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25638 if (!rtx_equal_p (operands[0], operands[1]))
25639 emit_move_insn (operands[0], operands[1]);
25641 split_double_mode (mode, operands, 1, low, high);
25642 emit_insn (gen_shld (high[0], low[0], operands[2]));
25645 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25647 if (TARGET_CMOVE && scratch)
25649 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25650 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25652 ix86_expand_clear (scratch);
25653 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25655 else
25657 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25658 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25660 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25664 void
25665 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25667 rtx (*gen_ashr3)(rtx, rtx, rtx)
25668 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25669 rtx (*gen_shrd)(rtx, rtx, rtx);
25670 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25672 rtx low[2], high[2];
25673 int count;
25675 if (CONST_INT_P (operands[2]))
25677 split_double_mode (mode, operands, 2, low, high);
25678 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25680 if (count == GET_MODE_BITSIZE (mode) - 1)
25682 emit_move_insn (high[0], high[1]);
25683 emit_insn (gen_ashr3 (high[0], high[0],
25684 GEN_INT (half_width - 1)));
25685 emit_move_insn (low[0], high[0]);
25688 else if (count >= half_width)
25690 emit_move_insn (low[0], high[1]);
25691 emit_move_insn (high[0], low[0]);
25692 emit_insn (gen_ashr3 (high[0], high[0],
25693 GEN_INT (half_width - 1)));
25695 if (count > half_width)
25696 emit_insn (gen_ashr3 (low[0], low[0],
25697 GEN_INT (count - half_width)));
25699 else
25701 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25703 if (!rtx_equal_p (operands[0], operands[1]))
25704 emit_move_insn (operands[0], operands[1]);
25706 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25707 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25710 else
25712 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25714 if (!rtx_equal_p (operands[0], operands[1]))
25715 emit_move_insn (operands[0], operands[1]);
25717 split_double_mode (mode, operands, 1, low, high);
25719 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25720 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25722 if (TARGET_CMOVE && scratch)
25724 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25725 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25727 emit_move_insn (scratch, high[0]);
25728 emit_insn (gen_ashr3 (scratch, scratch,
25729 GEN_INT (half_width - 1)));
25730 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25731 scratch));
25733 else
25735 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25736 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25738 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25743 void
25744 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25746 rtx (*gen_lshr3)(rtx, rtx, rtx)
25747 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25748 rtx (*gen_shrd)(rtx, rtx, rtx);
25749 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25751 rtx low[2], high[2];
25752 int count;
25754 if (CONST_INT_P (operands[2]))
25756 split_double_mode (mode, operands, 2, low, high);
25757 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25759 if (count >= half_width)
25761 emit_move_insn (low[0], high[1]);
25762 ix86_expand_clear (high[0]);
25764 if (count > half_width)
25765 emit_insn (gen_lshr3 (low[0], low[0],
25766 GEN_INT (count - half_width)));
25768 else
25770 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25772 if (!rtx_equal_p (operands[0], operands[1]))
25773 emit_move_insn (operands[0], operands[1]);
25775 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25776 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25779 else
25781 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25783 if (!rtx_equal_p (operands[0], operands[1]))
25784 emit_move_insn (operands[0], operands[1]);
25786 split_double_mode (mode, operands, 1, low, high);
25788 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25789 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25791 if (TARGET_CMOVE && scratch)
25793 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25794 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25796 ix86_expand_clear (scratch);
25797 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25798 scratch));
25800 else
25802 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25803 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25805 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25810 /* Predict just emitted jump instruction to be taken with probability PROB. */
25811 static void
25812 predict_jump (int prob)
25814 rtx insn = get_last_insn ();
25815 gcc_assert (JUMP_P (insn));
25816 add_int_reg_note (insn, REG_BR_PROB, prob);
25819 /* Helper function for the string operations below. Dest VARIABLE whether
25820 it is aligned to VALUE bytes. If true, jump to the label. */
25821 static rtx_code_label *
25822 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25824 rtx_code_label *label = gen_label_rtx ();
25825 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25826 if (GET_MODE (variable) == DImode)
25827 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25828 else
25829 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25830 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25831 1, label);
25832 if (epilogue)
25833 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25834 else
25835 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25836 return label;
25839 /* Adjust COUNTER by the VALUE. */
25840 static void
25841 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25843 rtx (*gen_add)(rtx, rtx, rtx)
25844 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25846 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25849 /* Zero extend possibly SImode EXP to Pmode register. */
25851 ix86_zero_extend_to_Pmode (rtx exp)
25853 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25856 /* Divide COUNTREG by SCALE. */
25857 static rtx
25858 scale_counter (rtx countreg, int scale)
25860 rtx sc;
25862 if (scale == 1)
25863 return countreg;
25864 if (CONST_INT_P (countreg))
25865 return GEN_INT (INTVAL (countreg) / scale);
25866 gcc_assert (REG_P (countreg));
25868 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25869 GEN_INT (exact_log2 (scale)),
25870 NULL, 1, OPTAB_DIRECT);
25871 return sc;
25874 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25875 DImode for constant loop counts. */
25877 static machine_mode
25878 counter_mode (rtx count_exp)
25880 if (GET_MODE (count_exp) != VOIDmode)
25881 return GET_MODE (count_exp);
25882 if (!CONST_INT_P (count_exp))
25883 return Pmode;
25884 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25885 return DImode;
25886 return SImode;
25889 /* Copy the address to a Pmode register. This is used for x32 to
25890 truncate DImode TLS address to a SImode register. */
25892 static rtx
25893 ix86_copy_addr_to_reg (rtx addr)
25895 rtx reg;
25896 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25898 reg = copy_addr_to_reg (addr);
25899 REG_POINTER (reg) = 1;
25900 return reg;
25902 else
25904 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25905 reg = copy_to_mode_reg (DImode, addr);
25906 REG_POINTER (reg) = 1;
25907 return gen_rtx_SUBREG (SImode, reg, 0);
25911 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25912 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25913 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25914 memory by VALUE (supposed to be in MODE).
25916 The size is rounded down to whole number of chunk size moved at once.
25917 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25920 static void
25921 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25922 rtx destptr, rtx srcptr, rtx value,
25923 rtx count, machine_mode mode, int unroll,
25924 int expected_size, bool issetmem)
25926 rtx_code_label *out_label, *top_label;
25927 rtx iter, tmp;
25928 machine_mode iter_mode = counter_mode (count);
25929 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25930 rtx piece_size = GEN_INT (piece_size_n);
25931 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25932 rtx size;
25933 int i;
25935 top_label = gen_label_rtx ();
25936 out_label = gen_label_rtx ();
25937 iter = gen_reg_rtx (iter_mode);
25939 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25940 NULL, 1, OPTAB_DIRECT);
25941 /* Those two should combine. */
25942 if (piece_size == const1_rtx)
25944 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25945 true, out_label);
25946 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25948 emit_move_insn (iter, const0_rtx);
25950 emit_label (top_label);
25952 tmp = convert_modes (Pmode, iter_mode, iter, true);
25954 /* This assert could be relaxed - in this case we'll need to compute
25955 smallest power of two, containing in PIECE_SIZE_N and pass it to
25956 offset_address. */
25957 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25958 destmem = offset_address (destmem, tmp, piece_size_n);
25959 destmem = adjust_address (destmem, mode, 0);
25961 if (!issetmem)
25963 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25964 srcmem = adjust_address (srcmem, mode, 0);
25966 /* When unrolling for chips that reorder memory reads and writes,
25967 we can save registers by using single temporary.
25968 Also using 4 temporaries is overkill in 32bit mode. */
25969 if (!TARGET_64BIT && 0)
25971 for (i = 0; i < unroll; i++)
25973 if (i)
25975 destmem =
25976 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25977 srcmem =
25978 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25980 emit_move_insn (destmem, srcmem);
25983 else
25985 rtx tmpreg[4];
25986 gcc_assert (unroll <= 4);
25987 for (i = 0; i < unroll; i++)
25989 tmpreg[i] = gen_reg_rtx (mode);
25990 if (i)
25992 srcmem =
25993 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25995 emit_move_insn (tmpreg[i], srcmem);
25997 for (i = 0; i < unroll; i++)
25999 if (i)
26001 destmem =
26002 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26004 emit_move_insn (destmem, tmpreg[i]);
26008 else
26009 for (i = 0; i < unroll; i++)
26011 if (i)
26012 destmem =
26013 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26014 emit_move_insn (destmem, value);
26017 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26018 true, OPTAB_LIB_WIDEN);
26019 if (tmp != iter)
26020 emit_move_insn (iter, tmp);
26022 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26023 true, top_label);
26024 if (expected_size != -1)
26026 expected_size /= GET_MODE_SIZE (mode) * unroll;
26027 if (expected_size == 0)
26028 predict_jump (0);
26029 else if (expected_size > REG_BR_PROB_BASE)
26030 predict_jump (REG_BR_PROB_BASE - 1);
26031 else
26032 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26034 else
26035 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26036 iter = ix86_zero_extend_to_Pmode (iter);
26037 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26038 true, OPTAB_LIB_WIDEN);
26039 if (tmp != destptr)
26040 emit_move_insn (destptr, tmp);
26041 if (!issetmem)
26043 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26044 true, OPTAB_LIB_WIDEN);
26045 if (tmp != srcptr)
26046 emit_move_insn (srcptr, tmp);
26048 emit_label (out_label);
26051 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26052 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26053 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26054 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26055 ORIG_VALUE is the original value passed to memset to fill the memory with.
26056 Other arguments have same meaning as for previous function. */
26058 static void
26059 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26060 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26061 rtx count,
26062 machine_mode mode, bool issetmem)
26064 rtx destexp;
26065 rtx srcexp;
26066 rtx countreg;
26067 HOST_WIDE_INT rounded_count;
26069 /* If possible, it is shorter to use rep movs.
26070 TODO: Maybe it is better to move this logic to decide_alg. */
26071 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26072 && (!issetmem || orig_value == const0_rtx))
26073 mode = SImode;
26075 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26076 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26078 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26079 GET_MODE_SIZE (mode)));
26080 if (mode != QImode)
26082 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26083 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26084 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26086 else
26087 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26088 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26090 rounded_count
26091 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26092 destmem = shallow_copy_rtx (destmem);
26093 set_mem_size (destmem, rounded_count);
26095 else if (MEM_SIZE_KNOWN_P (destmem))
26096 clear_mem_size (destmem);
26098 if (issetmem)
26100 value = force_reg (mode, gen_lowpart (mode, value));
26101 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26103 else
26105 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26106 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26107 if (mode != QImode)
26109 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26110 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26111 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26113 else
26114 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26115 if (CONST_INT_P (count))
26117 rounded_count
26118 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26119 srcmem = shallow_copy_rtx (srcmem);
26120 set_mem_size (srcmem, rounded_count);
26122 else
26124 if (MEM_SIZE_KNOWN_P (srcmem))
26125 clear_mem_size (srcmem);
26127 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26128 destexp, srcexp));
26132 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26133 DESTMEM.
26134 SRC is passed by pointer to be updated on return.
26135 Return value is updated DST. */
26136 static rtx
26137 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26138 HOST_WIDE_INT size_to_move)
26140 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26141 enum insn_code code;
26142 machine_mode move_mode;
26143 int piece_size, i;
26145 /* Find the widest mode in which we could perform moves.
26146 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26147 it until move of such size is supported. */
26148 piece_size = 1 << floor_log2 (size_to_move);
26149 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26150 code = optab_handler (mov_optab, move_mode);
26151 while (code == CODE_FOR_nothing && piece_size > 1)
26153 piece_size >>= 1;
26154 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
26155 code = optab_handler (mov_optab, move_mode);
26158 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26159 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26160 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26162 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26163 move_mode = mode_for_vector (word_mode, nunits);
26164 code = optab_handler (mov_optab, move_mode);
26165 if (code == CODE_FOR_nothing)
26167 move_mode = word_mode;
26168 piece_size = GET_MODE_SIZE (move_mode);
26169 code = optab_handler (mov_optab, move_mode);
26172 gcc_assert (code != CODE_FOR_nothing);
26174 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26175 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26177 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26178 gcc_assert (size_to_move % piece_size == 0);
26179 adjust = GEN_INT (piece_size);
26180 for (i = 0; i < size_to_move; i += piece_size)
26182 /* We move from memory to memory, so we'll need to do it via
26183 a temporary register. */
26184 tempreg = gen_reg_rtx (move_mode);
26185 emit_insn (GEN_FCN (code) (tempreg, src));
26186 emit_insn (GEN_FCN (code) (dst, tempreg));
26188 emit_move_insn (destptr,
26189 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26190 emit_move_insn (srcptr,
26191 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26193 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26194 piece_size);
26195 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26196 piece_size);
26199 /* Update DST and SRC rtx. */
26200 *srcmem = src;
26201 return dst;
26204 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26205 static void
26206 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26207 rtx destptr, rtx srcptr, rtx count, int max_size)
26209 rtx src, dest;
26210 if (CONST_INT_P (count))
26212 HOST_WIDE_INT countval = INTVAL (count);
26213 HOST_WIDE_INT epilogue_size = countval % max_size;
26214 int i;
26216 /* For now MAX_SIZE should be a power of 2. This assert could be
26217 relaxed, but it'll require a bit more complicated epilogue
26218 expanding. */
26219 gcc_assert ((max_size & (max_size - 1)) == 0);
26220 for (i = max_size; i >= 1; i >>= 1)
26222 if (epilogue_size & i)
26223 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26225 return;
26227 if (max_size > 8)
26229 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26230 count, 1, OPTAB_DIRECT);
26231 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26232 count, QImode, 1, 4, false);
26233 return;
26236 /* When there are stringops, we can cheaply increase dest and src pointers.
26237 Otherwise we save code size by maintaining offset (zero is readily
26238 available from preceding rep operation) and using x86 addressing modes.
26240 if (TARGET_SINGLE_STRINGOP)
26242 if (max_size > 4)
26244 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26245 src = change_address (srcmem, SImode, srcptr);
26246 dest = change_address (destmem, SImode, destptr);
26247 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26248 emit_label (label);
26249 LABEL_NUSES (label) = 1;
26251 if (max_size > 2)
26253 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26254 src = change_address (srcmem, HImode, srcptr);
26255 dest = change_address (destmem, HImode, destptr);
26256 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26257 emit_label (label);
26258 LABEL_NUSES (label) = 1;
26260 if (max_size > 1)
26262 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26263 src = change_address (srcmem, QImode, srcptr);
26264 dest = change_address (destmem, QImode, destptr);
26265 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26266 emit_label (label);
26267 LABEL_NUSES (label) = 1;
26270 else
26272 rtx offset = force_reg (Pmode, const0_rtx);
26273 rtx tmp;
26275 if (max_size > 4)
26277 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26278 src = change_address (srcmem, SImode, srcptr);
26279 dest = change_address (destmem, SImode, destptr);
26280 emit_move_insn (dest, src);
26281 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26282 true, OPTAB_LIB_WIDEN);
26283 if (tmp != offset)
26284 emit_move_insn (offset, tmp);
26285 emit_label (label);
26286 LABEL_NUSES (label) = 1;
26288 if (max_size > 2)
26290 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26291 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26292 src = change_address (srcmem, HImode, tmp);
26293 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26294 dest = change_address (destmem, HImode, tmp);
26295 emit_move_insn (dest, src);
26296 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26297 true, OPTAB_LIB_WIDEN);
26298 if (tmp != offset)
26299 emit_move_insn (offset, tmp);
26300 emit_label (label);
26301 LABEL_NUSES (label) = 1;
26303 if (max_size > 1)
26305 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26306 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26307 src = change_address (srcmem, QImode, tmp);
26308 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26309 dest = change_address (destmem, QImode, tmp);
26310 emit_move_insn (dest, src);
26311 emit_label (label);
26312 LABEL_NUSES (label) = 1;
26317 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26318 with value PROMOTED_VAL.
26319 SRC is passed by pointer to be updated on return.
26320 Return value is updated DST. */
26321 static rtx
26322 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26323 HOST_WIDE_INT size_to_move)
26325 rtx dst = destmem, adjust;
26326 enum insn_code code;
26327 machine_mode move_mode;
26328 int piece_size, i;
26330 /* Find the widest mode in which we could perform moves.
26331 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26332 it until move of such size is supported. */
26333 move_mode = GET_MODE (promoted_val);
26334 if (move_mode == VOIDmode)
26335 move_mode = QImode;
26336 if (size_to_move < GET_MODE_SIZE (move_mode))
26338 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
26339 promoted_val = gen_lowpart (move_mode, promoted_val);
26341 piece_size = GET_MODE_SIZE (move_mode);
26342 code = optab_handler (mov_optab, move_mode);
26343 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26345 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26347 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26348 gcc_assert (size_to_move % piece_size == 0);
26349 adjust = GEN_INT (piece_size);
26350 for (i = 0; i < size_to_move; i += piece_size)
26352 if (piece_size <= GET_MODE_SIZE (word_mode))
26354 emit_insn (gen_strset (destptr, dst, promoted_val));
26355 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26356 piece_size);
26357 continue;
26360 emit_insn (GEN_FCN (code) (dst, promoted_val));
26362 emit_move_insn (destptr,
26363 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26365 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26366 piece_size);
26369 /* Update DST rtx. */
26370 return dst;
26372 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26373 static void
26374 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26375 rtx count, int max_size)
26377 count =
26378 expand_simple_binop (counter_mode (count), AND, count,
26379 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26380 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26381 gen_lowpart (QImode, value), count, QImode,
26382 1, max_size / 2, true);
26385 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26386 static void
26387 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26388 rtx count, int max_size)
26390 rtx dest;
26392 if (CONST_INT_P (count))
26394 HOST_WIDE_INT countval = INTVAL (count);
26395 HOST_WIDE_INT epilogue_size = countval % max_size;
26396 int i;
26398 /* For now MAX_SIZE should be a power of 2. This assert could be
26399 relaxed, but it'll require a bit more complicated epilogue
26400 expanding. */
26401 gcc_assert ((max_size & (max_size - 1)) == 0);
26402 for (i = max_size; i >= 1; i >>= 1)
26404 if (epilogue_size & i)
26406 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26407 destmem = emit_memset (destmem, destptr, vec_value, i);
26408 else
26409 destmem = emit_memset (destmem, destptr, value, i);
26412 return;
26414 if (max_size > 32)
26416 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26417 return;
26419 if (max_size > 16)
26421 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26422 if (TARGET_64BIT)
26424 dest = change_address (destmem, DImode, destptr);
26425 emit_insn (gen_strset (destptr, dest, value));
26426 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26427 emit_insn (gen_strset (destptr, dest, value));
26429 else
26431 dest = change_address (destmem, SImode, destptr);
26432 emit_insn (gen_strset (destptr, dest, value));
26433 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26434 emit_insn (gen_strset (destptr, dest, value));
26435 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26436 emit_insn (gen_strset (destptr, dest, value));
26437 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26438 emit_insn (gen_strset (destptr, dest, value));
26440 emit_label (label);
26441 LABEL_NUSES (label) = 1;
26443 if (max_size > 8)
26445 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26446 if (TARGET_64BIT)
26448 dest = change_address (destmem, DImode, destptr);
26449 emit_insn (gen_strset (destptr, dest, value));
26451 else
26453 dest = change_address (destmem, SImode, destptr);
26454 emit_insn (gen_strset (destptr, dest, value));
26455 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26456 emit_insn (gen_strset (destptr, dest, value));
26458 emit_label (label);
26459 LABEL_NUSES (label) = 1;
26461 if (max_size > 4)
26463 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26464 dest = change_address (destmem, SImode, destptr);
26465 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26466 emit_label (label);
26467 LABEL_NUSES (label) = 1;
26469 if (max_size > 2)
26471 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26472 dest = change_address (destmem, HImode, destptr);
26473 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26474 emit_label (label);
26475 LABEL_NUSES (label) = 1;
26477 if (max_size > 1)
26479 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26480 dest = change_address (destmem, QImode, destptr);
26481 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26482 emit_label (label);
26483 LABEL_NUSES (label) = 1;
26487 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26488 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26489 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26490 ignored.
26491 Return value is updated DESTMEM. */
26492 static rtx
26493 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26494 rtx destptr, rtx srcptr, rtx value,
26495 rtx vec_value, rtx count, int align,
26496 int desired_alignment, bool issetmem)
26498 int i;
26499 for (i = 1; i < desired_alignment; i <<= 1)
26501 if (align <= i)
26503 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26504 if (issetmem)
26506 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26507 destmem = emit_memset (destmem, destptr, vec_value, i);
26508 else
26509 destmem = emit_memset (destmem, destptr, value, i);
26511 else
26512 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26513 ix86_adjust_counter (count, i);
26514 emit_label (label);
26515 LABEL_NUSES (label) = 1;
26516 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26519 return destmem;
26522 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26523 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26524 and jump to DONE_LABEL. */
26525 static void
26526 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26527 rtx destptr, rtx srcptr,
26528 rtx value, rtx vec_value,
26529 rtx count, int size,
26530 rtx done_label, bool issetmem)
26532 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26533 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
26534 rtx modesize;
26535 int n;
26537 /* If we do not have vector value to copy, we must reduce size. */
26538 if (issetmem)
26540 if (!vec_value)
26542 if (GET_MODE (value) == VOIDmode && size > 8)
26543 mode = Pmode;
26544 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26545 mode = GET_MODE (value);
26547 else
26548 mode = GET_MODE (vec_value), value = vec_value;
26550 else
26552 /* Choose appropriate vector mode. */
26553 if (size >= 32)
26554 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26555 else if (size >= 16)
26556 mode = TARGET_SSE ? V16QImode : DImode;
26557 srcmem = change_address (srcmem, mode, srcptr);
26559 destmem = change_address (destmem, mode, destptr);
26560 modesize = GEN_INT (GET_MODE_SIZE (mode));
26561 gcc_assert (GET_MODE_SIZE (mode) <= size);
26562 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26564 if (issetmem)
26565 emit_move_insn (destmem, gen_lowpart (mode, value));
26566 else
26568 emit_move_insn (destmem, srcmem);
26569 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26571 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26574 destmem = offset_address (destmem, count, 1);
26575 destmem = offset_address (destmem, GEN_INT (-2 * size),
26576 GET_MODE_SIZE (mode));
26577 if (!issetmem)
26579 srcmem = offset_address (srcmem, count, 1);
26580 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26581 GET_MODE_SIZE (mode));
26583 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26585 if (issetmem)
26586 emit_move_insn (destmem, gen_lowpart (mode, value));
26587 else
26589 emit_move_insn (destmem, srcmem);
26590 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26592 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26594 emit_jump_insn (gen_jump (done_label));
26595 emit_barrier ();
26597 emit_label (label);
26598 LABEL_NUSES (label) = 1;
26601 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26602 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26603 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26604 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26605 DONE_LABEL is a label after the whole copying sequence. The label is created
26606 on demand if *DONE_LABEL is NULL.
26607 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26608 bounds after the initial copies.
26610 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26611 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26612 we will dispatch to a library call for large blocks.
26614 In pseudocode we do:
26616 if (COUNT < SIZE)
26618 Assume that SIZE is 4. Bigger sizes are handled analogously
26619 if (COUNT & 4)
26621 copy 4 bytes from SRCPTR to DESTPTR
26622 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26623 goto done_label
26625 if (!COUNT)
26626 goto done_label;
26627 copy 1 byte from SRCPTR to DESTPTR
26628 if (COUNT & 2)
26630 copy 2 bytes from SRCPTR to DESTPTR
26631 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26634 else
26636 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26637 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26639 OLD_DESPTR = DESTPTR;
26640 Align DESTPTR up to DESIRED_ALIGN
26641 SRCPTR += DESTPTR - OLD_DESTPTR
26642 COUNT -= DEST_PTR - OLD_DESTPTR
26643 if (DYNAMIC_CHECK)
26644 Round COUNT down to multiple of SIZE
26645 << optional caller supplied zero size guard is here >>
26646 << optional caller supplied dynamic check is here >>
26647 << caller supplied main copy loop is here >>
26649 done_label:
26651 static void
26652 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26653 rtx *destptr, rtx *srcptr,
26654 machine_mode mode,
26655 rtx value, rtx vec_value,
26656 rtx *count,
26657 rtx_code_label **done_label,
26658 int size,
26659 int desired_align,
26660 int align,
26661 unsigned HOST_WIDE_INT *min_size,
26662 bool dynamic_check,
26663 bool issetmem)
26665 rtx_code_label *loop_label = NULL, *label;
26666 int n;
26667 rtx modesize;
26668 int prolog_size = 0;
26669 rtx mode_value;
26671 /* Chose proper value to copy. */
26672 if (issetmem && VECTOR_MODE_P (mode))
26673 mode_value = vec_value;
26674 else
26675 mode_value = value;
26676 gcc_assert (GET_MODE_SIZE (mode) <= size);
26678 /* See if block is big or small, handle small blocks. */
26679 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26681 int size2 = size;
26682 loop_label = gen_label_rtx ();
26684 if (!*done_label)
26685 *done_label = gen_label_rtx ();
26687 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26688 1, loop_label);
26689 size2 >>= 1;
26691 /* Handle sizes > 3. */
26692 for (;size2 > 2; size2 >>= 1)
26693 expand_small_movmem_or_setmem (destmem, srcmem,
26694 *destptr, *srcptr,
26695 value, vec_value,
26696 *count,
26697 size2, *done_label, issetmem);
26698 /* Nothing to copy? Jump to DONE_LABEL if so */
26699 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26700 1, *done_label);
26702 /* Do a byte copy. */
26703 destmem = change_address (destmem, QImode, *destptr);
26704 if (issetmem)
26705 emit_move_insn (destmem, gen_lowpart (QImode, value));
26706 else
26708 srcmem = change_address (srcmem, QImode, *srcptr);
26709 emit_move_insn (destmem, srcmem);
26712 /* Handle sizes 2 and 3. */
26713 label = ix86_expand_aligntest (*count, 2, false);
26714 destmem = change_address (destmem, HImode, *destptr);
26715 destmem = offset_address (destmem, *count, 1);
26716 destmem = offset_address (destmem, GEN_INT (-2), 2);
26717 if (issetmem)
26718 emit_move_insn (destmem, gen_lowpart (HImode, value));
26719 else
26721 srcmem = change_address (srcmem, HImode, *srcptr);
26722 srcmem = offset_address (srcmem, *count, 1);
26723 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26724 emit_move_insn (destmem, srcmem);
26727 emit_label (label);
26728 LABEL_NUSES (label) = 1;
26729 emit_jump_insn (gen_jump (*done_label));
26730 emit_barrier ();
26732 else
26733 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26734 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26736 /* Start memcpy for COUNT >= SIZE. */
26737 if (loop_label)
26739 emit_label (loop_label);
26740 LABEL_NUSES (loop_label) = 1;
26743 /* Copy first desired_align bytes. */
26744 if (!issetmem)
26745 srcmem = change_address (srcmem, mode, *srcptr);
26746 destmem = change_address (destmem, mode, *destptr);
26747 modesize = GEN_INT (GET_MODE_SIZE (mode));
26748 for (n = 0; prolog_size < desired_align - align; n++)
26750 if (issetmem)
26751 emit_move_insn (destmem, mode_value);
26752 else
26754 emit_move_insn (destmem, srcmem);
26755 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26757 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26758 prolog_size += GET_MODE_SIZE (mode);
26762 /* Copy last SIZE bytes. */
26763 destmem = offset_address (destmem, *count, 1);
26764 destmem = offset_address (destmem,
26765 GEN_INT (-size - prolog_size),
26767 if (issetmem)
26768 emit_move_insn (destmem, mode_value);
26769 else
26771 srcmem = offset_address (srcmem, *count, 1);
26772 srcmem = offset_address (srcmem,
26773 GEN_INT (-size - prolog_size),
26775 emit_move_insn (destmem, srcmem);
26777 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26779 destmem = offset_address (destmem, modesize, 1);
26780 if (issetmem)
26781 emit_move_insn (destmem, mode_value);
26782 else
26784 srcmem = offset_address (srcmem, modesize, 1);
26785 emit_move_insn (destmem, srcmem);
26789 /* Align destination. */
26790 if (desired_align > 1 && desired_align > align)
26792 rtx saveddest = *destptr;
26794 gcc_assert (desired_align <= size);
26795 /* Align destptr up, place it to new register. */
26796 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26797 GEN_INT (prolog_size),
26798 NULL_RTX, 1, OPTAB_DIRECT);
26799 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26800 REG_POINTER (*destptr) = 1;
26801 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26802 GEN_INT (-desired_align),
26803 *destptr, 1, OPTAB_DIRECT);
26804 /* See how many bytes we skipped. */
26805 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26806 *destptr,
26807 saveddest, 1, OPTAB_DIRECT);
26808 /* Adjust srcptr and count. */
26809 if (!issetmem)
26810 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26811 saveddest, *srcptr, 1, OPTAB_DIRECT);
26812 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26813 saveddest, *count, 1, OPTAB_DIRECT);
26814 /* We copied at most size + prolog_size. */
26815 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26816 *min_size
26817 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26818 else
26819 *min_size = 0;
26821 /* Our loops always round down the block size, but for dispatch to
26822 library we need precise value. */
26823 if (dynamic_check)
26824 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26825 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26827 else
26829 gcc_assert (prolog_size == 0);
26830 /* Decrease count, so we won't end up copying last word twice. */
26831 if (!CONST_INT_P (*count))
26832 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26833 constm1_rtx, *count, 1, OPTAB_DIRECT);
26834 else
26835 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26836 (unsigned HOST_WIDE_INT)size));
26837 if (*min_size)
26838 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26843 /* This function is like the previous one, except here we know how many bytes
26844 need to be copied. That allows us to update alignment not only of DST, which
26845 is returned, but also of SRC, which is passed as a pointer for that
26846 reason. */
26847 static rtx
26848 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26849 rtx srcreg, rtx value, rtx vec_value,
26850 int desired_align, int align_bytes,
26851 bool issetmem)
26853 rtx src = NULL;
26854 rtx orig_dst = dst;
26855 rtx orig_src = NULL;
26856 int piece_size = 1;
26857 int copied_bytes = 0;
26859 if (!issetmem)
26861 gcc_assert (srcp != NULL);
26862 src = *srcp;
26863 orig_src = src;
26866 for (piece_size = 1;
26867 piece_size <= desired_align && copied_bytes < align_bytes;
26868 piece_size <<= 1)
26870 if (align_bytes & piece_size)
26872 if (issetmem)
26874 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26875 dst = emit_memset (dst, destreg, vec_value, piece_size);
26876 else
26877 dst = emit_memset (dst, destreg, value, piece_size);
26879 else
26880 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26881 copied_bytes += piece_size;
26884 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26885 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26886 if (MEM_SIZE_KNOWN_P (orig_dst))
26887 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26889 if (!issetmem)
26891 int src_align_bytes = get_mem_align_offset (src, desired_align
26892 * BITS_PER_UNIT);
26893 if (src_align_bytes >= 0)
26894 src_align_bytes = desired_align - src_align_bytes;
26895 if (src_align_bytes >= 0)
26897 unsigned int src_align;
26898 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26900 if ((src_align_bytes & (src_align - 1))
26901 == (align_bytes & (src_align - 1)))
26902 break;
26904 if (src_align > (unsigned int) desired_align)
26905 src_align = desired_align;
26906 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26907 set_mem_align (src, src_align * BITS_PER_UNIT);
26909 if (MEM_SIZE_KNOWN_P (orig_src))
26910 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26911 *srcp = src;
26914 return dst;
26917 /* Return true if ALG can be used in current context.
26918 Assume we expand memset if MEMSET is true. */
26919 static bool
26920 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26922 if (alg == no_stringop)
26923 return false;
26924 if (alg == vector_loop)
26925 return TARGET_SSE || TARGET_AVX;
26926 /* Algorithms using the rep prefix want at least edi and ecx;
26927 additionally, memset wants eax and memcpy wants esi. Don't
26928 consider such algorithms if the user has appropriated those
26929 registers for their own purposes, or if we have a non-default
26930 address space, since some string insns cannot override the segment. */
26931 if (alg == rep_prefix_1_byte
26932 || alg == rep_prefix_4_byte
26933 || alg == rep_prefix_8_byte)
26935 if (have_as)
26936 return false;
26937 if (fixed_regs[CX_REG]
26938 || fixed_regs[DI_REG]
26939 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26940 return false;
26942 return true;
26945 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26946 static enum stringop_alg
26947 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26948 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26949 bool memset, bool zero_memset, bool have_as,
26950 int *dynamic_check, bool *noalign, bool recur)
26952 const struct stringop_algs *algs;
26953 bool optimize_for_speed;
26954 int max = 0;
26955 const struct processor_costs *cost;
26956 int i;
26957 bool any_alg_usable_p = false;
26959 *noalign = false;
26960 *dynamic_check = -1;
26962 /* Even if the string operation call is cold, we still might spend a lot
26963 of time processing large blocks. */
26964 if (optimize_function_for_size_p (cfun)
26965 || (optimize_insn_for_size_p ()
26966 && (max_size < 256
26967 || (expected_size != -1 && expected_size < 256))))
26968 optimize_for_speed = false;
26969 else
26970 optimize_for_speed = true;
26972 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26973 if (memset)
26974 algs = &cost->memset[TARGET_64BIT != 0];
26975 else
26976 algs = &cost->memcpy[TARGET_64BIT != 0];
26978 /* See maximal size for user defined algorithm. */
26979 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26981 enum stringop_alg candidate = algs->size[i].alg;
26982 bool usable = alg_usable_p (candidate, memset, have_as);
26983 any_alg_usable_p |= usable;
26985 if (candidate != libcall && candidate && usable)
26986 max = algs->size[i].max;
26989 /* If expected size is not known but max size is small enough
26990 so inline version is a win, set expected size into
26991 the range. */
26992 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26993 && expected_size == -1)
26994 expected_size = min_size / 2 + max_size / 2;
26996 /* If user specified the algorithm, honor it if possible. */
26997 if (ix86_stringop_alg != no_stringop
26998 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26999 return ix86_stringop_alg;
27000 /* rep; movq or rep; movl is the smallest variant. */
27001 else if (!optimize_for_speed)
27003 *noalign = true;
27004 if (!count || (count & 3) || (memset && !zero_memset))
27005 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27006 ? rep_prefix_1_byte : loop_1_byte;
27007 else
27008 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27009 ? rep_prefix_4_byte : loop;
27011 /* Very tiny blocks are best handled via the loop, REP is expensive to
27012 setup. */
27013 else if (expected_size != -1 && expected_size < 4)
27014 return loop_1_byte;
27015 else if (expected_size != -1)
27017 enum stringop_alg alg = libcall;
27018 bool alg_noalign = false;
27019 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27021 /* We get here if the algorithms that were not libcall-based
27022 were rep-prefix based and we are unable to use rep prefixes
27023 based on global register usage. Break out of the loop and
27024 use the heuristic below. */
27025 if (algs->size[i].max == 0)
27026 break;
27027 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27029 enum stringop_alg candidate = algs->size[i].alg;
27031 if (candidate != libcall
27032 && alg_usable_p (candidate, memset, have_as))
27034 alg = candidate;
27035 alg_noalign = algs->size[i].noalign;
27037 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27038 last non-libcall inline algorithm. */
27039 if (TARGET_INLINE_ALL_STRINGOPS)
27041 /* When the current size is best to be copied by a libcall,
27042 but we are still forced to inline, run the heuristic below
27043 that will pick code for medium sized blocks. */
27044 if (alg != libcall)
27046 *noalign = alg_noalign;
27047 return alg;
27049 else if (!any_alg_usable_p)
27050 break;
27052 else if (alg_usable_p (candidate, memset, have_as))
27054 *noalign = algs->size[i].noalign;
27055 return candidate;
27060 /* When asked to inline the call anyway, try to pick meaningful choice.
27061 We look for maximal size of block that is faster to copy by hand and
27062 take blocks of at most of that size guessing that average size will
27063 be roughly half of the block.
27065 If this turns out to be bad, we might simply specify the preferred
27066 choice in ix86_costs. */
27067 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27068 && (algs->unknown_size == libcall
27069 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27071 enum stringop_alg alg;
27072 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27074 /* If there aren't any usable algorithms or if recursing already,
27075 then recursing on smaller sizes or same size isn't going to
27076 find anything. Just return the simple byte-at-a-time copy loop. */
27077 if (!any_alg_usable_p || recur)
27079 /* Pick something reasonable. */
27080 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27081 *dynamic_check = 128;
27082 return loop_1_byte;
27084 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27085 zero_memset, have_as, dynamic_check, noalign, true);
27086 gcc_assert (*dynamic_check == -1);
27087 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27088 *dynamic_check = max;
27089 else
27090 gcc_assert (alg != libcall);
27091 return alg;
27093 return (alg_usable_p (algs->unknown_size, memset, have_as)
27094 ? algs->unknown_size : libcall);
27097 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27098 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27099 static int
27100 decide_alignment (int align,
27101 enum stringop_alg alg,
27102 int expected_size,
27103 machine_mode move_mode)
27105 int desired_align = 0;
27107 gcc_assert (alg != no_stringop);
27109 if (alg == libcall)
27110 return 0;
27111 if (move_mode == VOIDmode)
27112 return 0;
27114 desired_align = GET_MODE_SIZE (move_mode);
27115 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27116 copying whole cacheline at once. */
27117 if (TARGET_PENTIUMPRO
27118 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27119 desired_align = 8;
27121 if (optimize_size)
27122 desired_align = 1;
27123 if (desired_align < align)
27124 desired_align = align;
27125 if (expected_size != -1 && expected_size < 4)
27126 desired_align = align;
27128 return desired_align;
27132 /* Helper function for memcpy. For QImode value 0xXY produce
27133 0xXYXYXYXY of wide specified by MODE. This is essentially
27134 a * 0x10101010, but we can do slightly better than
27135 synth_mult by unwinding the sequence by hand on CPUs with
27136 slow multiply. */
27137 static rtx
27138 promote_duplicated_reg (machine_mode mode, rtx val)
27140 machine_mode valmode = GET_MODE (val);
27141 rtx tmp;
27142 int nops = mode == DImode ? 3 : 2;
27144 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27145 if (val == const0_rtx)
27146 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27147 if (CONST_INT_P (val))
27149 HOST_WIDE_INT v = INTVAL (val) & 255;
27151 v |= v << 8;
27152 v |= v << 16;
27153 if (mode == DImode)
27154 v |= (v << 16) << 16;
27155 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27158 if (valmode == VOIDmode)
27159 valmode = QImode;
27160 if (valmode != QImode)
27161 val = gen_lowpart (QImode, val);
27162 if (mode == QImode)
27163 return val;
27164 if (!TARGET_PARTIAL_REG_STALL)
27165 nops--;
27166 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27167 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27168 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27169 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27171 rtx reg = convert_modes (mode, QImode, val, true);
27172 tmp = promote_duplicated_reg (mode, const1_rtx);
27173 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27174 OPTAB_DIRECT);
27176 else
27178 rtx reg = convert_modes (mode, QImode, val, true);
27180 if (!TARGET_PARTIAL_REG_STALL)
27181 if (mode == SImode)
27182 emit_insn (gen_insvsi_1 (reg, reg));
27183 else
27184 emit_insn (gen_insvdi_1 (reg, reg));
27185 else
27187 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27188 NULL, 1, OPTAB_DIRECT);
27189 reg =
27190 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27192 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27193 NULL, 1, OPTAB_DIRECT);
27194 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27195 if (mode == SImode)
27196 return reg;
27197 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27198 NULL, 1, OPTAB_DIRECT);
27199 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27200 return reg;
27204 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27205 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27206 alignment from ALIGN to DESIRED_ALIGN. */
27207 static rtx
27208 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27209 int align)
27211 rtx promoted_val;
27213 if (TARGET_64BIT
27214 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27215 promoted_val = promote_duplicated_reg (DImode, val);
27216 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27217 promoted_val = promote_duplicated_reg (SImode, val);
27218 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27219 promoted_val = promote_duplicated_reg (HImode, val);
27220 else
27221 promoted_val = val;
27223 return promoted_val;
27226 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27227 operations when profitable. The code depends upon architecture, block size
27228 and alignment, but always has one of the following overall structures:
27230 Aligned move sequence:
27232 1) Prologue guard: Conditional that jumps up to epilogues for small
27233 blocks that can be handled by epilogue alone. This is faster
27234 but also needed for correctness, since prologue assume the block
27235 is larger than the desired alignment.
27237 Optional dynamic check for size and libcall for large
27238 blocks is emitted here too, with -minline-stringops-dynamically.
27240 2) Prologue: copy first few bytes in order to get destination
27241 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27242 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27243 copied. We emit either a jump tree on power of two sized
27244 blocks, or a byte loop.
27246 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27247 with specified algorithm.
27249 4) Epilogue: code copying tail of the block that is too small to be
27250 handled by main body (or up to size guarded by prologue guard).
27252 Misaligned move sequence
27254 1) missaligned move prologue/epilogue containing:
27255 a) Prologue handling small memory blocks and jumping to done_label
27256 (skipped if blocks are known to be large enough)
27257 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27258 needed by single possibly misaligned move
27259 (skipped if alignment is not needed)
27260 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27262 2) Zero size guard dispatching to done_label, if needed
27264 3) dispatch to library call, if needed,
27266 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27267 with specified algorithm. */
27268 bool
27269 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27270 rtx align_exp, rtx expected_align_exp,
27271 rtx expected_size_exp, rtx min_size_exp,
27272 rtx max_size_exp, rtx probable_max_size_exp,
27273 bool issetmem)
27275 rtx destreg;
27276 rtx srcreg = NULL;
27277 rtx_code_label *label = NULL;
27278 rtx tmp;
27279 rtx_code_label *jump_around_label = NULL;
27280 HOST_WIDE_INT align = 1;
27281 unsigned HOST_WIDE_INT count = 0;
27282 HOST_WIDE_INT expected_size = -1;
27283 int size_needed = 0, epilogue_size_needed;
27284 int desired_align = 0, align_bytes = 0;
27285 enum stringop_alg alg;
27286 rtx promoted_val = NULL;
27287 rtx vec_promoted_val = NULL;
27288 bool force_loopy_epilogue = false;
27289 int dynamic_check;
27290 bool need_zero_guard = false;
27291 bool noalign;
27292 machine_mode move_mode = VOIDmode;
27293 int unroll_factor = 1;
27294 /* TODO: Once value ranges are available, fill in proper data. */
27295 unsigned HOST_WIDE_INT min_size = 0;
27296 unsigned HOST_WIDE_INT max_size = -1;
27297 unsigned HOST_WIDE_INT probable_max_size = -1;
27298 bool misaligned_prologue_used = false;
27299 bool have_as;
27301 if (CONST_INT_P (align_exp))
27302 align = INTVAL (align_exp);
27303 /* i386 can do misaligned access on reasonably increased cost. */
27304 if (CONST_INT_P (expected_align_exp)
27305 && INTVAL (expected_align_exp) > align)
27306 align = INTVAL (expected_align_exp);
27307 /* ALIGN is the minimum of destination and source alignment, but we care here
27308 just about destination alignment. */
27309 else if (!issetmem
27310 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27311 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27313 if (CONST_INT_P (count_exp))
27315 min_size = max_size = probable_max_size = count = expected_size
27316 = INTVAL (count_exp);
27317 /* When COUNT is 0, there is nothing to do. */
27318 if (!count)
27319 return true;
27321 else
27323 if (min_size_exp)
27324 min_size = INTVAL (min_size_exp);
27325 if (max_size_exp)
27326 max_size = INTVAL (max_size_exp);
27327 if (probable_max_size_exp)
27328 probable_max_size = INTVAL (probable_max_size_exp);
27329 if (CONST_INT_P (expected_size_exp))
27330 expected_size = INTVAL (expected_size_exp);
27333 /* Make sure we don't need to care about overflow later on. */
27334 if (count > (HOST_WIDE_INT_1U << 30))
27335 return false;
27337 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27338 if (!issetmem)
27339 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27341 /* Step 0: Decide on preferred algorithm, desired alignment and
27342 size of chunks to be copied by main loop. */
27343 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27344 issetmem,
27345 issetmem && val_exp == const0_rtx, have_as,
27346 &dynamic_check, &noalign, false);
27347 if (alg == libcall)
27348 return false;
27349 gcc_assert (alg != no_stringop);
27351 /* For now vector-version of memset is generated only for memory zeroing, as
27352 creating of promoted vector value is very cheap in this case. */
27353 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27354 alg = unrolled_loop;
27356 if (!count)
27357 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27358 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27359 if (!issetmem)
27360 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27362 unroll_factor = 1;
27363 move_mode = word_mode;
27364 switch (alg)
27366 case libcall:
27367 case no_stringop:
27368 case last_alg:
27369 gcc_unreachable ();
27370 case loop_1_byte:
27371 need_zero_guard = true;
27372 move_mode = QImode;
27373 break;
27374 case loop:
27375 need_zero_guard = true;
27376 break;
27377 case unrolled_loop:
27378 need_zero_guard = true;
27379 unroll_factor = (TARGET_64BIT ? 4 : 2);
27380 break;
27381 case vector_loop:
27382 need_zero_guard = true;
27383 unroll_factor = 4;
27384 /* Find the widest supported mode. */
27385 move_mode = word_mode;
27386 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
27387 != CODE_FOR_nothing)
27388 move_mode = GET_MODE_WIDER_MODE (move_mode);
27390 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27391 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27392 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27394 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27395 move_mode = mode_for_vector (word_mode, nunits);
27396 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27397 move_mode = word_mode;
27399 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27400 break;
27401 case rep_prefix_8_byte:
27402 move_mode = DImode;
27403 break;
27404 case rep_prefix_4_byte:
27405 move_mode = SImode;
27406 break;
27407 case rep_prefix_1_byte:
27408 move_mode = QImode;
27409 break;
27411 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27412 epilogue_size_needed = size_needed;
27414 /* If we are going to call any library calls conditionally, make sure any
27415 pending stack adjustment happen before the first conditional branch,
27416 otherwise they will be emitted before the library call only and won't
27417 happen from the other branches. */
27418 if (dynamic_check != -1)
27419 do_pending_stack_adjust ();
27421 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27422 if (!TARGET_ALIGN_STRINGOPS || noalign)
27423 align = desired_align;
27425 /* Step 1: Prologue guard. */
27427 /* Alignment code needs count to be in register. */
27428 if (CONST_INT_P (count_exp) && desired_align > align)
27430 if (INTVAL (count_exp) > desired_align
27431 && INTVAL (count_exp) > size_needed)
27433 align_bytes
27434 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27435 if (align_bytes <= 0)
27436 align_bytes = 0;
27437 else
27438 align_bytes = desired_align - align_bytes;
27440 if (align_bytes == 0)
27441 count_exp = force_reg (counter_mode (count_exp), count_exp);
27443 gcc_assert (desired_align >= 1 && align >= 1);
27445 /* Misaligned move sequences handle both prologue and epilogue at once.
27446 Default code generation results in a smaller code for large alignments
27447 and also avoids redundant job when sizes are known precisely. */
27448 misaligned_prologue_used
27449 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27450 && MAX (desired_align, epilogue_size_needed) <= 32
27451 && desired_align <= epilogue_size_needed
27452 && ((desired_align > align && !align_bytes)
27453 || (!count && epilogue_size_needed > 1)));
27455 /* Do the cheap promotion to allow better CSE across the
27456 main loop and epilogue (ie one load of the big constant in the
27457 front of all code.
27458 For now the misaligned move sequences do not have fast path
27459 without broadcasting. */
27460 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27462 if (alg == vector_loop)
27464 gcc_assert (val_exp == const0_rtx);
27465 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27466 promoted_val = promote_duplicated_reg_to_size (val_exp,
27467 GET_MODE_SIZE (word_mode),
27468 desired_align, align);
27470 else
27472 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27473 desired_align, align);
27476 /* Misaligned move sequences handles both prologues and epilogues at once.
27477 Default code generation results in smaller code for large alignments and
27478 also avoids redundant job when sizes are known precisely. */
27479 if (misaligned_prologue_used)
27481 /* Misaligned move prologue handled small blocks by itself. */
27482 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27483 (dst, src, &destreg, &srcreg,
27484 move_mode, promoted_val, vec_promoted_val,
27485 &count_exp,
27486 &jump_around_label,
27487 desired_align < align
27488 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27489 desired_align, align, &min_size, dynamic_check, issetmem);
27490 if (!issetmem)
27491 src = change_address (src, BLKmode, srcreg);
27492 dst = change_address (dst, BLKmode, destreg);
27493 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27494 epilogue_size_needed = 0;
27495 if (need_zero_guard
27496 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27498 /* It is possible that we copied enough so the main loop will not
27499 execute. */
27500 gcc_assert (size_needed > 1);
27501 if (jump_around_label == NULL_RTX)
27502 jump_around_label = gen_label_rtx ();
27503 emit_cmp_and_jump_insns (count_exp,
27504 GEN_INT (size_needed),
27505 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27506 if (expected_size == -1
27507 || expected_size < (desired_align - align) / 2 + size_needed)
27508 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27509 else
27510 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27513 /* Ensure that alignment prologue won't copy past end of block. */
27514 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27516 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27517 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27518 Make sure it is power of 2. */
27519 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27521 /* To improve performance of small blocks, we jump around the VAL
27522 promoting mode. This mean that if the promoted VAL is not constant,
27523 we might not use it in the epilogue and have to use byte
27524 loop variant. */
27525 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27526 force_loopy_epilogue = true;
27527 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27528 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27530 /* If main algorithm works on QImode, no epilogue is needed.
27531 For small sizes just don't align anything. */
27532 if (size_needed == 1)
27533 desired_align = align;
27534 else
27535 goto epilogue;
27537 else if (!count
27538 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27540 label = gen_label_rtx ();
27541 emit_cmp_and_jump_insns (count_exp,
27542 GEN_INT (epilogue_size_needed),
27543 LTU, 0, counter_mode (count_exp), 1, label);
27544 if (expected_size == -1 || expected_size < epilogue_size_needed)
27545 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27546 else
27547 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27551 /* Emit code to decide on runtime whether library call or inline should be
27552 used. */
27553 if (dynamic_check != -1)
27555 if (!issetmem && CONST_INT_P (count_exp))
27557 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27559 emit_block_copy_via_libcall (dst, src, count_exp);
27560 count_exp = const0_rtx;
27561 goto epilogue;
27564 else
27566 rtx_code_label *hot_label = gen_label_rtx ();
27567 if (jump_around_label == NULL_RTX)
27568 jump_around_label = gen_label_rtx ();
27569 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27570 LEU, 0, counter_mode (count_exp),
27571 1, hot_label);
27572 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27573 if (issetmem)
27574 set_storage_via_libcall (dst, count_exp, val_exp);
27575 else
27576 emit_block_copy_via_libcall (dst, src, count_exp);
27577 emit_jump (jump_around_label);
27578 emit_label (hot_label);
27582 /* Step 2: Alignment prologue. */
27583 /* Do the expensive promotion once we branched off the small blocks. */
27584 if (issetmem && !promoted_val)
27585 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27586 desired_align, align);
27588 if (desired_align > align && !misaligned_prologue_used)
27590 if (align_bytes == 0)
27592 /* Except for the first move in prologue, we no longer know
27593 constant offset in aliasing info. It don't seems to worth
27594 the pain to maintain it for the first move, so throw away
27595 the info early. */
27596 dst = change_address (dst, BLKmode, destreg);
27597 if (!issetmem)
27598 src = change_address (src, BLKmode, srcreg);
27599 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27600 promoted_val, vec_promoted_val,
27601 count_exp, align, desired_align,
27602 issetmem);
27603 /* At most desired_align - align bytes are copied. */
27604 if (min_size < (unsigned)(desired_align - align))
27605 min_size = 0;
27606 else
27607 min_size -= desired_align - align;
27609 else
27611 /* If we know how many bytes need to be stored before dst is
27612 sufficiently aligned, maintain aliasing info accurately. */
27613 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27614 srcreg,
27615 promoted_val,
27616 vec_promoted_val,
27617 desired_align,
27618 align_bytes,
27619 issetmem);
27621 count_exp = plus_constant (counter_mode (count_exp),
27622 count_exp, -align_bytes);
27623 count -= align_bytes;
27624 min_size -= align_bytes;
27625 max_size -= align_bytes;
27627 if (need_zero_guard
27628 && min_size < (unsigned HOST_WIDE_INT) size_needed
27629 && (count < (unsigned HOST_WIDE_INT) size_needed
27630 || (align_bytes == 0
27631 && count < ((unsigned HOST_WIDE_INT) size_needed
27632 + desired_align - align))))
27634 /* It is possible that we copied enough so the main loop will not
27635 execute. */
27636 gcc_assert (size_needed > 1);
27637 if (label == NULL_RTX)
27638 label = gen_label_rtx ();
27639 emit_cmp_and_jump_insns (count_exp,
27640 GEN_INT (size_needed),
27641 LTU, 0, counter_mode (count_exp), 1, label);
27642 if (expected_size == -1
27643 || expected_size < (desired_align - align) / 2 + size_needed)
27644 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27645 else
27646 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27649 if (label && size_needed == 1)
27651 emit_label (label);
27652 LABEL_NUSES (label) = 1;
27653 label = NULL;
27654 epilogue_size_needed = 1;
27655 if (issetmem)
27656 promoted_val = val_exp;
27658 else if (label == NULL_RTX && !misaligned_prologue_used)
27659 epilogue_size_needed = size_needed;
27661 /* Step 3: Main loop. */
27663 switch (alg)
27665 case libcall:
27666 case no_stringop:
27667 case last_alg:
27668 gcc_unreachable ();
27669 case loop_1_byte:
27670 case loop:
27671 case unrolled_loop:
27672 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27673 count_exp, move_mode, unroll_factor,
27674 expected_size, issetmem);
27675 break;
27676 case vector_loop:
27677 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27678 vec_promoted_val, count_exp, move_mode,
27679 unroll_factor, expected_size, issetmem);
27680 break;
27681 case rep_prefix_8_byte:
27682 case rep_prefix_4_byte:
27683 case rep_prefix_1_byte:
27684 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27685 val_exp, count_exp, move_mode, issetmem);
27686 break;
27688 /* Adjust properly the offset of src and dest memory for aliasing. */
27689 if (CONST_INT_P (count_exp))
27691 if (!issetmem)
27692 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27693 (count / size_needed) * size_needed);
27694 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27695 (count / size_needed) * size_needed);
27697 else
27699 if (!issetmem)
27700 src = change_address (src, BLKmode, srcreg);
27701 dst = change_address (dst, BLKmode, destreg);
27704 /* Step 4: Epilogue to copy the remaining bytes. */
27705 epilogue:
27706 if (label)
27708 /* When the main loop is done, COUNT_EXP might hold original count,
27709 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27710 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27711 bytes. Compensate if needed. */
27713 if (size_needed < epilogue_size_needed)
27715 tmp =
27716 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27717 GEN_INT (size_needed - 1), count_exp, 1,
27718 OPTAB_DIRECT);
27719 if (tmp != count_exp)
27720 emit_move_insn (count_exp, tmp);
27722 emit_label (label);
27723 LABEL_NUSES (label) = 1;
27726 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27728 if (force_loopy_epilogue)
27729 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27730 epilogue_size_needed);
27731 else
27733 if (issetmem)
27734 expand_setmem_epilogue (dst, destreg, promoted_val,
27735 vec_promoted_val, count_exp,
27736 epilogue_size_needed);
27737 else
27738 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27739 epilogue_size_needed);
27742 if (jump_around_label)
27743 emit_label (jump_around_label);
27744 return true;
27748 /* Expand the appropriate insns for doing strlen if not just doing
27749 repnz; scasb
27751 out = result, initialized with the start address
27752 align_rtx = alignment of the address.
27753 scratch = scratch register, initialized with the startaddress when
27754 not aligned, otherwise undefined
27756 This is just the body. It needs the initializations mentioned above and
27757 some address computing at the end. These things are done in i386.md. */
27759 static void
27760 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27762 int align;
27763 rtx tmp;
27764 rtx_code_label *align_2_label = NULL;
27765 rtx_code_label *align_3_label = NULL;
27766 rtx_code_label *align_4_label = gen_label_rtx ();
27767 rtx_code_label *end_0_label = gen_label_rtx ();
27768 rtx mem;
27769 rtx tmpreg = gen_reg_rtx (SImode);
27770 rtx scratch = gen_reg_rtx (SImode);
27771 rtx cmp;
27773 align = 0;
27774 if (CONST_INT_P (align_rtx))
27775 align = INTVAL (align_rtx);
27777 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27779 /* Is there a known alignment and is it less than 4? */
27780 if (align < 4)
27782 rtx scratch1 = gen_reg_rtx (Pmode);
27783 emit_move_insn (scratch1, out);
27784 /* Is there a known alignment and is it not 2? */
27785 if (align != 2)
27787 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27788 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27790 /* Leave just the 3 lower bits. */
27791 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27792 NULL_RTX, 0, OPTAB_WIDEN);
27794 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27795 Pmode, 1, align_4_label);
27796 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27797 Pmode, 1, align_2_label);
27798 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27799 Pmode, 1, align_3_label);
27801 else
27803 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27804 check if is aligned to 4 - byte. */
27806 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27807 NULL_RTX, 0, OPTAB_WIDEN);
27809 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27810 Pmode, 1, align_4_label);
27813 mem = change_address (src, QImode, out);
27815 /* Now compare the bytes. */
27817 /* Compare the first n unaligned byte on a byte per byte basis. */
27818 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27819 QImode, 1, end_0_label);
27821 /* Increment the address. */
27822 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27824 /* Not needed with an alignment of 2 */
27825 if (align != 2)
27827 emit_label (align_2_label);
27829 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27830 end_0_label);
27832 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27834 emit_label (align_3_label);
27837 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27838 end_0_label);
27840 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27843 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27844 align this loop. It gives only huge programs, but does not help to
27845 speed up. */
27846 emit_label (align_4_label);
27848 mem = change_address (src, SImode, out);
27849 emit_move_insn (scratch, mem);
27850 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27852 /* This formula yields a nonzero result iff one of the bytes is zero.
27853 This saves three branches inside loop and many cycles. */
27855 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27856 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27857 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27858 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27859 gen_int_mode (0x80808080, SImode)));
27860 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27861 align_4_label);
27863 if (TARGET_CMOVE)
27865 rtx reg = gen_reg_rtx (SImode);
27866 rtx reg2 = gen_reg_rtx (Pmode);
27867 emit_move_insn (reg, tmpreg);
27868 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27870 /* If zero is not in the first two bytes, move two bytes forward. */
27871 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27872 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27873 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27874 emit_insn (gen_rtx_SET (tmpreg,
27875 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27876 reg,
27877 tmpreg)));
27878 /* Emit lea manually to avoid clobbering of flags. */
27879 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27881 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27882 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27883 emit_insn (gen_rtx_SET (out,
27884 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27885 reg2,
27886 out)));
27888 else
27890 rtx_code_label *end_2_label = gen_label_rtx ();
27891 /* Is zero in the first two bytes? */
27893 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27894 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27895 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27896 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27897 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27898 pc_rtx);
27899 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27900 JUMP_LABEL (tmp) = end_2_label;
27902 /* Not in the first two. Move two bytes forward. */
27903 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27904 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27906 emit_label (end_2_label);
27910 /* Avoid branch in fixing the byte. */
27911 tmpreg = gen_lowpart (QImode, tmpreg);
27912 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27913 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27914 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27915 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27917 emit_label (end_0_label);
27920 /* Expand strlen. */
27922 bool
27923 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27925 rtx addr, scratch1, scratch2, scratch3, scratch4;
27927 /* The generic case of strlen expander is long. Avoid it's
27928 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27930 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27931 && !TARGET_INLINE_ALL_STRINGOPS
27932 && !optimize_insn_for_size_p ()
27933 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27934 return false;
27936 addr = force_reg (Pmode, XEXP (src, 0));
27937 scratch1 = gen_reg_rtx (Pmode);
27939 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27940 && !optimize_insn_for_size_p ())
27942 /* Well it seems that some optimizer does not combine a call like
27943 foo(strlen(bar), strlen(bar));
27944 when the move and the subtraction is done here. It does calculate
27945 the length just once when these instructions are done inside of
27946 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27947 often used and I use one fewer register for the lifetime of
27948 output_strlen_unroll() this is better. */
27950 emit_move_insn (out, addr);
27952 ix86_expand_strlensi_unroll_1 (out, src, align);
27954 /* strlensi_unroll_1 returns the address of the zero at the end of
27955 the string, like memchr(), so compute the length by subtracting
27956 the start address. */
27957 emit_insn (ix86_gen_sub3 (out, out, addr));
27959 else
27961 rtx unspec;
27963 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27964 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27965 return false;
27966 /* Can't use this for non-default address spaces. */
27967 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27968 return false;
27970 scratch2 = gen_reg_rtx (Pmode);
27971 scratch3 = gen_reg_rtx (Pmode);
27972 scratch4 = force_reg (Pmode, constm1_rtx);
27974 emit_move_insn (scratch3, addr);
27975 eoschar = force_reg (QImode, eoschar);
27977 src = replace_equiv_address_nv (src, scratch3);
27979 /* If .md starts supporting :P, this can be done in .md. */
27980 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27981 scratch4), UNSPEC_SCAS);
27982 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27983 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27984 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27986 return true;
27989 /* For given symbol (function) construct code to compute address of it's PLT
27990 entry in large x86-64 PIC model. */
27991 static rtx
27992 construct_plt_address (rtx symbol)
27994 rtx tmp, unspec;
27996 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27997 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27998 gcc_assert (Pmode == DImode);
28000 tmp = gen_reg_rtx (Pmode);
28001 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28003 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28004 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28005 return tmp;
28009 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28010 rtx callarg2,
28011 rtx pop, bool sibcall)
28013 rtx vec[3];
28014 rtx use = NULL, call;
28015 unsigned int vec_len = 0;
28016 tree fndecl;
28018 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28020 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28021 if (fndecl
28022 && (lookup_attribute ("interrupt",
28023 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28024 error ("interrupt service routine can't be called directly");
28026 else
28027 fndecl = NULL_TREE;
28029 if (pop == const0_rtx)
28030 pop = NULL;
28031 gcc_assert (!TARGET_64BIT || !pop);
28033 if (TARGET_MACHO && !TARGET_64BIT)
28035 #if TARGET_MACHO
28036 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28037 fnaddr = machopic_indirect_call_target (fnaddr);
28038 #endif
28040 else
28042 /* Static functions and indirect calls don't need the pic register. Also,
28043 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28044 it an indirect call. */
28045 rtx addr = XEXP (fnaddr, 0);
28046 if (flag_pic
28047 && GET_CODE (addr) == SYMBOL_REF
28048 && !SYMBOL_REF_LOCAL_P (addr))
28050 if (flag_plt
28051 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28052 || !lookup_attribute ("noplt",
28053 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28055 if (!TARGET_64BIT
28056 || (ix86_cmodel == CM_LARGE_PIC
28057 && DEFAULT_ABI != MS_ABI))
28059 use_reg (&use, gen_rtx_REG (Pmode,
28060 REAL_PIC_OFFSET_TABLE_REGNUM));
28061 if (ix86_use_pseudo_pic_reg ())
28062 emit_move_insn (gen_rtx_REG (Pmode,
28063 REAL_PIC_OFFSET_TABLE_REGNUM),
28064 pic_offset_table_rtx);
28067 else if (!TARGET_PECOFF && !TARGET_MACHO)
28069 if (TARGET_64BIT)
28071 fnaddr = gen_rtx_UNSPEC (Pmode,
28072 gen_rtvec (1, addr),
28073 UNSPEC_GOTPCREL);
28074 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28076 else
28078 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28079 UNSPEC_GOT);
28080 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28081 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28082 fnaddr);
28084 fnaddr = gen_const_mem (Pmode, fnaddr);
28085 /* Pmode may not be the same as word_mode for x32, which
28086 doesn't support indirect branch via 32-bit memory slot.
28087 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28088 indirect branch via x32 GOT slot is OK. */
28089 if (GET_MODE (fnaddr) != word_mode)
28090 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28091 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28096 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28097 parameters passed in vector registers. */
28098 if (TARGET_64BIT
28099 && (INTVAL (callarg2) > 0
28100 || (INTVAL (callarg2) == 0
28101 && (TARGET_SSE || !flag_skip_rax_setup))))
28103 rtx al = gen_rtx_REG (QImode, AX_REG);
28104 emit_move_insn (al, callarg2);
28105 use_reg (&use, al);
28108 if (ix86_cmodel == CM_LARGE_PIC
28109 && !TARGET_PECOFF
28110 && MEM_P (fnaddr)
28111 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28112 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28113 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28114 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28115 branch via x32 GOT slot is OK. */
28116 else if (!(TARGET_X32
28117 && MEM_P (fnaddr)
28118 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28119 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28120 && (sibcall
28121 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28122 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28124 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28125 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28128 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28130 if (retval)
28132 /* We should add bounds as destination register in case
28133 pointer with bounds may be returned. */
28134 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28136 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28137 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28138 if (GET_CODE (retval) == PARALLEL)
28140 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28141 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28142 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28143 retval = chkp_join_splitted_slot (retval, par);
28145 else
28147 retval = gen_rtx_PARALLEL (VOIDmode,
28148 gen_rtvec (3, retval, b0, b1));
28149 chkp_put_regs_to_expr_list (retval);
28153 call = gen_rtx_SET (retval, call);
28155 vec[vec_len++] = call;
28157 if (pop)
28159 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28160 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28161 vec[vec_len++] = pop;
28164 if (cfun->machine->no_caller_saved_registers
28165 && (!fndecl
28166 || (!TREE_THIS_VOLATILE (fndecl)
28167 && !lookup_attribute ("no_caller_saved_registers",
28168 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28170 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28171 bool is_64bit_ms_abi = (TARGET_64BIT
28172 && ix86_function_abi (fndecl) == MS_ABI);
28173 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28175 /* If there are no caller-saved registers, add all registers
28176 that are clobbered by the call which returns. */
28177 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28178 if (!fixed_regs[i]
28179 && (ix86_call_used_regs[i] == 1
28180 || (ix86_call_used_regs[i] & c_mask))
28181 && !STACK_REGNO_P (i)
28182 && !MMX_REGNO_P (i))
28183 clobber_reg (&use,
28184 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28186 else if (TARGET_64BIT_MS_ABI
28187 && (!callarg2 || INTVAL (callarg2) != -2))
28189 int const cregs_size
28190 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
28191 int i;
28193 for (i = 0; i < cregs_size; i++)
28195 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28196 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28198 clobber_reg (&use, gen_rtx_REG (mode, regno));
28202 if (vec_len > 1)
28203 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28204 call = emit_call_insn (call);
28205 if (use)
28206 CALL_INSN_FUNCTION_USAGE (call) = use;
28208 return call;
28211 /* Return true if the function being called was marked with attribute
28212 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28213 to handle the non-PIC case in the backend because there is no easy
28214 interface for the front-end to force non-PLT calls to use the GOT.
28215 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28216 to call the function marked "noplt" indirectly. */
28218 static bool
28219 ix86_nopic_noplt_attribute_p (rtx call_op)
28221 if (flag_pic || ix86_cmodel == CM_LARGE
28222 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28223 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28224 || SYMBOL_REF_LOCAL_P (call_op))
28225 return false;
28227 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28229 if (!flag_plt
28230 || (symbol_decl != NULL_TREE
28231 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28232 return true;
28234 return false;
28237 /* Output the assembly for a call instruction. */
28239 const char *
28240 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28242 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28243 bool seh_nop_p = false;
28244 const char *xasm;
28246 if (SIBLING_CALL_P (insn))
28248 if (direct_p)
28250 if (ix86_nopic_noplt_attribute_p (call_op))
28252 if (TARGET_64BIT)
28253 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28254 else
28255 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28257 else
28258 xasm = "%!jmp\t%P0";
28260 /* SEH epilogue detection requires the indirect branch case
28261 to include REX.W. */
28262 else if (TARGET_SEH)
28263 xasm = "%!rex.W jmp\t%A0";
28264 else
28265 xasm = "%!jmp\t%A0";
28267 output_asm_insn (xasm, &call_op);
28268 return "";
28271 /* SEH unwinding can require an extra nop to be emitted in several
28272 circumstances. Determine if we have one of those. */
28273 if (TARGET_SEH)
28275 rtx_insn *i;
28277 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28279 /* If we get to another real insn, we don't need the nop. */
28280 if (INSN_P (i))
28281 break;
28283 /* If we get to the epilogue note, prevent a catch region from
28284 being adjacent to the standard epilogue sequence. If non-
28285 call-exceptions, we'll have done this during epilogue emission. */
28286 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28287 && !flag_non_call_exceptions
28288 && !can_throw_internal (insn))
28290 seh_nop_p = true;
28291 break;
28295 /* If we didn't find a real insn following the call, prevent the
28296 unwinder from looking into the next function. */
28297 if (i == NULL)
28298 seh_nop_p = true;
28301 if (direct_p)
28303 if (ix86_nopic_noplt_attribute_p (call_op))
28305 if (TARGET_64BIT)
28306 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28307 else
28308 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28310 else
28311 xasm = "%!call\t%P0";
28313 else
28314 xasm = "%!call\t%A0";
28316 output_asm_insn (xasm, &call_op);
28318 if (seh_nop_p)
28319 return "nop";
28321 return "";
28324 /* Clear stack slot assignments remembered from previous functions.
28325 This is called from INIT_EXPANDERS once before RTL is emitted for each
28326 function. */
28328 static struct machine_function *
28329 ix86_init_machine_status (void)
28331 struct machine_function *f;
28333 f = ggc_cleared_alloc<machine_function> ();
28334 f->use_fast_prologue_epilogue_nregs = -1;
28335 f->call_abi = ix86_abi;
28337 return f;
28340 /* Return a MEM corresponding to a stack slot with mode MODE.
28341 Allocate a new slot if necessary.
28343 The RTL for a function can have several slots available: N is
28344 which slot to use. */
28347 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28349 struct stack_local_entry *s;
28351 gcc_assert (n < MAX_386_STACK_LOCALS);
28353 for (s = ix86_stack_locals; s; s = s->next)
28354 if (s->mode == mode && s->n == n)
28355 return validize_mem (copy_rtx (s->rtl));
28357 s = ggc_alloc<stack_local_entry> ();
28358 s->n = n;
28359 s->mode = mode;
28360 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28362 s->next = ix86_stack_locals;
28363 ix86_stack_locals = s;
28364 return validize_mem (copy_rtx (s->rtl));
28367 static void
28368 ix86_instantiate_decls (void)
28370 struct stack_local_entry *s;
28372 for (s = ix86_stack_locals; s; s = s->next)
28373 if (s->rtl != NULL_RTX)
28374 instantiate_decl_rtl (s->rtl);
28377 /* Return the number used for encoding REG, in the range 0..7. */
28379 static int
28380 reg_encoded_number (rtx reg)
28382 unsigned regno = REGNO (reg);
28383 switch (regno)
28385 case AX_REG:
28386 return 0;
28387 case CX_REG:
28388 return 1;
28389 case DX_REG:
28390 return 2;
28391 case BX_REG:
28392 return 3;
28393 case SP_REG:
28394 return 4;
28395 case BP_REG:
28396 return 5;
28397 case SI_REG:
28398 return 6;
28399 case DI_REG:
28400 return 7;
28401 default:
28402 break;
28404 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28405 return regno - FIRST_STACK_REG;
28406 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28407 return regno - FIRST_SSE_REG;
28408 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28409 return regno - FIRST_MMX_REG;
28410 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28411 return regno - FIRST_REX_SSE_REG;
28412 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28413 return regno - FIRST_REX_INT_REG;
28414 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28415 return regno - FIRST_MASK_REG;
28416 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28417 return regno - FIRST_BND_REG;
28418 return -1;
28421 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28422 in its encoding if it could be relevant for ROP mitigation, otherwise
28423 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28424 used for calculating it into them. */
28426 static int
28427 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28428 int *popno0 = 0, int *popno1 = 0)
28430 if (asm_noperands (PATTERN (insn)) >= 0)
28431 return -1;
28432 int has_modrm = get_attr_modrm (insn);
28433 if (!has_modrm)
28434 return -1;
28435 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28436 rtx op0, op1;
28437 switch (cls)
28439 case MODRM_CLASS_OP02:
28440 gcc_assert (noperands >= 3);
28441 if (popno0)
28443 *popno0 = 0;
28444 *popno1 = 2;
28446 op0 = operands[0];
28447 op1 = operands[2];
28448 break;
28449 case MODRM_CLASS_OP01:
28450 gcc_assert (noperands >= 2);
28451 if (popno0)
28453 *popno0 = 0;
28454 *popno1 = 1;
28456 op0 = operands[0];
28457 op1 = operands[1];
28458 break;
28459 default:
28460 return -1;
28462 if (REG_P (op0) && REG_P (op1))
28464 int enc0 = reg_encoded_number (op0);
28465 int enc1 = reg_encoded_number (op1);
28466 return 0xc0 + (enc1 << 3) + enc0;
28468 return -1;
28471 /* Check whether x86 address PARTS is a pc-relative address. */
28473 static bool
28474 rip_relative_addr_p (struct ix86_address *parts)
28476 rtx base, index, disp;
28478 base = parts->base;
28479 index = parts->index;
28480 disp = parts->disp;
28482 if (disp && !base && !index)
28484 if (TARGET_64BIT)
28486 rtx symbol = disp;
28488 if (GET_CODE (disp) == CONST)
28489 symbol = XEXP (disp, 0);
28490 if (GET_CODE (symbol) == PLUS
28491 && CONST_INT_P (XEXP (symbol, 1)))
28492 symbol = XEXP (symbol, 0);
28494 if (GET_CODE (symbol) == LABEL_REF
28495 || (GET_CODE (symbol) == SYMBOL_REF
28496 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28497 || (GET_CODE (symbol) == UNSPEC
28498 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28499 || XINT (symbol, 1) == UNSPEC_PCREL
28500 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28501 return true;
28504 return false;
28507 /* Calculate the length of the memory address in the instruction encoding.
28508 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28509 or other prefixes. We never generate addr32 prefix for LEA insn. */
28512 memory_address_length (rtx addr, bool lea)
28514 struct ix86_address parts;
28515 rtx base, index, disp;
28516 int len;
28517 int ok;
28519 if (GET_CODE (addr) == PRE_DEC
28520 || GET_CODE (addr) == POST_INC
28521 || GET_CODE (addr) == PRE_MODIFY
28522 || GET_CODE (addr) == POST_MODIFY)
28523 return 0;
28525 ok = ix86_decompose_address (addr, &parts);
28526 gcc_assert (ok);
28528 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28530 /* If this is not LEA instruction, add the length of addr32 prefix. */
28531 if (TARGET_64BIT && !lea
28532 && (SImode_address_operand (addr, VOIDmode)
28533 || (parts.base && GET_MODE (parts.base) == SImode)
28534 || (parts.index && GET_MODE (parts.index) == SImode)))
28535 len++;
28537 base = parts.base;
28538 index = parts.index;
28539 disp = parts.disp;
28541 if (base && SUBREG_P (base))
28542 base = SUBREG_REG (base);
28543 if (index && SUBREG_P (index))
28544 index = SUBREG_REG (index);
28546 gcc_assert (base == NULL_RTX || REG_P (base));
28547 gcc_assert (index == NULL_RTX || REG_P (index));
28549 /* Rule of thumb:
28550 - esp as the base always wants an index,
28551 - ebp as the base always wants a displacement,
28552 - r12 as the base always wants an index,
28553 - r13 as the base always wants a displacement. */
28555 /* Register Indirect. */
28556 if (base && !index && !disp)
28558 /* esp (for its index) and ebp (for its displacement) need
28559 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28560 code. */
28561 if (base == arg_pointer_rtx
28562 || base == frame_pointer_rtx
28563 || REGNO (base) == SP_REG
28564 || REGNO (base) == BP_REG
28565 || REGNO (base) == R12_REG
28566 || REGNO (base) == R13_REG)
28567 len++;
28570 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28571 is not disp32, but disp32(%rip), so for disp32
28572 SIB byte is needed, unless print_operand_address
28573 optimizes it into disp32(%rip) or (%rip) is implied
28574 by UNSPEC. */
28575 else if (disp && !base && !index)
28577 len += 4;
28578 if (rip_relative_addr_p (&parts))
28579 len++;
28581 else
28583 /* Find the length of the displacement constant. */
28584 if (disp)
28586 if (base && satisfies_constraint_K (disp))
28587 len += 1;
28588 else
28589 len += 4;
28591 /* ebp always wants a displacement. Similarly r13. */
28592 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28593 len++;
28595 /* An index requires the two-byte modrm form.... */
28596 if (index
28597 /* ...like esp (or r12), which always wants an index. */
28598 || base == arg_pointer_rtx
28599 || base == frame_pointer_rtx
28600 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28601 len++;
28604 return len;
28607 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28608 is set, expect that insn have 8bit immediate alternative. */
28610 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28612 int len = 0;
28613 int i;
28614 extract_insn_cached (insn);
28615 for (i = recog_data.n_operands - 1; i >= 0; --i)
28616 if (CONSTANT_P (recog_data.operand[i]))
28618 enum attr_mode mode = get_attr_mode (insn);
28620 gcc_assert (!len);
28621 if (shortform && CONST_INT_P (recog_data.operand[i]))
28623 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28624 switch (mode)
28626 case MODE_QI:
28627 len = 1;
28628 continue;
28629 case MODE_HI:
28630 ival = trunc_int_for_mode (ival, HImode);
28631 break;
28632 case MODE_SI:
28633 ival = trunc_int_for_mode (ival, SImode);
28634 break;
28635 default:
28636 break;
28638 if (IN_RANGE (ival, -128, 127))
28640 len = 1;
28641 continue;
28644 switch (mode)
28646 case MODE_QI:
28647 len = 1;
28648 break;
28649 case MODE_HI:
28650 len = 2;
28651 break;
28652 case MODE_SI:
28653 len = 4;
28654 break;
28655 /* Immediates for DImode instructions are encoded
28656 as 32bit sign extended values. */
28657 case MODE_DI:
28658 len = 4;
28659 break;
28660 default:
28661 fatal_insn ("unknown insn mode", insn);
28664 return len;
28667 /* Compute default value for "length_address" attribute. */
28669 ix86_attr_length_address_default (rtx_insn *insn)
28671 int i;
28673 if (get_attr_type (insn) == TYPE_LEA)
28675 rtx set = PATTERN (insn), addr;
28677 if (GET_CODE (set) == PARALLEL)
28678 set = XVECEXP (set, 0, 0);
28680 gcc_assert (GET_CODE (set) == SET);
28682 addr = SET_SRC (set);
28684 return memory_address_length (addr, true);
28687 extract_insn_cached (insn);
28688 for (i = recog_data.n_operands - 1; i >= 0; --i)
28690 rtx op = recog_data.operand[i];
28691 if (MEM_P (op))
28693 constrain_operands_cached (insn, reload_completed);
28694 if (which_alternative != -1)
28696 const char *constraints = recog_data.constraints[i];
28697 int alt = which_alternative;
28699 while (*constraints == '=' || *constraints == '+')
28700 constraints++;
28701 while (alt-- > 0)
28702 while (*constraints++ != ',')
28704 /* Skip ignored operands. */
28705 if (*constraints == 'X')
28706 continue;
28709 int len = memory_address_length (XEXP (op, 0), false);
28711 /* Account for segment prefix for non-default addr spaces. */
28712 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28713 len++;
28715 return len;
28718 return 0;
28721 /* Compute default value for "length_vex" attribute. It includes
28722 2 or 3 byte VEX prefix and 1 opcode byte. */
28725 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28726 bool has_vex_w)
28728 int i;
28730 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28731 byte VEX prefix. */
28732 if (!has_0f_opcode || has_vex_w)
28733 return 3 + 1;
28735 /* We can always use 2 byte VEX prefix in 32bit. */
28736 if (!TARGET_64BIT)
28737 return 2 + 1;
28739 extract_insn_cached (insn);
28741 for (i = recog_data.n_operands - 1; i >= 0; --i)
28742 if (REG_P (recog_data.operand[i]))
28744 /* REX.W bit uses 3 byte VEX prefix. */
28745 if (GET_MODE (recog_data.operand[i]) == DImode
28746 && GENERAL_REG_P (recog_data.operand[i]))
28747 return 3 + 1;
28749 else
28751 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28752 if (MEM_P (recog_data.operand[i])
28753 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28754 return 3 + 1;
28757 return 2 + 1;
28760 /* Return the maximum number of instructions a cpu can issue. */
28762 static int
28763 ix86_issue_rate (void)
28765 switch (ix86_tune)
28767 case PROCESSOR_PENTIUM:
28768 case PROCESSOR_LAKEMONT:
28769 case PROCESSOR_BONNELL:
28770 case PROCESSOR_SILVERMONT:
28771 case PROCESSOR_KNL:
28772 case PROCESSOR_INTEL:
28773 case PROCESSOR_K6:
28774 case PROCESSOR_BTVER2:
28775 case PROCESSOR_PENTIUM4:
28776 case PROCESSOR_NOCONA:
28777 return 2;
28779 case PROCESSOR_PENTIUMPRO:
28780 case PROCESSOR_ATHLON:
28781 case PROCESSOR_K8:
28782 case PROCESSOR_AMDFAM10:
28783 case PROCESSOR_GENERIC:
28784 case PROCESSOR_BTVER1:
28785 return 3;
28787 case PROCESSOR_BDVER1:
28788 case PROCESSOR_BDVER2:
28789 case PROCESSOR_BDVER3:
28790 case PROCESSOR_BDVER4:
28791 case PROCESSOR_ZNVER1:
28792 case PROCESSOR_CORE2:
28793 case PROCESSOR_NEHALEM:
28794 case PROCESSOR_SANDYBRIDGE:
28795 case PROCESSOR_HASWELL:
28796 return 4;
28798 default:
28799 return 1;
28803 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
28804 by DEP_INSN and nothing set by DEP_INSN. */
28806 static bool
28807 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
28809 rtx set, set2;
28811 /* Simplify the test for uninteresting insns. */
28812 if (insn_type != TYPE_SETCC
28813 && insn_type != TYPE_ICMOV
28814 && insn_type != TYPE_FCMOV
28815 && insn_type != TYPE_IBR)
28816 return false;
28818 if ((set = single_set (dep_insn)) != 0)
28820 set = SET_DEST (set);
28821 set2 = NULL_RTX;
28823 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
28824 && XVECLEN (PATTERN (dep_insn), 0) == 2
28825 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
28826 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
28828 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28829 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
28831 else
28832 return false;
28834 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
28835 return false;
28837 /* This test is true if the dependent insn reads the flags but
28838 not any other potentially set register. */
28839 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
28840 return false;
28842 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
28843 return false;
28845 return true;
28848 /* Return true iff USE_INSN has a memory address with operands set by
28849 SET_INSN. */
28851 bool
28852 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
28854 int i;
28855 extract_insn_cached (use_insn);
28856 for (i = recog_data.n_operands - 1; i >= 0; --i)
28857 if (MEM_P (recog_data.operand[i]))
28859 rtx addr = XEXP (recog_data.operand[i], 0);
28860 return modified_in_p (addr, set_insn) != 0;
28862 return false;
28865 /* Helper function for exact_store_load_dependency.
28866 Return true if addr is found in insn. */
28867 static bool
28868 exact_dependency_1 (rtx addr, rtx insn)
28870 enum rtx_code code;
28871 const char *format_ptr;
28872 int i, j;
28874 code = GET_CODE (insn);
28875 switch (code)
28877 case MEM:
28878 if (rtx_equal_p (addr, insn))
28879 return true;
28880 break;
28881 case REG:
28882 CASE_CONST_ANY:
28883 case SYMBOL_REF:
28884 case CODE_LABEL:
28885 case PC:
28886 case CC0:
28887 case EXPR_LIST:
28888 return false;
28889 default:
28890 break;
28893 format_ptr = GET_RTX_FORMAT (code);
28894 for (i = 0; i < GET_RTX_LENGTH (code); i++)
28896 switch (*format_ptr++)
28898 case 'e':
28899 if (exact_dependency_1 (addr, XEXP (insn, i)))
28900 return true;
28901 break;
28902 case 'E':
28903 for (j = 0; j < XVECLEN (insn, i); j++)
28904 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
28905 return true;
28906 break;
28909 return false;
28912 /* Return true if there exists exact dependency for store & load, i.e.
28913 the same memory address is used in them. */
28914 static bool
28915 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
28917 rtx set1, set2;
28919 set1 = single_set (store);
28920 if (!set1)
28921 return false;
28922 if (!MEM_P (SET_DEST (set1)))
28923 return false;
28924 set2 = single_set (load);
28925 if (!set2)
28926 return false;
28927 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
28928 return true;
28929 return false;
28932 static int
28933 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
28934 unsigned int)
28936 enum attr_type insn_type, dep_insn_type;
28937 enum attr_memory memory;
28938 rtx set, set2;
28939 int dep_insn_code_number;
28941 /* Anti and output dependencies have zero cost on all CPUs. */
28942 if (dep_type != 0)
28943 return 0;
28945 dep_insn_code_number = recog_memoized (dep_insn);
28947 /* If we can't recognize the insns, we can't really do anything. */
28948 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
28949 return cost;
28951 insn_type = get_attr_type (insn);
28952 dep_insn_type = get_attr_type (dep_insn);
28954 switch (ix86_tune)
28956 case PROCESSOR_PENTIUM:
28957 case PROCESSOR_LAKEMONT:
28958 /* Address Generation Interlock adds a cycle of latency. */
28959 if (insn_type == TYPE_LEA)
28961 rtx addr = PATTERN (insn);
28963 if (GET_CODE (addr) == PARALLEL)
28964 addr = XVECEXP (addr, 0, 0);
28966 gcc_assert (GET_CODE (addr) == SET);
28968 addr = SET_SRC (addr);
28969 if (modified_in_p (addr, dep_insn))
28970 cost += 1;
28972 else if (ix86_agi_dependent (dep_insn, insn))
28973 cost += 1;
28975 /* ??? Compares pair with jump/setcc. */
28976 if (ix86_flags_dependent (insn, dep_insn, insn_type))
28977 cost = 0;
28979 /* Floating point stores require value to be ready one cycle earlier. */
28980 if (insn_type == TYPE_FMOV
28981 && get_attr_memory (insn) == MEMORY_STORE
28982 && !ix86_agi_dependent (dep_insn, insn))
28983 cost += 1;
28984 break;
28986 case PROCESSOR_PENTIUMPRO:
28987 /* INT->FP conversion is expensive. */
28988 if (get_attr_fp_int_src (dep_insn))
28989 cost += 5;
28991 /* There is one cycle extra latency between an FP op and a store. */
28992 if (insn_type == TYPE_FMOV
28993 && (set = single_set (dep_insn)) != NULL_RTX
28994 && (set2 = single_set (insn)) != NULL_RTX
28995 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
28996 && MEM_P (SET_DEST (set2)))
28997 cost += 1;
28999 memory = get_attr_memory (insn);
29001 /* Show ability of reorder buffer to hide latency of load by executing
29002 in parallel with previous instruction in case
29003 previous instruction is not needed to compute the address. */
29004 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29005 && !ix86_agi_dependent (dep_insn, insn))
29007 /* Claim moves to take one cycle, as core can issue one load
29008 at time and the next load can start cycle later. */
29009 if (dep_insn_type == TYPE_IMOV
29010 || dep_insn_type == TYPE_FMOV)
29011 cost = 1;
29012 else if (cost > 1)
29013 cost--;
29015 break;
29017 case PROCESSOR_K6:
29018 /* The esp dependency is resolved before
29019 the instruction is really finished. */
29020 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29021 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29022 return 1;
29024 /* INT->FP conversion is expensive. */
29025 if (get_attr_fp_int_src (dep_insn))
29026 cost += 5;
29028 memory = get_attr_memory (insn);
29030 /* Show ability of reorder buffer to hide latency of load by executing
29031 in parallel with previous instruction in case
29032 previous instruction is not needed to compute the address. */
29033 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29034 && !ix86_agi_dependent (dep_insn, insn))
29036 /* Claim moves to take one cycle, as core can issue one load
29037 at time and the next load can start cycle later. */
29038 if (dep_insn_type == TYPE_IMOV
29039 || dep_insn_type == TYPE_FMOV)
29040 cost = 1;
29041 else if (cost > 2)
29042 cost -= 2;
29043 else
29044 cost = 1;
29046 break;
29048 case PROCESSOR_AMDFAM10:
29049 case PROCESSOR_BDVER1:
29050 case PROCESSOR_BDVER2:
29051 case PROCESSOR_BDVER3:
29052 case PROCESSOR_BDVER4:
29053 case PROCESSOR_ZNVER1:
29054 case PROCESSOR_BTVER1:
29055 case PROCESSOR_BTVER2:
29056 case PROCESSOR_GENERIC:
29057 /* Stack engine allows to execute push&pop instructions in parall. */
29058 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29059 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29060 return 0;
29061 /* FALLTHRU */
29063 case PROCESSOR_ATHLON:
29064 case PROCESSOR_K8:
29065 memory = get_attr_memory (insn);
29067 /* Show ability of reorder buffer to hide latency of load by executing
29068 in parallel with previous instruction in case
29069 previous instruction is not needed to compute the address. */
29070 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29071 && !ix86_agi_dependent (dep_insn, insn))
29073 enum attr_unit unit = get_attr_unit (insn);
29074 int loadcost = 3;
29076 /* Because of the difference between the length of integer and
29077 floating unit pipeline preparation stages, the memory operands
29078 for floating point are cheaper.
29080 ??? For Athlon it the difference is most probably 2. */
29081 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
29082 loadcost = 3;
29083 else
29084 loadcost = TARGET_ATHLON ? 2 : 0;
29086 if (cost >= loadcost)
29087 cost -= loadcost;
29088 else
29089 cost = 0;
29091 break;
29093 case PROCESSOR_CORE2:
29094 case PROCESSOR_NEHALEM:
29095 case PROCESSOR_SANDYBRIDGE:
29096 case PROCESSOR_HASWELL:
29097 /* Stack engine allows to execute push&pop instructions in parall. */
29098 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
29099 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
29100 return 0;
29102 memory = get_attr_memory (insn);
29104 /* Show ability of reorder buffer to hide latency of load by executing
29105 in parallel with previous instruction in case
29106 previous instruction is not needed to compute the address. */
29107 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29108 && !ix86_agi_dependent (dep_insn, insn))
29110 if (cost >= 4)
29111 cost -= 4;
29112 else
29113 cost = 0;
29115 break;
29117 case PROCESSOR_SILVERMONT:
29118 case PROCESSOR_KNL:
29119 case PROCESSOR_INTEL:
29120 if (!reload_completed)
29121 return cost;
29123 /* Increase cost of integer loads. */
29124 memory = get_attr_memory (dep_insn);
29125 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
29127 enum attr_unit unit = get_attr_unit (dep_insn);
29128 if (unit == UNIT_INTEGER && cost == 1)
29130 if (memory == MEMORY_LOAD)
29131 cost = 3;
29132 else
29134 /* Increase cost of ld/st for short int types only
29135 because of store forwarding issue. */
29136 rtx set = single_set (dep_insn);
29137 if (set && (GET_MODE (SET_DEST (set)) == QImode
29138 || GET_MODE (SET_DEST (set)) == HImode))
29140 /* Increase cost of store/load insn if exact
29141 dependence exists and it is load insn. */
29142 enum attr_memory insn_memory = get_attr_memory (insn);
29143 if (insn_memory == MEMORY_LOAD
29144 && exact_store_load_dependency (dep_insn, insn))
29145 cost = 3;
29151 default:
29152 break;
29155 return cost;
29158 /* How many alternative schedules to try. This should be as wide as the
29159 scheduling freedom in the DFA, but no wider. Making this value too
29160 large results extra work for the scheduler. */
29162 static int
29163 ia32_multipass_dfa_lookahead (void)
29165 switch (ix86_tune)
29167 case PROCESSOR_PENTIUM:
29168 case PROCESSOR_LAKEMONT:
29169 return 2;
29171 case PROCESSOR_PENTIUMPRO:
29172 case PROCESSOR_K6:
29173 return 1;
29175 case PROCESSOR_BDVER1:
29176 case PROCESSOR_BDVER2:
29177 case PROCESSOR_BDVER3:
29178 case PROCESSOR_BDVER4:
29179 /* We use lookahead value 4 for BD both before and after reload
29180 schedules. Plan is to have value 8 included for O3. */
29181 return 4;
29183 case PROCESSOR_CORE2:
29184 case PROCESSOR_NEHALEM:
29185 case PROCESSOR_SANDYBRIDGE:
29186 case PROCESSOR_HASWELL:
29187 case PROCESSOR_BONNELL:
29188 case PROCESSOR_SILVERMONT:
29189 case PROCESSOR_KNL:
29190 case PROCESSOR_INTEL:
29191 /* Generally, we want haifa-sched:max_issue() to look ahead as far
29192 as many instructions can be executed on a cycle, i.e.,
29193 issue_rate. I wonder why tuning for many CPUs does not do this. */
29194 if (reload_completed)
29195 return ix86_issue_rate ();
29196 /* Don't use lookahead for pre-reload schedule to save compile time. */
29197 return 0;
29199 default:
29200 return 0;
29204 /* Return true if target platform supports macro-fusion. */
29206 static bool
29207 ix86_macro_fusion_p ()
29209 return TARGET_FUSE_CMP_AND_BRANCH;
29212 /* Check whether current microarchitecture support macro fusion
29213 for insn pair "CONDGEN + CONDJMP". Refer to
29214 "Intel Architectures Optimization Reference Manual". */
29216 static bool
29217 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
29219 rtx src, dest;
29220 enum rtx_code ccode;
29221 rtx compare_set = NULL_RTX, test_if, cond;
29222 rtx alu_set = NULL_RTX, addr = NULL_RTX;
29224 if (!any_condjump_p (condjmp))
29225 return false;
29227 if (get_attr_type (condgen) != TYPE_TEST
29228 && get_attr_type (condgen) != TYPE_ICMP
29229 && get_attr_type (condgen) != TYPE_INCDEC
29230 && get_attr_type (condgen) != TYPE_ALU)
29231 return false;
29233 compare_set = single_set (condgen);
29234 if (compare_set == NULL_RTX
29235 && !TARGET_FUSE_ALU_AND_BRANCH)
29236 return false;
29238 if (compare_set == NULL_RTX)
29240 int i;
29241 rtx pat = PATTERN (condgen);
29242 for (i = 0; i < XVECLEN (pat, 0); i++)
29243 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
29245 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
29246 if (GET_CODE (set_src) == COMPARE)
29247 compare_set = XVECEXP (pat, 0, i);
29248 else
29249 alu_set = XVECEXP (pat, 0, i);
29252 if (compare_set == NULL_RTX)
29253 return false;
29254 src = SET_SRC (compare_set);
29255 if (GET_CODE (src) != COMPARE)
29256 return false;
29258 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
29259 supported. */
29260 if ((MEM_P (XEXP (src, 0))
29261 && CONST_INT_P (XEXP (src, 1)))
29262 || (MEM_P (XEXP (src, 1))
29263 && CONST_INT_P (XEXP (src, 0))))
29264 return false;
29266 /* No fusion for RIP-relative address. */
29267 if (MEM_P (XEXP (src, 0)))
29268 addr = XEXP (XEXP (src, 0), 0);
29269 else if (MEM_P (XEXP (src, 1)))
29270 addr = XEXP (XEXP (src, 1), 0);
29272 if (addr) {
29273 ix86_address parts;
29274 int ok = ix86_decompose_address (addr, &parts);
29275 gcc_assert (ok);
29277 if (rip_relative_addr_p (&parts))
29278 return false;
29281 test_if = SET_SRC (pc_set (condjmp));
29282 cond = XEXP (test_if, 0);
29283 ccode = GET_CODE (cond);
29284 /* Check whether conditional jump use Sign or Overflow Flags. */
29285 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
29286 && (ccode == GE
29287 || ccode == GT
29288 || ccode == LE
29289 || ccode == LT))
29290 return false;
29292 /* Return true for TYPE_TEST and TYPE_ICMP. */
29293 if (get_attr_type (condgen) == TYPE_TEST
29294 || get_attr_type (condgen) == TYPE_ICMP)
29295 return true;
29297 /* The following is the case that macro-fusion for alu + jmp. */
29298 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
29299 return false;
29301 /* No fusion for alu op with memory destination operand. */
29302 dest = SET_DEST (alu_set);
29303 if (MEM_P (dest))
29304 return false;
29306 /* Macro-fusion for inc/dec + unsigned conditional jump is not
29307 supported. */
29308 if (get_attr_type (condgen) == TYPE_INCDEC
29309 && (ccode == GEU
29310 || ccode == GTU
29311 || ccode == LEU
29312 || ccode == LTU))
29313 return false;
29315 return true;
29318 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
29319 execution. It is applied if
29320 (1) IMUL instruction is on the top of list;
29321 (2) There exists the only producer of independent IMUL instruction in
29322 ready list.
29323 Return index of IMUL producer if it was found and -1 otherwise. */
29324 static int
29325 do_reorder_for_imul (rtx_insn **ready, int n_ready)
29327 rtx_insn *insn;
29328 rtx set, insn1, insn2;
29329 sd_iterator_def sd_it;
29330 dep_t dep;
29331 int index = -1;
29332 int i;
29334 if (!TARGET_BONNELL)
29335 return index;
29337 /* Check that IMUL instruction is on the top of ready list. */
29338 insn = ready[n_ready - 1];
29339 set = single_set (insn);
29340 if (!set)
29341 return index;
29342 if (!(GET_CODE (SET_SRC (set)) == MULT
29343 && GET_MODE (SET_SRC (set)) == SImode))
29344 return index;
29346 /* Search for producer of independent IMUL instruction. */
29347 for (i = n_ready - 2; i >= 0; i--)
29349 insn = ready[i];
29350 if (!NONDEBUG_INSN_P (insn))
29351 continue;
29352 /* Skip IMUL instruction. */
29353 insn2 = PATTERN (insn);
29354 if (GET_CODE (insn2) == PARALLEL)
29355 insn2 = XVECEXP (insn2, 0, 0);
29356 if (GET_CODE (insn2) == SET
29357 && GET_CODE (SET_SRC (insn2)) == MULT
29358 && GET_MODE (SET_SRC (insn2)) == SImode)
29359 continue;
29361 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
29363 rtx con;
29364 con = DEP_CON (dep);
29365 if (!NONDEBUG_INSN_P (con))
29366 continue;
29367 insn1 = PATTERN (con);
29368 if (GET_CODE (insn1) == PARALLEL)
29369 insn1 = XVECEXP (insn1, 0, 0);
29371 if (GET_CODE (insn1) == SET
29372 && GET_CODE (SET_SRC (insn1)) == MULT
29373 && GET_MODE (SET_SRC (insn1)) == SImode)
29375 sd_iterator_def sd_it1;
29376 dep_t dep1;
29377 /* Check if there is no other dependee for IMUL. */
29378 index = i;
29379 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
29381 rtx pro;
29382 pro = DEP_PRO (dep1);
29383 if (!NONDEBUG_INSN_P (pro))
29384 continue;
29385 if (pro != insn)
29386 index = -1;
29388 if (index >= 0)
29389 break;
29392 if (index >= 0)
29393 break;
29395 return index;
29398 /* Try to find the best candidate on the top of ready list if two insns
29399 have the same priority - candidate is best if its dependees were
29400 scheduled earlier. Applied for Silvermont only.
29401 Return true if top 2 insns must be interchanged. */
29402 static bool
29403 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
29405 rtx_insn *top = ready[n_ready - 1];
29406 rtx_insn *next = ready[n_ready - 2];
29407 rtx set;
29408 sd_iterator_def sd_it;
29409 dep_t dep;
29410 int clock1 = -1;
29411 int clock2 = -1;
29412 #define INSN_TICK(INSN) (HID (INSN)->tick)
29414 if (!TARGET_SILVERMONT && !TARGET_INTEL)
29415 return false;
29417 if (!NONDEBUG_INSN_P (top))
29418 return false;
29419 if (!NONJUMP_INSN_P (top))
29420 return false;
29421 if (!NONDEBUG_INSN_P (next))
29422 return false;
29423 if (!NONJUMP_INSN_P (next))
29424 return false;
29425 set = single_set (top);
29426 if (!set)
29427 return false;
29428 set = single_set (next);
29429 if (!set)
29430 return false;
29432 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
29434 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
29435 return false;
29436 /* Determine winner more precise. */
29437 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
29439 rtx pro;
29440 pro = DEP_PRO (dep);
29441 if (!NONDEBUG_INSN_P (pro))
29442 continue;
29443 if (INSN_TICK (pro) > clock1)
29444 clock1 = INSN_TICK (pro);
29446 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
29448 rtx pro;
29449 pro = DEP_PRO (dep);
29450 if (!NONDEBUG_INSN_P (pro))
29451 continue;
29452 if (INSN_TICK (pro) > clock2)
29453 clock2 = INSN_TICK (pro);
29456 if (clock1 == clock2)
29458 /* Determine winner - load must win. */
29459 enum attr_memory memory1, memory2;
29460 memory1 = get_attr_memory (top);
29461 memory2 = get_attr_memory (next);
29462 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
29463 return true;
29465 return (bool) (clock2 < clock1);
29467 return false;
29468 #undef INSN_TICK
29471 /* Perform possible reodering of ready list for Atom/Silvermont only.
29472 Return issue rate. */
29473 static int
29474 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
29475 int *pn_ready, int clock_var)
29477 int issue_rate = -1;
29478 int n_ready = *pn_ready;
29479 int i;
29480 rtx_insn *insn;
29481 int index = -1;
29483 /* Set up issue rate. */
29484 issue_rate = ix86_issue_rate ();
29486 /* Do reodering for BONNELL/SILVERMONT only. */
29487 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
29488 return issue_rate;
29490 /* Nothing to do if ready list contains only 1 instruction. */
29491 if (n_ready <= 1)
29492 return issue_rate;
29494 /* Do reodering for post-reload scheduler only. */
29495 if (!reload_completed)
29496 return issue_rate;
29498 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
29500 if (sched_verbose > 1)
29501 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
29502 INSN_UID (ready[index]));
29504 /* Put IMUL producer (ready[index]) at the top of ready list. */
29505 insn = ready[index];
29506 for (i = index; i < n_ready - 1; i++)
29507 ready[i] = ready[i + 1];
29508 ready[n_ready - 1] = insn;
29509 return issue_rate;
29512 /* Skip selective scheduling since HID is not populated in it. */
29513 if (clock_var != 0
29514 && !sel_sched_p ()
29515 && swap_top_of_ready_list (ready, n_ready))
29517 if (sched_verbose > 1)
29518 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
29519 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
29520 /* Swap 2 top elements of ready list. */
29521 insn = ready[n_ready - 1];
29522 ready[n_ready - 1] = ready[n_ready - 2];
29523 ready[n_ready - 2] = insn;
29525 return issue_rate;
29528 static bool
29529 ix86_class_likely_spilled_p (reg_class_t);
29531 /* Returns true if lhs of insn is HW function argument register and set up
29532 is_spilled to true if it is likely spilled HW register. */
29533 static bool
29534 insn_is_function_arg (rtx insn, bool* is_spilled)
29536 rtx dst;
29538 if (!NONDEBUG_INSN_P (insn))
29539 return false;
29540 /* Call instructions are not movable, ignore it. */
29541 if (CALL_P (insn))
29542 return false;
29543 insn = PATTERN (insn);
29544 if (GET_CODE (insn) == PARALLEL)
29545 insn = XVECEXP (insn, 0, 0);
29546 if (GET_CODE (insn) != SET)
29547 return false;
29548 dst = SET_DEST (insn);
29549 if (REG_P (dst) && HARD_REGISTER_P (dst)
29550 && ix86_function_arg_regno_p (REGNO (dst)))
29552 /* Is it likely spilled HW register? */
29553 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29554 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29555 *is_spilled = true;
29556 return true;
29558 return false;
29561 /* Add output dependencies for chain of function adjacent arguments if only
29562 there is a move to likely spilled HW register. Return first argument
29563 if at least one dependence was added or NULL otherwise. */
29564 static rtx_insn *
29565 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29567 rtx_insn *insn;
29568 rtx_insn *last = call;
29569 rtx_insn *first_arg = NULL;
29570 bool is_spilled = false;
29572 head = PREV_INSN (head);
29574 /* Find nearest to call argument passing instruction. */
29575 while (true)
29577 last = PREV_INSN (last);
29578 if (last == head)
29579 return NULL;
29580 if (!NONDEBUG_INSN_P (last))
29581 continue;
29582 if (insn_is_function_arg (last, &is_spilled))
29583 break;
29584 return NULL;
29587 first_arg = last;
29588 while (true)
29590 insn = PREV_INSN (last);
29591 if (!INSN_P (insn))
29592 break;
29593 if (insn == head)
29594 break;
29595 if (!NONDEBUG_INSN_P (insn))
29597 last = insn;
29598 continue;
29600 if (insn_is_function_arg (insn, &is_spilled))
29602 /* Add output depdendence between two function arguments if chain
29603 of output arguments contains likely spilled HW registers. */
29604 if (is_spilled)
29605 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29606 first_arg = last = insn;
29608 else
29609 break;
29611 if (!is_spilled)
29612 return NULL;
29613 return first_arg;
29616 /* Add output or anti dependency from insn to first_arg to restrict its code
29617 motion. */
29618 static void
29619 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29621 rtx set;
29622 rtx tmp;
29624 /* Add anti dependencies for bounds stores. */
29625 if (INSN_P (insn)
29626 && GET_CODE (PATTERN (insn)) == PARALLEL
29627 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29628 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29630 add_dependence (first_arg, insn, REG_DEP_ANTI);
29631 return;
29634 set = single_set (insn);
29635 if (!set)
29636 return;
29637 tmp = SET_DEST (set);
29638 if (REG_P (tmp))
29640 /* Add output dependency to the first function argument. */
29641 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29642 return;
29644 /* Add anti dependency. */
29645 add_dependence (first_arg, insn, REG_DEP_ANTI);
29648 /* Avoid cross block motion of function argument through adding dependency
29649 from the first non-jump instruction in bb. */
29650 static void
29651 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29653 rtx_insn *insn = BB_END (bb);
29655 while (insn)
29657 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29659 rtx set = single_set (insn);
29660 if (set)
29662 avoid_func_arg_motion (arg, insn);
29663 return;
29666 if (insn == BB_HEAD (bb))
29667 return;
29668 insn = PREV_INSN (insn);
29672 /* Hook for pre-reload schedule - avoid motion of function arguments
29673 passed in likely spilled HW registers. */
29674 static void
29675 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29677 rtx_insn *insn;
29678 rtx_insn *first_arg = NULL;
29679 if (reload_completed)
29680 return;
29681 while (head != tail && DEBUG_INSN_P (head))
29682 head = NEXT_INSN (head);
29683 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29684 if (INSN_P (insn) && CALL_P (insn))
29686 first_arg = add_parameter_dependencies (insn, head);
29687 if (first_arg)
29689 /* Add dependee for first argument to predecessors if only
29690 region contains more than one block. */
29691 basic_block bb = BLOCK_FOR_INSN (insn);
29692 int rgn = CONTAINING_RGN (bb->index);
29693 int nr_blks = RGN_NR_BLOCKS (rgn);
29694 /* Skip trivial regions and region head blocks that can have
29695 predecessors outside of region. */
29696 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29698 edge e;
29699 edge_iterator ei;
29701 /* Regions are SCCs with the exception of selective
29702 scheduling with pipelining of outer blocks enabled.
29703 So also check that immediate predecessors of a non-head
29704 block are in the same region. */
29705 FOR_EACH_EDGE (e, ei, bb->preds)
29707 /* Avoid creating of loop-carried dependencies through
29708 using topological ordering in the region. */
29709 if (rgn == CONTAINING_RGN (e->src->index)
29710 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29711 add_dependee_for_func_arg (first_arg, e->src);
29714 insn = first_arg;
29715 if (insn == head)
29716 break;
29719 else if (first_arg)
29720 avoid_func_arg_motion (first_arg, insn);
29723 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29724 HW registers to maximum, to schedule them at soon as possible. These are
29725 moves from function argument registers at the top of the function entry
29726 and moves from function return value registers after call. */
29727 static int
29728 ix86_adjust_priority (rtx_insn *insn, int priority)
29730 rtx set;
29732 if (reload_completed)
29733 return priority;
29735 if (!NONDEBUG_INSN_P (insn))
29736 return priority;
29738 set = single_set (insn);
29739 if (set)
29741 rtx tmp = SET_SRC (set);
29742 if (REG_P (tmp)
29743 && HARD_REGISTER_P (tmp)
29744 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29745 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29746 return current_sched_info->sched_max_insns_priority;
29749 return priority;
29752 /* Model decoder of Core 2/i7.
29753 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
29754 track the instruction fetch block boundaries and make sure that long
29755 (9+ bytes) instructions are assigned to D0. */
29757 /* Maximum length of an insn that can be handled by
29758 a secondary decoder unit. '8' for Core 2/i7. */
29759 static int core2i7_secondary_decoder_max_insn_size;
29761 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
29762 '16' for Core 2/i7. */
29763 static int core2i7_ifetch_block_size;
29765 /* Maximum number of instructions decoder can handle per cycle.
29766 '6' for Core 2/i7. */
29767 static int core2i7_ifetch_block_max_insns;
29769 typedef struct ix86_first_cycle_multipass_data_ *
29770 ix86_first_cycle_multipass_data_t;
29771 typedef const struct ix86_first_cycle_multipass_data_ *
29772 const_ix86_first_cycle_multipass_data_t;
29774 /* A variable to store target state across calls to max_issue within
29775 one cycle. */
29776 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
29777 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
29779 /* Initialize DATA. */
29780 static void
29781 core2i7_first_cycle_multipass_init (void *_data)
29783 ix86_first_cycle_multipass_data_t data
29784 = (ix86_first_cycle_multipass_data_t) _data;
29786 data->ifetch_block_len = 0;
29787 data->ifetch_block_n_insns = 0;
29788 data->ready_try_change = NULL;
29789 data->ready_try_change_size = 0;
29792 /* Advancing the cycle; reset ifetch block counts. */
29793 static void
29794 core2i7_dfa_post_advance_cycle (void)
29796 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
29798 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29800 data->ifetch_block_len = 0;
29801 data->ifetch_block_n_insns = 0;
29804 static int min_insn_size (rtx_insn *);
29806 /* Filter out insns from ready_try that the core will not be able to issue
29807 on current cycle due to decoder. */
29808 static void
29809 core2i7_first_cycle_multipass_filter_ready_try
29810 (const_ix86_first_cycle_multipass_data_t data,
29811 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
29813 while (n_ready--)
29815 rtx_insn *insn;
29816 int insn_size;
29818 if (ready_try[n_ready])
29819 continue;
29821 insn = get_ready_element (n_ready);
29822 insn_size = min_insn_size (insn);
29824 if (/* If this is a too long an insn for a secondary decoder ... */
29825 (!first_cycle_insn_p
29826 && insn_size > core2i7_secondary_decoder_max_insn_size)
29827 /* ... or it would not fit into the ifetch block ... */
29828 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
29829 /* ... or the decoder is full already ... */
29830 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
29831 /* ... mask the insn out. */
29833 ready_try[n_ready] = 1;
29835 if (data->ready_try_change)
29836 bitmap_set_bit (data->ready_try_change, n_ready);
29841 /* Prepare for a new round of multipass lookahead scheduling. */
29842 static void
29843 core2i7_first_cycle_multipass_begin (void *_data,
29844 signed char *ready_try, int n_ready,
29845 bool first_cycle_insn_p)
29847 ix86_first_cycle_multipass_data_t data
29848 = (ix86_first_cycle_multipass_data_t) _data;
29849 const_ix86_first_cycle_multipass_data_t prev_data
29850 = ix86_first_cycle_multipass_data;
29852 /* Restore the state from the end of the previous round. */
29853 data->ifetch_block_len = prev_data->ifetch_block_len;
29854 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
29856 /* Filter instructions that cannot be issued on current cycle due to
29857 decoder restrictions. */
29858 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29859 first_cycle_insn_p);
29862 /* INSN is being issued in current solution. Account for its impact on
29863 the decoder model. */
29864 static void
29865 core2i7_first_cycle_multipass_issue (void *_data,
29866 signed char *ready_try, int n_ready,
29867 rtx_insn *insn, const void *_prev_data)
29869 ix86_first_cycle_multipass_data_t data
29870 = (ix86_first_cycle_multipass_data_t) _data;
29871 const_ix86_first_cycle_multipass_data_t prev_data
29872 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
29874 int insn_size = min_insn_size (insn);
29876 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
29877 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
29878 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
29879 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
29881 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
29882 if (!data->ready_try_change)
29884 data->ready_try_change = sbitmap_alloc (n_ready);
29885 data->ready_try_change_size = n_ready;
29887 else if (data->ready_try_change_size < n_ready)
29889 data->ready_try_change = sbitmap_resize (data->ready_try_change,
29890 n_ready, 0);
29891 data->ready_try_change_size = n_ready;
29893 bitmap_clear (data->ready_try_change);
29895 /* Filter out insns from ready_try that the core will not be able to issue
29896 on current cycle due to decoder. */
29897 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
29898 false);
29901 /* Revert the effect on ready_try. */
29902 static void
29903 core2i7_first_cycle_multipass_backtrack (const void *_data,
29904 signed char *ready_try,
29905 int n_ready ATTRIBUTE_UNUSED)
29907 const_ix86_first_cycle_multipass_data_t data
29908 = (const_ix86_first_cycle_multipass_data_t) _data;
29909 unsigned int i = 0;
29910 sbitmap_iterator sbi;
29912 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
29913 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
29915 ready_try[i] = 0;
29919 /* Save the result of multipass lookahead scheduling for the next round. */
29920 static void
29921 core2i7_first_cycle_multipass_end (const void *_data)
29923 const_ix86_first_cycle_multipass_data_t data
29924 = (const_ix86_first_cycle_multipass_data_t) _data;
29925 ix86_first_cycle_multipass_data_t next_data
29926 = ix86_first_cycle_multipass_data;
29928 if (data != NULL)
29930 next_data->ifetch_block_len = data->ifetch_block_len;
29931 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
29935 /* Deallocate target data. */
29936 static void
29937 core2i7_first_cycle_multipass_fini (void *_data)
29939 ix86_first_cycle_multipass_data_t data
29940 = (ix86_first_cycle_multipass_data_t) _data;
29942 if (data->ready_try_change)
29944 sbitmap_free (data->ready_try_change);
29945 data->ready_try_change = NULL;
29946 data->ready_try_change_size = 0;
29950 /* Prepare for scheduling pass. */
29951 static void
29952 ix86_sched_init_global (FILE *, int, int)
29954 /* Install scheduling hooks for current CPU. Some of these hooks are used
29955 in time-critical parts of the scheduler, so we only set them up when
29956 they are actually used. */
29957 switch (ix86_tune)
29959 case PROCESSOR_CORE2:
29960 case PROCESSOR_NEHALEM:
29961 case PROCESSOR_SANDYBRIDGE:
29962 case PROCESSOR_HASWELL:
29963 /* Do not perform multipass scheduling for pre-reload schedule
29964 to save compile time. */
29965 if (reload_completed)
29967 targetm.sched.dfa_post_advance_cycle
29968 = core2i7_dfa_post_advance_cycle;
29969 targetm.sched.first_cycle_multipass_init
29970 = core2i7_first_cycle_multipass_init;
29971 targetm.sched.first_cycle_multipass_begin
29972 = core2i7_first_cycle_multipass_begin;
29973 targetm.sched.first_cycle_multipass_issue
29974 = core2i7_first_cycle_multipass_issue;
29975 targetm.sched.first_cycle_multipass_backtrack
29976 = core2i7_first_cycle_multipass_backtrack;
29977 targetm.sched.first_cycle_multipass_end
29978 = core2i7_first_cycle_multipass_end;
29979 targetm.sched.first_cycle_multipass_fini
29980 = core2i7_first_cycle_multipass_fini;
29982 /* Set decoder parameters. */
29983 core2i7_secondary_decoder_max_insn_size = 8;
29984 core2i7_ifetch_block_size = 16;
29985 core2i7_ifetch_block_max_insns = 6;
29986 break;
29988 /* Fall through. */
29989 default:
29990 targetm.sched.dfa_post_advance_cycle = NULL;
29991 targetm.sched.first_cycle_multipass_init = NULL;
29992 targetm.sched.first_cycle_multipass_begin = NULL;
29993 targetm.sched.first_cycle_multipass_issue = NULL;
29994 targetm.sched.first_cycle_multipass_backtrack = NULL;
29995 targetm.sched.first_cycle_multipass_end = NULL;
29996 targetm.sched.first_cycle_multipass_fini = NULL;
29997 break;
30002 /* Compute the alignment given to a constant that is being placed in memory.
30003 EXP is the constant and ALIGN is the alignment that the object would
30004 ordinarily have.
30005 The value of this function is used instead of that alignment to align
30006 the object. */
30009 ix86_constant_alignment (tree exp, int align)
30011 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
30012 || TREE_CODE (exp) == INTEGER_CST)
30014 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
30015 return 64;
30016 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
30017 return 128;
30019 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
30020 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
30021 return BITS_PER_WORD;
30023 return align;
30026 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
30027 the data type, and ALIGN is the alignment that the object would
30028 ordinarily have. */
30030 static int
30031 iamcu_alignment (tree type, int align)
30033 enum machine_mode mode;
30035 if (align < 32 || TYPE_USER_ALIGN (type))
30036 return align;
30038 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
30039 bytes. */
30040 mode = TYPE_MODE (strip_array_types (type));
30041 switch (GET_MODE_CLASS (mode))
30043 case MODE_INT:
30044 case MODE_COMPLEX_INT:
30045 case MODE_COMPLEX_FLOAT:
30046 case MODE_FLOAT:
30047 case MODE_DECIMAL_FLOAT:
30048 return 32;
30049 default:
30050 return align;
30054 /* Compute the alignment for a static variable.
30055 TYPE is the data type, and ALIGN is the alignment that
30056 the object would ordinarily have. The value of this function is used
30057 instead of that alignment to align the object. */
30060 ix86_data_alignment (tree type, int align, bool opt)
30062 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
30063 for symbols from other compilation units or symbols that don't need
30064 to bind locally. In order to preserve some ABI compatibility with
30065 those compilers, ensure we don't decrease alignment from what we
30066 used to assume. */
30068 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
30070 /* A data structure, equal or greater than the size of a cache line
30071 (64 bytes in the Pentium 4 and other recent Intel processors, including
30072 processors based on Intel Core microarchitecture) should be aligned
30073 so that its base address is a multiple of a cache line size. */
30075 int max_align
30076 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
30078 if (max_align < BITS_PER_WORD)
30079 max_align = BITS_PER_WORD;
30081 switch (ix86_align_data_type)
30083 case ix86_align_data_type_abi: opt = false; break;
30084 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
30085 case ix86_align_data_type_cacheline: break;
30088 if (TARGET_IAMCU)
30089 align = iamcu_alignment (type, align);
30091 if (opt
30092 && AGGREGATE_TYPE_P (type)
30093 && TYPE_SIZE (type)
30094 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
30096 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
30097 && align < max_align_compat)
30098 align = max_align_compat;
30099 if (wi::geu_p (TYPE_SIZE (type), max_align)
30100 && align < max_align)
30101 align = max_align;
30104 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30105 to 16byte boundary. */
30106 if (TARGET_64BIT)
30108 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
30109 && TYPE_SIZE (type)
30110 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30111 && wi::geu_p (TYPE_SIZE (type), 128)
30112 && align < 128)
30113 return 128;
30116 if (!opt)
30117 return align;
30119 if (TREE_CODE (type) == ARRAY_TYPE)
30121 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30122 return 64;
30123 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30124 return 128;
30126 else if (TREE_CODE (type) == COMPLEX_TYPE)
30129 if (TYPE_MODE (type) == DCmode && align < 64)
30130 return 64;
30131 if ((TYPE_MODE (type) == XCmode
30132 || TYPE_MODE (type) == TCmode) && align < 128)
30133 return 128;
30135 else if ((TREE_CODE (type) == RECORD_TYPE
30136 || TREE_CODE (type) == UNION_TYPE
30137 || TREE_CODE (type) == QUAL_UNION_TYPE)
30138 && TYPE_FIELDS (type))
30140 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30141 return 64;
30142 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30143 return 128;
30145 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30146 || TREE_CODE (type) == INTEGER_TYPE)
30148 if (TYPE_MODE (type) == DFmode && align < 64)
30149 return 64;
30150 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30151 return 128;
30154 return align;
30157 /* Compute the alignment for a local variable or a stack slot. EXP is
30158 the data type or decl itself, MODE is the widest mode available and
30159 ALIGN is the alignment that the object would ordinarily have. The
30160 value of this macro is used instead of that alignment to align the
30161 object. */
30163 unsigned int
30164 ix86_local_alignment (tree exp, machine_mode mode,
30165 unsigned int align)
30167 tree type, decl;
30169 if (exp && DECL_P (exp))
30171 type = TREE_TYPE (exp);
30172 decl = exp;
30174 else
30176 type = exp;
30177 decl = NULL;
30180 /* Don't do dynamic stack realignment for long long objects with
30181 -mpreferred-stack-boundary=2. */
30182 if (!TARGET_64BIT
30183 && align == 64
30184 && ix86_preferred_stack_boundary < 64
30185 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
30186 && (!type || !TYPE_USER_ALIGN (type))
30187 && (!decl || !DECL_USER_ALIGN (decl)))
30188 align = 32;
30190 /* If TYPE is NULL, we are allocating a stack slot for caller-save
30191 register in MODE. We will return the largest alignment of XF
30192 and DF. */
30193 if (!type)
30195 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
30196 align = GET_MODE_ALIGNMENT (DFmode);
30197 return align;
30200 /* Don't increase alignment for Intel MCU psABI. */
30201 if (TARGET_IAMCU)
30202 return align;
30204 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30205 to 16byte boundary. Exact wording is:
30207 An array uses the same alignment as its elements, except that a local or
30208 global array variable of length at least 16 bytes or
30209 a C99 variable-length array variable always has alignment of at least 16 bytes.
30211 This was added to allow use of aligned SSE instructions at arrays. This
30212 rule is meant for static storage (where compiler can not do the analysis
30213 by itself). We follow it for automatic variables only when convenient.
30214 We fully control everything in the function compiled and functions from
30215 other unit can not rely on the alignment.
30217 Exclude va_list type. It is the common case of local array where
30218 we can not benefit from the alignment.
30220 TODO: Probably one should optimize for size only when var is not escaping. */
30221 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30222 && TARGET_SSE)
30224 if (AGGREGATE_TYPE_P (type)
30225 && (va_list_type_node == NULL_TREE
30226 || (TYPE_MAIN_VARIANT (type)
30227 != TYPE_MAIN_VARIANT (va_list_type_node)))
30228 && TYPE_SIZE (type)
30229 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30230 && wi::geu_p (TYPE_SIZE (type), 16)
30231 && align < 128)
30232 return 128;
30234 if (TREE_CODE (type) == ARRAY_TYPE)
30236 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30237 return 64;
30238 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30239 return 128;
30241 else if (TREE_CODE (type) == COMPLEX_TYPE)
30243 if (TYPE_MODE (type) == DCmode && align < 64)
30244 return 64;
30245 if ((TYPE_MODE (type) == XCmode
30246 || TYPE_MODE (type) == TCmode) && align < 128)
30247 return 128;
30249 else if ((TREE_CODE (type) == RECORD_TYPE
30250 || TREE_CODE (type) == UNION_TYPE
30251 || TREE_CODE (type) == QUAL_UNION_TYPE)
30252 && TYPE_FIELDS (type))
30254 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30255 return 64;
30256 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30257 return 128;
30259 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30260 || TREE_CODE (type) == INTEGER_TYPE)
30263 if (TYPE_MODE (type) == DFmode && align < 64)
30264 return 64;
30265 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30266 return 128;
30268 return align;
30271 /* Compute the minimum required alignment for dynamic stack realignment
30272 purposes for a local variable, parameter or a stack slot. EXP is
30273 the data type or decl itself, MODE is its mode and ALIGN is the
30274 alignment that the object would ordinarily have. */
30276 unsigned int
30277 ix86_minimum_alignment (tree exp, machine_mode mode,
30278 unsigned int align)
30280 tree type, decl;
30282 if (exp && DECL_P (exp))
30284 type = TREE_TYPE (exp);
30285 decl = exp;
30287 else
30289 type = exp;
30290 decl = NULL;
30293 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30294 return align;
30296 /* Don't do dynamic stack realignment for long long objects with
30297 -mpreferred-stack-boundary=2. */
30298 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30299 && (!type || !TYPE_USER_ALIGN (type))
30300 && (!decl || !DECL_USER_ALIGN (decl)))
30302 gcc_checking_assert (!TARGET_STV);
30303 return 32;
30306 return align;
30309 /* Find a location for the static chain incoming to a nested function.
30310 This is a register, unless all free registers are used by arguments. */
30312 static rtx
30313 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30315 unsigned regno;
30317 /* While this function won't be called by the middle-end when a static
30318 chain isn't needed, it's also used throughout the backend so it's
30319 easiest to keep this check centralized. */
30320 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
30321 return NULL;
30323 if (TARGET_64BIT)
30325 /* We always use R10 in 64-bit mode. */
30326 regno = R10_REG;
30328 else
30330 const_tree fntype, fndecl;
30331 unsigned int ccvt;
30333 /* By default in 32-bit mode we use ECX to pass the static chain. */
30334 regno = CX_REG;
30336 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30338 fntype = TREE_TYPE (fndecl_or_type);
30339 fndecl = fndecl_or_type;
30341 else
30343 fntype = fndecl_or_type;
30344 fndecl = NULL;
30347 ccvt = ix86_get_callcvt (fntype);
30348 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30350 /* Fastcall functions use ecx/edx for arguments, which leaves
30351 us with EAX for the static chain.
30352 Thiscall functions use ecx for arguments, which also
30353 leaves us with EAX for the static chain. */
30354 regno = AX_REG;
30356 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30358 /* Thiscall functions use ecx for arguments, which leaves
30359 us with EAX and EDX for the static chain.
30360 We are using for abi-compatibility EAX. */
30361 regno = AX_REG;
30363 else if (ix86_function_regparm (fntype, fndecl) == 3)
30365 /* For regparm 3, we have no free call-clobbered registers in
30366 which to store the static chain. In order to implement this,
30367 we have the trampoline push the static chain to the stack.
30368 However, we can't push a value below the return address when
30369 we call the nested function directly, so we have to use an
30370 alternate entry point. For this we use ESI, and have the
30371 alternate entry point push ESI, so that things appear the
30372 same once we're executing the nested function. */
30373 if (incoming_p)
30375 if (fndecl == current_function_decl)
30376 ix86_static_chain_on_stack = true;
30377 return gen_frame_mem (SImode,
30378 plus_constant (Pmode,
30379 arg_pointer_rtx, -8));
30381 regno = SI_REG;
30385 return gen_rtx_REG (Pmode, regno);
30388 /* Emit RTL insns to initialize the variable parts of a trampoline.
30389 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30390 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30391 to be passed to the target function. */
30393 static void
30394 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30396 rtx mem, fnaddr;
30397 int opcode;
30398 int offset = 0;
30400 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30402 if (TARGET_64BIT)
30404 int size;
30406 /* Load the function address to r11. Try to load address using
30407 the shorter movl instead of movabs. We may want to support
30408 movq for kernel mode, but kernel does not use trampolines at
30409 the moment. FNADDR is a 32bit address and may not be in
30410 DImode when ptr_mode == SImode. Always use movl in this
30411 case. */
30412 if (ptr_mode == SImode
30413 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30415 fnaddr = copy_addr_to_reg (fnaddr);
30417 mem = adjust_address (m_tramp, HImode, offset);
30418 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30420 mem = adjust_address (m_tramp, SImode, offset + 2);
30421 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30422 offset += 6;
30424 else
30426 mem = adjust_address (m_tramp, HImode, offset);
30427 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30429 mem = adjust_address (m_tramp, DImode, offset + 2);
30430 emit_move_insn (mem, fnaddr);
30431 offset += 10;
30434 /* Load static chain using movabs to r10. Use the shorter movl
30435 instead of movabs when ptr_mode == SImode. */
30436 if (ptr_mode == SImode)
30438 opcode = 0xba41;
30439 size = 6;
30441 else
30443 opcode = 0xba49;
30444 size = 10;
30447 mem = adjust_address (m_tramp, HImode, offset);
30448 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30450 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30451 emit_move_insn (mem, chain_value);
30452 offset += size;
30454 /* Jump to r11; the last (unused) byte is a nop, only there to
30455 pad the write out to a single 32-bit store. */
30456 mem = adjust_address (m_tramp, SImode, offset);
30457 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30458 offset += 4;
30460 else
30462 rtx disp, chain;
30464 /* Depending on the static chain location, either load a register
30465 with a constant, or push the constant to the stack. All of the
30466 instructions are the same size. */
30467 chain = ix86_static_chain (fndecl, true);
30468 if (REG_P (chain))
30470 switch (REGNO (chain))
30472 case AX_REG:
30473 opcode = 0xb8; break;
30474 case CX_REG:
30475 opcode = 0xb9; break;
30476 default:
30477 gcc_unreachable ();
30480 else
30481 opcode = 0x68;
30483 mem = adjust_address (m_tramp, QImode, offset);
30484 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30486 mem = adjust_address (m_tramp, SImode, offset + 1);
30487 emit_move_insn (mem, chain_value);
30488 offset += 5;
30490 mem = adjust_address (m_tramp, QImode, offset);
30491 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30493 mem = adjust_address (m_tramp, SImode, offset + 1);
30495 /* Compute offset from the end of the jmp to the target function.
30496 In the case in which the trampoline stores the static chain on
30497 the stack, we need to skip the first insn which pushes the
30498 (call-saved) register static chain; this push is 1 byte. */
30499 offset += 5;
30500 disp = expand_binop (SImode, sub_optab, fnaddr,
30501 plus_constant (Pmode, XEXP (m_tramp, 0),
30502 offset - (MEM_P (chain) ? 1 : 0)),
30503 NULL_RTX, 1, OPTAB_DIRECT);
30504 emit_move_insn (mem, disp);
30507 gcc_assert (offset <= TRAMPOLINE_SIZE);
30509 #ifdef HAVE_ENABLE_EXECUTE_STACK
30510 #ifdef CHECK_EXECUTE_STACK_ENABLED
30511 if (CHECK_EXECUTE_STACK_ENABLED)
30512 #endif
30513 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30514 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
30515 #endif
30518 /* The following file contains several enumerations and data structures
30519 built from the definitions in i386-builtin-types.def. */
30521 #include "i386-builtin-types.inc"
30523 /* Table for the ix86 builtin non-function types. */
30524 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30526 /* Retrieve an element from the above table, building some of
30527 the types lazily. */
30529 static tree
30530 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30532 unsigned int index;
30533 tree type, itype;
30535 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30537 type = ix86_builtin_type_tab[(int) tcode];
30538 if (type != NULL)
30539 return type;
30541 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30542 if (tcode <= IX86_BT_LAST_VECT)
30544 machine_mode mode;
30546 index = tcode - IX86_BT_LAST_PRIM - 1;
30547 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30548 mode = ix86_builtin_type_vect_mode[index];
30550 type = build_vector_type_for_mode (itype, mode);
30552 else
30554 int quals;
30556 index = tcode - IX86_BT_LAST_VECT - 1;
30557 if (tcode <= IX86_BT_LAST_PTR)
30558 quals = TYPE_UNQUALIFIED;
30559 else
30560 quals = TYPE_QUAL_CONST;
30562 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30563 if (quals != TYPE_UNQUALIFIED)
30564 itype = build_qualified_type (itype, quals);
30566 type = build_pointer_type (itype);
30569 ix86_builtin_type_tab[(int) tcode] = type;
30570 return type;
30573 /* Table for the ix86 builtin function types. */
30574 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30576 /* Retrieve an element from the above table, building some of
30577 the types lazily. */
30579 static tree
30580 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30582 tree type;
30584 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30586 type = ix86_builtin_func_type_tab[(int) tcode];
30587 if (type != NULL)
30588 return type;
30590 if (tcode <= IX86_BT_LAST_FUNC)
30592 unsigned start = ix86_builtin_func_start[(int) tcode];
30593 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30594 tree rtype, atype, args = void_list_node;
30595 unsigned i;
30597 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30598 for (i = after - 1; i > start; --i)
30600 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30601 args = tree_cons (NULL, atype, args);
30604 type = build_function_type (rtype, args);
30606 else
30608 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30609 enum ix86_builtin_func_type icode;
30611 icode = ix86_builtin_func_alias_base[index];
30612 type = ix86_get_builtin_func_type (icode);
30615 ix86_builtin_func_type_tab[(int) tcode] = type;
30616 return type;
30620 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30621 bdesc_* arrays below should come first, then builtins for each bdesc_*
30622 array in ascending order, so that we can use direct array accesses. */
30623 enum ix86_builtins
30625 IX86_BUILTIN_MASKMOVQ,
30626 IX86_BUILTIN_LDMXCSR,
30627 IX86_BUILTIN_STMXCSR,
30628 IX86_BUILTIN_MASKMOVDQU,
30629 IX86_BUILTIN_PSLLDQ128,
30630 IX86_BUILTIN_CLFLUSH,
30631 IX86_BUILTIN_MONITOR,
30632 IX86_BUILTIN_MWAIT,
30633 IX86_BUILTIN_CLZERO,
30634 IX86_BUILTIN_VEC_INIT_V2SI,
30635 IX86_BUILTIN_VEC_INIT_V4HI,
30636 IX86_BUILTIN_VEC_INIT_V8QI,
30637 IX86_BUILTIN_VEC_EXT_V2DF,
30638 IX86_BUILTIN_VEC_EXT_V2DI,
30639 IX86_BUILTIN_VEC_EXT_V4SF,
30640 IX86_BUILTIN_VEC_EXT_V4SI,
30641 IX86_BUILTIN_VEC_EXT_V8HI,
30642 IX86_BUILTIN_VEC_EXT_V2SI,
30643 IX86_BUILTIN_VEC_EXT_V4HI,
30644 IX86_BUILTIN_VEC_EXT_V16QI,
30645 IX86_BUILTIN_VEC_SET_V2DI,
30646 IX86_BUILTIN_VEC_SET_V4SF,
30647 IX86_BUILTIN_VEC_SET_V4SI,
30648 IX86_BUILTIN_VEC_SET_V8HI,
30649 IX86_BUILTIN_VEC_SET_V4HI,
30650 IX86_BUILTIN_VEC_SET_V16QI,
30651 IX86_BUILTIN_GATHERSIV2DF,
30652 IX86_BUILTIN_GATHERSIV4DF,
30653 IX86_BUILTIN_GATHERDIV2DF,
30654 IX86_BUILTIN_GATHERDIV4DF,
30655 IX86_BUILTIN_GATHERSIV4SF,
30656 IX86_BUILTIN_GATHERSIV8SF,
30657 IX86_BUILTIN_GATHERDIV4SF,
30658 IX86_BUILTIN_GATHERDIV8SF,
30659 IX86_BUILTIN_GATHERSIV2DI,
30660 IX86_BUILTIN_GATHERSIV4DI,
30661 IX86_BUILTIN_GATHERDIV2DI,
30662 IX86_BUILTIN_GATHERDIV4DI,
30663 IX86_BUILTIN_GATHERSIV4SI,
30664 IX86_BUILTIN_GATHERSIV8SI,
30665 IX86_BUILTIN_GATHERDIV4SI,
30666 IX86_BUILTIN_GATHERDIV8SI,
30667 IX86_BUILTIN_VFMSUBSD3_MASK3,
30668 IX86_BUILTIN_VFMSUBSS3_MASK3,
30669 IX86_BUILTIN_GATHER3SIV8SF,
30670 IX86_BUILTIN_GATHER3SIV4SF,
30671 IX86_BUILTIN_GATHER3SIV4DF,
30672 IX86_BUILTIN_GATHER3SIV2DF,
30673 IX86_BUILTIN_GATHER3DIV8SF,
30674 IX86_BUILTIN_GATHER3DIV4SF,
30675 IX86_BUILTIN_GATHER3DIV4DF,
30676 IX86_BUILTIN_GATHER3DIV2DF,
30677 IX86_BUILTIN_GATHER3SIV8SI,
30678 IX86_BUILTIN_GATHER3SIV4SI,
30679 IX86_BUILTIN_GATHER3SIV4DI,
30680 IX86_BUILTIN_GATHER3SIV2DI,
30681 IX86_BUILTIN_GATHER3DIV8SI,
30682 IX86_BUILTIN_GATHER3DIV4SI,
30683 IX86_BUILTIN_GATHER3DIV4DI,
30684 IX86_BUILTIN_GATHER3DIV2DI,
30685 IX86_BUILTIN_SCATTERSIV8SF,
30686 IX86_BUILTIN_SCATTERSIV4SF,
30687 IX86_BUILTIN_SCATTERSIV4DF,
30688 IX86_BUILTIN_SCATTERSIV2DF,
30689 IX86_BUILTIN_SCATTERDIV8SF,
30690 IX86_BUILTIN_SCATTERDIV4SF,
30691 IX86_BUILTIN_SCATTERDIV4DF,
30692 IX86_BUILTIN_SCATTERDIV2DF,
30693 IX86_BUILTIN_SCATTERSIV8SI,
30694 IX86_BUILTIN_SCATTERSIV4SI,
30695 IX86_BUILTIN_SCATTERSIV4DI,
30696 IX86_BUILTIN_SCATTERSIV2DI,
30697 IX86_BUILTIN_SCATTERDIV8SI,
30698 IX86_BUILTIN_SCATTERDIV4SI,
30699 IX86_BUILTIN_SCATTERDIV4DI,
30700 IX86_BUILTIN_SCATTERDIV2DI,
30701 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30702 where all operands are 32-byte or 64-byte wide respectively. */
30703 IX86_BUILTIN_GATHERALTSIV4DF,
30704 IX86_BUILTIN_GATHERALTDIV8SF,
30705 IX86_BUILTIN_GATHERALTSIV4DI,
30706 IX86_BUILTIN_GATHERALTDIV8SI,
30707 IX86_BUILTIN_GATHER3ALTDIV16SF,
30708 IX86_BUILTIN_GATHER3ALTDIV16SI,
30709 IX86_BUILTIN_GATHER3ALTSIV4DF,
30710 IX86_BUILTIN_GATHER3ALTDIV8SF,
30711 IX86_BUILTIN_GATHER3ALTSIV4DI,
30712 IX86_BUILTIN_GATHER3ALTDIV8SI,
30713 IX86_BUILTIN_GATHER3ALTSIV8DF,
30714 IX86_BUILTIN_GATHER3ALTSIV8DI,
30715 IX86_BUILTIN_GATHER3DIV16SF,
30716 IX86_BUILTIN_GATHER3DIV16SI,
30717 IX86_BUILTIN_GATHER3DIV8DF,
30718 IX86_BUILTIN_GATHER3DIV8DI,
30719 IX86_BUILTIN_GATHER3SIV16SF,
30720 IX86_BUILTIN_GATHER3SIV16SI,
30721 IX86_BUILTIN_GATHER3SIV8DF,
30722 IX86_BUILTIN_GATHER3SIV8DI,
30723 IX86_BUILTIN_SCATTERALTSIV8DF,
30724 IX86_BUILTIN_SCATTERALTDIV16SF,
30725 IX86_BUILTIN_SCATTERALTSIV8DI,
30726 IX86_BUILTIN_SCATTERALTDIV16SI,
30727 IX86_BUILTIN_SCATTERDIV16SF,
30728 IX86_BUILTIN_SCATTERDIV16SI,
30729 IX86_BUILTIN_SCATTERDIV8DF,
30730 IX86_BUILTIN_SCATTERDIV8DI,
30731 IX86_BUILTIN_SCATTERSIV16SF,
30732 IX86_BUILTIN_SCATTERSIV16SI,
30733 IX86_BUILTIN_SCATTERSIV8DF,
30734 IX86_BUILTIN_SCATTERSIV8DI,
30735 IX86_BUILTIN_GATHERPFQPD,
30736 IX86_BUILTIN_GATHERPFDPS,
30737 IX86_BUILTIN_GATHERPFDPD,
30738 IX86_BUILTIN_GATHERPFQPS,
30739 IX86_BUILTIN_SCATTERPFDPD,
30740 IX86_BUILTIN_SCATTERPFDPS,
30741 IX86_BUILTIN_SCATTERPFQPD,
30742 IX86_BUILTIN_SCATTERPFQPS,
30743 IX86_BUILTIN_CLWB,
30744 IX86_BUILTIN_CLFLUSHOPT,
30745 IX86_BUILTIN_INFQ,
30746 IX86_BUILTIN_HUGE_VALQ,
30747 IX86_BUILTIN_NANQ,
30748 IX86_BUILTIN_NANSQ,
30749 IX86_BUILTIN_XABORT,
30750 IX86_BUILTIN_ADDCARRYX32,
30751 IX86_BUILTIN_ADDCARRYX64,
30752 IX86_BUILTIN_SBB32,
30753 IX86_BUILTIN_SBB64,
30754 IX86_BUILTIN_RDRAND16_STEP,
30755 IX86_BUILTIN_RDRAND32_STEP,
30756 IX86_BUILTIN_RDRAND64_STEP,
30757 IX86_BUILTIN_RDSEED16_STEP,
30758 IX86_BUILTIN_RDSEED32_STEP,
30759 IX86_BUILTIN_RDSEED64_STEP,
30760 IX86_BUILTIN_MONITORX,
30761 IX86_BUILTIN_MWAITX,
30762 IX86_BUILTIN_CFSTRING,
30763 IX86_BUILTIN_CPU_INIT,
30764 IX86_BUILTIN_CPU_IS,
30765 IX86_BUILTIN_CPU_SUPPORTS,
30766 IX86_BUILTIN_READ_FLAGS,
30767 IX86_BUILTIN_WRITE_FLAGS,
30769 /* All the remaining builtins are tracked in bdesc_* arrays in
30770 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30771 this point. */
30772 #define BDESC(mask, icode, name, code, comparison, flag) \
30773 code,
30774 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30775 code, \
30776 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30777 #define BDESC_END(kind, next_kind)
30779 #include "i386-builtin.def"
30781 #undef BDESC
30782 #undef BDESC_FIRST
30783 #undef BDESC_END
30785 IX86_BUILTIN_MAX,
30787 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30789 /* Now just the aliases for bdesc_* start/end. */
30790 #define BDESC(mask, icode, name, code, comparison, flag)
30791 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30792 #define BDESC_END(kind, next_kind) \
30793 IX86_BUILTIN__BDESC_##kind##_LAST \
30794 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30796 #include "i386-builtin.def"
30798 #undef BDESC
30799 #undef BDESC_FIRST
30800 #undef BDESC_END
30802 /* Just to make sure there is no comma after the last enumerator. */
30803 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30806 /* Table for the ix86 builtin decls. */
30807 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30809 /* Table of all of the builtin functions that are possible with different ISA's
30810 but are waiting to be built until a function is declared to use that
30811 ISA. */
30812 struct builtin_isa {
30813 const char *name; /* function name */
30814 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30815 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30816 bool const_p; /* true if the declaration is constant */
30817 bool leaf_p; /* true if the declaration has leaf attribute */
30818 bool nothrow_p; /* true if the declaration has nothrow attribute */
30819 bool set_and_not_built_p;
30822 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30824 /* Bits that can still enable any inclusion of a builtin. */
30825 static HOST_WIDE_INT deferred_isa_values = 0;
30827 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30828 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30829 function decl in the ix86_builtins array. Returns the function decl or
30830 NULL_TREE, if the builtin was not added.
30832 If the front end has a special hook for builtin functions, delay adding
30833 builtin functions that aren't in the current ISA until the ISA is changed
30834 with function specific optimization. Doing so, can save about 300K for the
30835 default compiler. When the builtin is expanded, check at that time whether
30836 it is valid.
30838 If the front end doesn't have a special hook, record all builtins, even if
30839 it isn't an instruction set in the current ISA in case the user uses
30840 function specific options for a different ISA, so that we don't get scope
30841 errors if a builtin is added in the middle of a function scope. */
30843 static inline tree
30844 def_builtin (HOST_WIDE_INT mask, const char *name,
30845 enum ix86_builtin_func_type tcode,
30846 enum ix86_builtins code)
30848 tree decl = NULL_TREE;
30850 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30852 ix86_builtins_isa[(int) code].isa = mask;
30854 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
30855 where any bit set means that built-in is enable, this bit must be *and-ed*
30856 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
30857 means that *both* cpuid bits must be set for the built-in to be available.
30858 Handle this here. */
30859 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30860 mask &= ~OPTION_MASK_ISA_AVX512VL;
30862 mask &= ~OPTION_MASK_ISA_64BIT;
30863 if (mask == 0
30864 || (mask & ix86_isa_flags) != 0
30865 || (lang_hooks.builtin_function
30866 == lang_hooks.builtin_function_ext_scope))
30869 tree type = ix86_get_builtin_func_type (tcode);
30870 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30871 NULL, NULL_TREE);
30872 ix86_builtins[(int) code] = decl;
30873 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30875 else
30877 /* Just a MASK where set_and_not_built_p == true can potentially
30878 include a builtin. */
30879 deferred_isa_values |= mask;
30880 ix86_builtins[(int) code] = NULL_TREE;
30881 ix86_builtins_isa[(int) code].tcode = tcode;
30882 ix86_builtins_isa[(int) code].name = name;
30883 ix86_builtins_isa[(int) code].leaf_p = false;
30884 ix86_builtins_isa[(int) code].nothrow_p = false;
30885 ix86_builtins_isa[(int) code].const_p = false;
30886 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30890 return decl;
30893 /* Like def_builtin, but also marks the function decl "const". */
30895 static inline tree
30896 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30897 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30899 tree decl = def_builtin (mask, name, tcode, code);
30900 if (decl)
30901 TREE_READONLY (decl) = 1;
30902 else
30903 ix86_builtins_isa[(int) code].const_p = true;
30905 return decl;
30908 /* Add any new builtin functions for a given ISA that may not have been
30909 declared. This saves a bit of space compared to adding all of the
30910 declarations to the tree, even if we didn't use them. */
30912 static void
30913 ix86_add_new_builtins (HOST_WIDE_INT isa)
30915 if ((isa & deferred_isa_values) == 0)
30916 return;
30918 /* Bits in ISA value can be removed from potential isa values. */
30919 deferred_isa_values &= ~isa;
30921 int i;
30922 tree saved_current_target_pragma = current_target_pragma;
30923 current_target_pragma = NULL_TREE;
30925 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30927 if ((ix86_builtins_isa[i].isa & isa) != 0
30928 && ix86_builtins_isa[i].set_and_not_built_p)
30930 tree decl, type;
30932 /* Don't define the builtin again. */
30933 ix86_builtins_isa[i].set_and_not_built_p = false;
30935 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30936 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30937 type, i, BUILT_IN_MD, NULL,
30938 NULL_TREE);
30940 ix86_builtins[i] = decl;
30941 if (ix86_builtins_isa[i].const_p)
30942 TREE_READONLY (decl) = 1;
30943 if (ix86_builtins_isa[i].leaf_p)
30944 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30945 NULL_TREE);
30946 if (ix86_builtins_isa[i].nothrow_p)
30947 TREE_NOTHROW (decl) = 1;
30951 current_target_pragma = saved_current_target_pragma;
30954 /* Bits for builtin_description.flag. */
30956 /* Set when we don't support the comparison natively, and should
30957 swap_comparison in order to support it. */
30958 #define BUILTIN_DESC_SWAP_OPERANDS 1
30960 struct builtin_description
30962 const HOST_WIDE_INT mask;
30963 const enum insn_code icode;
30964 const char *const name;
30965 const enum ix86_builtins code;
30966 const enum rtx_code comparison;
30967 const int flag;
30970 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30971 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30972 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30973 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30974 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30975 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30976 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30977 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30978 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30979 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30980 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30981 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30982 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30983 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30984 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30985 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30986 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30987 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30988 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30989 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30990 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30991 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30992 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30993 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30994 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30995 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30996 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30997 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30998 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30999 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
31000 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
31001 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
31002 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
31003 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
31004 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
31005 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
31006 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
31007 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
31008 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
31009 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
31010 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
31011 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
31012 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
31013 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
31014 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
31015 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
31016 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
31017 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
31018 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
31019 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
31020 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
31021 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
31023 #define BDESC(mask, icode, name, code, comparison, flag) \
31024 { mask, icode, name, code, comparison, flag },
31025 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31026 static const struct builtin_description bdesc_##kind[] = \
31028 BDESC (mask, icode, name, code, comparison, flag)
31029 #define BDESC_END(kind, next_kind) \
31032 #include "i386-builtin.def"
31034 #undef BDESC
31035 #undef BDESC_FIRST
31036 #undef BDESC_END
31038 /* TM vector builtins. */
31040 /* Reuse the existing x86-specific `struct builtin_description' cause
31041 we're lazy. Add casts to make them fit. */
31042 static const struct builtin_description bdesc_tm[] =
31044 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31045 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31046 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
31047 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31048 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31049 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31050 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
31052 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31053 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31054 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
31055 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31056 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31057 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31058 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
31060 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31061 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31062 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
31063 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31064 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31065 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31066 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
31068 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
31069 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31070 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31073 /* Initialize the transactional memory vector load/store builtins. */
31075 static void
31076 ix86_init_tm_builtins (void)
31078 enum ix86_builtin_func_type ftype;
31079 const struct builtin_description *d;
31080 size_t i;
31081 tree decl;
31082 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31083 tree attrs_log, attrs_type_log;
31085 if (!flag_tm)
31086 return;
31088 /* If there are no builtins defined, we must be compiling in a
31089 language without trans-mem support. */
31090 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31091 return;
31093 /* Use whatever attributes a normal TM load has. */
31094 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31095 attrs_load = DECL_ATTRIBUTES (decl);
31096 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31097 /* Use whatever attributes a normal TM store has. */
31098 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31099 attrs_store = DECL_ATTRIBUTES (decl);
31100 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31101 /* Use whatever attributes a normal TM log has. */
31102 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31103 attrs_log = DECL_ATTRIBUTES (decl);
31104 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31106 for (i = 0, d = bdesc_tm;
31107 i < ARRAY_SIZE (bdesc_tm);
31108 i++, d++)
31110 if ((d->mask & ix86_isa_flags) != 0
31111 || (lang_hooks.builtin_function
31112 == lang_hooks.builtin_function_ext_scope))
31114 tree type, attrs, attrs_type;
31115 enum built_in_function code = (enum built_in_function) d->code;
31117 ftype = (enum ix86_builtin_func_type) d->flag;
31118 type = ix86_get_builtin_func_type (ftype);
31120 if (BUILTIN_TM_LOAD_P (code))
31122 attrs = attrs_load;
31123 attrs_type = attrs_type_load;
31125 else if (BUILTIN_TM_STORE_P (code))
31127 attrs = attrs_store;
31128 attrs_type = attrs_type_store;
31130 else
31132 attrs = attrs_log;
31133 attrs_type = attrs_type_log;
31135 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31136 /* The builtin without the prefix for
31137 calling it directly. */
31138 d->name + strlen ("__builtin_"),
31139 attrs);
31140 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31141 set the TYPE_ATTRIBUTES. */
31142 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31144 set_builtin_decl (code, decl, false);
31149 /* Macros for verification of enum ix86_builtins order. */
31150 #define BDESC_VERIFY(x, y, z) \
31151 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31152 #define BDESC_VERIFYS(x, y, z) \
31153 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31155 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31156 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31157 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31158 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31159 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31160 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31161 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31162 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31163 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31164 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31165 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31166 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31167 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31168 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31169 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31170 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31171 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31172 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31174 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31175 in the current target ISA to allow the user to compile particular modules
31176 with different target specific options that differ from the command line
31177 options. */
31178 static void
31179 ix86_init_mmx_sse_builtins (void)
31181 const struct builtin_description * d;
31182 enum ix86_builtin_func_type ftype;
31183 size_t i;
31185 /* Add all special builtins with variable number of operands. */
31186 for (i = 0, d = bdesc_special_args;
31187 i < ARRAY_SIZE (bdesc_special_args);
31188 i++, d++)
31190 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31191 if (d->name == 0)
31192 continue;
31194 ftype = (enum ix86_builtin_func_type) d->flag;
31195 def_builtin (d->mask, d->name, ftype, d->code);
31197 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31198 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31199 ARRAY_SIZE (bdesc_special_args) - 1);
31201 /* Add all builtins with variable number of operands. */
31202 for (i = 0, d = bdesc_args;
31203 i < ARRAY_SIZE (bdesc_args);
31204 i++, d++)
31206 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31207 if (d->name == 0)
31208 continue;
31210 ftype = (enum ix86_builtin_func_type) d->flag;
31211 def_builtin_const (d->mask, d->name, ftype, d->code);
31213 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31214 IX86_BUILTIN__BDESC_ARGS_FIRST,
31215 ARRAY_SIZE (bdesc_args) - 1);
31217 /* Add all builtins with rounding. */
31218 for (i = 0, d = bdesc_round_args;
31219 i < ARRAY_SIZE (bdesc_round_args);
31220 i++, d++)
31222 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31223 if (d->name == 0)
31224 continue;
31226 ftype = (enum ix86_builtin_func_type) d->flag;
31227 def_builtin_const (d->mask, d->name, ftype, d->code);
31229 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31230 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31231 ARRAY_SIZE (bdesc_round_args) - 1);
31233 /* pcmpestr[im] insns. */
31234 for (i = 0, d = bdesc_pcmpestr;
31235 i < ARRAY_SIZE (bdesc_pcmpestr);
31236 i++, d++)
31238 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31239 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31240 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31241 else
31242 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31243 def_builtin_const (d->mask, d->name, ftype, d->code);
31245 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31246 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31247 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31249 /* pcmpistr[im] insns. */
31250 for (i = 0, d = bdesc_pcmpistr;
31251 i < ARRAY_SIZE (bdesc_pcmpistr);
31252 i++, d++)
31254 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31255 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31256 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31257 else
31258 ftype = INT_FTYPE_V16QI_V16QI_INT;
31259 def_builtin_const (d->mask, d->name, ftype, d->code);
31261 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31262 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31263 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31265 /* comi/ucomi insns. */
31266 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31268 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31269 if (d->mask == OPTION_MASK_ISA_SSE2)
31270 ftype = INT_FTYPE_V2DF_V2DF;
31271 else
31272 ftype = INT_FTYPE_V4SF_V4SF;
31273 def_builtin_const (d->mask, d->name, ftype, d->code);
31275 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31276 IX86_BUILTIN__BDESC_COMI_FIRST,
31277 ARRAY_SIZE (bdesc_comi) - 1);
31279 /* SSE */
31280 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31281 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31282 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31283 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31285 /* SSE or 3DNow!A */
31286 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31287 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31288 IX86_BUILTIN_MASKMOVQ);
31290 /* SSE2 */
31291 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31292 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31294 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31295 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31296 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31297 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31299 /* SSE3. */
31300 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31301 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31302 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31303 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31305 /* AES */
31306 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31307 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31308 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31309 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31310 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31311 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31312 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31313 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31314 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31315 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31316 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31317 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31319 /* PCLMUL */
31320 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31321 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31323 /* RDRND */
31324 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31325 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31326 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31327 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31328 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31329 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31330 IX86_BUILTIN_RDRAND64_STEP);
31332 /* AVX2 */
31333 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31334 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31335 IX86_BUILTIN_GATHERSIV2DF);
31337 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31338 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31339 IX86_BUILTIN_GATHERSIV4DF);
31341 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31342 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31343 IX86_BUILTIN_GATHERDIV2DF);
31345 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31346 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31347 IX86_BUILTIN_GATHERDIV4DF);
31349 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31350 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31351 IX86_BUILTIN_GATHERSIV4SF);
31353 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31354 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31355 IX86_BUILTIN_GATHERSIV8SF);
31357 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31358 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31359 IX86_BUILTIN_GATHERDIV4SF);
31361 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31362 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31363 IX86_BUILTIN_GATHERDIV8SF);
31365 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31366 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31367 IX86_BUILTIN_GATHERSIV2DI);
31369 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31370 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31371 IX86_BUILTIN_GATHERSIV4DI);
31373 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31374 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31375 IX86_BUILTIN_GATHERDIV2DI);
31377 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31378 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31379 IX86_BUILTIN_GATHERDIV4DI);
31381 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31382 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31383 IX86_BUILTIN_GATHERSIV4SI);
31385 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31386 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31387 IX86_BUILTIN_GATHERSIV8SI);
31389 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31390 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31391 IX86_BUILTIN_GATHERDIV4SI);
31393 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31394 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31395 IX86_BUILTIN_GATHERDIV8SI);
31397 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31398 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31399 IX86_BUILTIN_GATHERALTSIV4DF);
31401 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31402 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31403 IX86_BUILTIN_GATHERALTDIV8SF);
31405 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31406 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31407 IX86_BUILTIN_GATHERALTSIV4DI);
31409 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31410 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31411 IX86_BUILTIN_GATHERALTDIV8SI);
31413 /* AVX512F */
31414 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31415 V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
31416 IX86_BUILTIN_GATHER3SIV16SF);
31418 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31419 V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
31420 IX86_BUILTIN_GATHER3SIV8DF);
31422 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31423 V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
31424 IX86_BUILTIN_GATHER3DIV16SF);
31426 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31427 V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
31428 IX86_BUILTIN_GATHER3DIV8DF);
31430 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31431 V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
31432 IX86_BUILTIN_GATHER3SIV16SI);
31434 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31435 V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
31436 IX86_BUILTIN_GATHER3SIV8DI);
31438 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31439 V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
31440 IX86_BUILTIN_GATHER3DIV16SI);
31442 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31443 V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
31444 IX86_BUILTIN_GATHER3DIV8DI);
31446 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31447 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31448 IX86_BUILTIN_GATHER3ALTSIV8DF);
31450 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31451 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31452 IX86_BUILTIN_GATHER3ALTDIV16SF);
31454 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31455 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31456 IX86_BUILTIN_GATHER3ALTSIV8DI);
31458 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31459 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31460 IX86_BUILTIN_GATHER3ALTDIV16SI);
31462 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31463 VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
31464 IX86_BUILTIN_SCATTERSIV16SF);
31466 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31467 VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
31468 IX86_BUILTIN_SCATTERSIV8DF);
31470 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31471 VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
31472 IX86_BUILTIN_SCATTERDIV16SF);
31474 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31475 VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
31476 IX86_BUILTIN_SCATTERDIV8DF);
31478 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31479 VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
31480 IX86_BUILTIN_SCATTERSIV16SI);
31482 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31483 VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
31484 IX86_BUILTIN_SCATTERSIV8DI);
31486 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31487 VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
31488 IX86_BUILTIN_SCATTERDIV16SI);
31490 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31491 VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
31492 IX86_BUILTIN_SCATTERDIV8DI);
31494 /* AVX512VL */
31495 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31496 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_QI_INT,
31497 IX86_BUILTIN_GATHER3SIV2DF);
31499 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31500 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_QI_INT,
31501 IX86_BUILTIN_GATHER3SIV4DF);
31503 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31504 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_QI_INT,
31505 IX86_BUILTIN_GATHER3DIV2DF);
31507 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31508 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_QI_INT,
31509 IX86_BUILTIN_GATHER3DIV4DF);
31511 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31512 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_QI_INT,
31513 IX86_BUILTIN_GATHER3SIV4SF);
31515 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31516 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_QI_INT,
31517 IX86_BUILTIN_GATHER3SIV8SF);
31519 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31520 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_QI_INT,
31521 IX86_BUILTIN_GATHER3DIV4SF);
31523 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31524 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_QI_INT,
31525 IX86_BUILTIN_GATHER3DIV8SF);
31527 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31528 V2DI_FTYPE_V2DI_PCINT64_V4SI_QI_INT,
31529 IX86_BUILTIN_GATHER3SIV2DI);
31531 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31532 V4DI_FTYPE_V4DI_PCINT64_V4SI_QI_INT,
31533 IX86_BUILTIN_GATHER3SIV4DI);
31535 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31536 V2DI_FTYPE_V2DI_PCINT64_V2DI_QI_INT,
31537 IX86_BUILTIN_GATHER3DIV2DI);
31539 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31540 V4DI_FTYPE_V4DI_PCINT64_V4DI_QI_INT,
31541 IX86_BUILTIN_GATHER3DIV4DI);
31543 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31544 V4SI_FTYPE_V4SI_PCINT_V4SI_QI_INT,
31545 IX86_BUILTIN_GATHER3SIV4SI);
31547 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31548 V8SI_FTYPE_V8SI_PCINT_V8SI_QI_INT,
31549 IX86_BUILTIN_GATHER3SIV8SI);
31551 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31552 V4SI_FTYPE_V4SI_PCINT_V2DI_QI_INT,
31553 IX86_BUILTIN_GATHER3DIV4SI);
31555 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31556 V4SI_FTYPE_V4SI_PCINT_V4DI_QI_INT,
31557 IX86_BUILTIN_GATHER3DIV8SI);
31559 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31560 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31561 IX86_BUILTIN_GATHER3ALTSIV4DF);
31563 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31564 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31565 IX86_BUILTIN_GATHER3ALTDIV8SF);
31567 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31568 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31569 IX86_BUILTIN_GATHER3ALTSIV4DI);
31571 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31572 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31573 IX86_BUILTIN_GATHER3ALTDIV8SI);
31575 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31576 VOID_FTYPE_PFLOAT_QI_V8SI_V8SF_INT,
31577 IX86_BUILTIN_SCATTERSIV8SF);
31579 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31580 VOID_FTYPE_PFLOAT_QI_V4SI_V4SF_INT,
31581 IX86_BUILTIN_SCATTERSIV4SF);
31583 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31584 VOID_FTYPE_PDOUBLE_QI_V4SI_V4DF_INT,
31585 IX86_BUILTIN_SCATTERSIV4DF);
31587 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31588 VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
31589 IX86_BUILTIN_SCATTERSIV2DF);
31591 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31592 VOID_FTYPE_PFLOAT_QI_V4DI_V4SF_INT,
31593 IX86_BUILTIN_SCATTERDIV8SF);
31595 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31596 VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
31597 IX86_BUILTIN_SCATTERDIV4SF);
31599 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31600 VOID_FTYPE_PDOUBLE_QI_V4DI_V4DF_INT,
31601 IX86_BUILTIN_SCATTERDIV4DF);
31603 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31604 VOID_FTYPE_PDOUBLE_QI_V2DI_V2DF_INT,
31605 IX86_BUILTIN_SCATTERDIV2DF);
31607 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31608 VOID_FTYPE_PINT_QI_V8SI_V8SI_INT,
31609 IX86_BUILTIN_SCATTERSIV8SI);
31611 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31612 VOID_FTYPE_PINT_QI_V4SI_V4SI_INT,
31613 IX86_BUILTIN_SCATTERSIV4SI);
31615 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31616 VOID_FTYPE_PLONGLONG_QI_V4SI_V4DI_INT,
31617 IX86_BUILTIN_SCATTERSIV4DI);
31619 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31620 VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
31621 IX86_BUILTIN_SCATTERSIV2DI);
31623 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31624 VOID_FTYPE_PINT_QI_V4DI_V4SI_INT,
31625 IX86_BUILTIN_SCATTERDIV8SI);
31627 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31628 VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
31629 IX86_BUILTIN_SCATTERDIV4SI);
31631 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31632 VOID_FTYPE_PLONGLONG_QI_V4DI_V4DI_INT,
31633 IX86_BUILTIN_SCATTERDIV4DI);
31635 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31636 VOID_FTYPE_PLONGLONG_QI_V2DI_V2DI_INT,
31637 IX86_BUILTIN_SCATTERDIV2DI);
31638 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31639 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31640 IX86_BUILTIN_SCATTERALTSIV8DF);
31642 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31643 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31644 IX86_BUILTIN_SCATTERALTDIV16SF);
31646 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31647 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31648 IX86_BUILTIN_SCATTERALTSIV8DI);
31650 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31651 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31652 IX86_BUILTIN_SCATTERALTDIV16SI);
31654 /* AVX512PF */
31655 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31656 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31657 IX86_BUILTIN_GATHERPFDPD);
31658 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31659 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31660 IX86_BUILTIN_GATHERPFDPS);
31661 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31662 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31663 IX86_BUILTIN_GATHERPFQPD);
31664 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31665 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31666 IX86_BUILTIN_GATHERPFQPS);
31667 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31668 VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
31669 IX86_BUILTIN_SCATTERPFDPD);
31670 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31671 VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
31672 IX86_BUILTIN_SCATTERPFDPS);
31673 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31674 VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
31675 IX86_BUILTIN_SCATTERPFQPD);
31676 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31677 VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
31678 IX86_BUILTIN_SCATTERPFQPS);
31680 /* SHA */
31681 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31682 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31683 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31684 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31685 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31686 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31687 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31688 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31689 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31690 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31691 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31692 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31693 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31694 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31696 /* RTM. */
31697 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31698 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31700 /* MMX access to the vec_init patterns. */
31701 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31702 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31704 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31705 V4HI_FTYPE_HI_HI_HI_HI,
31706 IX86_BUILTIN_VEC_INIT_V4HI);
31708 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31709 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31710 IX86_BUILTIN_VEC_INIT_V8QI);
31712 /* Access to the vec_extract patterns. */
31713 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31714 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31715 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31716 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31717 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31718 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31719 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31720 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31721 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31722 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31724 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31725 "__builtin_ia32_vec_ext_v4hi",
31726 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31728 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31729 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31731 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31732 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31734 /* Access to the vec_set patterns. */
31735 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31736 "__builtin_ia32_vec_set_v2di",
31737 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31739 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31740 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31742 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31743 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31745 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31746 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31748 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
31749 "__builtin_ia32_vec_set_v4hi",
31750 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31752 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31753 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31755 /* RDSEED */
31756 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31757 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31758 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31759 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31760 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31761 "__builtin_ia32_rdseed_di_step",
31762 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31764 /* ADCX */
31765 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31766 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31767 def_builtin (OPTION_MASK_ISA_64BIT,
31768 "__builtin_ia32_addcarryx_u64",
31769 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31770 IX86_BUILTIN_ADDCARRYX64);
31772 /* SBB */
31773 def_builtin (0, "__builtin_ia32_sbb_u32",
31774 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31775 def_builtin (OPTION_MASK_ISA_64BIT,
31776 "__builtin_ia32_sbb_u64",
31777 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31778 IX86_BUILTIN_SBB64);
31780 /* Read/write FLAGS. */
31781 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
31782 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31783 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31784 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31785 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
31786 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31787 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31788 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31790 /* CLFLUSHOPT. */
31791 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31792 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31794 /* CLWB. */
31795 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31796 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31798 /* MONITORX and MWAITX. */
31799 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31800 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31801 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31802 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31804 /* CLZERO. */
31805 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31806 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31808 /* Add FMA4 multi-arg argument instructions */
31809 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31811 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31812 if (d->name == 0)
31813 continue;
31815 ftype = (enum ix86_builtin_func_type) d->flag;
31816 def_builtin_const (d->mask, d->name, ftype, d->code);
31818 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31819 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31820 ARRAY_SIZE (bdesc_multi_arg) - 1);
31823 static void
31824 ix86_init_mpx_builtins ()
31826 const struct builtin_description * d;
31827 enum ix86_builtin_func_type ftype;
31828 tree decl;
31829 size_t i;
31831 for (i = 0, d = bdesc_mpx;
31832 i < ARRAY_SIZE (bdesc_mpx);
31833 i++, d++)
31835 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31836 if (d->name == 0)
31837 continue;
31839 ftype = (enum ix86_builtin_func_type) d->flag;
31840 decl = def_builtin (d->mask, d->name, ftype, d->code);
31842 /* With no leaf and nothrow flags for MPX builtins
31843 abnormal edges may follow its call when setjmp
31844 presents in the function. Since we may have a lot
31845 of MPX builtins calls it causes lots of useless
31846 edges and enormous PHI nodes. To avoid this we mark
31847 MPX builtins as leaf and nothrow. */
31848 if (decl)
31850 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31851 NULL_TREE);
31852 TREE_NOTHROW (decl) = 1;
31854 else
31856 ix86_builtins_isa[(int)d->code].leaf_p = true;
31857 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31860 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31861 IX86_BUILTIN__BDESC_MPX_FIRST,
31862 ARRAY_SIZE (bdesc_mpx) - 1);
31864 for (i = 0, d = bdesc_mpx_const;
31865 i < ARRAY_SIZE (bdesc_mpx_const);
31866 i++, d++)
31868 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31869 if (d->name == 0)
31870 continue;
31872 ftype = (enum ix86_builtin_func_type) d->flag;
31873 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
31875 if (decl)
31877 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31878 NULL_TREE);
31879 TREE_NOTHROW (decl) = 1;
31881 else
31883 ix86_builtins_isa[(int)d->code].leaf_p = true;
31884 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31887 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31888 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31889 ARRAY_SIZE (bdesc_mpx_const) - 1);
31891 #undef BDESC_VERIFY
31892 #undef BDESC_VERIFYS
31894 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31895 to return a pointer to VERSION_DECL if the outcome of the expression
31896 formed by PREDICATE_CHAIN is true. This function will be called during
31897 version dispatch to decide which function version to execute. It returns
31898 the basic block at the end, to which more conditions can be added. */
31900 static basic_block
31901 add_condition_to_bb (tree function_decl, tree version_decl,
31902 tree predicate_chain, basic_block new_bb)
31904 gimple *return_stmt;
31905 tree convert_expr, result_var;
31906 gimple *convert_stmt;
31907 gimple *call_cond_stmt;
31908 gimple *if_else_stmt;
31910 basic_block bb1, bb2, bb3;
31911 edge e12, e23;
31913 tree cond_var, and_expr_var = NULL_TREE;
31914 gimple_seq gseq;
31916 tree predicate_decl, predicate_arg;
31918 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31920 gcc_assert (new_bb != NULL);
31921 gseq = bb_seq (new_bb);
31924 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31925 build_fold_addr_expr (version_decl));
31926 result_var = create_tmp_var (ptr_type_node);
31927 convert_stmt = gimple_build_assign (result_var, convert_expr);
31928 return_stmt = gimple_build_return (result_var);
31930 if (predicate_chain == NULL_TREE)
31932 gimple_seq_add_stmt (&gseq, convert_stmt);
31933 gimple_seq_add_stmt (&gseq, return_stmt);
31934 set_bb_seq (new_bb, gseq);
31935 gimple_set_bb (convert_stmt, new_bb);
31936 gimple_set_bb (return_stmt, new_bb);
31937 pop_cfun ();
31938 return new_bb;
31941 while (predicate_chain != NULL)
31943 cond_var = create_tmp_var (integer_type_node);
31944 predicate_decl = TREE_PURPOSE (predicate_chain);
31945 predicate_arg = TREE_VALUE (predicate_chain);
31946 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31947 gimple_call_set_lhs (call_cond_stmt, cond_var);
31949 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31950 gimple_set_bb (call_cond_stmt, new_bb);
31951 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31953 predicate_chain = TREE_CHAIN (predicate_chain);
31955 if (and_expr_var == NULL)
31956 and_expr_var = cond_var;
31957 else
31959 gimple *assign_stmt;
31960 /* Use MIN_EXPR to check if any integer is zero?.
31961 and_expr_var = min_expr <cond_var, and_expr_var> */
31962 assign_stmt = gimple_build_assign (and_expr_var,
31963 build2 (MIN_EXPR, integer_type_node,
31964 cond_var, and_expr_var));
31966 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31967 gimple_set_bb (assign_stmt, new_bb);
31968 gimple_seq_add_stmt (&gseq, assign_stmt);
31972 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31973 integer_zero_node,
31974 NULL_TREE, NULL_TREE);
31975 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31976 gimple_set_bb (if_else_stmt, new_bb);
31977 gimple_seq_add_stmt (&gseq, if_else_stmt);
31979 gimple_seq_add_stmt (&gseq, convert_stmt);
31980 gimple_seq_add_stmt (&gseq, return_stmt);
31981 set_bb_seq (new_bb, gseq);
31983 bb1 = new_bb;
31984 e12 = split_block (bb1, if_else_stmt);
31985 bb2 = e12->dest;
31986 e12->flags &= ~EDGE_FALLTHRU;
31987 e12->flags |= EDGE_TRUE_VALUE;
31989 e23 = split_block (bb2, return_stmt);
31991 gimple_set_bb (convert_stmt, bb2);
31992 gimple_set_bb (return_stmt, bb2);
31994 bb3 = e23->dest;
31995 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31997 remove_edge (e23);
31998 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
32000 pop_cfun ();
32002 return bb3;
32005 /* This parses the attribute arguments to target in DECL and determines
32006 the right builtin to use to match the platform specification.
32007 It returns the priority value for this version decl. If PREDICATE_LIST
32008 is not NULL, it stores the list of cpu features that need to be checked
32009 before dispatching this function. */
32011 static unsigned int
32012 get_builtin_code_for_version (tree decl, tree *predicate_list)
32014 tree attrs;
32015 struct cl_target_option cur_target;
32016 tree target_node;
32017 struct cl_target_option *new_target;
32018 const char *arg_str = NULL;
32019 const char *attrs_str = NULL;
32020 char *tok_str = NULL;
32021 char *token;
32023 /* Priority of i386 features, greater value is higher priority. This is
32024 used to decide the order in which function dispatch must happen. For
32025 instance, a version specialized for SSE4.2 should be checked for dispatch
32026 before a version for SSE3, as SSE4.2 implies SSE3. */
32027 enum feature_priority
32029 P_ZERO = 0,
32030 P_MMX,
32031 P_SSE,
32032 P_SSE2,
32033 P_SSE3,
32034 P_SSSE3,
32035 P_PROC_SSSE3,
32036 P_SSE4_A,
32037 P_PROC_SSE4_A,
32038 P_SSE4_1,
32039 P_SSE4_2,
32040 P_PROC_SSE4_2,
32041 P_POPCNT,
32042 P_AES,
32043 P_PCLMUL,
32044 P_AVX,
32045 P_PROC_AVX,
32046 P_BMI,
32047 P_PROC_BMI,
32048 P_FMA4,
32049 P_XOP,
32050 P_PROC_XOP,
32051 P_FMA,
32052 P_PROC_FMA,
32053 P_BMI2,
32054 P_AVX2,
32055 P_PROC_AVX2,
32056 P_AVX512F,
32057 P_PROC_AVX512F
32060 enum feature_priority priority = P_ZERO;
32062 /* These are the target attribute strings for which a dispatcher is
32063 available, from fold_builtin_cpu. */
32065 static struct _feature_list
32067 const char *const name;
32068 const enum feature_priority priority;
32070 const feature_list[] =
32072 {"mmx", P_MMX},
32073 {"sse", P_SSE},
32074 {"sse2", P_SSE2},
32075 {"sse3", P_SSE3},
32076 {"sse4a", P_SSE4_A},
32077 {"ssse3", P_SSSE3},
32078 {"sse4.1", P_SSE4_1},
32079 {"sse4.2", P_SSE4_2},
32080 {"popcnt", P_POPCNT},
32081 {"aes", P_AES},
32082 {"pclmul", P_PCLMUL},
32083 {"avx", P_AVX},
32084 {"bmi", P_BMI},
32085 {"fma4", P_FMA4},
32086 {"xop", P_XOP},
32087 {"fma", P_FMA},
32088 {"bmi2", P_BMI2},
32089 {"avx2", P_AVX2},
32090 {"avx512f", P_AVX512F}
32094 static unsigned int NUM_FEATURES
32095 = sizeof (feature_list) / sizeof (struct _feature_list);
32097 unsigned int i;
32099 tree predicate_chain = NULL_TREE;
32100 tree predicate_decl, predicate_arg;
32102 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32103 gcc_assert (attrs != NULL);
32105 attrs = TREE_VALUE (TREE_VALUE (attrs));
32107 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32108 attrs_str = TREE_STRING_POINTER (attrs);
32110 /* Return priority zero for default function. */
32111 if (strcmp (attrs_str, "default") == 0)
32112 return 0;
32114 /* Handle arch= if specified. For priority, set it to be 1 more than
32115 the best instruction set the processor can handle. For instance, if
32116 there is a version for atom and a version for ssse3 (the highest ISA
32117 priority for atom), the atom version must be checked for dispatch
32118 before the ssse3 version. */
32119 if (strstr (attrs_str, "arch=") != NULL)
32121 cl_target_option_save (&cur_target, &global_options);
32122 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32123 &global_options_set);
32125 gcc_assert (target_node);
32126 new_target = TREE_TARGET_OPTION (target_node);
32127 gcc_assert (new_target);
32129 if (new_target->arch_specified && new_target->arch > 0)
32131 switch (new_target->arch)
32133 case PROCESSOR_CORE2:
32134 arg_str = "core2";
32135 priority = P_PROC_SSSE3;
32136 break;
32137 case PROCESSOR_NEHALEM:
32138 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32139 arg_str = "westmere";
32140 else
32141 /* We translate "arch=corei7" and "arch=nehalem" to
32142 "corei7" so that it will be mapped to M_INTEL_COREI7
32143 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32144 arg_str = "corei7";
32145 priority = P_PROC_SSE4_2;
32146 break;
32147 case PROCESSOR_SANDYBRIDGE:
32148 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32149 arg_str = "ivybridge";
32150 else
32151 arg_str = "sandybridge";
32152 priority = P_PROC_AVX;
32153 break;
32154 case PROCESSOR_HASWELL:
32155 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32156 arg_str = "skylake-avx512";
32157 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32158 arg_str = "skylake";
32159 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32160 arg_str = "broadwell";
32161 else
32162 arg_str = "haswell";
32163 priority = P_PROC_AVX2;
32164 break;
32165 case PROCESSOR_BONNELL:
32166 arg_str = "bonnell";
32167 priority = P_PROC_SSSE3;
32168 break;
32169 case PROCESSOR_KNL:
32170 arg_str = "knl";
32171 priority = P_PROC_AVX512F;
32172 break;
32173 case PROCESSOR_SILVERMONT:
32174 arg_str = "silvermont";
32175 priority = P_PROC_SSE4_2;
32176 break;
32177 case PROCESSOR_AMDFAM10:
32178 arg_str = "amdfam10h";
32179 priority = P_PROC_SSE4_A;
32180 break;
32181 case PROCESSOR_BTVER1:
32182 arg_str = "btver1";
32183 priority = P_PROC_SSE4_A;
32184 break;
32185 case PROCESSOR_BTVER2:
32186 arg_str = "btver2";
32187 priority = P_PROC_BMI;
32188 break;
32189 case PROCESSOR_BDVER1:
32190 arg_str = "bdver1";
32191 priority = P_PROC_XOP;
32192 break;
32193 case PROCESSOR_BDVER2:
32194 arg_str = "bdver2";
32195 priority = P_PROC_FMA;
32196 break;
32197 case PROCESSOR_BDVER3:
32198 arg_str = "bdver3";
32199 priority = P_PROC_FMA;
32200 break;
32201 case PROCESSOR_BDVER4:
32202 arg_str = "bdver4";
32203 priority = P_PROC_AVX2;
32204 break;
32205 case PROCESSOR_ZNVER1:
32206 arg_str = "znver1";
32207 priority = P_PROC_AVX2;
32208 break;
32212 cl_target_option_restore (&global_options, &cur_target);
32214 if (predicate_list && arg_str == NULL)
32216 error_at (DECL_SOURCE_LOCATION (decl),
32217 "No dispatcher found for the versioning attributes");
32218 return 0;
32221 if (predicate_list)
32223 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32224 /* For a C string literal the length includes the trailing NULL. */
32225 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32226 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32227 predicate_chain);
32231 /* Process feature name. */
32232 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32233 strcpy (tok_str, attrs_str);
32234 token = strtok (tok_str, ",");
32235 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32237 while (token != NULL)
32239 /* Do not process "arch=" */
32240 if (strncmp (token, "arch=", 5) == 0)
32242 token = strtok (NULL, ",");
32243 continue;
32245 for (i = 0; i < NUM_FEATURES; ++i)
32247 if (strcmp (token, feature_list[i].name) == 0)
32249 if (predicate_list)
32251 predicate_arg = build_string_literal (
32252 strlen (feature_list[i].name) + 1,
32253 feature_list[i].name);
32254 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32255 predicate_chain);
32257 /* Find the maximum priority feature. */
32258 if (feature_list[i].priority > priority)
32259 priority = feature_list[i].priority;
32261 break;
32264 if (predicate_list && i == NUM_FEATURES)
32266 error_at (DECL_SOURCE_LOCATION (decl),
32267 "No dispatcher found for %s", token);
32268 return 0;
32270 token = strtok (NULL, ",");
32272 free (tok_str);
32274 if (predicate_list && predicate_chain == NULL_TREE)
32276 error_at (DECL_SOURCE_LOCATION (decl),
32277 "No dispatcher found for the versioning attributes : %s",
32278 attrs_str);
32279 return 0;
32281 else if (predicate_list)
32283 predicate_chain = nreverse (predicate_chain);
32284 *predicate_list = predicate_chain;
32287 return priority;
32290 /* This compares the priority of target features in function DECL1
32291 and DECL2. It returns positive value if DECL1 is higher priority,
32292 negative value if DECL2 is higher priority and 0 if they are the
32293 same. */
32295 static int
32296 ix86_compare_version_priority (tree decl1, tree decl2)
32298 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32299 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32301 return (int)priority1 - (int)priority2;
32304 /* V1 and V2 point to function versions with different priorities
32305 based on the target ISA. This function compares their priorities. */
32307 static int
32308 feature_compare (const void *v1, const void *v2)
32310 typedef struct _function_version_info
32312 tree version_decl;
32313 tree predicate_chain;
32314 unsigned int dispatch_priority;
32315 } function_version_info;
32317 const function_version_info c1 = *(const function_version_info *)v1;
32318 const function_version_info c2 = *(const function_version_info *)v2;
32319 return (c2.dispatch_priority - c1.dispatch_priority);
32322 /* This function generates the dispatch function for
32323 multi-versioned functions. DISPATCH_DECL is the function which will
32324 contain the dispatch logic. FNDECLS are the function choices for
32325 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32326 in DISPATCH_DECL in which the dispatch code is generated. */
32328 static int
32329 dispatch_function_versions (tree dispatch_decl,
32330 void *fndecls_p,
32331 basic_block *empty_bb)
32333 tree default_decl;
32334 gimple *ifunc_cpu_init_stmt;
32335 gimple_seq gseq;
32336 int ix;
32337 tree ele;
32338 vec<tree> *fndecls;
32339 unsigned int num_versions = 0;
32340 unsigned int actual_versions = 0;
32341 unsigned int i;
32343 struct _function_version_info
32345 tree version_decl;
32346 tree predicate_chain;
32347 unsigned int dispatch_priority;
32348 }*function_version_info;
32350 gcc_assert (dispatch_decl != NULL
32351 && fndecls_p != NULL
32352 && empty_bb != NULL);
32354 /*fndecls_p is actually a vector. */
32355 fndecls = static_cast<vec<tree> *> (fndecls_p);
32357 /* At least one more version other than the default. */
32358 num_versions = fndecls->length ();
32359 gcc_assert (num_versions >= 2);
32361 function_version_info = (struct _function_version_info *)
32362 XNEWVEC (struct _function_version_info, (num_versions - 1));
32364 /* The first version in the vector is the default decl. */
32365 default_decl = (*fndecls)[0];
32367 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32369 gseq = bb_seq (*empty_bb);
32370 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32371 constructors, so explicity call __builtin_cpu_init here. */
32372 ifunc_cpu_init_stmt = gimple_build_call_vec (
32373 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32374 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32375 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32376 set_bb_seq (*empty_bb, gseq);
32378 pop_cfun ();
32381 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32383 tree version_decl = ele;
32384 tree predicate_chain = NULL_TREE;
32385 unsigned int priority;
32386 /* Get attribute string, parse it and find the right predicate decl.
32387 The predicate function could be a lengthy combination of many
32388 features, like arch-type and various isa-variants. */
32389 priority = get_builtin_code_for_version (version_decl,
32390 &predicate_chain);
32392 if (predicate_chain == NULL_TREE)
32393 continue;
32395 function_version_info [actual_versions].version_decl = version_decl;
32396 function_version_info [actual_versions].predicate_chain
32397 = predicate_chain;
32398 function_version_info [actual_versions].dispatch_priority = priority;
32399 actual_versions++;
32402 /* Sort the versions according to descending order of dispatch priority. The
32403 priority is based on the ISA. This is not a perfect solution. There
32404 could still be ambiguity. If more than one function version is suitable
32405 to execute, which one should be dispatched? In future, allow the user
32406 to specify a dispatch priority next to the version. */
32407 qsort (function_version_info, actual_versions,
32408 sizeof (struct _function_version_info), feature_compare);
32410 for (i = 0; i < actual_versions; ++i)
32411 *empty_bb = add_condition_to_bb (dispatch_decl,
32412 function_version_info[i].version_decl,
32413 function_version_info[i].predicate_chain,
32414 *empty_bb);
32416 /* dispatch default version at the end. */
32417 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32418 NULL, *empty_bb);
32420 free (function_version_info);
32421 return 0;
32424 /* Comparator function to be used in qsort routine to sort attribute
32425 specification strings to "target". */
32427 static int
32428 attr_strcmp (const void *v1, const void *v2)
32430 const char *c1 = *(char *const*)v1;
32431 const char *c2 = *(char *const*)v2;
32432 return strcmp (c1, c2);
32435 /* ARGLIST is the argument to target attribute. This function tokenizes
32436 the comma separated arguments, sorts them and returns a string which
32437 is a unique identifier for the comma separated arguments. It also
32438 replaces non-identifier characters "=,-" with "_". */
32440 static char *
32441 sorted_attr_string (tree arglist)
32443 tree arg;
32444 size_t str_len_sum = 0;
32445 char **args = NULL;
32446 char *attr_str, *ret_str;
32447 char *attr = NULL;
32448 unsigned int argnum = 1;
32449 unsigned int i;
32451 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32453 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32454 size_t len = strlen (str);
32455 str_len_sum += len + 1;
32456 if (arg != arglist)
32457 argnum++;
32458 for (i = 0; i < strlen (str); i++)
32459 if (str[i] == ',')
32460 argnum++;
32463 attr_str = XNEWVEC (char, str_len_sum);
32464 str_len_sum = 0;
32465 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
32467 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
32468 size_t len = strlen (str);
32469 memcpy (attr_str + str_len_sum, str, len);
32470 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
32471 str_len_sum += len + 1;
32474 /* Replace "=,-" with "_". */
32475 for (i = 0; i < strlen (attr_str); i++)
32476 if (attr_str[i] == '=' || attr_str[i]== '-')
32477 attr_str[i] = '_';
32479 if (argnum == 1)
32480 return attr_str;
32482 args = XNEWVEC (char *, argnum);
32484 i = 0;
32485 attr = strtok (attr_str, ",");
32486 while (attr != NULL)
32488 args[i] = attr;
32489 i++;
32490 attr = strtok (NULL, ",");
32493 qsort (args, argnum, sizeof (char *), attr_strcmp);
32495 ret_str = XNEWVEC (char, str_len_sum);
32496 str_len_sum = 0;
32497 for (i = 0; i < argnum; i++)
32499 size_t len = strlen (args[i]);
32500 memcpy (ret_str + str_len_sum, args[i], len);
32501 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
32502 str_len_sum += len + 1;
32505 XDELETEVEC (args);
32506 XDELETEVEC (attr_str);
32507 return ret_str;
32510 /* This function changes the assembler name for functions that are
32511 versions. If DECL is a function version and has a "target"
32512 attribute, it appends the attribute string to its assembler name. */
32514 static tree
32515 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32517 tree version_attr;
32518 const char *orig_name, *version_string;
32519 char *attr_str, *assembler_name;
32521 if (DECL_DECLARED_INLINE_P (decl)
32522 && lookup_attribute ("gnu_inline",
32523 DECL_ATTRIBUTES (decl)))
32524 error_at (DECL_SOURCE_LOCATION (decl),
32525 "Function versions cannot be marked as gnu_inline,"
32526 " bodies have to be generated");
32528 if (DECL_VIRTUAL_P (decl)
32529 || DECL_VINDEX (decl))
32530 sorry ("Virtual function multiversioning not supported");
32532 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32534 /* target attribute string cannot be NULL. */
32535 gcc_assert (version_attr != NULL_TREE);
32537 orig_name = IDENTIFIER_POINTER (id);
32538 version_string
32539 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32541 if (strcmp (version_string, "default") == 0)
32542 return id;
32544 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32545 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32547 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32549 /* Allow assembler name to be modified if already set. */
32550 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32551 SET_DECL_RTL (decl, NULL);
32553 tree ret = get_identifier (assembler_name);
32554 XDELETEVEC (attr_str);
32555 XDELETEVEC (assembler_name);
32556 return ret;
32559 /* This function returns true if FN1 and FN2 are versions of the same function,
32560 that is, the target strings of the function decls are different. This assumes
32561 that FN1 and FN2 have the same signature. */
32563 static bool
32564 ix86_function_versions (tree fn1, tree fn2)
32566 tree attr1, attr2;
32567 char *target1, *target2;
32568 bool result;
32570 if (TREE_CODE (fn1) != FUNCTION_DECL
32571 || TREE_CODE (fn2) != FUNCTION_DECL)
32572 return false;
32574 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
32575 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
32577 /* At least one function decl should have the target attribute specified. */
32578 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
32579 return false;
32581 /* Diagnose missing target attribute if one of the decls is already
32582 multi-versioned. */
32583 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
32585 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
32587 if (attr2 != NULL_TREE)
32589 std::swap (fn1, fn2);
32590 attr1 = attr2;
32592 error_at (DECL_SOURCE_LOCATION (fn2),
32593 "missing %<target%> attribute for multi-versioned %D",
32594 fn2);
32595 inform (DECL_SOURCE_LOCATION (fn1),
32596 "previous declaration of %D", fn1);
32597 /* Prevent diagnosing of the same error multiple times. */
32598 DECL_ATTRIBUTES (fn2)
32599 = tree_cons (get_identifier ("target"),
32600 copy_node (TREE_VALUE (attr1)),
32601 DECL_ATTRIBUTES (fn2));
32603 return false;
32606 target1 = sorted_attr_string (TREE_VALUE (attr1));
32607 target2 = sorted_attr_string (TREE_VALUE (attr2));
32609 /* The sorted target strings must be different for fn1 and fn2
32610 to be versions. */
32611 if (strcmp (target1, target2) == 0)
32612 result = false;
32613 else
32614 result = true;
32616 XDELETEVEC (target1);
32617 XDELETEVEC (target2);
32619 return result;
32622 static tree
32623 ix86_mangle_decl_assembler_name (tree decl, tree id)
32625 /* For function version, add the target suffix to the assembler name. */
32626 if (TREE_CODE (decl) == FUNCTION_DECL
32627 && DECL_FUNCTION_VERSIONED (decl))
32628 id = ix86_mangle_function_version_assembler_name (decl, id);
32629 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32630 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32631 #endif
32633 return id;
32636 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
32637 is true, append the full path name of the source file. */
32639 static char *
32640 make_name (tree decl, const char *suffix, bool make_unique)
32642 char *global_var_name;
32643 int name_len;
32644 const char *name;
32645 const char *unique_name = NULL;
32647 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
32649 /* Get a unique name that can be used globally without any chances
32650 of collision at link time. */
32651 if (make_unique)
32652 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
32654 name_len = strlen (name) + strlen (suffix) + 2;
32656 if (make_unique)
32657 name_len += strlen (unique_name) + 1;
32658 global_var_name = XNEWVEC (char, name_len);
32660 /* Use '.' to concatenate names as it is demangler friendly. */
32661 if (make_unique)
32662 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
32663 suffix);
32664 else
32665 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
32667 return global_var_name;
32670 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32672 /* Make a dispatcher declaration for the multi-versioned function DECL.
32673 Calls to DECL function will be replaced with calls to the dispatcher
32674 by the front-end. Return the decl created. */
32676 static tree
32677 make_dispatcher_decl (const tree decl)
32679 tree func_decl;
32680 char *func_name;
32681 tree fn_type, func_type;
32682 bool is_uniq = false;
32684 if (TREE_PUBLIC (decl) == 0)
32685 is_uniq = true;
32687 func_name = make_name (decl, "ifunc", is_uniq);
32689 fn_type = TREE_TYPE (decl);
32690 func_type = build_function_type (TREE_TYPE (fn_type),
32691 TYPE_ARG_TYPES (fn_type));
32693 func_decl = build_fn_decl (func_name, func_type);
32694 XDELETEVEC (func_name);
32695 TREE_USED (func_decl) = 1;
32696 DECL_CONTEXT (func_decl) = NULL_TREE;
32697 DECL_INITIAL (func_decl) = error_mark_node;
32698 DECL_ARTIFICIAL (func_decl) = 1;
32699 /* Mark this func as external, the resolver will flip it again if
32700 it gets generated. */
32701 DECL_EXTERNAL (func_decl) = 1;
32702 /* This will be of type IFUNCs have to be externally visible. */
32703 TREE_PUBLIC (func_decl) = 1;
32705 return func_decl;
32708 #endif
32710 /* Returns true if decl is multi-versioned and DECL is the default function,
32711 that is it is not tagged with target specific optimization. */
32713 static bool
32714 is_function_default_version (const tree decl)
32716 if (TREE_CODE (decl) != FUNCTION_DECL
32717 || !DECL_FUNCTION_VERSIONED (decl))
32718 return false;
32719 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32720 gcc_assert (attr);
32721 attr = TREE_VALUE (TREE_VALUE (attr));
32722 return (TREE_CODE (attr) == STRING_CST
32723 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
32726 /* Make a dispatcher declaration for the multi-versioned function DECL.
32727 Calls to DECL function will be replaced with calls to the dispatcher
32728 by the front-end. Returns the decl of the dispatcher function. */
32730 static tree
32731 ix86_get_function_versions_dispatcher (void *decl)
32733 tree fn = (tree) decl;
32734 struct cgraph_node *node = NULL;
32735 struct cgraph_node *default_node = NULL;
32736 struct cgraph_function_version_info *node_v = NULL;
32737 struct cgraph_function_version_info *first_v = NULL;
32739 tree dispatch_decl = NULL;
32741 struct cgraph_function_version_info *default_version_info = NULL;
32743 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32745 node = cgraph_node::get (fn);
32746 gcc_assert (node != NULL);
32748 node_v = node->function_version ();
32749 gcc_assert (node_v != NULL);
32751 if (node_v->dispatcher_resolver != NULL)
32752 return node_v->dispatcher_resolver;
32754 /* Find the default version and make it the first node. */
32755 first_v = node_v;
32756 /* Go to the beginning of the chain. */
32757 while (first_v->prev != NULL)
32758 first_v = first_v->prev;
32759 default_version_info = first_v;
32760 while (default_version_info != NULL)
32762 if (is_function_default_version
32763 (default_version_info->this_node->decl))
32764 break;
32765 default_version_info = default_version_info->next;
32768 /* If there is no default node, just return NULL. */
32769 if (default_version_info == NULL)
32770 return NULL;
32772 /* Make default info the first node. */
32773 if (first_v != default_version_info)
32775 default_version_info->prev->next = default_version_info->next;
32776 if (default_version_info->next)
32777 default_version_info->next->prev = default_version_info->prev;
32778 first_v->prev = default_version_info;
32779 default_version_info->next = first_v;
32780 default_version_info->prev = NULL;
32783 default_node = default_version_info->this_node;
32785 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32786 if (targetm.has_ifunc_p ())
32788 struct cgraph_function_version_info *it_v = NULL;
32789 struct cgraph_node *dispatcher_node = NULL;
32790 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32792 /* Right now, the dispatching is done via ifunc. */
32793 dispatch_decl = make_dispatcher_decl (default_node->decl);
32795 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32796 gcc_assert (dispatcher_node != NULL);
32797 dispatcher_node->dispatcher_function = 1;
32798 dispatcher_version_info
32799 = dispatcher_node->insert_new_function_version ();
32800 dispatcher_version_info->next = default_version_info;
32801 dispatcher_node->definition = 1;
32803 /* Set the dispatcher for all the versions. */
32804 it_v = default_version_info;
32805 while (it_v != NULL)
32807 it_v->dispatcher_resolver = dispatch_decl;
32808 it_v = it_v->next;
32811 else
32812 #endif
32814 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32815 "multiversioning needs ifunc which is not supported "
32816 "on this target");
32819 return dispatch_decl;
32822 /* Make the resolver function decl to dispatch the versions of
32823 a multi-versioned function, DEFAULT_DECL. Create an
32824 empty basic block in the resolver and store the pointer in
32825 EMPTY_BB. Return the decl of the resolver function. */
32827 static tree
32828 make_resolver_func (const tree default_decl,
32829 const tree dispatch_decl,
32830 basic_block *empty_bb)
32832 char *resolver_name;
32833 tree decl, type, decl_name, t;
32834 bool is_uniq = false;
32836 /* IFUNC's have to be globally visible. So, if the default_decl is
32837 not, then the name of the IFUNC should be made unique. */
32838 if (TREE_PUBLIC (default_decl) == 0)
32839 is_uniq = true;
32841 /* Append the filename to the resolver function if the versions are
32842 not externally visible. This is because the resolver function has
32843 to be externally visible for the loader to find it. So, appending
32844 the filename will prevent conflicts with a resolver function from
32845 another module which is based on the same version name. */
32846 resolver_name = make_name (default_decl, "resolver", is_uniq);
32848 /* The resolver function should return a (void *). */
32849 type = build_function_type_list (ptr_type_node, NULL_TREE);
32851 decl = build_fn_decl (resolver_name, type);
32852 decl_name = get_identifier (resolver_name);
32853 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32855 DECL_NAME (decl) = decl_name;
32856 TREE_USED (decl) = 1;
32857 DECL_ARTIFICIAL (decl) = 1;
32858 DECL_IGNORED_P (decl) = 0;
32859 /* IFUNC resolvers have to be externally visible. */
32860 TREE_PUBLIC (decl) = 1;
32861 DECL_UNINLINABLE (decl) = 1;
32863 /* Resolver is not external, body is generated. */
32864 DECL_EXTERNAL (decl) = 0;
32865 DECL_EXTERNAL (dispatch_decl) = 0;
32867 DECL_CONTEXT (decl) = NULL_TREE;
32868 DECL_INITIAL (decl) = make_node (BLOCK);
32869 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32871 if (DECL_COMDAT_GROUP (default_decl)
32872 || TREE_PUBLIC (default_decl))
32874 /* In this case, each translation unit with a call to this
32875 versioned function will put out a resolver. Ensure it
32876 is comdat to keep just one copy. */
32877 DECL_COMDAT (decl) = 1;
32878 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32880 /* Build result decl and add to function_decl. */
32881 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32882 DECL_ARTIFICIAL (t) = 1;
32883 DECL_IGNORED_P (t) = 1;
32884 DECL_RESULT (decl) = t;
32886 gimplify_function_tree (decl);
32887 push_cfun (DECL_STRUCT_FUNCTION (decl));
32888 *empty_bb = init_lowered_empty_function (decl, false, 0);
32890 cgraph_node::add_new_function (decl, true);
32891 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32893 pop_cfun ();
32895 gcc_assert (dispatch_decl != NULL);
32896 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
32897 DECL_ATTRIBUTES (dispatch_decl)
32898 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
32900 /* Create the alias for dispatch to resolver here. */
32901 /*cgraph_create_function_alias (dispatch_decl, decl);*/
32902 cgraph_node::create_same_body_alias (dispatch_decl, decl);
32903 XDELETEVEC (resolver_name);
32904 return decl;
32907 /* Generate the dispatching code body to dispatch multi-versioned function
32908 DECL. The target hook is called to process the "target" attributes and
32909 provide the code to dispatch the right function at run-time. NODE points
32910 to the dispatcher decl whose body will be created. */
32912 static tree
32913 ix86_generate_version_dispatcher_body (void *node_p)
32915 tree resolver_decl;
32916 basic_block empty_bb;
32917 tree default_ver_decl;
32918 struct cgraph_node *versn;
32919 struct cgraph_node *node;
32921 struct cgraph_function_version_info *node_version_info = NULL;
32922 struct cgraph_function_version_info *versn_info = NULL;
32924 node = (cgraph_node *)node_p;
32926 node_version_info = node->function_version ();
32927 gcc_assert (node->dispatcher_function
32928 && node_version_info != NULL);
32930 if (node_version_info->dispatcher_resolver)
32931 return node_version_info->dispatcher_resolver;
32933 /* The first version in the chain corresponds to the default version. */
32934 default_ver_decl = node_version_info->next->this_node->decl;
32936 /* node is going to be an alias, so remove the finalized bit. */
32937 node->definition = false;
32939 resolver_decl = make_resolver_func (default_ver_decl,
32940 node->decl, &empty_bb);
32942 node_version_info->dispatcher_resolver = resolver_decl;
32944 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32946 auto_vec<tree, 2> fn_ver_vec;
32948 for (versn_info = node_version_info->next; versn_info;
32949 versn_info = versn_info->next)
32951 versn = versn_info->this_node;
32952 /* Check for virtual functions here again, as by this time it should
32953 have been determined if this function needs a vtable index or
32954 not. This happens for methods in derived classes that override
32955 virtual methods in base classes but are not explicitly marked as
32956 virtual. */
32957 if (DECL_VINDEX (versn->decl))
32958 sorry ("Virtual function multiversioning not supported");
32960 fn_ver_vec.safe_push (versn->decl);
32963 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32964 cgraph_edge::rebuild_edges ();
32965 pop_cfun ();
32966 return resolver_decl;
32968 /* This builds the processor_model struct type defined in
32969 libgcc/config/i386/cpuinfo.c */
32971 static tree
32972 build_processor_model_struct (void)
32974 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32975 "__cpu_features"};
32976 tree field = NULL_TREE, field_chain = NULL_TREE;
32977 int i;
32978 tree type = make_node (RECORD_TYPE);
32980 /* The first 3 fields are unsigned int. */
32981 for (i = 0; i < 3; ++i)
32983 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32984 get_identifier (field_name[i]), unsigned_type_node);
32985 if (field_chain != NULL_TREE)
32986 DECL_CHAIN (field) = field_chain;
32987 field_chain = field;
32990 /* The last field is an array of unsigned integers of size one. */
32991 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32992 get_identifier (field_name[3]),
32993 build_array_type (unsigned_type_node,
32994 build_index_type (size_one_node)));
32995 if (field_chain != NULL_TREE)
32996 DECL_CHAIN (field) = field_chain;
32997 field_chain = field;
32999 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
33000 return type;
33003 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
33005 static tree
33006 make_var_decl (tree type, const char *name)
33008 tree new_decl;
33010 new_decl = build_decl (UNKNOWN_LOCATION,
33011 VAR_DECL,
33012 get_identifier(name),
33013 type);
33015 DECL_EXTERNAL (new_decl) = 1;
33016 TREE_STATIC (new_decl) = 1;
33017 TREE_PUBLIC (new_decl) = 1;
33018 DECL_INITIAL (new_decl) = 0;
33019 DECL_ARTIFICIAL (new_decl) = 0;
33020 DECL_PRESERVE_P (new_decl) = 1;
33022 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
33023 assemble_variable (new_decl, 0, 0, 0);
33025 return new_decl;
33028 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
33029 into an integer defined in libgcc/config/i386/cpuinfo.c */
33031 static tree
33032 fold_builtin_cpu (tree fndecl, tree *args)
33034 unsigned int i;
33035 enum ix86_builtins fn_code = (enum ix86_builtins)
33036 DECL_FUNCTION_CODE (fndecl);
33037 tree param_string_cst = NULL;
33039 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
33040 enum processor_features
33042 F_CMOV = 0,
33043 F_MMX,
33044 F_POPCNT,
33045 F_SSE,
33046 F_SSE2,
33047 F_SSE3,
33048 F_SSSE3,
33049 F_SSE4_1,
33050 F_SSE4_2,
33051 F_AVX,
33052 F_AVX2,
33053 F_SSE4_A,
33054 F_FMA4,
33055 F_XOP,
33056 F_FMA,
33057 F_AVX512F,
33058 F_BMI,
33059 F_BMI2,
33060 F_AES,
33061 F_PCLMUL,
33062 F_AVX512VL,
33063 F_AVX512BW,
33064 F_AVX512DQ,
33065 F_AVX512CD,
33066 F_AVX512ER,
33067 F_AVX512PF,
33068 F_AVX512VBMI,
33069 F_AVX512IFMA,
33070 F_MAX
33073 /* These are the values for vendor types and cpu types and subtypes
33074 in cpuinfo.c. Cpu types and subtypes should be subtracted by
33075 the corresponding start value. */
33076 enum processor_model
33078 M_INTEL = 1,
33079 M_AMD,
33080 M_CPU_TYPE_START,
33081 M_INTEL_BONNELL,
33082 M_INTEL_CORE2,
33083 M_INTEL_COREI7,
33084 M_AMDFAM10H,
33085 M_AMDFAM15H,
33086 M_INTEL_SILVERMONT,
33087 M_INTEL_KNL,
33088 M_AMD_BTVER1,
33089 M_AMD_BTVER2,
33090 M_CPU_SUBTYPE_START,
33091 M_INTEL_COREI7_NEHALEM,
33092 M_INTEL_COREI7_WESTMERE,
33093 M_INTEL_COREI7_SANDYBRIDGE,
33094 M_AMDFAM10H_BARCELONA,
33095 M_AMDFAM10H_SHANGHAI,
33096 M_AMDFAM10H_ISTANBUL,
33097 M_AMDFAM15H_BDVER1,
33098 M_AMDFAM15H_BDVER2,
33099 M_AMDFAM15H_BDVER3,
33100 M_AMDFAM15H_BDVER4,
33101 M_AMDFAM17H_ZNVER1,
33102 M_INTEL_COREI7_IVYBRIDGE,
33103 M_INTEL_COREI7_HASWELL,
33104 M_INTEL_COREI7_BROADWELL,
33105 M_INTEL_COREI7_SKYLAKE,
33106 M_INTEL_COREI7_SKYLAKE_AVX512
33109 static struct _arch_names_table
33111 const char *const name;
33112 const enum processor_model model;
33114 const arch_names_table[] =
33116 {"amd", M_AMD},
33117 {"intel", M_INTEL},
33118 {"atom", M_INTEL_BONNELL},
33119 {"slm", M_INTEL_SILVERMONT},
33120 {"core2", M_INTEL_CORE2},
33121 {"corei7", M_INTEL_COREI7},
33122 {"nehalem", M_INTEL_COREI7_NEHALEM},
33123 {"westmere", M_INTEL_COREI7_WESTMERE},
33124 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
33125 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
33126 {"haswell", M_INTEL_COREI7_HASWELL},
33127 {"broadwell", M_INTEL_COREI7_BROADWELL},
33128 {"skylake", M_INTEL_COREI7_SKYLAKE},
33129 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
33130 {"bonnell", M_INTEL_BONNELL},
33131 {"silvermont", M_INTEL_SILVERMONT},
33132 {"knl", M_INTEL_KNL},
33133 {"amdfam10h", M_AMDFAM10H},
33134 {"barcelona", M_AMDFAM10H_BARCELONA},
33135 {"shanghai", M_AMDFAM10H_SHANGHAI},
33136 {"istanbul", M_AMDFAM10H_ISTANBUL},
33137 {"btver1", M_AMD_BTVER1},
33138 {"amdfam15h", M_AMDFAM15H},
33139 {"bdver1", M_AMDFAM15H_BDVER1},
33140 {"bdver2", M_AMDFAM15H_BDVER2},
33141 {"bdver3", M_AMDFAM15H_BDVER3},
33142 {"bdver4", M_AMDFAM15H_BDVER4},
33143 {"btver2", M_AMD_BTVER2},
33144 {"znver1", M_AMDFAM17H_ZNVER1},
33147 static struct _isa_names_table
33149 const char *const name;
33150 const enum processor_features feature;
33152 const isa_names_table[] =
33154 {"cmov", F_CMOV},
33155 {"mmx", F_MMX},
33156 {"popcnt", F_POPCNT},
33157 {"sse", F_SSE},
33158 {"sse2", F_SSE2},
33159 {"sse3", F_SSE3},
33160 {"ssse3", F_SSSE3},
33161 {"sse4a", F_SSE4_A},
33162 {"sse4.1", F_SSE4_1},
33163 {"sse4.2", F_SSE4_2},
33164 {"avx", F_AVX},
33165 {"fma4", F_FMA4},
33166 {"xop", F_XOP},
33167 {"fma", F_FMA},
33168 {"avx2", F_AVX2},
33169 {"avx512f", F_AVX512F},
33170 {"bmi", F_BMI},
33171 {"bmi2", F_BMI2},
33172 {"aes", F_AES},
33173 {"pclmul", F_PCLMUL},
33174 {"avx512vl",F_AVX512VL},
33175 {"avx512bw",F_AVX512BW},
33176 {"avx512dq",F_AVX512DQ},
33177 {"avx512cd",F_AVX512CD},
33178 {"avx512er",F_AVX512ER},
33179 {"avx512pf",F_AVX512PF},
33180 {"avx512vbmi",F_AVX512VBMI},
33181 {"avx512ifma",F_AVX512IFMA},
33184 tree __processor_model_type = build_processor_model_struct ();
33185 tree __cpu_model_var = make_var_decl (__processor_model_type,
33186 "__cpu_model");
33189 varpool_node::add (__cpu_model_var);
33191 gcc_assert ((args != NULL) && (*args != NULL));
33193 param_string_cst = *args;
33194 while (param_string_cst
33195 && TREE_CODE (param_string_cst) != STRING_CST)
33197 /* *args must be a expr that can contain other EXPRS leading to a
33198 STRING_CST. */
33199 if (!EXPR_P (param_string_cst))
33201 error ("Parameter to builtin must be a string constant or literal");
33202 return integer_zero_node;
33204 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
33207 gcc_assert (param_string_cst);
33209 if (fn_code == IX86_BUILTIN_CPU_IS)
33211 tree ref;
33212 tree field;
33213 tree final;
33215 unsigned int field_val = 0;
33216 unsigned int NUM_ARCH_NAMES
33217 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33219 for (i = 0; i < NUM_ARCH_NAMES; i++)
33220 if (strcmp (arch_names_table[i].name,
33221 TREE_STRING_POINTER (param_string_cst)) == 0)
33222 break;
33224 if (i == NUM_ARCH_NAMES)
33226 error ("Parameter to builtin not valid: %s",
33227 TREE_STRING_POINTER (param_string_cst));
33228 return integer_zero_node;
33231 field = TYPE_FIELDS (__processor_model_type);
33232 field_val = arch_names_table[i].model;
33234 /* CPU types are stored in the next field. */
33235 if (field_val > M_CPU_TYPE_START
33236 && field_val < M_CPU_SUBTYPE_START)
33238 field = DECL_CHAIN (field);
33239 field_val -= M_CPU_TYPE_START;
33242 /* CPU subtypes are stored in the next field. */
33243 if (field_val > M_CPU_SUBTYPE_START)
33245 field = DECL_CHAIN ( DECL_CHAIN (field));
33246 field_val -= M_CPU_SUBTYPE_START;
33249 /* Get the appropriate field in __cpu_model. */
33250 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33251 field, NULL_TREE);
33253 /* Check the value. */
33254 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33255 build_int_cstu (unsigned_type_node, field_val));
33256 return build1 (CONVERT_EXPR, integer_type_node, final);
33258 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33260 tree ref;
33261 tree array_elt;
33262 tree field;
33263 tree final;
33265 unsigned int field_val = 0;
33266 unsigned int NUM_ISA_NAMES
33267 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33269 for (i = 0; i < NUM_ISA_NAMES; i++)
33270 if (strcmp (isa_names_table[i].name,
33271 TREE_STRING_POINTER (param_string_cst)) == 0)
33272 break;
33274 if (i == NUM_ISA_NAMES)
33276 error ("Parameter to builtin not valid: %s",
33277 TREE_STRING_POINTER (param_string_cst));
33278 return integer_zero_node;
33281 field = TYPE_FIELDS (__processor_model_type);
33282 /* Get the last field, which is __cpu_features. */
33283 while (DECL_CHAIN (field))
33284 field = DECL_CHAIN (field);
33286 /* Get the appropriate field: __cpu_model.__cpu_features */
33287 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33288 field, NULL_TREE);
33290 /* Access the 0th element of __cpu_features array. */
33291 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33292 integer_zero_node, NULL_TREE, NULL_TREE);
33294 field_val = (1 << isa_names_table[i].feature);
33295 /* Return __cpu_model.__cpu_features[0] & field_val */
33296 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33297 build_int_cstu (unsigned_type_node, field_val));
33298 return build1 (CONVERT_EXPR, integer_type_node, final);
33300 gcc_unreachable ();
33303 static tree
33304 ix86_fold_builtin (tree fndecl, int n_args,
33305 tree *args, bool ignore ATTRIBUTE_UNUSED)
33307 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33309 enum ix86_builtins fn_code = (enum ix86_builtins)
33310 DECL_FUNCTION_CODE (fndecl);
33311 switch (fn_code)
33313 case IX86_BUILTIN_CPU_IS:
33314 case IX86_BUILTIN_CPU_SUPPORTS:
33315 gcc_assert (n_args == 1);
33316 return fold_builtin_cpu (fndecl, args);
33318 case IX86_BUILTIN_NANQ:
33319 case IX86_BUILTIN_NANSQ:
33321 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33322 const char *str = c_getstr (*args);
33323 int quiet = fn_code == IX86_BUILTIN_NANQ;
33324 REAL_VALUE_TYPE real;
33326 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33327 return build_real (type, real);
33328 return NULL_TREE;
33331 default:
33332 break;
33336 #ifdef SUBTARGET_FOLD_BUILTIN
33337 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33338 #endif
33340 return NULL_TREE;
33343 /* Make builtins to detect cpu type and features supported. NAME is
33344 the builtin name, CODE is the builtin code, and FTYPE is the function
33345 type of the builtin. */
33347 static void
33348 make_cpu_type_builtin (const char* name, int code,
33349 enum ix86_builtin_func_type ftype, bool is_const)
33351 tree decl;
33352 tree type;
33354 type = ix86_get_builtin_func_type (ftype);
33355 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33356 NULL, NULL_TREE);
33357 gcc_assert (decl != NULL_TREE);
33358 ix86_builtins[(int) code] = decl;
33359 TREE_READONLY (decl) = is_const;
33362 /* Make builtins to get CPU type and features supported. The created
33363 builtins are :
33365 __builtin_cpu_init (), to detect cpu type and features,
33366 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33367 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33370 static void
33371 ix86_init_platform_type_builtins (void)
33373 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33374 INT_FTYPE_VOID, false);
33375 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33376 INT_FTYPE_PCCHAR, true);
33377 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33378 INT_FTYPE_PCCHAR, true);
33381 /* Internal method for ix86_init_builtins. */
33383 static void
33384 ix86_init_builtins_va_builtins_abi (void)
33386 tree ms_va_ref, sysv_va_ref;
33387 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33388 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33389 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33390 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33392 if (!TARGET_64BIT)
33393 return;
33394 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33395 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33396 ms_va_ref = build_reference_type (ms_va_list_type_node);
33397 sysv_va_ref =
33398 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33400 fnvoid_va_end_ms =
33401 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33402 fnvoid_va_start_ms =
33403 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33404 fnvoid_va_end_sysv =
33405 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33406 fnvoid_va_start_sysv =
33407 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33408 NULL_TREE);
33409 fnvoid_va_copy_ms =
33410 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33411 NULL_TREE);
33412 fnvoid_va_copy_sysv =
33413 build_function_type_list (void_type_node, sysv_va_ref,
33414 sysv_va_ref, NULL_TREE);
33416 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33417 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33418 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33419 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33420 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33421 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33422 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33423 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33424 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33425 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33426 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33427 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33430 static void
33431 ix86_init_builtin_types (void)
33433 tree float80_type_node, const_string_type_node;
33435 /* The __float80 type. */
33436 float80_type_node = long_double_type_node;
33437 if (TYPE_MODE (float80_type_node) != XFmode)
33439 if (float64x_type_node != NULL_TREE
33440 && TYPE_MODE (float64x_type_node) == XFmode)
33441 float80_type_node = float64x_type_node;
33442 else
33444 /* The __float80 type. */
33445 float80_type_node = make_node (REAL_TYPE);
33447 TYPE_PRECISION (float80_type_node) = 80;
33448 layout_type (float80_type_node);
33451 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33453 /* The __float128 type. The node has already been created as
33454 _Float128, so we only need to register the __float128 name for
33455 it. */
33456 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33458 const_string_type_node
33459 = build_pointer_type (build_qualified_type
33460 (char_type_node, TYPE_QUAL_CONST));
33462 /* This macro is built by i386-builtin-types.awk. */
33463 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33466 static void
33467 ix86_init_builtins (void)
33469 tree ftype, decl;
33471 ix86_init_builtin_types ();
33473 /* Builtins to get CPU type and features. */
33474 ix86_init_platform_type_builtins ();
33476 /* TFmode support builtins. */
33477 def_builtin_const (0, "__builtin_infq",
33478 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33479 def_builtin_const (0, "__builtin_huge_valq",
33480 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33482 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33483 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33484 BUILT_IN_MD, "nanq", NULL_TREE);
33485 TREE_READONLY (decl) = 1;
33486 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33488 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33489 BUILT_IN_MD, "nansq", NULL_TREE);
33490 TREE_READONLY (decl) = 1;
33491 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33493 /* We will expand them to normal call if SSE isn't available since
33494 they are used by libgcc. */
33495 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33496 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33497 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33498 TREE_READONLY (decl) = 1;
33499 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33501 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33502 decl = add_builtin_function ("__builtin_copysignq", ftype,
33503 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33504 "__copysigntf3", NULL_TREE);
33505 TREE_READONLY (decl) = 1;
33506 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33508 ix86_init_tm_builtins ();
33509 ix86_init_mmx_sse_builtins ();
33510 ix86_init_mpx_builtins ();
33512 if (TARGET_LP64)
33513 ix86_init_builtins_va_builtins_abi ();
33515 #ifdef SUBTARGET_INIT_BUILTINS
33516 SUBTARGET_INIT_BUILTINS;
33517 #endif
33520 /* Return the ix86 builtin for CODE. */
33522 static tree
33523 ix86_builtin_decl (unsigned code, bool)
33525 if (code >= IX86_BUILTIN_MAX)
33526 return error_mark_node;
33528 return ix86_builtins[code];
33531 /* Errors in the source file can cause expand_expr to return const0_rtx
33532 where we expect a vector. To avoid crashing, use one of the vector
33533 clear instructions. */
33534 static rtx
33535 safe_vector_operand (rtx x, machine_mode mode)
33537 if (x == const0_rtx)
33538 x = CONST0_RTX (mode);
33539 return x;
33542 /* Fixup modeless constants to fit required mode. */
33543 static rtx
33544 fixup_modeless_constant (rtx x, machine_mode mode)
33546 if (GET_MODE (x) == VOIDmode)
33547 x = convert_to_mode (mode, x, 1);
33548 return x;
33551 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33553 static rtx
33554 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33556 rtx pat;
33557 tree arg0 = CALL_EXPR_ARG (exp, 0);
33558 tree arg1 = CALL_EXPR_ARG (exp, 1);
33559 rtx op0 = expand_normal (arg0);
33560 rtx op1 = expand_normal (arg1);
33561 machine_mode tmode = insn_data[icode].operand[0].mode;
33562 machine_mode mode0 = insn_data[icode].operand[1].mode;
33563 machine_mode mode1 = insn_data[icode].operand[2].mode;
33565 if (VECTOR_MODE_P (mode0))
33566 op0 = safe_vector_operand (op0, mode0);
33567 if (VECTOR_MODE_P (mode1))
33568 op1 = safe_vector_operand (op1, mode1);
33570 if (optimize || !target
33571 || GET_MODE (target) != tmode
33572 || !insn_data[icode].operand[0].predicate (target, tmode))
33573 target = gen_reg_rtx (tmode);
33575 if (GET_MODE (op1) == SImode && mode1 == TImode)
33577 rtx x = gen_reg_rtx (V4SImode);
33578 emit_insn (gen_sse2_loadd (x, op1));
33579 op1 = gen_lowpart (TImode, x);
33582 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33583 op0 = copy_to_mode_reg (mode0, op0);
33584 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33585 op1 = copy_to_mode_reg (mode1, op1);
33587 pat = GEN_FCN (icode) (target, op0, op1);
33588 if (! pat)
33589 return 0;
33591 emit_insn (pat);
33593 return target;
33596 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33598 static rtx
33599 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33600 enum ix86_builtin_func_type m_type,
33601 enum rtx_code sub_code)
33603 rtx pat;
33604 int i;
33605 int nargs;
33606 bool comparison_p = false;
33607 bool tf_p = false;
33608 bool last_arg_constant = false;
33609 int num_memory = 0;
33610 struct {
33611 rtx op;
33612 machine_mode mode;
33613 } args[4];
33615 machine_mode tmode = insn_data[icode].operand[0].mode;
33617 switch (m_type)
33619 case MULTI_ARG_4_DF2_DI_I:
33620 case MULTI_ARG_4_DF2_DI_I1:
33621 case MULTI_ARG_4_SF2_SI_I:
33622 case MULTI_ARG_4_SF2_SI_I1:
33623 nargs = 4;
33624 last_arg_constant = true;
33625 break;
33627 case MULTI_ARG_3_SF:
33628 case MULTI_ARG_3_DF:
33629 case MULTI_ARG_3_SF2:
33630 case MULTI_ARG_3_DF2:
33631 case MULTI_ARG_3_DI:
33632 case MULTI_ARG_3_SI:
33633 case MULTI_ARG_3_SI_DI:
33634 case MULTI_ARG_3_HI:
33635 case MULTI_ARG_3_HI_SI:
33636 case MULTI_ARG_3_QI:
33637 case MULTI_ARG_3_DI2:
33638 case MULTI_ARG_3_SI2:
33639 case MULTI_ARG_3_HI2:
33640 case MULTI_ARG_3_QI2:
33641 nargs = 3;
33642 break;
33644 case MULTI_ARG_2_SF:
33645 case MULTI_ARG_2_DF:
33646 case MULTI_ARG_2_DI:
33647 case MULTI_ARG_2_SI:
33648 case MULTI_ARG_2_HI:
33649 case MULTI_ARG_2_QI:
33650 nargs = 2;
33651 break;
33653 case MULTI_ARG_2_DI_IMM:
33654 case MULTI_ARG_2_SI_IMM:
33655 case MULTI_ARG_2_HI_IMM:
33656 case MULTI_ARG_2_QI_IMM:
33657 nargs = 2;
33658 last_arg_constant = true;
33659 break;
33661 case MULTI_ARG_1_SF:
33662 case MULTI_ARG_1_DF:
33663 case MULTI_ARG_1_SF2:
33664 case MULTI_ARG_1_DF2:
33665 case MULTI_ARG_1_DI:
33666 case MULTI_ARG_1_SI:
33667 case MULTI_ARG_1_HI:
33668 case MULTI_ARG_1_QI:
33669 case MULTI_ARG_1_SI_DI:
33670 case MULTI_ARG_1_HI_DI:
33671 case MULTI_ARG_1_HI_SI:
33672 case MULTI_ARG_1_QI_DI:
33673 case MULTI_ARG_1_QI_SI:
33674 case MULTI_ARG_1_QI_HI:
33675 nargs = 1;
33676 break;
33678 case MULTI_ARG_2_DI_CMP:
33679 case MULTI_ARG_2_SI_CMP:
33680 case MULTI_ARG_2_HI_CMP:
33681 case MULTI_ARG_2_QI_CMP:
33682 nargs = 2;
33683 comparison_p = true;
33684 break;
33686 case MULTI_ARG_2_SF_TF:
33687 case MULTI_ARG_2_DF_TF:
33688 case MULTI_ARG_2_DI_TF:
33689 case MULTI_ARG_2_SI_TF:
33690 case MULTI_ARG_2_HI_TF:
33691 case MULTI_ARG_2_QI_TF:
33692 nargs = 2;
33693 tf_p = true;
33694 break;
33696 default:
33697 gcc_unreachable ();
33700 if (optimize || !target
33701 || GET_MODE (target) != tmode
33702 || !insn_data[icode].operand[0].predicate (target, tmode))
33703 target = gen_reg_rtx (tmode);
33705 gcc_assert (nargs <= 4);
33707 for (i = 0; i < nargs; i++)
33709 tree arg = CALL_EXPR_ARG (exp, i);
33710 rtx op = expand_normal (arg);
33711 int adjust = (comparison_p) ? 1 : 0;
33712 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33714 if (last_arg_constant && i == nargs - 1)
33716 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33718 enum insn_code new_icode = icode;
33719 switch (icode)
33721 case CODE_FOR_xop_vpermil2v2df3:
33722 case CODE_FOR_xop_vpermil2v4sf3:
33723 case CODE_FOR_xop_vpermil2v4df3:
33724 case CODE_FOR_xop_vpermil2v8sf3:
33725 error ("the last argument must be a 2-bit immediate");
33726 return gen_reg_rtx (tmode);
33727 case CODE_FOR_xop_rotlv2di3:
33728 new_icode = CODE_FOR_rotlv2di3;
33729 goto xop_rotl;
33730 case CODE_FOR_xop_rotlv4si3:
33731 new_icode = CODE_FOR_rotlv4si3;
33732 goto xop_rotl;
33733 case CODE_FOR_xop_rotlv8hi3:
33734 new_icode = CODE_FOR_rotlv8hi3;
33735 goto xop_rotl;
33736 case CODE_FOR_xop_rotlv16qi3:
33737 new_icode = CODE_FOR_rotlv16qi3;
33738 xop_rotl:
33739 if (CONST_INT_P (op))
33741 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33742 op = GEN_INT (INTVAL (op) & mask);
33743 gcc_checking_assert
33744 (insn_data[icode].operand[i + 1].predicate (op, mode));
33746 else
33748 gcc_checking_assert
33749 (nargs == 2
33750 && insn_data[new_icode].operand[0].mode == tmode
33751 && insn_data[new_icode].operand[1].mode == tmode
33752 && insn_data[new_icode].operand[2].mode == mode
33753 && insn_data[new_icode].operand[0].predicate
33754 == insn_data[icode].operand[0].predicate
33755 && insn_data[new_icode].operand[1].predicate
33756 == insn_data[icode].operand[1].predicate);
33757 icode = new_icode;
33758 goto non_constant;
33760 break;
33761 default:
33762 gcc_unreachable ();
33766 else
33768 non_constant:
33769 if (VECTOR_MODE_P (mode))
33770 op = safe_vector_operand (op, mode);
33772 /* If we aren't optimizing, only allow one memory operand to be
33773 generated. */
33774 if (memory_operand (op, mode))
33775 num_memory++;
33777 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
33779 if (optimize
33780 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
33781 || num_memory > 1)
33782 op = force_reg (mode, op);
33785 args[i].op = op;
33786 args[i].mode = mode;
33789 switch (nargs)
33791 case 1:
33792 pat = GEN_FCN (icode) (target, args[0].op);
33793 break;
33795 case 2:
33796 if (tf_p)
33797 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
33798 GEN_INT ((int)sub_code));
33799 else if (! comparison_p)
33800 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
33801 else
33803 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
33804 args[0].op,
33805 args[1].op);
33807 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
33809 break;
33811 case 3:
33812 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
33813 break;
33815 case 4:
33816 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
33817 break;
33819 default:
33820 gcc_unreachable ();
33823 if (! pat)
33824 return 0;
33826 emit_insn (pat);
33827 return target;
33830 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
33831 insns with vec_merge. */
33833 static rtx
33834 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
33835 rtx target)
33837 rtx pat;
33838 tree arg0 = CALL_EXPR_ARG (exp, 0);
33839 rtx op1, op0 = expand_normal (arg0);
33840 machine_mode tmode = insn_data[icode].operand[0].mode;
33841 machine_mode mode0 = insn_data[icode].operand[1].mode;
33843 if (optimize || !target
33844 || GET_MODE (target) != tmode
33845 || !insn_data[icode].operand[0].predicate (target, tmode))
33846 target = gen_reg_rtx (tmode);
33848 if (VECTOR_MODE_P (mode0))
33849 op0 = safe_vector_operand (op0, mode0);
33851 if ((optimize && !register_operand (op0, mode0))
33852 || !insn_data[icode].operand[1].predicate (op0, mode0))
33853 op0 = copy_to_mode_reg (mode0, op0);
33855 op1 = op0;
33856 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33857 op1 = copy_to_mode_reg (mode0, op1);
33859 pat = GEN_FCN (icode) (target, op0, op1);
33860 if (! pat)
33861 return 0;
33862 emit_insn (pat);
33863 return target;
33866 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33868 static rtx
33869 ix86_expand_sse_compare (const struct builtin_description *d,
33870 tree exp, rtx target, bool swap)
33872 rtx pat;
33873 tree arg0 = CALL_EXPR_ARG (exp, 0);
33874 tree arg1 = CALL_EXPR_ARG (exp, 1);
33875 rtx op0 = expand_normal (arg0);
33876 rtx op1 = expand_normal (arg1);
33877 rtx op2;
33878 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33879 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33880 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33881 enum rtx_code comparison = d->comparison;
33883 if (VECTOR_MODE_P (mode0))
33884 op0 = safe_vector_operand (op0, mode0);
33885 if (VECTOR_MODE_P (mode1))
33886 op1 = safe_vector_operand (op1, mode1);
33888 /* Swap operands if we have a comparison that isn't available in
33889 hardware. */
33890 if (swap)
33891 std::swap (op0, op1);
33893 if (optimize || !target
33894 || GET_MODE (target) != tmode
33895 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33896 target = gen_reg_rtx (tmode);
33898 if ((optimize && !register_operand (op0, mode0))
33899 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33900 op0 = copy_to_mode_reg (mode0, op0);
33901 if ((optimize && !register_operand (op1, mode1))
33902 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33903 op1 = copy_to_mode_reg (mode1, op1);
33905 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33906 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33907 if (! pat)
33908 return 0;
33909 emit_insn (pat);
33910 return target;
33913 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33915 static rtx
33916 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33917 rtx target)
33919 rtx pat;
33920 tree arg0 = CALL_EXPR_ARG (exp, 0);
33921 tree arg1 = CALL_EXPR_ARG (exp, 1);
33922 rtx op0 = expand_normal (arg0);
33923 rtx op1 = expand_normal (arg1);
33924 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33925 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33926 enum rtx_code comparison = d->comparison;
33928 if (VECTOR_MODE_P (mode0))
33929 op0 = safe_vector_operand (op0, mode0);
33930 if (VECTOR_MODE_P (mode1))
33931 op1 = safe_vector_operand (op1, mode1);
33933 /* Swap operands if we have a comparison that isn't available in
33934 hardware. */
33935 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33936 std::swap (op0, op1);
33938 target = gen_reg_rtx (SImode);
33939 emit_move_insn (target, const0_rtx);
33940 target = gen_rtx_SUBREG (QImode, target, 0);
33942 if ((optimize && !register_operand (op0, mode0))
33943 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33944 op0 = copy_to_mode_reg (mode0, op0);
33945 if ((optimize && !register_operand (op1, mode1))
33946 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33947 op1 = copy_to_mode_reg (mode1, op1);
33949 pat = GEN_FCN (d->icode) (op0, op1);
33950 if (! pat)
33951 return 0;
33952 emit_insn (pat);
33953 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33954 gen_rtx_fmt_ee (comparison, QImode,
33955 SET_DEST (pat),
33956 const0_rtx)));
33958 return SUBREG_REG (target);
33961 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33963 static rtx
33964 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33965 rtx target)
33967 rtx pat;
33968 tree arg0 = CALL_EXPR_ARG (exp, 0);
33969 rtx op1, op0 = expand_normal (arg0);
33970 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33971 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33973 if (optimize || target == 0
33974 || GET_MODE (target) != tmode
33975 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33976 target = gen_reg_rtx (tmode);
33978 if (VECTOR_MODE_P (mode0))
33979 op0 = safe_vector_operand (op0, mode0);
33981 if ((optimize && !register_operand (op0, mode0))
33982 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33983 op0 = copy_to_mode_reg (mode0, op0);
33985 op1 = GEN_INT (d->comparison);
33987 pat = GEN_FCN (d->icode) (target, op0, op1);
33988 if (! pat)
33989 return 0;
33990 emit_insn (pat);
33991 return target;
33994 static rtx
33995 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33996 tree exp, rtx target)
33998 rtx pat;
33999 tree arg0 = CALL_EXPR_ARG (exp, 0);
34000 tree arg1 = CALL_EXPR_ARG (exp, 1);
34001 rtx op0 = expand_normal (arg0);
34002 rtx op1 = expand_normal (arg1);
34003 rtx op2;
34004 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34005 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34006 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34008 if (optimize || target == 0
34009 || GET_MODE (target) != tmode
34010 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34011 target = gen_reg_rtx (tmode);
34013 op0 = safe_vector_operand (op0, mode0);
34014 op1 = safe_vector_operand (op1, mode1);
34016 if ((optimize && !register_operand (op0, mode0))
34017 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34018 op0 = copy_to_mode_reg (mode0, op0);
34019 if ((optimize && !register_operand (op1, mode1))
34020 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34021 op1 = copy_to_mode_reg (mode1, op1);
34023 op2 = GEN_INT (d->comparison);
34025 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34026 if (! pat)
34027 return 0;
34028 emit_insn (pat);
34029 return target;
34032 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34034 static rtx
34035 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34036 rtx target)
34038 rtx pat;
34039 tree arg0 = CALL_EXPR_ARG (exp, 0);
34040 tree arg1 = CALL_EXPR_ARG (exp, 1);
34041 rtx op0 = expand_normal (arg0);
34042 rtx op1 = expand_normal (arg1);
34043 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34044 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34045 enum rtx_code comparison = d->comparison;
34047 if (VECTOR_MODE_P (mode0))
34048 op0 = safe_vector_operand (op0, mode0);
34049 if (VECTOR_MODE_P (mode1))
34050 op1 = safe_vector_operand (op1, mode1);
34052 target = gen_reg_rtx (SImode);
34053 emit_move_insn (target, const0_rtx);
34054 target = gen_rtx_SUBREG (QImode, target, 0);
34056 if ((optimize && !register_operand (op0, mode0))
34057 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34058 op0 = copy_to_mode_reg (mode0, op0);
34059 if ((optimize && !register_operand (op1, mode1))
34060 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34061 op1 = copy_to_mode_reg (mode1, op1);
34063 pat = GEN_FCN (d->icode) (op0, op1);
34064 if (! pat)
34065 return 0;
34066 emit_insn (pat);
34067 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34068 gen_rtx_fmt_ee (comparison, QImode,
34069 SET_DEST (pat),
34070 const0_rtx)));
34072 return SUBREG_REG (target);
34075 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34077 static rtx
34078 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34079 tree exp, rtx target)
34081 rtx pat;
34082 tree arg0 = CALL_EXPR_ARG (exp, 0);
34083 tree arg1 = CALL_EXPR_ARG (exp, 1);
34084 tree arg2 = CALL_EXPR_ARG (exp, 2);
34085 tree arg3 = CALL_EXPR_ARG (exp, 3);
34086 tree arg4 = CALL_EXPR_ARG (exp, 4);
34087 rtx scratch0, scratch1;
34088 rtx op0 = expand_normal (arg0);
34089 rtx op1 = expand_normal (arg1);
34090 rtx op2 = expand_normal (arg2);
34091 rtx op3 = expand_normal (arg3);
34092 rtx op4 = expand_normal (arg4);
34093 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34095 tmode0 = insn_data[d->icode].operand[0].mode;
34096 tmode1 = insn_data[d->icode].operand[1].mode;
34097 modev2 = insn_data[d->icode].operand[2].mode;
34098 modei3 = insn_data[d->icode].operand[3].mode;
34099 modev4 = insn_data[d->icode].operand[4].mode;
34100 modei5 = insn_data[d->icode].operand[5].mode;
34101 modeimm = insn_data[d->icode].operand[6].mode;
34103 if (VECTOR_MODE_P (modev2))
34104 op0 = safe_vector_operand (op0, modev2);
34105 if (VECTOR_MODE_P (modev4))
34106 op2 = safe_vector_operand (op2, modev4);
34108 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34109 op0 = copy_to_mode_reg (modev2, op0);
34110 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34111 op1 = copy_to_mode_reg (modei3, op1);
34112 if ((optimize && !register_operand (op2, modev4))
34113 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34114 op2 = copy_to_mode_reg (modev4, op2);
34115 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34116 op3 = copy_to_mode_reg (modei5, op3);
34118 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34120 error ("the fifth argument must be an 8-bit immediate");
34121 return const0_rtx;
34124 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34126 if (optimize || !target
34127 || GET_MODE (target) != tmode0
34128 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34129 target = gen_reg_rtx (tmode0);
34131 scratch1 = gen_reg_rtx (tmode1);
34133 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34135 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34137 if (optimize || !target
34138 || GET_MODE (target) != tmode1
34139 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34140 target = gen_reg_rtx (tmode1);
34142 scratch0 = gen_reg_rtx (tmode0);
34144 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34146 else
34148 gcc_assert (d->flag);
34150 scratch0 = gen_reg_rtx (tmode0);
34151 scratch1 = gen_reg_rtx (tmode1);
34153 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34156 if (! pat)
34157 return 0;
34159 emit_insn (pat);
34161 if (d->flag)
34163 target = gen_reg_rtx (SImode);
34164 emit_move_insn (target, const0_rtx);
34165 target = gen_rtx_SUBREG (QImode, target, 0);
34167 emit_insn
34168 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34169 gen_rtx_fmt_ee (EQ, QImode,
34170 gen_rtx_REG ((machine_mode) d->flag,
34171 FLAGS_REG),
34172 const0_rtx)));
34173 return SUBREG_REG (target);
34175 else
34176 return target;
34180 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34182 static rtx
34183 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34184 tree exp, rtx target)
34186 rtx pat;
34187 tree arg0 = CALL_EXPR_ARG (exp, 0);
34188 tree arg1 = CALL_EXPR_ARG (exp, 1);
34189 tree arg2 = CALL_EXPR_ARG (exp, 2);
34190 rtx scratch0, scratch1;
34191 rtx op0 = expand_normal (arg0);
34192 rtx op1 = expand_normal (arg1);
34193 rtx op2 = expand_normal (arg2);
34194 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34196 tmode0 = insn_data[d->icode].operand[0].mode;
34197 tmode1 = insn_data[d->icode].operand[1].mode;
34198 modev2 = insn_data[d->icode].operand[2].mode;
34199 modev3 = insn_data[d->icode].operand[3].mode;
34200 modeimm = insn_data[d->icode].operand[4].mode;
34202 if (VECTOR_MODE_P (modev2))
34203 op0 = safe_vector_operand (op0, modev2);
34204 if (VECTOR_MODE_P (modev3))
34205 op1 = safe_vector_operand (op1, modev3);
34207 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34208 op0 = copy_to_mode_reg (modev2, op0);
34209 if ((optimize && !register_operand (op1, modev3))
34210 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34211 op1 = copy_to_mode_reg (modev3, op1);
34213 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34215 error ("the third argument must be an 8-bit immediate");
34216 return const0_rtx;
34219 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34221 if (optimize || !target
34222 || GET_MODE (target) != tmode0
34223 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34224 target = gen_reg_rtx (tmode0);
34226 scratch1 = gen_reg_rtx (tmode1);
34228 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34230 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34232 if (optimize || !target
34233 || GET_MODE (target) != tmode1
34234 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34235 target = gen_reg_rtx (tmode1);
34237 scratch0 = gen_reg_rtx (tmode0);
34239 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34241 else
34243 gcc_assert (d->flag);
34245 scratch0 = gen_reg_rtx (tmode0);
34246 scratch1 = gen_reg_rtx (tmode1);
34248 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34251 if (! pat)
34252 return 0;
34254 emit_insn (pat);
34256 if (d->flag)
34258 target = gen_reg_rtx (SImode);
34259 emit_move_insn (target, const0_rtx);
34260 target = gen_rtx_SUBREG (QImode, target, 0);
34262 emit_insn
34263 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34264 gen_rtx_fmt_ee (EQ, QImode,
34265 gen_rtx_REG ((machine_mode) d->flag,
34266 FLAGS_REG),
34267 const0_rtx)));
34268 return SUBREG_REG (target);
34270 else
34271 return target;
34274 /* Subroutine of ix86_expand_builtin to take care of insns with
34275 variable number of operands. */
34277 static rtx
34278 ix86_expand_args_builtin (const struct builtin_description *d,
34279 tree exp, rtx target)
34281 rtx pat, real_target;
34282 unsigned int i, nargs;
34283 unsigned int nargs_constant = 0;
34284 unsigned int mask_pos = 0;
34285 int num_memory = 0;
34286 struct
34288 rtx op;
34289 machine_mode mode;
34290 } args[6];
34291 bool last_arg_count = false;
34292 enum insn_code icode = d->icode;
34293 const struct insn_data_d *insn_p = &insn_data[icode];
34294 machine_mode tmode = insn_p->operand[0].mode;
34295 machine_mode rmode = VOIDmode;
34296 bool swap = false;
34297 enum rtx_code comparison = d->comparison;
34299 switch ((enum ix86_builtin_func_type) d->flag)
34301 case V2DF_FTYPE_V2DF_ROUND:
34302 case V4DF_FTYPE_V4DF_ROUND:
34303 case V8DF_FTYPE_V8DF_ROUND:
34304 case V4SF_FTYPE_V4SF_ROUND:
34305 case V8SF_FTYPE_V8SF_ROUND:
34306 case V16SF_FTYPE_V16SF_ROUND:
34307 case V4SI_FTYPE_V4SF_ROUND:
34308 case V8SI_FTYPE_V8SF_ROUND:
34309 case V16SI_FTYPE_V16SF_ROUND:
34310 return ix86_expand_sse_round (d, exp, target);
34311 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34312 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34313 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34314 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34315 case INT_FTYPE_V8SF_V8SF_PTEST:
34316 case INT_FTYPE_V4DI_V4DI_PTEST:
34317 case INT_FTYPE_V4DF_V4DF_PTEST:
34318 case INT_FTYPE_V4SF_V4SF_PTEST:
34319 case INT_FTYPE_V2DI_V2DI_PTEST:
34320 case INT_FTYPE_V2DF_V2DF_PTEST:
34321 return ix86_expand_sse_ptest (d, exp, target);
34322 case FLOAT128_FTYPE_FLOAT128:
34323 case FLOAT_FTYPE_FLOAT:
34324 case INT_FTYPE_INT:
34325 case UINT64_FTYPE_INT:
34326 case UINT16_FTYPE_UINT16:
34327 case INT64_FTYPE_INT64:
34328 case INT64_FTYPE_V4SF:
34329 case INT64_FTYPE_V2DF:
34330 case INT_FTYPE_V16QI:
34331 case INT_FTYPE_V8QI:
34332 case INT_FTYPE_V8SF:
34333 case INT_FTYPE_V4DF:
34334 case INT_FTYPE_V4SF:
34335 case INT_FTYPE_V2DF:
34336 case INT_FTYPE_V32QI:
34337 case V16QI_FTYPE_V16QI:
34338 case V8SI_FTYPE_V8SF:
34339 case V8SI_FTYPE_V4SI:
34340 case V8HI_FTYPE_V8HI:
34341 case V8HI_FTYPE_V16QI:
34342 case V8QI_FTYPE_V8QI:
34343 case V8SF_FTYPE_V8SF:
34344 case V8SF_FTYPE_V8SI:
34345 case V8SF_FTYPE_V4SF:
34346 case V8SF_FTYPE_V8HI:
34347 case V4SI_FTYPE_V4SI:
34348 case V4SI_FTYPE_V16QI:
34349 case V4SI_FTYPE_V4SF:
34350 case V4SI_FTYPE_V8SI:
34351 case V4SI_FTYPE_V8HI:
34352 case V4SI_FTYPE_V4DF:
34353 case V4SI_FTYPE_V2DF:
34354 case V4HI_FTYPE_V4HI:
34355 case V4DF_FTYPE_V4DF:
34356 case V4DF_FTYPE_V4SI:
34357 case V4DF_FTYPE_V4SF:
34358 case V4DF_FTYPE_V2DF:
34359 case V4SF_FTYPE_V4SF:
34360 case V4SF_FTYPE_V4SI:
34361 case V4SF_FTYPE_V8SF:
34362 case V4SF_FTYPE_V4DF:
34363 case V4SF_FTYPE_V8HI:
34364 case V4SF_FTYPE_V2DF:
34365 case V2DI_FTYPE_V2DI:
34366 case V2DI_FTYPE_V16QI:
34367 case V2DI_FTYPE_V8HI:
34368 case V2DI_FTYPE_V4SI:
34369 case V2DF_FTYPE_V2DF:
34370 case V2DF_FTYPE_V4SI:
34371 case V2DF_FTYPE_V4DF:
34372 case V2DF_FTYPE_V4SF:
34373 case V2DF_FTYPE_V2SI:
34374 case V2SI_FTYPE_V2SI:
34375 case V2SI_FTYPE_V4SF:
34376 case V2SI_FTYPE_V2SF:
34377 case V2SI_FTYPE_V2DF:
34378 case V2SF_FTYPE_V2SF:
34379 case V2SF_FTYPE_V2SI:
34380 case V32QI_FTYPE_V32QI:
34381 case V32QI_FTYPE_V16QI:
34382 case V16HI_FTYPE_V16HI:
34383 case V16HI_FTYPE_V8HI:
34384 case V8SI_FTYPE_V8SI:
34385 case V16HI_FTYPE_V16QI:
34386 case V8SI_FTYPE_V16QI:
34387 case V4DI_FTYPE_V16QI:
34388 case V8SI_FTYPE_V8HI:
34389 case V4DI_FTYPE_V8HI:
34390 case V4DI_FTYPE_V4SI:
34391 case V4DI_FTYPE_V2DI:
34392 case UHI_FTYPE_UHI:
34393 case UHI_FTYPE_V16QI:
34394 case USI_FTYPE_V32QI:
34395 case UDI_FTYPE_V64QI:
34396 case V16QI_FTYPE_UHI:
34397 case V32QI_FTYPE_USI:
34398 case V64QI_FTYPE_UDI:
34399 case V8HI_FTYPE_UQI:
34400 case V16HI_FTYPE_UHI:
34401 case V32HI_FTYPE_USI:
34402 case V4SI_FTYPE_UQI:
34403 case V8SI_FTYPE_UQI:
34404 case V4SI_FTYPE_UHI:
34405 case V8SI_FTYPE_UHI:
34406 case UQI_FTYPE_V8HI:
34407 case UHI_FTYPE_V16HI:
34408 case USI_FTYPE_V32HI:
34409 case UQI_FTYPE_V4SI:
34410 case UQI_FTYPE_V8SI:
34411 case UHI_FTYPE_V16SI:
34412 case UQI_FTYPE_V2DI:
34413 case UQI_FTYPE_V4DI:
34414 case UQI_FTYPE_V8DI:
34415 case V16SI_FTYPE_UHI:
34416 case V2DI_FTYPE_UQI:
34417 case V4DI_FTYPE_UQI:
34418 case V16SI_FTYPE_INT:
34419 case V16SF_FTYPE_V8SF:
34420 case V16SI_FTYPE_V8SI:
34421 case V16SF_FTYPE_V4SF:
34422 case V16SI_FTYPE_V4SI:
34423 case V16SI_FTYPE_V16SF:
34424 case V16SF_FTYPE_V16SF:
34425 case V8DI_FTYPE_UQI:
34426 case V8DF_FTYPE_V4DF:
34427 case V8DF_FTYPE_V2DF:
34428 case V8DF_FTYPE_V8DF:
34429 nargs = 1;
34430 break;
34431 case V4SF_FTYPE_V4SF_VEC_MERGE:
34432 case V2DF_FTYPE_V2DF_VEC_MERGE:
34433 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34434 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34435 case V16QI_FTYPE_V16QI_V16QI:
34436 case V16QI_FTYPE_V8HI_V8HI:
34437 case V16SF_FTYPE_V16SF_V16SF:
34438 case V8QI_FTYPE_V8QI_V8QI:
34439 case V8QI_FTYPE_V4HI_V4HI:
34440 case V8HI_FTYPE_V8HI_V8HI:
34441 case V8HI_FTYPE_V16QI_V16QI:
34442 case V8HI_FTYPE_V4SI_V4SI:
34443 case V8SF_FTYPE_V8SF_V8SF:
34444 case V8SF_FTYPE_V8SF_V8SI:
34445 case V8DF_FTYPE_V8DF_V8DF:
34446 case V4SI_FTYPE_V4SI_V4SI:
34447 case V4SI_FTYPE_V8HI_V8HI:
34448 case V4SI_FTYPE_V2DF_V2DF:
34449 case V4HI_FTYPE_V4HI_V4HI:
34450 case V4HI_FTYPE_V8QI_V8QI:
34451 case V4HI_FTYPE_V2SI_V2SI:
34452 case V4DF_FTYPE_V4DF_V4DF:
34453 case V4DF_FTYPE_V4DF_V4DI:
34454 case V4SF_FTYPE_V4SF_V4SF:
34455 case V4SF_FTYPE_V4SF_V4SI:
34456 case V4SF_FTYPE_V4SF_V2SI:
34457 case V4SF_FTYPE_V4SF_V2DF:
34458 case V4SF_FTYPE_V4SF_UINT:
34459 case V4SF_FTYPE_V4SF_DI:
34460 case V4SF_FTYPE_V4SF_SI:
34461 case V2DI_FTYPE_V2DI_V2DI:
34462 case V2DI_FTYPE_V16QI_V16QI:
34463 case V2DI_FTYPE_V4SI_V4SI:
34464 case V2DI_FTYPE_V2DI_V16QI:
34465 case V2SI_FTYPE_V2SI_V2SI:
34466 case V2SI_FTYPE_V4HI_V4HI:
34467 case V2SI_FTYPE_V2SF_V2SF:
34468 case V2DF_FTYPE_V2DF_V2DF:
34469 case V2DF_FTYPE_V2DF_V4SF:
34470 case V2DF_FTYPE_V2DF_V2DI:
34471 case V2DF_FTYPE_V2DF_DI:
34472 case V2DF_FTYPE_V2DF_SI:
34473 case V2DF_FTYPE_V2DF_UINT:
34474 case V2SF_FTYPE_V2SF_V2SF:
34475 case V1DI_FTYPE_V1DI_V1DI:
34476 case V1DI_FTYPE_V8QI_V8QI:
34477 case V1DI_FTYPE_V2SI_V2SI:
34478 case V32QI_FTYPE_V16HI_V16HI:
34479 case V16HI_FTYPE_V8SI_V8SI:
34480 case V32QI_FTYPE_V32QI_V32QI:
34481 case V16HI_FTYPE_V32QI_V32QI:
34482 case V16HI_FTYPE_V16HI_V16HI:
34483 case V8SI_FTYPE_V4DF_V4DF:
34484 case V8SI_FTYPE_V8SI_V8SI:
34485 case V8SI_FTYPE_V16HI_V16HI:
34486 case V4DI_FTYPE_V4DI_V4DI:
34487 case V4DI_FTYPE_V8SI_V8SI:
34488 case V8DI_FTYPE_V64QI_V64QI:
34489 if (comparison == UNKNOWN)
34490 return ix86_expand_binop_builtin (icode, exp, target);
34491 nargs = 2;
34492 break;
34493 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34494 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34495 gcc_assert (comparison != UNKNOWN);
34496 nargs = 2;
34497 swap = true;
34498 break;
34499 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34500 case V16HI_FTYPE_V16HI_SI_COUNT:
34501 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34502 case V8SI_FTYPE_V8SI_SI_COUNT:
34503 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34504 case V4DI_FTYPE_V4DI_INT_COUNT:
34505 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34506 case V8HI_FTYPE_V8HI_SI_COUNT:
34507 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34508 case V4SI_FTYPE_V4SI_SI_COUNT:
34509 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34510 case V4HI_FTYPE_V4HI_SI_COUNT:
34511 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34512 case V2DI_FTYPE_V2DI_SI_COUNT:
34513 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34514 case V2SI_FTYPE_V2SI_SI_COUNT:
34515 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34516 case V1DI_FTYPE_V1DI_SI_COUNT:
34517 nargs = 2;
34518 last_arg_count = true;
34519 break;
34520 case UINT64_FTYPE_UINT64_UINT64:
34521 case UINT_FTYPE_UINT_UINT:
34522 case UINT_FTYPE_UINT_USHORT:
34523 case UINT_FTYPE_UINT_UCHAR:
34524 case UINT16_FTYPE_UINT16_INT:
34525 case UINT8_FTYPE_UINT8_INT:
34526 case UHI_FTYPE_UHI_UHI:
34527 case USI_FTYPE_USI_USI:
34528 case UDI_FTYPE_UDI_UDI:
34529 case V16SI_FTYPE_V8DF_V8DF:
34530 nargs = 2;
34531 break;
34532 case V2DI_FTYPE_V2DI_INT_CONVERT:
34533 nargs = 2;
34534 rmode = V1TImode;
34535 nargs_constant = 1;
34536 break;
34537 case V4DI_FTYPE_V4DI_INT_CONVERT:
34538 nargs = 2;
34539 rmode = V2TImode;
34540 nargs_constant = 1;
34541 break;
34542 case V8DI_FTYPE_V8DI_INT_CONVERT:
34543 nargs = 2;
34544 rmode = V4TImode;
34545 nargs_constant = 1;
34546 break;
34547 case V8HI_FTYPE_V8HI_INT:
34548 case V8HI_FTYPE_V8SF_INT:
34549 case V16HI_FTYPE_V16SF_INT:
34550 case V8HI_FTYPE_V4SF_INT:
34551 case V8SF_FTYPE_V8SF_INT:
34552 case V4SF_FTYPE_V16SF_INT:
34553 case V16SF_FTYPE_V16SF_INT:
34554 case V4SI_FTYPE_V4SI_INT:
34555 case V4SI_FTYPE_V8SI_INT:
34556 case V4HI_FTYPE_V4HI_INT:
34557 case V4DF_FTYPE_V4DF_INT:
34558 case V4DF_FTYPE_V8DF_INT:
34559 case V4SF_FTYPE_V4SF_INT:
34560 case V4SF_FTYPE_V8SF_INT:
34561 case V2DI_FTYPE_V2DI_INT:
34562 case V2DF_FTYPE_V2DF_INT:
34563 case V2DF_FTYPE_V4DF_INT:
34564 case V16HI_FTYPE_V16HI_INT:
34565 case V8SI_FTYPE_V8SI_INT:
34566 case V16SI_FTYPE_V16SI_INT:
34567 case V4SI_FTYPE_V16SI_INT:
34568 case V4DI_FTYPE_V4DI_INT:
34569 case V2DI_FTYPE_V4DI_INT:
34570 case V4DI_FTYPE_V8DI_INT:
34571 case QI_FTYPE_V4SF_INT:
34572 case QI_FTYPE_V2DF_INT:
34573 nargs = 2;
34574 nargs_constant = 1;
34575 break;
34576 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34577 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34578 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34579 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34580 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34581 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34582 case UHI_FTYPE_V16SI_V16SI_UHI:
34583 case UQI_FTYPE_V8DI_V8DI_UQI:
34584 case V16HI_FTYPE_V16SI_V16HI_UHI:
34585 case V16QI_FTYPE_V16SI_V16QI_UHI:
34586 case V16QI_FTYPE_V8DI_V16QI_UQI:
34587 case V16SF_FTYPE_V16SF_V16SF_UHI:
34588 case V16SF_FTYPE_V4SF_V16SF_UHI:
34589 case V16SI_FTYPE_SI_V16SI_UHI:
34590 case V16SI_FTYPE_V16HI_V16SI_UHI:
34591 case V16SI_FTYPE_V16QI_V16SI_UHI:
34592 case V8SF_FTYPE_V4SF_V8SF_UQI:
34593 case V4DF_FTYPE_V2DF_V4DF_UQI:
34594 case V8SI_FTYPE_V4SI_V8SI_UQI:
34595 case V8SI_FTYPE_SI_V8SI_UQI:
34596 case V4SI_FTYPE_V4SI_V4SI_UQI:
34597 case V4SI_FTYPE_SI_V4SI_UQI:
34598 case V4DI_FTYPE_V2DI_V4DI_UQI:
34599 case V4DI_FTYPE_DI_V4DI_UQI:
34600 case V2DI_FTYPE_V2DI_V2DI_UQI:
34601 case V2DI_FTYPE_DI_V2DI_UQI:
34602 case V64QI_FTYPE_V64QI_V64QI_UDI:
34603 case V64QI_FTYPE_V16QI_V64QI_UDI:
34604 case V64QI_FTYPE_QI_V64QI_UDI:
34605 case V32QI_FTYPE_V32QI_V32QI_USI:
34606 case V32QI_FTYPE_V16QI_V32QI_USI:
34607 case V32QI_FTYPE_QI_V32QI_USI:
34608 case V16QI_FTYPE_V16QI_V16QI_UHI:
34609 case V16QI_FTYPE_QI_V16QI_UHI:
34610 case V32HI_FTYPE_V8HI_V32HI_USI:
34611 case V32HI_FTYPE_HI_V32HI_USI:
34612 case V16HI_FTYPE_V8HI_V16HI_UHI:
34613 case V16HI_FTYPE_HI_V16HI_UHI:
34614 case V8HI_FTYPE_V8HI_V8HI_UQI:
34615 case V8HI_FTYPE_HI_V8HI_UQI:
34616 case V8SF_FTYPE_V8HI_V8SF_UQI:
34617 case V4SF_FTYPE_V8HI_V4SF_UQI:
34618 case V8SI_FTYPE_V8SF_V8SI_UQI:
34619 case V4SI_FTYPE_V4SF_V4SI_UQI:
34620 case V4DI_FTYPE_V4SF_V4DI_UQI:
34621 case V2DI_FTYPE_V4SF_V2DI_UQI:
34622 case V4SF_FTYPE_V4DI_V4SF_UQI:
34623 case V4SF_FTYPE_V2DI_V4SF_UQI:
34624 case V4DF_FTYPE_V4DI_V4DF_UQI:
34625 case V2DF_FTYPE_V2DI_V2DF_UQI:
34626 case V16QI_FTYPE_V8HI_V16QI_UQI:
34627 case V16QI_FTYPE_V16HI_V16QI_UHI:
34628 case V16QI_FTYPE_V4SI_V16QI_UQI:
34629 case V16QI_FTYPE_V8SI_V16QI_UQI:
34630 case V8HI_FTYPE_V4SI_V8HI_UQI:
34631 case V8HI_FTYPE_V8SI_V8HI_UQI:
34632 case V16QI_FTYPE_V2DI_V16QI_UQI:
34633 case V16QI_FTYPE_V4DI_V16QI_UQI:
34634 case V8HI_FTYPE_V2DI_V8HI_UQI:
34635 case V8HI_FTYPE_V4DI_V8HI_UQI:
34636 case V4SI_FTYPE_V2DI_V4SI_UQI:
34637 case V4SI_FTYPE_V4DI_V4SI_UQI:
34638 case V32QI_FTYPE_V32HI_V32QI_USI:
34639 case UHI_FTYPE_V16QI_V16QI_UHI:
34640 case USI_FTYPE_V32QI_V32QI_USI:
34641 case UDI_FTYPE_V64QI_V64QI_UDI:
34642 case UQI_FTYPE_V8HI_V8HI_UQI:
34643 case UHI_FTYPE_V16HI_V16HI_UHI:
34644 case USI_FTYPE_V32HI_V32HI_USI:
34645 case UQI_FTYPE_V4SI_V4SI_UQI:
34646 case UQI_FTYPE_V8SI_V8SI_UQI:
34647 case UQI_FTYPE_V2DI_V2DI_UQI:
34648 case UQI_FTYPE_V4DI_V4DI_UQI:
34649 case V4SF_FTYPE_V2DF_V4SF_UQI:
34650 case V4SF_FTYPE_V4DF_V4SF_UQI:
34651 case V16SI_FTYPE_V16SI_V16SI_UHI:
34652 case V16SI_FTYPE_V4SI_V16SI_UHI:
34653 case V2DI_FTYPE_V4SI_V2DI_UQI:
34654 case V2DI_FTYPE_V8HI_V2DI_UQI:
34655 case V2DI_FTYPE_V16QI_V2DI_UQI:
34656 case V4DI_FTYPE_V4DI_V4DI_UQI:
34657 case V4DI_FTYPE_V4SI_V4DI_UQI:
34658 case V4DI_FTYPE_V8HI_V4DI_UQI:
34659 case V4DI_FTYPE_V16QI_V4DI_UQI:
34660 case V4DI_FTYPE_V4DF_V4DI_UQI:
34661 case V2DI_FTYPE_V2DF_V2DI_UQI:
34662 case V4SI_FTYPE_V4DF_V4SI_UQI:
34663 case V4SI_FTYPE_V2DF_V4SI_UQI:
34664 case V4SI_FTYPE_V8HI_V4SI_UQI:
34665 case V4SI_FTYPE_V16QI_V4SI_UQI:
34666 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34667 case V8DF_FTYPE_V2DF_V8DF_UQI:
34668 case V8DF_FTYPE_V4DF_V8DF_UQI:
34669 case V8DF_FTYPE_V8DF_V8DF_UQI:
34670 case V8SF_FTYPE_V8SF_V8SF_UQI:
34671 case V8SF_FTYPE_V8SI_V8SF_UQI:
34672 case V4DF_FTYPE_V4DF_V4DF_UQI:
34673 case V4SF_FTYPE_V4SF_V4SF_UQI:
34674 case V2DF_FTYPE_V2DF_V2DF_UQI:
34675 case V2DF_FTYPE_V4SF_V2DF_UQI:
34676 case V2DF_FTYPE_V4SI_V2DF_UQI:
34677 case V4SF_FTYPE_V4SI_V4SF_UQI:
34678 case V4DF_FTYPE_V4SF_V4DF_UQI:
34679 case V4DF_FTYPE_V4SI_V4DF_UQI:
34680 case V8SI_FTYPE_V8SI_V8SI_UQI:
34681 case V8SI_FTYPE_V8HI_V8SI_UQI:
34682 case V8SI_FTYPE_V16QI_V8SI_UQI:
34683 case V8DF_FTYPE_V8SI_V8DF_UQI:
34684 case V8DI_FTYPE_DI_V8DI_UQI:
34685 case V16SF_FTYPE_V8SF_V16SF_UHI:
34686 case V16SI_FTYPE_V8SI_V16SI_UHI:
34687 case V16HI_FTYPE_V16HI_V16HI_UHI:
34688 case V8HI_FTYPE_V16QI_V8HI_UQI:
34689 case V16HI_FTYPE_V16QI_V16HI_UHI:
34690 case V32HI_FTYPE_V32HI_V32HI_USI:
34691 case V32HI_FTYPE_V32QI_V32HI_USI:
34692 case V8DI_FTYPE_V16QI_V8DI_UQI:
34693 case V8DI_FTYPE_V2DI_V8DI_UQI:
34694 case V8DI_FTYPE_V4DI_V8DI_UQI:
34695 case V8DI_FTYPE_V8DI_V8DI_UQI:
34696 case V8DI_FTYPE_V8HI_V8DI_UQI:
34697 case V8DI_FTYPE_V8SI_V8DI_UQI:
34698 case V8HI_FTYPE_V8DI_V8HI_UQI:
34699 case V8SI_FTYPE_V8DI_V8SI_UQI:
34700 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34701 nargs = 3;
34702 break;
34703 case V32QI_FTYPE_V32QI_V32QI_INT:
34704 case V16HI_FTYPE_V16HI_V16HI_INT:
34705 case V16QI_FTYPE_V16QI_V16QI_INT:
34706 case V4DI_FTYPE_V4DI_V4DI_INT:
34707 case V8HI_FTYPE_V8HI_V8HI_INT:
34708 case V8SI_FTYPE_V8SI_V8SI_INT:
34709 case V8SI_FTYPE_V8SI_V4SI_INT:
34710 case V8SF_FTYPE_V8SF_V8SF_INT:
34711 case V8SF_FTYPE_V8SF_V4SF_INT:
34712 case V4SI_FTYPE_V4SI_V4SI_INT:
34713 case V4DF_FTYPE_V4DF_V4DF_INT:
34714 case V16SF_FTYPE_V16SF_V16SF_INT:
34715 case V16SF_FTYPE_V16SF_V4SF_INT:
34716 case V16SI_FTYPE_V16SI_V4SI_INT:
34717 case V4DF_FTYPE_V4DF_V2DF_INT:
34718 case V4SF_FTYPE_V4SF_V4SF_INT:
34719 case V2DI_FTYPE_V2DI_V2DI_INT:
34720 case V4DI_FTYPE_V4DI_V2DI_INT:
34721 case V2DF_FTYPE_V2DF_V2DF_INT:
34722 case UQI_FTYPE_V8DI_V8UDI_INT:
34723 case UQI_FTYPE_V8DF_V8DF_INT:
34724 case UQI_FTYPE_V2DF_V2DF_INT:
34725 case UQI_FTYPE_V4SF_V4SF_INT:
34726 case UHI_FTYPE_V16SI_V16SI_INT:
34727 case UHI_FTYPE_V16SF_V16SF_INT:
34728 nargs = 3;
34729 nargs_constant = 1;
34730 break;
34731 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34732 nargs = 3;
34733 rmode = V4DImode;
34734 nargs_constant = 1;
34735 break;
34736 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34737 nargs = 3;
34738 rmode = V2DImode;
34739 nargs_constant = 1;
34740 break;
34741 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34742 nargs = 3;
34743 rmode = DImode;
34744 nargs_constant = 1;
34745 break;
34746 case V2DI_FTYPE_V2DI_UINT_UINT:
34747 nargs = 3;
34748 nargs_constant = 2;
34749 break;
34750 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
34751 nargs = 3;
34752 rmode = V8DImode;
34753 nargs_constant = 1;
34754 break;
34755 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
34756 nargs = 5;
34757 rmode = V8DImode;
34758 mask_pos = 2;
34759 nargs_constant = 1;
34760 break;
34761 case QI_FTYPE_V8DF_INT_UQI:
34762 case QI_FTYPE_V4DF_INT_UQI:
34763 case QI_FTYPE_V2DF_INT_UQI:
34764 case HI_FTYPE_V16SF_INT_UHI:
34765 case QI_FTYPE_V8SF_INT_UQI:
34766 case QI_FTYPE_V4SF_INT_UQI:
34767 nargs = 3;
34768 mask_pos = 1;
34769 nargs_constant = 1;
34770 break;
34771 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
34772 nargs = 5;
34773 rmode = V4DImode;
34774 mask_pos = 2;
34775 nargs_constant = 1;
34776 break;
34777 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
34778 nargs = 5;
34779 rmode = V2DImode;
34780 mask_pos = 2;
34781 nargs_constant = 1;
34782 break;
34783 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34784 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34785 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34786 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34787 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34788 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34789 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34790 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34791 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34792 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34793 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34794 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34795 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34796 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34797 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34798 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34799 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34800 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34801 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34802 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34803 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34804 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34805 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34806 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34807 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34808 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34809 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34810 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34811 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34812 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34813 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34814 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34815 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34816 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34817 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34818 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34819 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34820 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34821 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34822 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34823 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34824 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34825 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34826 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34827 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34828 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34829 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34830 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34831 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34832 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34833 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34834 nargs = 4;
34835 break;
34836 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34837 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34838 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34839 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34840 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34841 nargs = 4;
34842 nargs_constant = 1;
34843 break;
34844 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34845 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34846 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34847 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34848 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34849 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34850 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34851 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34852 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34853 case USI_FTYPE_V32QI_V32QI_INT_USI:
34854 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34855 case USI_FTYPE_V32HI_V32HI_INT_USI:
34856 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34857 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34858 nargs = 4;
34859 mask_pos = 1;
34860 nargs_constant = 1;
34861 break;
34862 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34863 nargs = 4;
34864 nargs_constant = 2;
34865 break;
34866 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34867 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34868 nargs = 4;
34869 break;
34870 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34871 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34872 mask_pos = 1;
34873 nargs = 4;
34874 nargs_constant = 1;
34875 break;
34876 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34877 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34878 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34879 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34880 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34881 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34882 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34883 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34884 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34885 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34886 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34887 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34888 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34889 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34890 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34891 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34892 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34893 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34894 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34895 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34896 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34897 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34898 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34899 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34900 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34901 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34902 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34903 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34904 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34905 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34906 nargs = 4;
34907 mask_pos = 2;
34908 nargs_constant = 1;
34909 break;
34910 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34911 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34912 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34913 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34914 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34915 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34916 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34917 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34918 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34919 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34920 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34921 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34922 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34923 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34924 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34925 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34926 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34927 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34928 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34929 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34930 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34931 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34932 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34933 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34934 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34935 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34936 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34937 nargs = 5;
34938 mask_pos = 2;
34939 nargs_constant = 1;
34940 break;
34941 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34942 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34943 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34944 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34945 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34946 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34947 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34948 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34949 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34950 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34951 nargs = 5;
34952 mask_pos = 1;
34953 nargs_constant = 1;
34954 break;
34956 default:
34957 gcc_unreachable ();
34960 gcc_assert (nargs <= ARRAY_SIZE (args));
34962 if (comparison != UNKNOWN)
34964 gcc_assert (nargs == 2);
34965 return ix86_expand_sse_compare (d, exp, target, swap);
34968 if (rmode == VOIDmode || rmode == tmode)
34970 if (optimize
34971 || target == 0
34972 || GET_MODE (target) != tmode
34973 || !insn_p->operand[0].predicate (target, tmode))
34974 target = gen_reg_rtx (tmode);
34975 real_target = target;
34977 else
34979 real_target = gen_reg_rtx (tmode);
34980 target = lowpart_subreg (rmode, real_target, tmode);
34983 for (i = 0; i < nargs; i++)
34985 tree arg = CALL_EXPR_ARG (exp, i);
34986 rtx op = expand_normal (arg);
34987 machine_mode mode = insn_p->operand[i + 1].mode;
34988 bool match = insn_p->operand[i + 1].predicate (op, mode);
34990 if (last_arg_count && (i + 1) == nargs)
34992 /* SIMD shift insns take either an 8-bit immediate or
34993 register as count. But builtin functions take int as
34994 count. If count doesn't match, we put it in register. */
34995 if (!match)
34997 op = lowpart_subreg (SImode, op, GET_MODE (op));
34998 if (!insn_p->operand[i + 1].predicate (op, mode))
34999 op = copy_to_reg (op);
35002 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35003 (!mask_pos && (nargs - i) <= nargs_constant))
35005 if (!match)
35006 switch (icode)
35008 case CODE_FOR_avx_vinsertf128v4di:
35009 case CODE_FOR_avx_vextractf128v4di:
35010 error ("the last argument must be an 1-bit immediate");
35011 return const0_rtx;
35013 case CODE_FOR_avx512f_cmpv8di3_mask:
35014 case CODE_FOR_avx512f_cmpv16si3_mask:
35015 case CODE_FOR_avx512f_ucmpv8di3_mask:
35016 case CODE_FOR_avx512f_ucmpv16si3_mask:
35017 case CODE_FOR_avx512vl_cmpv4di3_mask:
35018 case CODE_FOR_avx512vl_cmpv8si3_mask:
35019 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35020 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35021 case CODE_FOR_avx512vl_cmpv2di3_mask:
35022 case CODE_FOR_avx512vl_cmpv4si3_mask:
35023 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35024 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35025 error ("the last argument must be a 3-bit immediate");
35026 return const0_rtx;
35028 case CODE_FOR_sse4_1_roundsd:
35029 case CODE_FOR_sse4_1_roundss:
35031 case CODE_FOR_sse4_1_roundpd:
35032 case CODE_FOR_sse4_1_roundps:
35033 case CODE_FOR_avx_roundpd256:
35034 case CODE_FOR_avx_roundps256:
35036 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35037 case CODE_FOR_sse4_1_roundps_sfix:
35038 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35039 case CODE_FOR_avx_roundps_sfix256:
35041 case CODE_FOR_sse4_1_blendps:
35042 case CODE_FOR_avx_blendpd256:
35043 case CODE_FOR_avx_vpermilv4df:
35044 case CODE_FOR_avx_vpermilv4df_mask:
35045 case CODE_FOR_avx512f_getmantv8df_mask:
35046 case CODE_FOR_avx512f_getmantv16sf_mask:
35047 case CODE_FOR_avx512vl_getmantv8sf_mask:
35048 case CODE_FOR_avx512vl_getmantv4df_mask:
35049 case CODE_FOR_avx512vl_getmantv4sf_mask:
35050 case CODE_FOR_avx512vl_getmantv2df_mask:
35051 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35052 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35053 case CODE_FOR_avx512dq_rangepv4df_mask:
35054 case CODE_FOR_avx512dq_rangepv8sf_mask:
35055 case CODE_FOR_avx512dq_rangepv2df_mask:
35056 case CODE_FOR_avx512dq_rangepv4sf_mask:
35057 case CODE_FOR_avx_shufpd256_mask:
35058 error ("the last argument must be a 4-bit immediate");
35059 return const0_rtx;
35061 case CODE_FOR_sha1rnds4:
35062 case CODE_FOR_sse4_1_blendpd:
35063 case CODE_FOR_avx_vpermilv2df:
35064 case CODE_FOR_avx_vpermilv2df_mask:
35065 case CODE_FOR_xop_vpermil2v2df3:
35066 case CODE_FOR_xop_vpermil2v4sf3:
35067 case CODE_FOR_xop_vpermil2v4df3:
35068 case CODE_FOR_xop_vpermil2v8sf3:
35069 case CODE_FOR_avx512f_vinsertf32x4_mask:
35070 case CODE_FOR_avx512f_vinserti32x4_mask:
35071 case CODE_FOR_avx512f_vextractf32x4_mask:
35072 case CODE_FOR_avx512f_vextracti32x4_mask:
35073 case CODE_FOR_sse2_shufpd:
35074 case CODE_FOR_sse2_shufpd_mask:
35075 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35076 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35077 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35078 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35079 error ("the last argument must be a 2-bit immediate");
35080 return const0_rtx;
35082 case CODE_FOR_avx_vextractf128v4df:
35083 case CODE_FOR_avx_vextractf128v8sf:
35084 case CODE_FOR_avx_vextractf128v8si:
35085 case CODE_FOR_avx_vinsertf128v4df:
35086 case CODE_FOR_avx_vinsertf128v8sf:
35087 case CODE_FOR_avx_vinsertf128v8si:
35088 case CODE_FOR_avx512f_vinsertf64x4_mask:
35089 case CODE_FOR_avx512f_vinserti64x4_mask:
35090 case CODE_FOR_avx512f_vextractf64x4_mask:
35091 case CODE_FOR_avx512f_vextracti64x4_mask:
35092 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35093 case CODE_FOR_avx512dq_vinserti32x8_mask:
35094 case CODE_FOR_avx512vl_vinsertv4df:
35095 case CODE_FOR_avx512vl_vinsertv4di:
35096 case CODE_FOR_avx512vl_vinsertv8sf:
35097 case CODE_FOR_avx512vl_vinsertv8si:
35098 error ("the last argument must be a 1-bit immediate");
35099 return const0_rtx;
35101 case CODE_FOR_avx_vmcmpv2df3:
35102 case CODE_FOR_avx_vmcmpv4sf3:
35103 case CODE_FOR_avx_cmpv2df3:
35104 case CODE_FOR_avx_cmpv4sf3:
35105 case CODE_FOR_avx_cmpv4df3:
35106 case CODE_FOR_avx_cmpv8sf3:
35107 case CODE_FOR_avx512f_cmpv8df3_mask:
35108 case CODE_FOR_avx512f_cmpv16sf3_mask:
35109 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35110 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35111 error ("the last argument must be a 5-bit immediate");
35112 return const0_rtx;
35114 default:
35115 switch (nargs_constant)
35117 case 2:
35118 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35119 (!mask_pos && (nargs - i) == nargs_constant))
35121 error ("the next to last argument must be an 8-bit immediate");
35122 break;
35124 /* FALLTHRU */
35125 case 1:
35126 error ("the last argument must be an 8-bit immediate");
35127 break;
35128 default:
35129 gcc_unreachable ();
35131 return const0_rtx;
35134 else
35136 if (VECTOR_MODE_P (mode))
35137 op = safe_vector_operand (op, mode);
35139 /* If we aren't optimizing, only allow one memory operand to
35140 be generated. */
35141 if (memory_operand (op, mode))
35142 num_memory++;
35144 op = fixup_modeless_constant (op, mode);
35146 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35148 if (optimize || !match || num_memory > 1)
35149 op = copy_to_mode_reg (mode, op);
35151 else
35153 op = copy_to_reg (op);
35154 op = lowpart_subreg (mode, op, GET_MODE (op));
35158 args[i].op = op;
35159 args[i].mode = mode;
35162 switch (nargs)
35164 case 1:
35165 pat = GEN_FCN (icode) (real_target, args[0].op);
35166 break;
35167 case 2:
35168 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35169 break;
35170 case 3:
35171 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35172 args[2].op);
35173 break;
35174 case 4:
35175 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35176 args[2].op, args[3].op);
35177 break;
35178 case 5:
35179 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35180 args[2].op, args[3].op, args[4].op);
35181 break;
35182 case 6:
35183 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35184 args[2].op, args[3].op, args[4].op,
35185 args[5].op);
35186 break;
35187 default:
35188 gcc_unreachable ();
35191 if (! pat)
35192 return 0;
35194 emit_insn (pat);
35195 return target;
35198 /* Transform pattern of following layout:
35199 (parallel [
35200 set (A B)
35201 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
35203 into:
35204 (set (A B))
35207 (parallel [ A B
35209 (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
35212 into:
35213 (parallel [ A B ... ]) */
35215 static rtx
35216 ix86_erase_embedded_rounding (rtx pat)
35218 if (GET_CODE (pat) == INSN)
35219 pat = PATTERN (pat);
35221 gcc_assert (GET_CODE (pat) == PARALLEL);
35223 if (XVECLEN (pat, 0) == 2)
35225 rtx p0 = XVECEXP (pat, 0, 0);
35226 rtx p1 = XVECEXP (pat, 0, 1);
35228 gcc_assert (GET_CODE (p0) == SET
35229 && GET_CODE (p1) == UNSPEC
35230 && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
35232 return p0;
35234 else
35236 rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
35237 int i = 0;
35238 int j = 0;
35240 for (; i < XVECLEN (pat, 0); ++i)
35242 rtx elem = XVECEXP (pat, 0, i);
35243 if (GET_CODE (elem) != UNSPEC
35244 || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
35245 res [j++] = elem;
35248 /* No more than 1 occurence was removed. */
35249 gcc_assert (j >= XVECLEN (pat, 0) - 1);
35251 return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
35255 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35256 with rounding. */
35257 static rtx
35258 ix86_expand_sse_comi_round (const struct builtin_description *d,
35259 tree exp, rtx target)
35261 rtx pat, set_dst;
35262 tree arg0 = CALL_EXPR_ARG (exp, 0);
35263 tree arg1 = CALL_EXPR_ARG (exp, 1);
35264 tree arg2 = CALL_EXPR_ARG (exp, 2);
35265 tree arg3 = CALL_EXPR_ARG (exp, 3);
35266 rtx op0 = expand_normal (arg0);
35267 rtx op1 = expand_normal (arg1);
35268 rtx op2 = expand_normal (arg2);
35269 rtx op3 = expand_normal (arg3);
35270 enum insn_code icode = d->icode;
35271 const struct insn_data_d *insn_p = &insn_data[icode];
35272 machine_mode mode0 = insn_p->operand[0].mode;
35273 machine_mode mode1 = insn_p->operand[1].mode;
35274 enum rtx_code comparison = UNEQ;
35275 bool need_ucomi = false;
35277 /* See avxintrin.h for values. */
35278 enum rtx_code comi_comparisons[32] =
35280 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35281 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35282 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35284 bool need_ucomi_values[32] =
35286 true, false, false, true, true, false, false, true,
35287 true, false, false, true, true, false, false, true,
35288 false, true, true, false, false, true, true, false,
35289 false, true, true, false, false, true, true, false
35292 if (!CONST_INT_P (op2))
35294 error ("the third argument must be comparison constant");
35295 return const0_rtx;
35297 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35299 error ("incorrect comparison mode");
35300 return const0_rtx;
35303 if (!insn_p->operand[2].predicate (op3, SImode))
35305 error ("incorrect rounding operand");
35306 return const0_rtx;
35309 comparison = comi_comparisons[INTVAL (op2)];
35310 need_ucomi = need_ucomi_values[INTVAL (op2)];
35312 if (VECTOR_MODE_P (mode0))
35313 op0 = safe_vector_operand (op0, mode0);
35314 if (VECTOR_MODE_P (mode1))
35315 op1 = safe_vector_operand (op1, mode1);
35317 target = gen_reg_rtx (SImode);
35318 emit_move_insn (target, const0_rtx);
35319 target = gen_rtx_SUBREG (QImode, target, 0);
35321 if ((optimize && !register_operand (op0, mode0))
35322 || !insn_p->operand[0].predicate (op0, mode0))
35323 op0 = copy_to_mode_reg (mode0, op0);
35324 if ((optimize && !register_operand (op1, mode1))
35325 || !insn_p->operand[1].predicate (op1, mode1))
35326 op1 = copy_to_mode_reg (mode1, op1);
35328 if (need_ucomi)
35329 icode = icode == CODE_FOR_sse_comi_round
35330 ? CODE_FOR_sse_ucomi_round
35331 : CODE_FOR_sse2_ucomi_round;
35333 pat = GEN_FCN (icode) (op0, op1, op3);
35334 if (! pat)
35335 return 0;
35337 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35338 if (INTVAL (op3) == NO_ROUND)
35340 pat = ix86_erase_embedded_rounding (pat);
35341 if (! pat)
35342 return 0;
35344 set_dst = SET_DEST (pat);
35346 else
35348 gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
35349 set_dst = SET_DEST (XVECEXP (pat, 0, 0));
35352 emit_insn (pat);
35353 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35354 gen_rtx_fmt_ee (comparison, QImode,
35355 set_dst,
35356 const0_rtx)));
35358 return SUBREG_REG (target);
35361 static rtx
35362 ix86_expand_round_builtin (const struct builtin_description *d,
35363 tree exp, rtx target)
35365 rtx pat;
35366 unsigned int i, nargs;
35367 struct
35369 rtx op;
35370 machine_mode mode;
35371 } args[6];
35372 enum insn_code icode = d->icode;
35373 const struct insn_data_d *insn_p = &insn_data[icode];
35374 machine_mode tmode = insn_p->operand[0].mode;
35375 unsigned int nargs_constant = 0;
35376 unsigned int redundant_embed_rnd = 0;
35378 switch ((enum ix86_builtin_func_type) d->flag)
35380 case UINT64_FTYPE_V2DF_INT:
35381 case UINT64_FTYPE_V4SF_INT:
35382 case UINT_FTYPE_V2DF_INT:
35383 case UINT_FTYPE_V4SF_INT:
35384 case INT64_FTYPE_V2DF_INT:
35385 case INT64_FTYPE_V4SF_INT:
35386 case INT_FTYPE_V2DF_INT:
35387 case INT_FTYPE_V4SF_INT:
35388 nargs = 2;
35389 break;
35390 case V4SF_FTYPE_V4SF_UINT_INT:
35391 case V4SF_FTYPE_V4SF_UINT64_INT:
35392 case V2DF_FTYPE_V2DF_UINT64_INT:
35393 case V4SF_FTYPE_V4SF_INT_INT:
35394 case V4SF_FTYPE_V4SF_INT64_INT:
35395 case V2DF_FTYPE_V2DF_INT64_INT:
35396 case V4SF_FTYPE_V4SF_V4SF_INT:
35397 case V2DF_FTYPE_V2DF_V2DF_INT:
35398 case V4SF_FTYPE_V4SF_V2DF_INT:
35399 case V2DF_FTYPE_V2DF_V4SF_INT:
35400 nargs = 3;
35401 break;
35402 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35403 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35404 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35405 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35406 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35407 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35408 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35409 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35410 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35411 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35412 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35413 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35414 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35415 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35416 nargs = 4;
35417 break;
35418 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35419 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35420 nargs_constant = 2;
35421 nargs = 4;
35422 break;
35423 case INT_FTYPE_V4SF_V4SF_INT_INT:
35424 case INT_FTYPE_V2DF_V2DF_INT_INT:
35425 return ix86_expand_sse_comi_round (d, exp, target);
35426 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35427 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35428 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35429 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35430 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35431 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35432 nargs = 5;
35433 break;
35434 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35435 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35436 nargs_constant = 4;
35437 nargs = 5;
35438 break;
35439 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35440 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35441 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35442 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35443 nargs_constant = 3;
35444 nargs = 5;
35445 break;
35446 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35447 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35448 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35449 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35450 nargs = 6;
35451 nargs_constant = 4;
35452 break;
35453 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35454 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35455 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35456 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35457 nargs = 6;
35458 nargs_constant = 3;
35459 break;
35460 default:
35461 gcc_unreachable ();
35463 gcc_assert (nargs <= ARRAY_SIZE (args));
35465 if (optimize
35466 || target == 0
35467 || GET_MODE (target) != tmode
35468 || !insn_p->operand[0].predicate (target, tmode))
35469 target = gen_reg_rtx (tmode);
35471 for (i = 0; i < nargs; i++)
35473 tree arg = CALL_EXPR_ARG (exp, i);
35474 rtx op = expand_normal (arg);
35475 machine_mode mode = insn_p->operand[i + 1].mode;
35476 bool match = insn_p->operand[i + 1].predicate (op, mode);
35478 if (i == nargs - nargs_constant)
35480 if (!match)
35482 switch (icode)
35484 case CODE_FOR_avx512f_getmantv8df_mask_round:
35485 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35486 case CODE_FOR_avx512f_vgetmantv2df_round:
35487 case CODE_FOR_avx512f_vgetmantv4sf_round:
35488 error ("the immediate argument must be a 4-bit immediate");
35489 return const0_rtx;
35490 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35491 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35492 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35493 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35494 error ("the immediate argument must be a 5-bit immediate");
35495 return const0_rtx;
35496 default:
35497 error ("the immediate argument must be an 8-bit immediate");
35498 return const0_rtx;
35502 else if (i == nargs-1)
35504 if (!insn_p->operand[nargs].predicate (op, SImode))
35506 error ("incorrect rounding operand");
35507 return const0_rtx;
35510 /* If there is no rounding use normal version of the pattern. */
35511 if (INTVAL (op) == NO_ROUND)
35512 redundant_embed_rnd = 1;
35514 else
35516 if (VECTOR_MODE_P (mode))
35517 op = safe_vector_operand (op, mode);
35519 op = fixup_modeless_constant (op, mode);
35521 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35523 if (optimize || !match)
35524 op = copy_to_mode_reg (mode, op);
35526 else
35528 op = copy_to_reg (op);
35529 op = lowpart_subreg (mode, op, GET_MODE (op));
35533 args[i].op = op;
35534 args[i].mode = mode;
35537 switch (nargs)
35539 case 1:
35540 pat = GEN_FCN (icode) (target, args[0].op);
35541 break;
35542 case 2:
35543 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35544 break;
35545 case 3:
35546 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35547 args[2].op);
35548 break;
35549 case 4:
35550 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35551 args[2].op, args[3].op);
35552 break;
35553 case 5:
35554 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35555 args[2].op, args[3].op, args[4].op);
35556 break;
35557 case 6:
35558 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35559 args[2].op, args[3].op, args[4].op,
35560 args[5].op);
35561 break;
35562 default:
35563 gcc_unreachable ();
35566 if (!pat)
35567 return 0;
35569 if (redundant_embed_rnd)
35570 pat = ix86_erase_embedded_rounding (pat);
35572 emit_insn (pat);
35573 return target;
35576 /* Subroutine of ix86_expand_builtin to take care of special insns
35577 with variable number of operands. */
35579 static rtx
35580 ix86_expand_special_args_builtin (const struct builtin_description *d,
35581 tree exp, rtx target)
35583 tree arg;
35584 rtx pat, op;
35585 unsigned int i, nargs, arg_adjust, memory;
35586 bool aligned_mem = false;
35587 struct
35589 rtx op;
35590 machine_mode mode;
35591 } args[3];
35592 enum insn_code icode = d->icode;
35593 bool last_arg_constant = false;
35594 const struct insn_data_d *insn_p = &insn_data[icode];
35595 machine_mode tmode = insn_p->operand[0].mode;
35596 enum { load, store } klass;
35598 switch ((enum ix86_builtin_func_type) d->flag)
35600 case VOID_FTYPE_VOID:
35601 emit_insn (GEN_FCN (icode) (target));
35602 return 0;
35603 case VOID_FTYPE_UINT64:
35604 case VOID_FTYPE_UNSIGNED:
35605 nargs = 0;
35606 klass = store;
35607 memory = 0;
35608 break;
35610 case INT_FTYPE_VOID:
35611 case USHORT_FTYPE_VOID:
35612 case UINT64_FTYPE_VOID:
35613 case UNSIGNED_FTYPE_VOID:
35614 nargs = 0;
35615 klass = load;
35616 memory = 0;
35617 break;
35618 case UINT64_FTYPE_PUNSIGNED:
35619 case V2DI_FTYPE_PV2DI:
35620 case V4DI_FTYPE_PV4DI:
35621 case V32QI_FTYPE_PCCHAR:
35622 case V16QI_FTYPE_PCCHAR:
35623 case V8SF_FTYPE_PCV4SF:
35624 case V8SF_FTYPE_PCFLOAT:
35625 case V4SF_FTYPE_PCFLOAT:
35626 case V4DF_FTYPE_PCV2DF:
35627 case V4DF_FTYPE_PCDOUBLE:
35628 case V2DF_FTYPE_PCDOUBLE:
35629 case VOID_FTYPE_PVOID:
35630 case V8DI_FTYPE_PV8DI:
35631 nargs = 1;
35632 klass = load;
35633 memory = 0;
35634 switch (icode)
35636 case CODE_FOR_sse4_1_movntdqa:
35637 case CODE_FOR_avx2_movntdqa:
35638 case CODE_FOR_avx512f_movntdqa:
35639 aligned_mem = true;
35640 break;
35641 default:
35642 break;
35644 break;
35645 case VOID_FTYPE_PV2SF_V4SF:
35646 case VOID_FTYPE_PV8DI_V8DI:
35647 case VOID_FTYPE_PV4DI_V4DI:
35648 case VOID_FTYPE_PV2DI_V2DI:
35649 case VOID_FTYPE_PCHAR_V32QI:
35650 case VOID_FTYPE_PCHAR_V16QI:
35651 case VOID_FTYPE_PFLOAT_V16SF:
35652 case VOID_FTYPE_PFLOAT_V8SF:
35653 case VOID_FTYPE_PFLOAT_V4SF:
35654 case VOID_FTYPE_PDOUBLE_V8DF:
35655 case VOID_FTYPE_PDOUBLE_V4DF:
35656 case VOID_FTYPE_PDOUBLE_V2DF:
35657 case VOID_FTYPE_PLONGLONG_LONGLONG:
35658 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35659 case VOID_FTYPE_PINT_INT:
35660 nargs = 1;
35661 klass = store;
35662 /* Reserve memory operand for target. */
35663 memory = ARRAY_SIZE (args);
35664 switch (icode)
35666 /* These builtins and instructions require the memory
35667 to be properly aligned. */
35668 case CODE_FOR_avx_movntv4di:
35669 case CODE_FOR_sse2_movntv2di:
35670 case CODE_FOR_avx_movntv8sf:
35671 case CODE_FOR_sse_movntv4sf:
35672 case CODE_FOR_sse4a_vmmovntv4sf:
35673 case CODE_FOR_avx_movntv4df:
35674 case CODE_FOR_sse2_movntv2df:
35675 case CODE_FOR_sse4a_vmmovntv2df:
35676 case CODE_FOR_sse2_movntidi:
35677 case CODE_FOR_sse_movntq:
35678 case CODE_FOR_sse2_movntisi:
35679 case CODE_FOR_avx512f_movntv16sf:
35680 case CODE_FOR_avx512f_movntv8df:
35681 case CODE_FOR_avx512f_movntv8di:
35682 aligned_mem = true;
35683 break;
35684 default:
35685 break;
35687 break;
35688 case V4SF_FTYPE_V4SF_PCV2SF:
35689 case V2DF_FTYPE_V2DF_PCDOUBLE:
35690 nargs = 2;
35691 klass = load;
35692 memory = 1;
35693 break;
35694 case V8SF_FTYPE_PCV8SF_V8SI:
35695 case V4DF_FTYPE_PCV4DF_V4DI:
35696 case V4SF_FTYPE_PCV4SF_V4SI:
35697 case V2DF_FTYPE_PCV2DF_V2DI:
35698 case V8SI_FTYPE_PCV8SI_V8SI:
35699 case V4DI_FTYPE_PCV4DI_V4DI:
35700 case V4SI_FTYPE_PCV4SI_V4SI:
35701 case V2DI_FTYPE_PCV2DI_V2DI:
35702 nargs = 2;
35703 klass = load;
35704 memory = 0;
35705 break;
35706 case VOID_FTYPE_PV8DF_V8DF_UQI:
35707 case VOID_FTYPE_PV4DF_V4DF_UQI:
35708 case VOID_FTYPE_PV2DF_V2DF_UQI:
35709 case VOID_FTYPE_PV16SF_V16SF_UHI:
35710 case VOID_FTYPE_PV8SF_V8SF_UQI:
35711 case VOID_FTYPE_PV4SF_V4SF_UQI:
35712 case VOID_FTYPE_PV8DI_V8DI_UQI:
35713 case VOID_FTYPE_PV4DI_V4DI_UQI:
35714 case VOID_FTYPE_PV2DI_V2DI_UQI:
35715 case VOID_FTYPE_PV16SI_V16SI_UHI:
35716 case VOID_FTYPE_PV8SI_V8SI_UQI:
35717 case VOID_FTYPE_PV4SI_V4SI_UQI:
35718 switch (icode)
35720 /* These builtins and instructions require the memory
35721 to be properly aligned. */
35722 case CODE_FOR_avx512f_storev16sf_mask:
35723 case CODE_FOR_avx512f_storev16si_mask:
35724 case CODE_FOR_avx512f_storev8df_mask:
35725 case CODE_FOR_avx512f_storev8di_mask:
35726 case CODE_FOR_avx512vl_storev8sf_mask:
35727 case CODE_FOR_avx512vl_storev8si_mask:
35728 case CODE_FOR_avx512vl_storev4df_mask:
35729 case CODE_FOR_avx512vl_storev4di_mask:
35730 case CODE_FOR_avx512vl_storev4sf_mask:
35731 case CODE_FOR_avx512vl_storev4si_mask:
35732 case CODE_FOR_avx512vl_storev2df_mask:
35733 case CODE_FOR_avx512vl_storev2di_mask:
35734 aligned_mem = true;
35735 break;
35736 default:
35737 break;
35739 /* FALLTHRU */
35740 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35741 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35742 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35743 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35744 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35745 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35746 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35747 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35748 case VOID_FTYPE_PV8SI_V8DI_UQI:
35749 case VOID_FTYPE_PV8HI_V8DI_UQI:
35750 case VOID_FTYPE_PV16HI_V16SI_UHI:
35751 case VOID_FTYPE_PV16QI_V8DI_UQI:
35752 case VOID_FTYPE_PV16QI_V16SI_UHI:
35753 case VOID_FTYPE_PV4SI_V4DI_UQI:
35754 case VOID_FTYPE_PV4SI_V2DI_UQI:
35755 case VOID_FTYPE_PV8HI_V4DI_UQI:
35756 case VOID_FTYPE_PV8HI_V2DI_UQI:
35757 case VOID_FTYPE_PV8HI_V8SI_UQI:
35758 case VOID_FTYPE_PV8HI_V4SI_UQI:
35759 case VOID_FTYPE_PV16QI_V4DI_UQI:
35760 case VOID_FTYPE_PV16QI_V2DI_UQI:
35761 case VOID_FTYPE_PV16QI_V8SI_UQI:
35762 case VOID_FTYPE_PV16QI_V4SI_UQI:
35763 case VOID_FTYPE_PCHAR_V64QI_UDI:
35764 case VOID_FTYPE_PCHAR_V32QI_USI:
35765 case VOID_FTYPE_PCHAR_V16QI_UHI:
35766 case VOID_FTYPE_PSHORT_V32HI_USI:
35767 case VOID_FTYPE_PSHORT_V16HI_UHI:
35768 case VOID_FTYPE_PSHORT_V8HI_UQI:
35769 case VOID_FTYPE_PINT_V16SI_UHI:
35770 case VOID_FTYPE_PINT_V8SI_UQI:
35771 case VOID_FTYPE_PINT_V4SI_UQI:
35772 case VOID_FTYPE_PINT64_V8DI_UQI:
35773 case VOID_FTYPE_PINT64_V4DI_UQI:
35774 case VOID_FTYPE_PINT64_V2DI_UQI:
35775 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35776 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35777 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35778 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35779 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35780 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35781 nargs = 2;
35782 klass = store;
35783 /* Reserve memory operand for target. */
35784 memory = ARRAY_SIZE (args);
35785 break;
35786 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35787 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35788 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35789 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35790 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35791 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35792 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35793 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35794 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35795 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35796 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35797 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35798 switch (icode)
35800 /* These builtins and instructions require the memory
35801 to be properly aligned. */
35802 case CODE_FOR_avx512f_loadv16sf_mask:
35803 case CODE_FOR_avx512f_loadv16si_mask:
35804 case CODE_FOR_avx512f_loadv8df_mask:
35805 case CODE_FOR_avx512f_loadv8di_mask:
35806 case CODE_FOR_avx512vl_loadv8sf_mask:
35807 case CODE_FOR_avx512vl_loadv8si_mask:
35808 case CODE_FOR_avx512vl_loadv4df_mask:
35809 case CODE_FOR_avx512vl_loadv4di_mask:
35810 case CODE_FOR_avx512vl_loadv4sf_mask:
35811 case CODE_FOR_avx512vl_loadv4si_mask:
35812 case CODE_FOR_avx512vl_loadv2df_mask:
35813 case CODE_FOR_avx512vl_loadv2di_mask:
35814 case CODE_FOR_avx512bw_loadv64qi_mask:
35815 case CODE_FOR_avx512vl_loadv32qi_mask:
35816 case CODE_FOR_avx512vl_loadv16qi_mask:
35817 case CODE_FOR_avx512bw_loadv32hi_mask:
35818 case CODE_FOR_avx512vl_loadv16hi_mask:
35819 case CODE_FOR_avx512vl_loadv8hi_mask:
35820 aligned_mem = true;
35821 break;
35822 default:
35823 break;
35825 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35826 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35827 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35828 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35829 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35830 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35831 case V16SI_FTYPE_PCINT_V16SI_UHI:
35832 case V8SI_FTYPE_PCINT_V8SI_UQI:
35833 case V4SI_FTYPE_PCINT_V4SI_UQI:
35834 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35835 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35836 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35837 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35838 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35839 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35840 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35841 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35842 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35843 nargs = 3;
35844 klass = load;
35845 memory = 0;
35846 break;
35847 case VOID_FTYPE_UINT_UINT_UINT:
35848 case VOID_FTYPE_UINT64_UINT_UINT:
35849 case UCHAR_FTYPE_UINT_UINT_UINT:
35850 case UCHAR_FTYPE_UINT64_UINT_UINT:
35851 nargs = 3;
35852 klass = load;
35853 memory = ARRAY_SIZE (args);
35854 last_arg_constant = true;
35855 break;
35856 default:
35857 gcc_unreachable ();
35860 gcc_assert (nargs <= ARRAY_SIZE (args));
35862 if (klass == store)
35864 arg = CALL_EXPR_ARG (exp, 0);
35865 op = expand_normal (arg);
35866 gcc_assert (target == 0);
35867 if (memory)
35869 op = ix86_zero_extend_to_Pmode (op);
35870 target = gen_rtx_MEM (tmode, op);
35871 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35872 on it. Try to improve it using get_pointer_alignment,
35873 and if the special builtin is one that requires strict
35874 mode alignment, also from it's GET_MODE_ALIGNMENT.
35875 Failure to do so could lead to ix86_legitimate_combined_insn
35876 rejecting all changes to such insns. */
35877 unsigned int align = get_pointer_alignment (arg);
35878 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35879 align = GET_MODE_ALIGNMENT (tmode);
35880 if (MEM_ALIGN (target) < align)
35881 set_mem_align (target, align);
35883 else
35884 target = force_reg (tmode, op);
35885 arg_adjust = 1;
35887 else
35889 arg_adjust = 0;
35890 if (optimize
35891 || target == 0
35892 || !register_operand (target, tmode)
35893 || GET_MODE (target) != tmode)
35894 target = gen_reg_rtx (tmode);
35897 for (i = 0; i < nargs; i++)
35899 machine_mode mode = insn_p->operand[i + 1].mode;
35900 bool match;
35902 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35903 op = expand_normal (arg);
35904 match = insn_p->operand[i + 1].predicate (op, mode);
35906 if (last_arg_constant && (i + 1) == nargs)
35908 if (!match)
35910 if (icode == CODE_FOR_lwp_lwpvalsi3
35911 || icode == CODE_FOR_lwp_lwpinssi3
35912 || icode == CODE_FOR_lwp_lwpvaldi3
35913 || icode == CODE_FOR_lwp_lwpinsdi3)
35914 error ("the last argument must be a 32-bit immediate");
35915 else
35916 error ("the last argument must be an 8-bit immediate");
35917 return const0_rtx;
35920 else
35922 if (i == memory)
35924 /* This must be the memory operand. */
35925 op = ix86_zero_extend_to_Pmode (op);
35926 op = gen_rtx_MEM (mode, op);
35927 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35928 on it. Try to improve it using get_pointer_alignment,
35929 and if the special builtin is one that requires strict
35930 mode alignment, also from it's GET_MODE_ALIGNMENT.
35931 Failure to do so could lead to ix86_legitimate_combined_insn
35932 rejecting all changes to such insns. */
35933 unsigned int align = get_pointer_alignment (arg);
35934 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35935 align = GET_MODE_ALIGNMENT (mode);
35936 if (MEM_ALIGN (op) < align)
35937 set_mem_align (op, align);
35939 else
35941 /* This must be register. */
35942 if (VECTOR_MODE_P (mode))
35943 op = safe_vector_operand (op, mode);
35945 op = fixup_modeless_constant (op, mode);
35947 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35948 op = copy_to_mode_reg (mode, op);
35949 else
35951 op = copy_to_reg (op);
35952 op = lowpart_subreg (mode, op, GET_MODE (op));
35957 args[i].op = op;
35958 args[i].mode = mode;
35961 switch (nargs)
35963 case 0:
35964 pat = GEN_FCN (icode) (target);
35965 break;
35966 case 1:
35967 pat = GEN_FCN (icode) (target, args[0].op);
35968 break;
35969 case 2:
35970 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35971 break;
35972 case 3:
35973 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35974 break;
35975 default:
35976 gcc_unreachable ();
35979 if (! pat)
35980 return 0;
35981 emit_insn (pat);
35982 return klass == store ? 0 : target;
35985 /* Return the integer constant in ARG. Constrain it to be in the range
35986 of the subparts of VEC_TYPE; issue an error if not. */
35988 static int
35989 get_element_number (tree vec_type, tree arg)
35991 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35993 if (!tree_fits_uhwi_p (arg)
35994 || (elt = tree_to_uhwi (arg), elt > max))
35996 error ("selector must be an integer constant in the range 0..%wi", max);
35997 return 0;
36000 return elt;
36003 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36004 ix86_expand_vector_init. We DO have language-level syntax for this, in
36005 the form of (type){ init-list }. Except that since we can't place emms
36006 instructions from inside the compiler, we can't allow the use of MMX
36007 registers unless the user explicitly asks for it. So we do *not* define
36008 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36009 we have builtins invoked by mmintrin.h that gives us license to emit
36010 these sorts of instructions. */
36012 static rtx
36013 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36015 machine_mode tmode = TYPE_MODE (type);
36016 machine_mode inner_mode = GET_MODE_INNER (tmode);
36017 int i, n_elt = GET_MODE_NUNITS (tmode);
36018 rtvec v = rtvec_alloc (n_elt);
36020 gcc_assert (VECTOR_MODE_P (tmode));
36021 gcc_assert (call_expr_nargs (exp) == n_elt);
36023 for (i = 0; i < n_elt; ++i)
36025 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36026 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36029 if (!target || !register_operand (target, tmode))
36030 target = gen_reg_rtx (tmode);
36032 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36033 return target;
36036 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36037 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36038 had a language-level syntax for referencing vector elements. */
36040 static rtx
36041 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36043 machine_mode tmode, mode0;
36044 tree arg0, arg1;
36045 int elt;
36046 rtx op0;
36048 arg0 = CALL_EXPR_ARG (exp, 0);
36049 arg1 = CALL_EXPR_ARG (exp, 1);
36051 op0 = expand_normal (arg0);
36052 elt = get_element_number (TREE_TYPE (arg0), arg1);
36054 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36055 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36056 gcc_assert (VECTOR_MODE_P (mode0));
36058 op0 = force_reg (mode0, op0);
36060 if (optimize || !target || !register_operand (target, tmode))
36061 target = gen_reg_rtx (tmode);
36063 ix86_expand_vector_extract (true, target, op0, elt);
36065 return target;
36068 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36069 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36070 a language-level syntax for referencing vector elements. */
36072 static rtx
36073 ix86_expand_vec_set_builtin (tree exp)
36075 machine_mode tmode, mode1;
36076 tree arg0, arg1, arg2;
36077 int elt;
36078 rtx op0, op1, target;
36080 arg0 = CALL_EXPR_ARG (exp, 0);
36081 arg1 = CALL_EXPR_ARG (exp, 1);
36082 arg2 = CALL_EXPR_ARG (exp, 2);
36084 tmode = TYPE_MODE (TREE_TYPE (arg0));
36085 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36086 gcc_assert (VECTOR_MODE_P (tmode));
36088 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36089 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36090 elt = get_element_number (TREE_TYPE (arg0), arg2);
36092 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36093 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36095 op0 = force_reg (tmode, op0);
36096 op1 = force_reg (mode1, op1);
36098 /* OP0 is the source of these builtin functions and shouldn't be
36099 modified. Create a copy, use it and return it as target. */
36100 target = gen_reg_rtx (tmode);
36101 emit_move_insn (target, op0);
36102 ix86_expand_vector_set (true, target, op1, elt);
36104 return target;
36107 /* Emit conditional move of SRC to DST with condition
36108 OP1 CODE OP2. */
36109 static void
36110 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36112 rtx t;
36114 if (TARGET_CMOVE)
36116 t = ix86_expand_compare (code, op1, op2);
36117 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36118 src, dst)));
36120 else
36122 rtx_code_label *nomove = gen_label_rtx ();
36123 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36124 const0_rtx, GET_MODE (op1), 1, nomove);
36125 emit_move_insn (dst, src);
36126 emit_label (nomove);
36130 /* Choose max of DST and SRC and put it to DST. */
36131 static void
36132 ix86_emit_move_max (rtx dst, rtx src)
36134 ix86_emit_cmove (dst, src, LTU, dst, src);
36137 /* Expand an expression EXP that calls a built-in function,
36138 with result going to TARGET if that's convenient
36139 (and in mode MODE if that's convenient).
36140 SUBTARGET may be used as the target for computing one of EXP's operands.
36141 IGNORE is nonzero if the value is to be ignored. */
36143 static rtx
36144 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36145 machine_mode mode, int ignore)
36147 size_t i;
36148 enum insn_code icode;
36149 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36150 tree arg0, arg1, arg2, arg3, arg4;
36151 rtx op0, op1, op2, op3, op4, pat, insn;
36152 machine_mode mode0, mode1, mode2, mode3, mode4;
36153 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36155 /* For CPU builtins that can be folded, fold first and expand the fold. */
36156 switch (fcode)
36158 case IX86_BUILTIN_CPU_INIT:
36160 /* Make it call __cpu_indicator_init in libgcc. */
36161 tree call_expr, fndecl, type;
36162 type = build_function_type_list (integer_type_node, NULL_TREE);
36163 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36164 call_expr = build_call_expr (fndecl, 0);
36165 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36167 case IX86_BUILTIN_CPU_IS:
36168 case IX86_BUILTIN_CPU_SUPPORTS:
36170 tree arg0 = CALL_EXPR_ARG (exp, 0);
36171 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36172 gcc_assert (fold_expr != NULL_TREE);
36173 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36177 /* Determine whether the builtin function is available under the current ISA.
36178 Originally the builtin was not created if it wasn't applicable to the
36179 current ISA based on the command line switches. With function specific
36180 options, we need to check in the context of the function making the call
36181 whether it is supported. */
36182 if (ix86_builtins_isa[fcode].isa
36183 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
36185 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, 0,
36186 NULL, NULL, (enum fpmath_unit) 0,
36187 false);
36188 if (!opts)
36189 error ("%qE needs unknown isa option", fndecl);
36190 else
36192 gcc_assert (opts != NULL);
36193 error ("%qE needs isa option %s", fndecl, opts);
36194 free (opts);
36196 return expand_call (exp, target, ignore);
36199 switch (fcode)
36201 case IX86_BUILTIN_BNDMK:
36202 if (!target
36203 || GET_MODE (target) != BNDmode
36204 || !register_operand (target, BNDmode))
36205 target = gen_reg_rtx (BNDmode);
36207 arg0 = CALL_EXPR_ARG (exp, 0);
36208 arg1 = CALL_EXPR_ARG (exp, 1);
36210 op0 = expand_normal (arg0);
36211 op1 = expand_normal (arg1);
36213 if (!register_operand (op0, Pmode))
36214 op0 = ix86_zero_extend_to_Pmode (op0);
36215 if (!register_operand (op1, Pmode))
36216 op1 = ix86_zero_extend_to_Pmode (op1);
36218 /* Builtin arg1 is size of block but instruction op1 should
36219 be (size - 1). */
36220 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36221 NULL_RTX, 1, OPTAB_DIRECT);
36223 emit_insn (BNDmode == BND64mode
36224 ? gen_bnd64_mk (target, op0, op1)
36225 : gen_bnd32_mk (target, op0, op1));
36226 return target;
36228 case IX86_BUILTIN_BNDSTX:
36229 arg0 = CALL_EXPR_ARG (exp, 0);
36230 arg1 = CALL_EXPR_ARG (exp, 1);
36231 arg2 = CALL_EXPR_ARG (exp, 2);
36233 op0 = expand_normal (arg0);
36234 op1 = expand_normal (arg1);
36235 op2 = expand_normal (arg2);
36237 if (!register_operand (op0, Pmode))
36238 op0 = ix86_zero_extend_to_Pmode (op0);
36239 if (!register_operand (op1, BNDmode))
36240 op1 = copy_to_mode_reg (BNDmode, op1);
36241 if (!register_operand (op2, Pmode))
36242 op2 = ix86_zero_extend_to_Pmode (op2);
36244 emit_insn (BNDmode == BND64mode
36245 ? gen_bnd64_stx (op2, op0, op1)
36246 : gen_bnd32_stx (op2, op0, op1));
36247 return 0;
36249 case IX86_BUILTIN_BNDLDX:
36250 if (!target
36251 || GET_MODE (target) != BNDmode
36252 || !register_operand (target, BNDmode))
36253 target = gen_reg_rtx (BNDmode);
36255 arg0 = CALL_EXPR_ARG (exp, 0);
36256 arg1 = CALL_EXPR_ARG (exp, 1);
36258 op0 = expand_normal (arg0);
36259 op1 = expand_normal (arg1);
36261 if (!register_operand (op0, Pmode))
36262 op0 = ix86_zero_extend_to_Pmode (op0);
36263 if (!register_operand (op1, Pmode))
36264 op1 = ix86_zero_extend_to_Pmode (op1);
36266 emit_insn (BNDmode == BND64mode
36267 ? gen_bnd64_ldx (target, op0, op1)
36268 : gen_bnd32_ldx (target, op0, op1));
36269 return target;
36271 case IX86_BUILTIN_BNDCL:
36272 arg0 = CALL_EXPR_ARG (exp, 0);
36273 arg1 = CALL_EXPR_ARG (exp, 1);
36275 op0 = expand_normal (arg0);
36276 op1 = expand_normal (arg1);
36278 if (!register_operand (op0, Pmode))
36279 op0 = ix86_zero_extend_to_Pmode (op0);
36280 if (!register_operand (op1, BNDmode))
36281 op1 = copy_to_mode_reg (BNDmode, op1);
36283 emit_insn (BNDmode == BND64mode
36284 ? gen_bnd64_cl (op1, op0)
36285 : gen_bnd32_cl (op1, op0));
36286 return 0;
36288 case IX86_BUILTIN_BNDCU:
36289 arg0 = CALL_EXPR_ARG (exp, 0);
36290 arg1 = CALL_EXPR_ARG (exp, 1);
36292 op0 = expand_normal (arg0);
36293 op1 = expand_normal (arg1);
36295 if (!register_operand (op0, Pmode))
36296 op0 = ix86_zero_extend_to_Pmode (op0);
36297 if (!register_operand (op1, BNDmode))
36298 op1 = copy_to_mode_reg (BNDmode, op1);
36300 emit_insn (BNDmode == BND64mode
36301 ? gen_bnd64_cu (op1, op0)
36302 : gen_bnd32_cu (op1, op0));
36303 return 0;
36305 case IX86_BUILTIN_BNDRET:
36306 arg0 = CALL_EXPR_ARG (exp, 0);
36307 gcc_assert (TREE_CODE (arg0) == SSA_NAME);
36308 target = chkp_get_rtl_bounds (arg0);
36310 /* If no bounds were specified for returned value,
36311 then use INIT bounds. It usually happens when
36312 some built-in function is expanded. */
36313 if (!target)
36315 rtx t1 = gen_reg_rtx (Pmode);
36316 rtx t2 = gen_reg_rtx (Pmode);
36317 target = gen_reg_rtx (BNDmode);
36318 emit_move_insn (t1, const0_rtx);
36319 emit_move_insn (t2, constm1_rtx);
36320 emit_insn (BNDmode == BND64mode
36321 ? gen_bnd64_mk (target, t1, t2)
36322 : gen_bnd32_mk (target, t1, t2));
36325 gcc_assert (target && REG_P (target));
36326 return target;
36328 case IX86_BUILTIN_BNDNARROW:
36330 rtx m1, m1h1, m1h2, lb, ub, t1;
36332 /* Return value and lb. */
36333 arg0 = CALL_EXPR_ARG (exp, 0);
36334 /* Bounds. */
36335 arg1 = CALL_EXPR_ARG (exp, 1);
36336 /* Size. */
36337 arg2 = CALL_EXPR_ARG (exp, 2);
36339 lb = expand_normal (arg0);
36340 op1 = expand_normal (arg1);
36341 op2 = expand_normal (arg2);
36343 /* Size was passed but we need to use (size - 1) as for bndmk. */
36344 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36345 NULL_RTX, 1, OPTAB_DIRECT);
36347 /* Add LB to size and inverse to get UB. */
36348 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36349 op2, 1, OPTAB_DIRECT);
36350 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36352 if (!register_operand (lb, Pmode))
36353 lb = ix86_zero_extend_to_Pmode (lb);
36354 if (!register_operand (ub, Pmode))
36355 ub = ix86_zero_extend_to_Pmode (ub);
36357 /* We need to move bounds to memory before any computations. */
36358 if (MEM_P (op1))
36359 m1 = op1;
36360 else
36362 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36363 emit_move_insn (m1, op1);
36366 /* Generate mem expression to be used for access to LB and UB. */
36367 m1h1 = adjust_address (m1, Pmode, 0);
36368 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36370 t1 = gen_reg_rtx (Pmode);
36372 /* Compute LB. */
36373 emit_move_insn (t1, m1h1);
36374 ix86_emit_move_max (t1, lb);
36375 emit_move_insn (m1h1, t1);
36377 /* Compute UB. UB is stored in 1's complement form. Therefore
36378 we also use max here. */
36379 emit_move_insn (t1, m1h2);
36380 ix86_emit_move_max (t1, ub);
36381 emit_move_insn (m1h2, t1);
36383 op2 = gen_reg_rtx (BNDmode);
36384 emit_move_insn (op2, m1);
36386 return chkp_join_splitted_slot (lb, op2);
36389 case IX86_BUILTIN_BNDINT:
36391 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36393 if (!target
36394 || GET_MODE (target) != BNDmode
36395 || !register_operand (target, BNDmode))
36396 target = gen_reg_rtx (BNDmode);
36398 arg0 = CALL_EXPR_ARG (exp, 0);
36399 arg1 = CALL_EXPR_ARG (exp, 1);
36401 op0 = expand_normal (arg0);
36402 op1 = expand_normal (arg1);
36404 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36405 rh1 = adjust_address (res, Pmode, 0);
36406 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36408 /* Put first bounds to temporaries. */
36409 lb1 = gen_reg_rtx (Pmode);
36410 ub1 = gen_reg_rtx (Pmode);
36411 if (MEM_P (op0))
36413 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36414 emit_move_insn (ub1, adjust_address (op0, Pmode,
36415 GET_MODE_SIZE (Pmode)));
36417 else
36419 emit_move_insn (res, op0);
36420 emit_move_insn (lb1, rh1);
36421 emit_move_insn (ub1, rh2);
36424 /* Put second bounds to temporaries. */
36425 lb2 = gen_reg_rtx (Pmode);
36426 ub2 = gen_reg_rtx (Pmode);
36427 if (MEM_P (op1))
36429 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36430 emit_move_insn (ub2, adjust_address (op1, Pmode,
36431 GET_MODE_SIZE (Pmode)));
36433 else
36435 emit_move_insn (res, op1);
36436 emit_move_insn (lb2, rh1);
36437 emit_move_insn (ub2, rh2);
36440 /* Compute LB. */
36441 ix86_emit_move_max (lb1, lb2);
36442 emit_move_insn (rh1, lb1);
36444 /* Compute UB. UB is stored in 1's complement form. Therefore
36445 we also use max here. */
36446 ix86_emit_move_max (ub1, ub2);
36447 emit_move_insn (rh2, ub1);
36449 emit_move_insn (target, res);
36451 return target;
36454 case IX86_BUILTIN_SIZEOF:
36456 tree name;
36457 rtx symbol;
36459 if (!target
36460 || GET_MODE (target) != Pmode
36461 || !register_operand (target, Pmode))
36462 target = gen_reg_rtx (Pmode);
36464 arg0 = CALL_EXPR_ARG (exp, 0);
36465 gcc_assert (TREE_CODE (arg0) == VAR_DECL);
36467 name = DECL_ASSEMBLER_NAME (arg0);
36468 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36470 emit_insn (Pmode == SImode
36471 ? gen_move_size_reloc_si (target, symbol)
36472 : gen_move_size_reloc_di (target, symbol));
36474 return target;
36477 case IX86_BUILTIN_BNDLOWER:
36479 rtx mem, hmem;
36481 if (!target
36482 || GET_MODE (target) != Pmode
36483 || !register_operand (target, Pmode))
36484 target = gen_reg_rtx (Pmode);
36486 arg0 = CALL_EXPR_ARG (exp, 0);
36487 op0 = expand_normal (arg0);
36489 /* We need to move bounds to memory first. */
36490 if (MEM_P (op0))
36491 mem = op0;
36492 else
36494 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36495 emit_move_insn (mem, op0);
36498 /* Generate mem expression to access LB and load it. */
36499 hmem = adjust_address (mem, Pmode, 0);
36500 emit_move_insn (target, hmem);
36502 return target;
36505 case IX86_BUILTIN_BNDUPPER:
36507 rtx mem, hmem, res;
36509 if (!target
36510 || GET_MODE (target) != Pmode
36511 || !register_operand (target, Pmode))
36512 target = gen_reg_rtx (Pmode);
36514 arg0 = CALL_EXPR_ARG (exp, 0);
36515 op0 = expand_normal (arg0);
36517 /* We need to move bounds to memory first. */
36518 if (MEM_P (op0))
36519 mem = op0;
36520 else
36522 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36523 emit_move_insn (mem, op0);
36526 /* Generate mem expression to access UB. */
36527 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36529 /* We need to inverse all bits of UB. */
36530 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36532 if (res != target)
36533 emit_move_insn (target, res);
36535 return target;
36538 case IX86_BUILTIN_MASKMOVQ:
36539 case IX86_BUILTIN_MASKMOVDQU:
36540 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36541 ? CODE_FOR_mmx_maskmovq
36542 : CODE_FOR_sse2_maskmovdqu);
36543 /* Note the arg order is different from the operand order. */
36544 arg1 = CALL_EXPR_ARG (exp, 0);
36545 arg2 = CALL_EXPR_ARG (exp, 1);
36546 arg0 = CALL_EXPR_ARG (exp, 2);
36547 op0 = expand_normal (arg0);
36548 op1 = expand_normal (arg1);
36549 op2 = expand_normal (arg2);
36550 mode0 = insn_data[icode].operand[0].mode;
36551 mode1 = insn_data[icode].operand[1].mode;
36552 mode2 = insn_data[icode].operand[2].mode;
36554 op0 = ix86_zero_extend_to_Pmode (op0);
36555 op0 = gen_rtx_MEM (mode1, op0);
36557 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36558 op0 = copy_to_mode_reg (mode0, op0);
36559 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36560 op1 = copy_to_mode_reg (mode1, op1);
36561 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36562 op2 = copy_to_mode_reg (mode2, op2);
36563 pat = GEN_FCN (icode) (op0, op1, op2);
36564 if (! pat)
36565 return 0;
36566 emit_insn (pat);
36567 return 0;
36569 case IX86_BUILTIN_LDMXCSR:
36570 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36571 target = assign_386_stack_local (SImode, SLOT_TEMP);
36572 emit_move_insn (target, op0);
36573 emit_insn (gen_sse_ldmxcsr (target));
36574 return 0;
36576 case IX86_BUILTIN_STMXCSR:
36577 target = assign_386_stack_local (SImode, SLOT_TEMP);
36578 emit_insn (gen_sse_stmxcsr (target));
36579 return copy_to_mode_reg (SImode, target);
36581 case IX86_BUILTIN_CLFLUSH:
36582 arg0 = CALL_EXPR_ARG (exp, 0);
36583 op0 = expand_normal (arg0);
36584 icode = CODE_FOR_sse2_clflush;
36585 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36586 op0 = ix86_zero_extend_to_Pmode (op0);
36588 emit_insn (gen_sse2_clflush (op0));
36589 return 0;
36591 case IX86_BUILTIN_CLWB:
36592 arg0 = CALL_EXPR_ARG (exp, 0);
36593 op0 = expand_normal (arg0);
36594 icode = CODE_FOR_clwb;
36595 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36596 op0 = ix86_zero_extend_to_Pmode (op0);
36598 emit_insn (gen_clwb (op0));
36599 return 0;
36601 case IX86_BUILTIN_CLFLUSHOPT:
36602 arg0 = CALL_EXPR_ARG (exp, 0);
36603 op0 = expand_normal (arg0);
36604 icode = CODE_FOR_clflushopt;
36605 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36606 op0 = ix86_zero_extend_to_Pmode (op0);
36608 emit_insn (gen_clflushopt (op0));
36609 return 0;
36611 case IX86_BUILTIN_MONITOR:
36612 case IX86_BUILTIN_MONITORX:
36613 arg0 = CALL_EXPR_ARG (exp, 0);
36614 arg1 = CALL_EXPR_ARG (exp, 1);
36615 arg2 = CALL_EXPR_ARG (exp, 2);
36616 op0 = expand_normal (arg0);
36617 op1 = expand_normal (arg1);
36618 op2 = expand_normal (arg2);
36619 if (!REG_P (op0))
36620 op0 = ix86_zero_extend_to_Pmode (op0);
36621 if (!REG_P (op1))
36622 op1 = copy_to_mode_reg (SImode, op1);
36623 if (!REG_P (op2))
36624 op2 = copy_to_mode_reg (SImode, op2);
36626 emit_insn (fcode == IX86_BUILTIN_MONITOR
36627 ? ix86_gen_monitor (op0, op1, op2)
36628 : ix86_gen_monitorx (op0, op1, op2));
36629 return 0;
36631 case IX86_BUILTIN_MWAIT:
36632 arg0 = CALL_EXPR_ARG (exp, 0);
36633 arg1 = CALL_EXPR_ARG (exp, 1);
36634 op0 = expand_normal (arg0);
36635 op1 = expand_normal (arg1);
36636 if (!REG_P (op0))
36637 op0 = copy_to_mode_reg (SImode, op0);
36638 if (!REG_P (op1))
36639 op1 = copy_to_mode_reg (SImode, op1);
36640 emit_insn (gen_sse3_mwait (op0, op1));
36641 return 0;
36643 case IX86_BUILTIN_MWAITX:
36644 arg0 = CALL_EXPR_ARG (exp, 0);
36645 arg1 = CALL_EXPR_ARG (exp, 1);
36646 arg2 = CALL_EXPR_ARG (exp, 2);
36647 op0 = expand_normal (arg0);
36648 op1 = expand_normal (arg1);
36649 op2 = expand_normal (arg2);
36650 if (!REG_P (op0))
36651 op0 = copy_to_mode_reg (SImode, op0);
36652 if (!REG_P (op1))
36653 op1 = copy_to_mode_reg (SImode, op1);
36654 if (!REG_P (op2))
36655 op2 = copy_to_mode_reg (SImode, op2);
36656 emit_insn (gen_mwaitx (op0, op1, op2));
36657 return 0;
36659 case IX86_BUILTIN_CLZERO:
36660 arg0 = CALL_EXPR_ARG (exp, 0);
36661 op0 = expand_normal (arg0);
36662 if (!REG_P (op0))
36663 op0 = ix86_zero_extend_to_Pmode (op0);
36664 emit_insn (ix86_gen_clzero (op0));
36665 return 0;
36667 case IX86_BUILTIN_VEC_INIT_V2SI:
36668 case IX86_BUILTIN_VEC_INIT_V4HI:
36669 case IX86_BUILTIN_VEC_INIT_V8QI:
36670 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36672 case IX86_BUILTIN_VEC_EXT_V2DF:
36673 case IX86_BUILTIN_VEC_EXT_V2DI:
36674 case IX86_BUILTIN_VEC_EXT_V4SF:
36675 case IX86_BUILTIN_VEC_EXT_V4SI:
36676 case IX86_BUILTIN_VEC_EXT_V8HI:
36677 case IX86_BUILTIN_VEC_EXT_V2SI:
36678 case IX86_BUILTIN_VEC_EXT_V4HI:
36679 case IX86_BUILTIN_VEC_EXT_V16QI:
36680 return ix86_expand_vec_ext_builtin (exp, target);
36682 case IX86_BUILTIN_VEC_SET_V2DI:
36683 case IX86_BUILTIN_VEC_SET_V4SF:
36684 case IX86_BUILTIN_VEC_SET_V4SI:
36685 case IX86_BUILTIN_VEC_SET_V8HI:
36686 case IX86_BUILTIN_VEC_SET_V4HI:
36687 case IX86_BUILTIN_VEC_SET_V16QI:
36688 return ix86_expand_vec_set_builtin (exp);
36690 case IX86_BUILTIN_INFQ:
36691 case IX86_BUILTIN_HUGE_VALQ:
36693 REAL_VALUE_TYPE inf;
36694 rtx tmp;
36696 real_inf (&inf);
36697 tmp = const_double_from_real_value (inf, mode);
36699 tmp = validize_mem (force_const_mem (mode, tmp));
36701 if (target == 0)
36702 target = gen_reg_rtx (mode);
36704 emit_move_insn (target, tmp);
36705 return target;
36708 case IX86_BUILTIN_NANQ:
36709 case IX86_BUILTIN_NANSQ:
36710 return expand_call (exp, target, ignore);
36712 case IX86_BUILTIN_RDPMC:
36713 case IX86_BUILTIN_RDTSC:
36714 case IX86_BUILTIN_RDTSCP:
36716 op0 = gen_reg_rtx (DImode);
36717 op1 = gen_reg_rtx (DImode);
36719 if (fcode == IX86_BUILTIN_RDPMC)
36721 arg0 = CALL_EXPR_ARG (exp, 0);
36722 op2 = expand_normal (arg0);
36723 if (!register_operand (op2, SImode))
36724 op2 = copy_to_mode_reg (SImode, op2);
36726 insn = (TARGET_64BIT
36727 ? gen_rdpmc_rex64 (op0, op1, op2)
36728 : gen_rdpmc (op0, op2));
36729 emit_insn (insn);
36731 else if (fcode == IX86_BUILTIN_RDTSC)
36733 insn = (TARGET_64BIT
36734 ? gen_rdtsc_rex64 (op0, op1)
36735 : gen_rdtsc (op0));
36736 emit_insn (insn);
36738 else
36740 op2 = gen_reg_rtx (SImode);
36742 insn = (TARGET_64BIT
36743 ? gen_rdtscp_rex64 (op0, op1, op2)
36744 : gen_rdtscp (op0, op2));
36745 emit_insn (insn);
36747 arg0 = CALL_EXPR_ARG (exp, 0);
36748 op4 = expand_normal (arg0);
36749 if (!address_operand (op4, VOIDmode))
36751 op4 = convert_memory_address (Pmode, op4);
36752 op4 = copy_addr_to_reg (op4);
36754 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36757 if (target == 0)
36759 /* mode is VOIDmode if __builtin_rd* has been called
36760 without lhs. */
36761 if (mode == VOIDmode)
36762 return target;
36763 target = gen_reg_rtx (mode);
36766 if (TARGET_64BIT)
36768 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36769 op1, 1, OPTAB_DIRECT);
36770 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36771 op0, 1, OPTAB_DIRECT);
36774 emit_move_insn (target, op0);
36775 return target;
36777 case IX86_BUILTIN_FXSAVE:
36778 case IX86_BUILTIN_FXRSTOR:
36779 case IX86_BUILTIN_FXSAVE64:
36780 case IX86_BUILTIN_FXRSTOR64:
36781 case IX86_BUILTIN_FNSTENV:
36782 case IX86_BUILTIN_FLDENV:
36783 mode0 = BLKmode;
36784 switch (fcode)
36786 case IX86_BUILTIN_FXSAVE:
36787 icode = CODE_FOR_fxsave;
36788 break;
36789 case IX86_BUILTIN_FXRSTOR:
36790 icode = CODE_FOR_fxrstor;
36791 break;
36792 case IX86_BUILTIN_FXSAVE64:
36793 icode = CODE_FOR_fxsave64;
36794 break;
36795 case IX86_BUILTIN_FXRSTOR64:
36796 icode = CODE_FOR_fxrstor64;
36797 break;
36798 case IX86_BUILTIN_FNSTENV:
36799 icode = CODE_FOR_fnstenv;
36800 break;
36801 case IX86_BUILTIN_FLDENV:
36802 icode = CODE_FOR_fldenv;
36803 break;
36804 default:
36805 gcc_unreachable ();
36808 arg0 = CALL_EXPR_ARG (exp, 0);
36809 op0 = expand_normal (arg0);
36811 if (!address_operand (op0, VOIDmode))
36813 op0 = convert_memory_address (Pmode, op0);
36814 op0 = copy_addr_to_reg (op0);
36816 op0 = gen_rtx_MEM (mode0, op0);
36818 pat = GEN_FCN (icode) (op0);
36819 if (pat)
36820 emit_insn (pat);
36821 return 0;
36823 case IX86_BUILTIN_XSAVE:
36824 case IX86_BUILTIN_XRSTOR:
36825 case IX86_BUILTIN_XSAVE64:
36826 case IX86_BUILTIN_XRSTOR64:
36827 case IX86_BUILTIN_XSAVEOPT:
36828 case IX86_BUILTIN_XSAVEOPT64:
36829 case IX86_BUILTIN_XSAVES:
36830 case IX86_BUILTIN_XRSTORS:
36831 case IX86_BUILTIN_XSAVES64:
36832 case IX86_BUILTIN_XRSTORS64:
36833 case IX86_BUILTIN_XSAVEC:
36834 case IX86_BUILTIN_XSAVEC64:
36835 arg0 = CALL_EXPR_ARG (exp, 0);
36836 arg1 = CALL_EXPR_ARG (exp, 1);
36837 op0 = expand_normal (arg0);
36838 op1 = expand_normal (arg1);
36840 if (!address_operand (op0, VOIDmode))
36842 op0 = convert_memory_address (Pmode, op0);
36843 op0 = copy_addr_to_reg (op0);
36845 op0 = gen_rtx_MEM (BLKmode, op0);
36847 op1 = force_reg (DImode, op1);
36849 if (TARGET_64BIT)
36851 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36852 NULL, 1, OPTAB_DIRECT);
36853 switch (fcode)
36855 case IX86_BUILTIN_XSAVE:
36856 icode = CODE_FOR_xsave_rex64;
36857 break;
36858 case IX86_BUILTIN_XRSTOR:
36859 icode = CODE_FOR_xrstor_rex64;
36860 break;
36861 case IX86_BUILTIN_XSAVE64:
36862 icode = CODE_FOR_xsave64;
36863 break;
36864 case IX86_BUILTIN_XRSTOR64:
36865 icode = CODE_FOR_xrstor64;
36866 break;
36867 case IX86_BUILTIN_XSAVEOPT:
36868 icode = CODE_FOR_xsaveopt_rex64;
36869 break;
36870 case IX86_BUILTIN_XSAVEOPT64:
36871 icode = CODE_FOR_xsaveopt64;
36872 break;
36873 case IX86_BUILTIN_XSAVES:
36874 icode = CODE_FOR_xsaves_rex64;
36875 break;
36876 case IX86_BUILTIN_XRSTORS:
36877 icode = CODE_FOR_xrstors_rex64;
36878 break;
36879 case IX86_BUILTIN_XSAVES64:
36880 icode = CODE_FOR_xsaves64;
36881 break;
36882 case IX86_BUILTIN_XRSTORS64:
36883 icode = CODE_FOR_xrstors64;
36884 break;
36885 case IX86_BUILTIN_XSAVEC:
36886 icode = CODE_FOR_xsavec_rex64;
36887 break;
36888 case IX86_BUILTIN_XSAVEC64:
36889 icode = CODE_FOR_xsavec64;
36890 break;
36891 default:
36892 gcc_unreachable ();
36895 op2 = gen_lowpart (SImode, op2);
36896 op1 = gen_lowpart (SImode, op1);
36897 pat = GEN_FCN (icode) (op0, op1, op2);
36899 else
36901 switch (fcode)
36903 case IX86_BUILTIN_XSAVE:
36904 icode = CODE_FOR_xsave;
36905 break;
36906 case IX86_BUILTIN_XRSTOR:
36907 icode = CODE_FOR_xrstor;
36908 break;
36909 case IX86_BUILTIN_XSAVEOPT:
36910 icode = CODE_FOR_xsaveopt;
36911 break;
36912 case IX86_BUILTIN_XSAVES:
36913 icode = CODE_FOR_xsaves;
36914 break;
36915 case IX86_BUILTIN_XRSTORS:
36916 icode = CODE_FOR_xrstors;
36917 break;
36918 case IX86_BUILTIN_XSAVEC:
36919 icode = CODE_FOR_xsavec;
36920 break;
36921 default:
36922 gcc_unreachable ();
36924 pat = GEN_FCN (icode) (op0, op1);
36927 if (pat)
36928 emit_insn (pat);
36929 return 0;
36931 case IX86_BUILTIN_LLWPCB:
36932 arg0 = CALL_EXPR_ARG (exp, 0);
36933 op0 = expand_normal (arg0);
36934 icode = CODE_FOR_lwp_llwpcb;
36935 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36936 op0 = ix86_zero_extend_to_Pmode (op0);
36937 emit_insn (gen_lwp_llwpcb (op0));
36938 return 0;
36940 case IX86_BUILTIN_SLWPCB:
36941 icode = CODE_FOR_lwp_slwpcb;
36942 if (!target
36943 || !insn_data[icode].operand[0].predicate (target, Pmode))
36944 target = gen_reg_rtx (Pmode);
36945 emit_insn (gen_lwp_slwpcb (target));
36946 return target;
36948 case IX86_BUILTIN_BEXTRI32:
36949 case IX86_BUILTIN_BEXTRI64:
36950 arg0 = CALL_EXPR_ARG (exp, 0);
36951 arg1 = CALL_EXPR_ARG (exp, 1);
36952 op0 = expand_normal (arg0);
36953 op1 = expand_normal (arg1);
36954 icode = (fcode == IX86_BUILTIN_BEXTRI32
36955 ? CODE_FOR_tbm_bextri_si
36956 : CODE_FOR_tbm_bextri_di);
36957 if (!CONST_INT_P (op1))
36959 error ("last argument must be an immediate");
36960 return const0_rtx;
36962 else
36964 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36965 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36966 op1 = GEN_INT (length);
36967 op2 = GEN_INT (lsb_index);
36968 pat = GEN_FCN (icode) (target, op0, op1, op2);
36969 if (pat)
36970 emit_insn (pat);
36971 return target;
36974 case IX86_BUILTIN_RDRAND16_STEP:
36975 icode = CODE_FOR_rdrandhi_1;
36976 mode0 = HImode;
36977 goto rdrand_step;
36979 case IX86_BUILTIN_RDRAND32_STEP:
36980 icode = CODE_FOR_rdrandsi_1;
36981 mode0 = SImode;
36982 goto rdrand_step;
36984 case IX86_BUILTIN_RDRAND64_STEP:
36985 icode = CODE_FOR_rdranddi_1;
36986 mode0 = DImode;
36988 rdrand_step:
36989 op0 = gen_reg_rtx (mode0);
36990 emit_insn (GEN_FCN (icode) (op0));
36992 arg0 = CALL_EXPR_ARG (exp, 0);
36993 op1 = expand_normal (arg0);
36994 if (!address_operand (op1, VOIDmode))
36996 op1 = convert_memory_address (Pmode, op1);
36997 op1 = copy_addr_to_reg (op1);
36999 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37001 op1 = gen_reg_rtx (SImode);
37002 emit_move_insn (op1, CONST1_RTX (SImode));
37004 /* Emit SImode conditional move. */
37005 if (mode0 == HImode)
37007 op2 = gen_reg_rtx (SImode);
37008 emit_insn (gen_zero_extendhisi2 (op2, op0));
37010 else if (mode0 == SImode)
37011 op2 = op0;
37012 else
37013 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37015 if (target == 0
37016 || !register_operand (target, SImode))
37017 target = gen_reg_rtx (SImode);
37019 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37020 const0_rtx);
37021 emit_insn (gen_rtx_SET (target,
37022 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37023 return target;
37025 case IX86_BUILTIN_RDSEED16_STEP:
37026 icode = CODE_FOR_rdseedhi_1;
37027 mode0 = HImode;
37028 goto rdseed_step;
37030 case IX86_BUILTIN_RDSEED32_STEP:
37031 icode = CODE_FOR_rdseedsi_1;
37032 mode0 = SImode;
37033 goto rdseed_step;
37035 case IX86_BUILTIN_RDSEED64_STEP:
37036 icode = CODE_FOR_rdseeddi_1;
37037 mode0 = DImode;
37039 rdseed_step:
37040 op0 = gen_reg_rtx (mode0);
37041 emit_insn (GEN_FCN (icode) (op0));
37043 arg0 = CALL_EXPR_ARG (exp, 0);
37044 op1 = expand_normal (arg0);
37045 if (!address_operand (op1, VOIDmode))
37047 op1 = convert_memory_address (Pmode, op1);
37048 op1 = copy_addr_to_reg (op1);
37050 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37052 op2 = gen_reg_rtx (QImode);
37054 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37055 const0_rtx);
37056 emit_insn (gen_rtx_SET (op2, pat));
37058 if (target == 0
37059 || !register_operand (target, SImode))
37060 target = gen_reg_rtx (SImode);
37062 emit_insn (gen_zero_extendqisi2 (target, op2));
37063 return target;
37065 case IX86_BUILTIN_SBB32:
37066 icode = CODE_FOR_subborrowsi;
37067 mode0 = SImode;
37068 goto handlecarry;
37070 case IX86_BUILTIN_SBB64:
37071 icode = CODE_FOR_subborrowdi;
37072 mode0 = DImode;
37073 goto handlecarry;
37075 case IX86_BUILTIN_ADDCARRYX32:
37076 icode = CODE_FOR_addcarrysi;
37077 mode0 = SImode;
37078 goto handlecarry;
37080 case IX86_BUILTIN_ADDCARRYX64:
37081 icode = CODE_FOR_addcarrydi;
37082 mode0 = DImode;
37084 handlecarry:
37085 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37086 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37087 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37088 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37090 op1 = expand_normal (arg0);
37091 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37093 op2 = expand_normal (arg1);
37094 if (!register_operand (op2, mode0))
37095 op2 = copy_to_mode_reg (mode0, op2);
37097 op3 = expand_normal (arg2);
37098 if (!register_operand (op3, mode0))
37099 op3 = copy_to_mode_reg (mode0, op3);
37101 op4 = expand_normal (arg3);
37102 if (!address_operand (op4, VOIDmode))
37104 op4 = convert_memory_address (Pmode, op4);
37105 op4 = copy_addr_to_reg (op4);
37108 /* Generate CF from input operand. */
37109 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37111 /* Generate instruction that consumes CF. */
37112 op0 = gen_reg_rtx (mode0);
37114 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37115 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
37116 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
37118 /* Return current CF value. */
37119 if (target == 0)
37120 target = gen_reg_rtx (QImode);
37122 PUT_MODE (pat, QImode);
37123 emit_insn (gen_rtx_SET (target, pat));
37125 /* Store the result. */
37126 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37128 return target;
37130 case IX86_BUILTIN_READ_FLAGS:
37131 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37133 if (optimize
37134 || target == NULL_RTX
37135 || !nonimmediate_operand (target, word_mode)
37136 || GET_MODE (target) != word_mode)
37137 target = gen_reg_rtx (word_mode);
37139 emit_insn (gen_pop (target));
37140 return target;
37142 case IX86_BUILTIN_WRITE_FLAGS:
37144 arg0 = CALL_EXPR_ARG (exp, 0);
37145 op0 = expand_normal (arg0);
37146 if (!general_no_elim_operand (op0, word_mode))
37147 op0 = copy_to_mode_reg (word_mode, op0);
37149 emit_insn (gen_push (op0));
37150 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37151 return 0;
37153 case IX86_BUILTIN_KORTESTC16:
37154 icode = CODE_FOR_kortestchi;
37155 mode0 = HImode;
37156 mode1 = CCCmode;
37157 goto kortest;
37159 case IX86_BUILTIN_KORTESTZ16:
37160 icode = CODE_FOR_kortestzhi;
37161 mode0 = HImode;
37162 mode1 = CCZmode;
37164 kortest:
37165 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37166 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37167 op0 = expand_normal (arg0);
37168 op1 = expand_normal (arg1);
37170 op0 = copy_to_reg (op0);
37171 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37172 op1 = copy_to_reg (op1);
37173 op1 = lowpart_subreg (mode0, op1, GET_MODE (op1));
37175 target = gen_reg_rtx (QImode);
37176 emit_insn (gen_rtx_SET (target, const0_rtx));
37178 /* Emit kortest. */
37179 emit_insn (GEN_FCN (icode) (op0, op1));
37180 /* And use setcc to return result from flags. */
37181 ix86_expand_setcc (target, EQ,
37182 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
37183 return target;
37185 case IX86_BUILTIN_GATHERSIV2DF:
37186 icode = CODE_FOR_avx2_gathersiv2df;
37187 goto gather_gen;
37188 case IX86_BUILTIN_GATHERSIV4DF:
37189 icode = CODE_FOR_avx2_gathersiv4df;
37190 goto gather_gen;
37191 case IX86_BUILTIN_GATHERDIV2DF:
37192 icode = CODE_FOR_avx2_gatherdiv2df;
37193 goto gather_gen;
37194 case IX86_BUILTIN_GATHERDIV4DF:
37195 icode = CODE_FOR_avx2_gatherdiv4df;
37196 goto gather_gen;
37197 case IX86_BUILTIN_GATHERSIV4SF:
37198 icode = CODE_FOR_avx2_gathersiv4sf;
37199 goto gather_gen;
37200 case IX86_BUILTIN_GATHERSIV8SF:
37201 icode = CODE_FOR_avx2_gathersiv8sf;
37202 goto gather_gen;
37203 case IX86_BUILTIN_GATHERDIV4SF:
37204 icode = CODE_FOR_avx2_gatherdiv4sf;
37205 goto gather_gen;
37206 case IX86_BUILTIN_GATHERDIV8SF:
37207 icode = CODE_FOR_avx2_gatherdiv8sf;
37208 goto gather_gen;
37209 case IX86_BUILTIN_GATHERSIV2DI:
37210 icode = CODE_FOR_avx2_gathersiv2di;
37211 goto gather_gen;
37212 case IX86_BUILTIN_GATHERSIV4DI:
37213 icode = CODE_FOR_avx2_gathersiv4di;
37214 goto gather_gen;
37215 case IX86_BUILTIN_GATHERDIV2DI:
37216 icode = CODE_FOR_avx2_gatherdiv2di;
37217 goto gather_gen;
37218 case IX86_BUILTIN_GATHERDIV4DI:
37219 icode = CODE_FOR_avx2_gatherdiv4di;
37220 goto gather_gen;
37221 case IX86_BUILTIN_GATHERSIV4SI:
37222 icode = CODE_FOR_avx2_gathersiv4si;
37223 goto gather_gen;
37224 case IX86_BUILTIN_GATHERSIV8SI:
37225 icode = CODE_FOR_avx2_gathersiv8si;
37226 goto gather_gen;
37227 case IX86_BUILTIN_GATHERDIV4SI:
37228 icode = CODE_FOR_avx2_gatherdiv4si;
37229 goto gather_gen;
37230 case IX86_BUILTIN_GATHERDIV8SI:
37231 icode = CODE_FOR_avx2_gatherdiv8si;
37232 goto gather_gen;
37233 case IX86_BUILTIN_GATHERALTSIV4DF:
37234 icode = CODE_FOR_avx2_gathersiv4df;
37235 goto gather_gen;
37236 case IX86_BUILTIN_GATHERALTDIV8SF:
37237 icode = CODE_FOR_avx2_gatherdiv8sf;
37238 goto gather_gen;
37239 case IX86_BUILTIN_GATHERALTSIV4DI:
37240 icode = CODE_FOR_avx2_gathersiv4di;
37241 goto gather_gen;
37242 case IX86_BUILTIN_GATHERALTDIV8SI:
37243 icode = CODE_FOR_avx2_gatherdiv8si;
37244 goto gather_gen;
37245 case IX86_BUILTIN_GATHER3SIV16SF:
37246 icode = CODE_FOR_avx512f_gathersiv16sf;
37247 goto gather_gen;
37248 case IX86_BUILTIN_GATHER3SIV8DF:
37249 icode = CODE_FOR_avx512f_gathersiv8df;
37250 goto gather_gen;
37251 case IX86_BUILTIN_GATHER3DIV16SF:
37252 icode = CODE_FOR_avx512f_gatherdiv16sf;
37253 goto gather_gen;
37254 case IX86_BUILTIN_GATHER3DIV8DF:
37255 icode = CODE_FOR_avx512f_gatherdiv8df;
37256 goto gather_gen;
37257 case IX86_BUILTIN_GATHER3SIV16SI:
37258 icode = CODE_FOR_avx512f_gathersiv16si;
37259 goto gather_gen;
37260 case IX86_BUILTIN_GATHER3SIV8DI:
37261 icode = CODE_FOR_avx512f_gathersiv8di;
37262 goto gather_gen;
37263 case IX86_BUILTIN_GATHER3DIV16SI:
37264 icode = CODE_FOR_avx512f_gatherdiv16si;
37265 goto gather_gen;
37266 case IX86_BUILTIN_GATHER3DIV8DI:
37267 icode = CODE_FOR_avx512f_gatherdiv8di;
37268 goto gather_gen;
37269 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37270 icode = CODE_FOR_avx512f_gathersiv8df;
37271 goto gather_gen;
37272 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37273 icode = CODE_FOR_avx512f_gatherdiv16sf;
37274 goto gather_gen;
37275 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37276 icode = CODE_FOR_avx512f_gathersiv8di;
37277 goto gather_gen;
37278 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37279 icode = CODE_FOR_avx512f_gatherdiv16si;
37280 goto gather_gen;
37281 case IX86_BUILTIN_GATHER3SIV2DF:
37282 icode = CODE_FOR_avx512vl_gathersiv2df;
37283 goto gather_gen;
37284 case IX86_BUILTIN_GATHER3SIV4DF:
37285 icode = CODE_FOR_avx512vl_gathersiv4df;
37286 goto gather_gen;
37287 case IX86_BUILTIN_GATHER3DIV2DF:
37288 icode = CODE_FOR_avx512vl_gatherdiv2df;
37289 goto gather_gen;
37290 case IX86_BUILTIN_GATHER3DIV4DF:
37291 icode = CODE_FOR_avx512vl_gatherdiv4df;
37292 goto gather_gen;
37293 case IX86_BUILTIN_GATHER3SIV4SF:
37294 icode = CODE_FOR_avx512vl_gathersiv4sf;
37295 goto gather_gen;
37296 case IX86_BUILTIN_GATHER3SIV8SF:
37297 icode = CODE_FOR_avx512vl_gathersiv8sf;
37298 goto gather_gen;
37299 case IX86_BUILTIN_GATHER3DIV4SF:
37300 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37301 goto gather_gen;
37302 case IX86_BUILTIN_GATHER3DIV8SF:
37303 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37304 goto gather_gen;
37305 case IX86_BUILTIN_GATHER3SIV2DI:
37306 icode = CODE_FOR_avx512vl_gathersiv2di;
37307 goto gather_gen;
37308 case IX86_BUILTIN_GATHER3SIV4DI:
37309 icode = CODE_FOR_avx512vl_gathersiv4di;
37310 goto gather_gen;
37311 case IX86_BUILTIN_GATHER3DIV2DI:
37312 icode = CODE_FOR_avx512vl_gatherdiv2di;
37313 goto gather_gen;
37314 case IX86_BUILTIN_GATHER3DIV4DI:
37315 icode = CODE_FOR_avx512vl_gatherdiv4di;
37316 goto gather_gen;
37317 case IX86_BUILTIN_GATHER3SIV4SI:
37318 icode = CODE_FOR_avx512vl_gathersiv4si;
37319 goto gather_gen;
37320 case IX86_BUILTIN_GATHER3SIV8SI:
37321 icode = CODE_FOR_avx512vl_gathersiv8si;
37322 goto gather_gen;
37323 case IX86_BUILTIN_GATHER3DIV4SI:
37324 icode = CODE_FOR_avx512vl_gatherdiv4si;
37325 goto gather_gen;
37326 case IX86_BUILTIN_GATHER3DIV8SI:
37327 icode = CODE_FOR_avx512vl_gatherdiv8si;
37328 goto gather_gen;
37329 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37330 icode = CODE_FOR_avx512vl_gathersiv4df;
37331 goto gather_gen;
37332 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37333 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37334 goto gather_gen;
37335 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37336 icode = CODE_FOR_avx512vl_gathersiv4di;
37337 goto gather_gen;
37338 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37339 icode = CODE_FOR_avx512vl_gatherdiv8si;
37340 goto gather_gen;
37341 case IX86_BUILTIN_SCATTERSIV16SF:
37342 icode = CODE_FOR_avx512f_scattersiv16sf;
37343 goto scatter_gen;
37344 case IX86_BUILTIN_SCATTERSIV8DF:
37345 icode = CODE_FOR_avx512f_scattersiv8df;
37346 goto scatter_gen;
37347 case IX86_BUILTIN_SCATTERDIV16SF:
37348 icode = CODE_FOR_avx512f_scatterdiv16sf;
37349 goto scatter_gen;
37350 case IX86_BUILTIN_SCATTERDIV8DF:
37351 icode = CODE_FOR_avx512f_scatterdiv8df;
37352 goto scatter_gen;
37353 case IX86_BUILTIN_SCATTERSIV16SI:
37354 icode = CODE_FOR_avx512f_scattersiv16si;
37355 goto scatter_gen;
37356 case IX86_BUILTIN_SCATTERSIV8DI:
37357 icode = CODE_FOR_avx512f_scattersiv8di;
37358 goto scatter_gen;
37359 case IX86_BUILTIN_SCATTERDIV16SI:
37360 icode = CODE_FOR_avx512f_scatterdiv16si;
37361 goto scatter_gen;
37362 case IX86_BUILTIN_SCATTERDIV8DI:
37363 icode = CODE_FOR_avx512f_scatterdiv8di;
37364 goto scatter_gen;
37365 case IX86_BUILTIN_SCATTERSIV8SF:
37366 icode = CODE_FOR_avx512vl_scattersiv8sf;
37367 goto scatter_gen;
37368 case IX86_BUILTIN_SCATTERSIV4SF:
37369 icode = CODE_FOR_avx512vl_scattersiv4sf;
37370 goto scatter_gen;
37371 case IX86_BUILTIN_SCATTERSIV4DF:
37372 icode = CODE_FOR_avx512vl_scattersiv4df;
37373 goto scatter_gen;
37374 case IX86_BUILTIN_SCATTERSIV2DF:
37375 icode = CODE_FOR_avx512vl_scattersiv2df;
37376 goto scatter_gen;
37377 case IX86_BUILTIN_SCATTERDIV8SF:
37378 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37379 goto scatter_gen;
37380 case IX86_BUILTIN_SCATTERDIV4SF:
37381 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37382 goto scatter_gen;
37383 case IX86_BUILTIN_SCATTERDIV4DF:
37384 icode = CODE_FOR_avx512vl_scatterdiv4df;
37385 goto scatter_gen;
37386 case IX86_BUILTIN_SCATTERDIV2DF:
37387 icode = CODE_FOR_avx512vl_scatterdiv2df;
37388 goto scatter_gen;
37389 case IX86_BUILTIN_SCATTERSIV8SI:
37390 icode = CODE_FOR_avx512vl_scattersiv8si;
37391 goto scatter_gen;
37392 case IX86_BUILTIN_SCATTERSIV4SI:
37393 icode = CODE_FOR_avx512vl_scattersiv4si;
37394 goto scatter_gen;
37395 case IX86_BUILTIN_SCATTERSIV4DI:
37396 icode = CODE_FOR_avx512vl_scattersiv4di;
37397 goto scatter_gen;
37398 case IX86_BUILTIN_SCATTERSIV2DI:
37399 icode = CODE_FOR_avx512vl_scattersiv2di;
37400 goto scatter_gen;
37401 case IX86_BUILTIN_SCATTERDIV8SI:
37402 icode = CODE_FOR_avx512vl_scatterdiv8si;
37403 goto scatter_gen;
37404 case IX86_BUILTIN_SCATTERDIV4SI:
37405 icode = CODE_FOR_avx512vl_scatterdiv4si;
37406 goto scatter_gen;
37407 case IX86_BUILTIN_SCATTERDIV4DI:
37408 icode = CODE_FOR_avx512vl_scatterdiv4di;
37409 goto scatter_gen;
37410 case IX86_BUILTIN_SCATTERDIV2DI:
37411 icode = CODE_FOR_avx512vl_scatterdiv2di;
37412 goto scatter_gen;
37413 case IX86_BUILTIN_GATHERPFDPD:
37414 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37415 goto vec_prefetch_gen;
37416 case IX86_BUILTIN_SCATTERALTSIV8DF:
37417 icode = CODE_FOR_avx512f_scattersiv8df;
37418 goto scatter_gen;
37419 case IX86_BUILTIN_SCATTERALTDIV16SF:
37420 icode = CODE_FOR_avx512f_scatterdiv16sf;
37421 goto scatter_gen;
37422 case IX86_BUILTIN_SCATTERALTSIV8DI:
37423 icode = CODE_FOR_avx512f_scattersiv8di;
37424 goto scatter_gen;
37425 case IX86_BUILTIN_SCATTERALTDIV16SI:
37426 icode = CODE_FOR_avx512f_scatterdiv16si;
37427 goto scatter_gen;
37428 case IX86_BUILTIN_GATHERPFDPS:
37429 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37430 goto vec_prefetch_gen;
37431 case IX86_BUILTIN_GATHERPFQPD:
37432 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37433 goto vec_prefetch_gen;
37434 case IX86_BUILTIN_GATHERPFQPS:
37435 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37436 goto vec_prefetch_gen;
37437 case IX86_BUILTIN_SCATTERPFDPD:
37438 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37439 goto vec_prefetch_gen;
37440 case IX86_BUILTIN_SCATTERPFDPS:
37441 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37442 goto vec_prefetch_gen;
37443 case IX86_BUILTIN_SCATTERPFQPD:
37444 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37445 goto vec_prefetch_gen;
37446 case IX86_BUILTIN_SCATTERPFQPS:
37447 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37448 goto vec_prefetch_gen;
37450 gather_gen:
37451 rtx half;
37452 rtx (*gen) (rtx, rtx);
37454 arg0 = CALL_EXPR_ARG (exp, 0);
37455 arg1 = CALL_EXPR_ARG (exp, 1);
37456 arg2 = CALL_EXPR_ARG (exp, 2);
37457 arg3 = CALL_EXPR_ARG (exp, 3);
37458 arg4 = CALL_EXPR_ARG (exp, 4);
37459 op0 = expand_normal (arg0);
37460 op1 = expand_normal (arg1);
37461 op2 = expand_normal (arg2);
37462 op3 = expand_normal (arg3);
37463 op4 = expand_normal (arg4);
37464 /* Note the arg order is different from the operand order. */
37465 mode0 = insn_data[icode].operand[1].mode;
37466 mode2 = insn_data[icode].operand[3].mode;
37467 mode3 = insn_data[icode].operand[4].mode;
37468 mode4 = insn_data[icode].operand[5].mode;
37470 if (target == NULL_RTX
37471 || GET_MODE (target) != insn_data[icode].operand[0].mode
37472 || !insn_data[icode].operand[0].predicate (target,
37473 GET_MODE (target)))
37474 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37475 else
37476 subtarget = target;
37478 switch (fcode)
37480 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37481 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37482 half = gen_reg_rtx (V8SImode);
37483 if (!nonimmediate_operand (op2, V16SImode))
37484 op2 = copy_to_mode_reg (V16SImode, op2);
37485 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37486 op2 = half;
37487 break;
37488 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37489 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37490 case IX86_BUILTIN_GATHERALTSIV4DF:
37491 case IX86_BUILTIN_GATHERALTSIV4DI:
37492 half = gen_reg_rtx (V4SImode);
37493 if (!nonimmediate_operand (op2, V8SImode))
37494 op2 = copy_to_mode_reg (V8SImode, op2);
37495 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37496 op2 = half;
37497 break;
37498 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37499 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37500 half = gen_reg_rtx (mode0);
37501 if (mode0 == V8SFmode)
37502 gen = gen_vec_extract_lo_v16sf;
37503 else
37504 gen = gen_vec_extract_lo_v16si;
37505 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37506 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37507 emit_insn (gen (half, op0));
37508 op0 = half;
37509 if (GET_MODE (op3) != VOIDmode)
37511 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37512 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37513 emit_insn (gen (half, op3));
37514 op3 = half;
37516 break;
37517 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37518 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37519 case IX86_BUILTIN_GATHERALTDIV8SF:
37520 case IX86_BUILTIN_GATHERALTDIV8SI:
37521 half = gen_reg_rtx (mode0);
37522 if (mode0 == V4SFmode)
37523 gen = gen_vec_extract_lo_v8sf;
37524 else
37525 gen = gen_vec_extract_lo_v8si;
37526 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37527 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37528 emit_insn (gen (half, op0));
37529 op0 = half;
37530 if (GET_MODE (op3) != VOIDmode)
37532 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37533 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37534 emit_insn (gen (half, op3));
37535 op3 = half;
37537 break;
37538 default:
37539 break;
37542 /* Force memory operand only with base register here. But we
37543 don't want to do it on memory operand for other builtin
37544 functions. */
37545 op1 = ix86_zero_extend_to_Pmode (op1);
37547 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37548 op0 = copy_to_mode_reg (mode0, op0);
37549 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37550 op1 = copy_to_mode_reg (Pmode, op1);
37551 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37552 op2 = copy_to_mode_reg (mode2, op2);
37554 op3 = fixup_modeless_constant (op3, mode3);
37556 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37558 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37559 op3 = copy_to_mode_reg (mode3, op3);
37561 else
37563 op3 = copy_to_reg (op3);
37564 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37566 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37568 error ("the last argument must be scale 1, 2, 4, 8");
37569 return const0_rtx;
37572 /* Optimize. If mask is known to have all high bits set,
37573 replace op0 with pc_rtx to signal that the instruction
37574 overwrites the whole destination and doesn't use its
37575 previous contents. */
37576 if (optimize)
37578 if (TREE_CODE (arg3) == INTEGER_CST)
37580 if (integer_all_onesp (arg3))
37581 op0 = pc_rtx;
37583 else if (TREE_CODE (arg3) == VECTOR_CST)
37585 unsigned int negative = 0;
37586 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37588 tree cst = VECTOR_CST_ELT (arg3, i);
37589 if (TREE_CODE (cst) == INTEGER_CST
37590 && tree_int_cst_sign_bit (cst))
37591 negative++;
37592 else if (TREE_CODE (cst) == REAL_CST
37593 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37594 negative++;
37596 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37597 op0 = pc_rtx;
37599 else if (TREE_CODE (arg3) == SSA_NAME
37600 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37602 /* Recognize also when mask is like:
37603 __v2df src = _mm_setzero_pd ();
37604 __v2df mask = _mm_cmpeq_pd (src, src);
37606 __v8sf src = _mm256_setzero_ps ();
37607 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37608 as that is a cheaper way to load all ones into
37609 a register than having to load a constant from
37610 memory. */
37611 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37612 if (is_gimple_call (def_stmt))
37614 tree fndecl = gimple_call_fndecl (def_stmt);
37615 if (fndecl
37616 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37617 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37619 case IX86_BUILTIN_CMPPD:
37620 case IX86_BUILTIN_CMPPS:
37621 case IX86_BUILTIN_CMPPD256:
37622 case IX86_BUILTIN_CMPPS256:
37623 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37624 break;
37625 /* FALLTHRU */
37626 case IX86_BUILTIN_CMPEQPD:
37627 case IX86_BUILTIN_CMPEQPS:
37628 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37629 && initializer_zerop (gimple_call_arg (def_stmt,
37630 1)))
37631 op0 = pc_rtx;
37632 break;
37633 default:
37634 break;
37640 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37641 if (! pat)
37642 return const0_rtx;
37643 emit_insn (pat);
37645 switch (fcode)
37647 case IX86_BUILTIN_GATHER3DIV16SF:
37648 if (target == NULL_RTX)
37649 target = gen_reg_rtx (V8SFmode);
37650 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37651 break;
37652 case IX86_BUILTIN_GATHER3DIV16SI:
37653 if (target == NULL_RTX)
37654 target = gen_reg_rtx (V8SImode);
37655 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37656 break;
37657 case IX86_BUILTIN_GATHER3DIV8SF:
37658 case IX86_BUILTIN_GATHERDIV8SF:
37659 if (target == NULL_RTX)
37660 target = gen_reg_rtx (V4SFmode);
37661 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37662 break;
37663 case IX86_BUILTIN_GATHER3DIV8SI:
37664 case IX86_BUILTIN_GATHERDIV8SI:
37665 if (target == NULL_RTX)
37666 target = gen_reg_rtx (V4SImode);
37667 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37668 break;
37669 default:
37670 target = subtarget;
37671 break;
37673 return target;
37675 scatter_gen:
37676 arg0 = CALL_EXPR_ARG (exp, 0);
37677 arg1 = CALL_EXPR_ARG (exp, 1);
37678 arg2 = CALL_EXPR_ARG (exp, 2);
37679 arg3 = CALL_EXPR_ARG (exp, 3);
37680 arg4 = CALL_EXPR_ARG (exp, 4);
37681 op0 = expand_normal (arg0);
37682 op1 = expand_normal (arg1);
37683 op2 = expand_normal (arg2);
37684 op3 = expand_normal (arg3);
37685 op4 = expand_normal (arg4);
37686 mode1 = insn_data[icode].operand[1].mode;
37687 mode2 = insn_data[icode].operand[2].mode;
37688 mode3 = insn_data[icode].operand[3].mode;
37689 mode4 = insn_data[icode].operand[4].mode;
37691 /* Scatter instruction stores operand op3 to memory with
37692 indices from op2 and scale from op4 under writemask op1.
37693 If index operand op2 has more elements then source operand
37694 op3 one need to use only its low half. And vice versa. */
37695 switch (fcode)
37697 case IX86_BUILTIN_SCATTERALTSIV8DF:
37698 case IX86_BUILTIN_SCATTERALTSIV8DI:
37699 half = gen_reg_rtx (V8SImode);
37700 if (!nonimmediate_operand (op2, V16SImode))
37701 op2 = copy_to_mode_reg (V16SImode, op2);
37702 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37703 op2 = half;
37704 break;
37705 case IX86_BUILTIN_SCATTERALTDIV16SF:
37706 case IX86_BUILTIN_SCATTERALTDIV16SI:
37707 half = gen_reg_rtx (mode3);
37708 if (mode3 == V8SFmode)
37709 gen = gen_vec_extract_lo_v16sf;
37710 else
37711 gen = gen_vec_extract_lo_v16si;
37712 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37713 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37714 emit_insn (gen (half, op3));
37715 op3 = half;
37716 break;
37717 default:
37718 break;
37721 /* Force memory operand only with base register here. But we
37722 don't want to do it on memory operand for other builtin
37723 functions. */
37724 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37726 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37727 op0 = copy_to_mode_reg (Pmode, op0);
37729 op1 = fixup_modeless_constant (op1, mode1);
37731 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37733 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37734 op1 = copy_to_mode_reg (mode1, op1);
37736 else
37738 op1 = copy_to_reg (op1);
37739 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37742 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37743 op2 = copy_to_mode_reg (mode2, op2);
37745 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37746 op3 = copy_to_mode_reg (mode3, op3);
37748 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37750 error ("the last argument must be scale 1, 2, 4, 8");
37751 return const0_rtx;
37754 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37755 if (! pat)
37756 return const0_rtx;
37758 emit_insn (pat);
37759 return 0;
37761 vec_prefetch_gen:
37762 arg0 = CALL_EXPR_ARG (exp, 0);
37763 arg1 = CALL_EXPR_ARG (exp, 1);
37764 arg2 = CALL_EXPR_ARG (exp, 2);
37765 arg3 = CALL_EXPR_ARG (exp, 3);
37766 arg4 = CALL_EXPR_ARG (exp, 4);
37767 op0 = expand_normal (arg0);
37768 op1 = expand_normal (arg1);
37769 op2 = expand_normal (arg2);
37770 op3 = expand_normal (arg3);
37771 op4 = expand_normal (arg4);
37772 mode0 = insn_data[icode].operand[0].mode;
37773 mode1 = insn_data[icode].operand[1].mode;
37774 mode3 = insn_data[icode].operand[3].mode;
37775 mode4 = insn_data[icode].operand[4].mode;
37777 op0 = fixup_modeless_constant (op0, mode0);
37779 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37781 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37782 op0 = copy_to_mode_reg (mode0, op0);
37784 else
37786 op0 = copy_to_reg (op0);
37787 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37790 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37791 op1 = copy_to_mode_reg (mode1, op1);
37793 /* Force memory operand only with base register here. But we
37794 don't want to do it on memory operand for other builtin
37795 functions. */
37796 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37798 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37799 op2 = copy_to_mode_reg (Pmode, op2);
37801 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37803 error ("the forth argument must be scale 1, 2, 4, 8");
37804 return const0_rtx;
37807 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37809 error ("incorrect hint operand");
37810 return const0_rtx;
37813 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37814 if (! pat)
37815 return const0_rtx;
37817 emit_insn (pat);
37819 return 0;
37821 case IX86_BUILTIN_XABORT:
37822 icode = CODE_FOR_xabort;
37823 arg0 = CALL_EXPR_ARG (exp, 0);
37824 op0 = expand_normal (arg0);
37825 mode0 = insn_data[icode].operand[0].mode;
37826 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37828 error ("the xabort's argument must be an 8-bit immediate");
37829 return const0_rtx;
37831 emit_insn (gen_xabort (op0));
37832 return 0;
37834 default:
37835 break;
37838 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37839 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37841 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37842 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37843 target);
37846 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37847 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37849 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37850 switch (fcode)
37852 case IX86_BUILTIN_FABSQ:
37853 case IX86_BUILTIN_COPYSIGNQ:
37854 if (!TARGET_SSE)
37855 /* Emit a normal call if SSE isn't available. */
37856 return expand_call (exp, target, ignore);
37857 /* FALLTHRU */
37858 default:
37859 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37863 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37864 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37866 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37867 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37870 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37871 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37873 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37874 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37877 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37878 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37880 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37881 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37884 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37885 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37887 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37888 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37891 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37892 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37894 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37895 const struct builtin_description *d = bdesc_multi_arg + i;
37896 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37897 (enum ix86_builtin_func_type)
37898 d->flag, d->comparison);
37901 gcc_unreachable ();
37904 /* This returns the target-specific builtin with code CODE if
37905 current_function_decl has visibility on this builtin, which is checked
37906 using isa flags. Returns NULL_TREE otherwise. */
37908 static tree ix86_get_builtin (enum ix86_builtins code)
37910 struct cl_target_option *opts;
37911 tree target_tree = NULL_TREE;
37913 /* Determine the isa flags of current_function_decl. */
37915 if (current_function_decl)
37916 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37918 if (target_tree == NULL)
37919 target_tree = target_option_default_node;
37921 opts = TREE_TARGET_OPTION (target_tree);
37923 if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37924 return ix86_builtin_decl (code, true);
37925 else
37926 return NULL_TREE;
37929 /* Return function decl for target specific builtin
37930 for given MPX builtin passed i FCODE. */
37931 static tree
37932 ix86_builtin_mpx_function (unsigned fcode)
37934 switch (fcode)
37936 case BUILT_IN_CHKP_BNDMK:
37937 return ix86_builtins[IX86_BUILTIN_BNDMK];
37939 case BUILT_IN_CHKP_BNDSTX:
37940 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37942 case BUILT_IN_CHKP_BNDLDX:
37943 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37945 case BUILT_IN_CHKP_BNDCL:
37946 return ix86_builtins[IX86_BUILTIN_BNDCL];
37948 case BUILT_IN_CHKP_BNDCU:
37949 return ix86_builtins[IX86_BUILTIN_BNDCU];
37951 case BUILT_IN_CHKP_BNDRET:
37952 return ix86_builtins[IX86_BUILTIN_BNDRET];
37954 case BUILT_IN_CHKP_INTERSECT:
37955 return ix86_builtins[IX86_BUILTIN_BNDINT];
37957 case BUILT_IN_CHKP_NARROW:
37958 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37960 case BUILT_IN_CHKP_SIZEOF:
37961 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37963 case BUILT_IN_CHKP_EXTRACT_LOWER:
37964 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37966 case BUILT_IN_CHKP_EXTRACT_UPPER:
37967 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37969 default:
37970 return NULL_TREE;
37973 gcc_unreachable ();
37976 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37978 Return an address to be used to load/store bounds for pointer
37979 passed in SLOT.
37981 SLOT_NO is an integer constant holding number of a target
37982 dependent special slot to be used in case SLOT is not a memory.
37984 SPECIAL_BASE is a pointer to be used as a base of fake address
37985 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37986 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37988 static rtx
37989 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37991 rtx addr = NULL;
37993 /* NULL slot means we pass bounds for pointer not passed to the
37994 function at all. Register slot means we pass pointer in a
37995 register. In both these cases bounds are passed via Bounds
37996 Table. Since we do not have actual pointer stored in memory,
37997 we have to use fake addresses to access Bounds Table. We
37998 start with (special_base - sizeof (void*)) and decrease this
37999 address by pointer size to get addresses for other slots. */
38000 if (!slot || REG_P (slot))
38002 gcc_assert (CONST_INT_P (slot_no));
38003 addr = plus_constant (Pmode, special_base,
38004 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38006 /* If pointer is passed in a memory then its address is used to
38007 access Bounds Table. */
38008 else if (MEM_P (slot))
38010 addr = XEXP (slot, 0);
38011 if (!register_operand (addr, Pmode))
38012 addr = copy_addr_to_reg (addr);
38014 else
38015 gcc_unreachable ();
38017 return addr;
38020 /* Expand pass uses this hook to load bounds for function parameter
38021 PTR passed in SLOT in case its bounds are not passed in a register.
38023 If SLOT is a memory, then bounds are loaded as for regular pointer
38024 loaded from memory. PTR may be NULL in case SLOT is a memory.
38025 In such case value of PTR (if required) may be loaded from SLOT.
38027 If SLOT is NULL or a register then SLOT_NO is an integer constant
38028 holding number of the target dependent special slot which should be
38029 used to obtain bounds.
38031 Return loaded bounds. */
38033 static rtx
38034 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38036 rtx reg = gen_reg_rtx (BNDmode);
38037 rtx addr;
38039 /* Get address to be used to access Bounds Table. Special slots start
38040 at the location of return address of the current function. */
38041 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38043 /* Load pointer value from a memory if we don't have it. */
38044 if (!ptr)
38046 gcc_assert (MEM_P (slot));
38047 ptr = copy_addr_to_reg (slot);
38050 if (!register_operand (ptr, Pmode))
38051 ptr = ix86_zero_extend_to_Pmode (ptr);
38053 emit_insn (BNDmode == BND64mode
38054 ? gen_bnd64_ldx (reg, addr, ptr)
38055 : gen_bnd32_ldx (reg, addr, ptr));
38057 return reg;
38060 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38061 passed in SLOT in case BOUNDS are not passed in a register.
38063 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38064 stored in memory. PTR may be NULL in case SLOT is a memory.
38065 In such case value of PTR (if required) may be loaded from SLOT.
38067 If SLOT is NULL or a register then SLOT_NO is an integer constant
38068 holding number of the target dependent special slot which should be
38069 used to store BOUNDS. */
38071 static void
38072 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38074 rtx addr;
38076 /* Get address to be used to access Bounds Table. Special slots start
38077 at the location of return address of a called function. */
38078 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38080 /* Load pointer value from a memory if we don't have it. */
38081 if (!ptr)
38083 gcc_assert (MEM_P (slot));
38084 ptr = copy_addr_to_reg (slot);
38087 if (!register_operand (ptr, Pmode))
38088 ptr = ix86_zero_extend_to_Pmode (ptr);
38090 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38091 if (!register_operand (bounds, BNDmode))
38092 bounds = copy_to_mode_reg (BNDmode, bounds);
38094 emit_insn (BNDmode == BND64mode
38095 ? gen_bnd64_stx (addr, ptr, bounds)
38096 : gen_bnd32_stx (addr, ptr, bounds));
38099 /* Load and return bounds returned by function in SLOT. */
38101 static rtx
38102 ix86_load_returned_bounds (rtx slot)
38104 rtx res;
38106 gcc_assert (REG_P (slot));
38107 res = gen_reg_rtx (BNDmode);
38108 emit_move_insn (res, slot);
38110 return res;
38113 /* Store BOUNDS returned by function into SLOT. */
38115 static void
38116 ix86_store_returned_bounds (rtx slot, rtx bounds)
38118 gcc_assert (REG_P (slot));
38119 emit_move_insn (slot, bounds);
38122 /* Returns a function decl for a vectorized version of the combined function
38123 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38124 if it is not available. */
38126 static tree
38127 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38128 tree type_in)
38130 machine_mode in_mode, out_mode;
38131 int in_n, out_n;
38133 if (TREE_CODE (type_out) != VECTOR_TYPE
38134 || TREE_CODE (type_in) != VECTOR_TYPE)
38135 return NULL_TREE;
38137 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38138 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38139 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38140 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38142 switch (fn)
38144 CASE_CFN_EXP2:
38145 if (out_mode == SFmode && in_mode == SFmode)
38147 if (out_n == 16 && in_n == 16)
38148 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38150 break;
38152 CASE_CFN_IFLOOR:
38153 CASE_CFN_LFLOOR:
38154 CASE_CFN_LLFLOOR:
38155 /* The round insn does not trap on denormals. */
38156 if (flag_trapping_math || !TARGET_ROUND)
38157 break;
38159 if (out_mode == SImode && in_mode == DFmode)
38161 if (out_n == 4 && in_n == 2)
38162 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38163 else if (out_n == 8 && in_n == 4)
38164 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38165 else if (out_n == 16 && in_n == 8)
38166 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38168 if (out_mode == SImode && in_mode == SFmode)
38170 if (out_n == 4 && in_n == 4)
38171 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38172 else if (out_n == 8 && in_n == 8)
38173 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38174 else if (out_n == 16 && in_n == 16)
38175 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38177 break;
38179 CASE_CFN_ICEIL:
38180 CASE_CFN_LCEIL:
38181 CASE_CFN_LLCEIL:
38182 /* The round insn does not trap on denormals. */
38183 if (flag_trapping_math || !TARGET_ROUND)
38184 break;
38186 if (out_mode == SImode && in_mode == DFmode)
38188 if (out_n == 4 && in_n == 2)
38189 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38190 else if (out_n == 8 && in_n == 4)
38191 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38192 else if (out_n == 16 && in_n == 8)
38193 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38195 if (out_mode == SImode && in_mode == SFmode)
38197 if (out_n == 4 && in_n == 4)
38198 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38199 else if (out_n == 8 && in_n == 8)
38200 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38201 else if (out_n == 16 && in_n == 16)
38202 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38204 break;
38206 CASE_CFN_IRINT:
38207 CASE_CFN_LRINT:
38208 CASE_CFN_LLRINT:
38209 if (out_mode == SImode && in_mode == DFmode)
38211 if (out_n == 4 && in_n == 2)
38212 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38213 else if (out_n == 8 && in_n == 4)
38214 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38215 else if (out_n == 16 && in_n == 8)
38216 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38218 if (out_mode == SImode && in_mode == SFmode)
38220 if (out_n == 4 && in_n == 4)
38221 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38222 else if (out_n == 8 && in_n == 8)
38223 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38224 else if (out_n == 16 && in_n == 16)
38225 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38227 break;
38229 CASE_CFN_IROUND:
38230 CASE_CFN_LROUND:
38231 CASE_CFN_LLROUND:
38232 /* The round insn does not trap on denormals. */
38233 if (flag_trapping_math || !TARGET_ROUND)
38234 break;
38236 if (out_mode == SImode && in_mode == DFmode)
38238 if (out_n == 4 && in_n == 2)
38239 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38240 else if (out_n == 8 && in_n == 4)
38241 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38242 else if (out_n == 16 && in_n == 8)
38243 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38245 if (out_mode == SImode && in_mode == SFmode)
38247 if (out_n == 4 && in_n == 4)
38248 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38249 else if (out_n == 8 && in_n == 8)
38250 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38251 else if (out_n == 16 && in_n == 16)
38252 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38254 break;
38256 CASE_CFN_FLOOR:
38257 /* The round insn does not trap on denormals. */
38258 if (flag_trapping_math || !TARGET_ROUND)
38259 break;
38261 if (out_mode == DFmode && in_mode == DFmode)
38263 if (out_n == 2 && in_n == 2)
38264 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38265 else if (out_n == 4 && in_n == 4)
38266 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38267 else if (out_n == 8 && in_n == 8)
38268 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38270 if (out_mode == SFmode && in_mode == SFmode)
38272 if (out_n == 4 && in_n == 4)
38273 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38274 else if (out_n == 8 && in_n == 8)
38275 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38276 else if (out_n == 16 && in_n == 16)
38277 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38279 break;
38281 CASE_CFN_CEIL:
38282 /* The round insn does not trap on denormals. */
38283 if (flag_trapping_math || !TARGET_ROUND)
38284 break;
38286 if (out_mode == DFmode && in_mode == DFmode)
38288 if (out_n == 2 && in_n == 2)
38289 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38290 else if (out_n == 4 && in_n == 4)
38291 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38292 else if (out_n == 8 && in_n == 8)
38293 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38295 if (out_mode == SFmode && in_mode == SFmode)
38297 if (out_n == 4 && in_n == 4)
38298 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38299 else if (out_n == 8 && in_n == 8)
38300 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38301 else if (out_n == 16 && in_n == 16)
38302 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38304 break;
38306 CASE_CFN_TRUNC:
38307 /* The round insn does not trap on denormals. */
38308 if (flag_trapping_math || !TARGET_ROUND)
38309 break;
38311 if (out_mode == DFmode && in_mode == DFmode)
38313 if (out_n == 2 && in_n == 2)
38314 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38315 else if (out_n == 4 && in_n == 4)
38316 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38317 else if (out_n == 8 && in_n == 8)
38318 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38320 if (out_mode == SFmode && in_mode == SFmode)
38322 if (out_n == 4 && in_n == 4)
38323 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38324 else if (out_n == 8 && in_n == 8)
38325 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38326 else if (out_n == 16 && in_n == 16)
38327 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38329 break;
38331 CASE_CFN_RINT:
38332 /* The round insn does not trap on denormals. */
38333 if (flag_trapping_math || !TARGET_ROUND)
38334 break;
38336 if (out_mode == DFmode && in_mode == DFmode)
38338 if (out_n == 2 && in_n == 2)
38339 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38340 else if (out_n == 4 && in_n == 4)
38341 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38343 if (out_mode == SFmode && in_mode == SFmode)
38345 if (out_n == 4 && in_n == 4)
38346 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38347 else if (out_n == 8 && in_n == 8)
38348 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38350 break;
38352 CASE_CFN_FMA:
38353 if (out_mode == DFmode && in_mode == DFmode)
38355 if (out_n == 2 && in_n == 2)
38356 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38357 if (out_n == 4 && in_n == 4)
38358 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38360 if (out_mode == SFmode && in_mode == SFmode)
38362 if (out_n == 4 && in_n == 4)
38363 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38364 if (out_n == 8 && in_n == 8)
38365 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38367 break;
38369 default:
38370 break;
38373 /* Dispatch to a handler for a vectorization library. */
38374 if (ix86_veclib_handler)
38375 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38377 return NULL_TREE;
38380 /* Handler for an SVML-style interface to
38381 a library with vectorized intrinsics. */
38383 static tree
38384 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38386 char name[20];
38387 tree fntype, new_fndecl, args;
38388 unsigned arity;
38389 const char *bname;
38390 machine_mode el_mode, in_mode;
38391 int n, in_n;
38393 /* The SVML is suitable for unsafe math only. */
38394 if (!flag_unsafe_math_optimizations)
38395 return NULL_TREE;
38397 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38398 n = TYPE_VECTOR_SUBPARTS (type_out);
38399 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38400 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38401 if (el_mode != in_mode
38402 || n != in_n)
38403 return NULL_TREE;
38405 switch (fn)
38407 CASE_CFN_EXP:
38408 CASE_CFN_LOG:
38409 CASE_CFN_LOG10:
38410 CASE_CFN_POW:
38411 CASE_CFN_TANH:
38412 CASE_CFN_TAN:
38413 CASE_CFN_ATAN:
38414 CASE_CFN_ATAN2:
38415 CASE_CFN_ATANH:
38416 CASE_CFN_CBRT:
38417 CASE_CFN_SINH:
38418 CASE_CFN_SIN:
38419 CASE_CFN_ASINH:
38420 CASE_CFN_ASIN:
38421 CASE_CFN_COSH:
38422 CASE_CFN_COS:
38423 CASE_CFN_ACOSH:
38424 CASE_CFN_ACOS:
38425 if ((el_mode != DFmode || n != 2)
38426 && (el_mode != SFmode || n != 4))
38427 return NULL_TREE;
38428 break;
38430 default:
38431 return NULL_TREE;
38434 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38435 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38437 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38438 strcpy (name, "vmlsLn4");
38439 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38440 strcpy (name, "vmldLn2");
38441 else if (n == 4)
38443 sprintf (name, "vmls%s", bname+10);
38444 name[strlen (name)-1] = '4';
38446 else
38447 sprintf (name, "vmld%s2", bname+10);
38449 /* Convert to uppercase. */
38450 name[4] &= ~0x20;
38452 arity = 0;
38453 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38454 arity++;
38456 if (arity == 1)
38457 fntype = build_function_type_list (type_out, type_in, NULL);
38458 else
38459 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38461 /* Build a function declaration for the vectorized function. */
38462 new_fndecl = build_decl (BUILTINS_LOCATION,
38463 FUNCTION_DECL, get_identifier (name), fntype);
38464 TREE_PUBLIC (new_fndecl) = 1;
38465 DECL_EXTERNAL (new_fndecl) = 1;
38466 DECL_IS_NOVOPS (new_fndecl) = 1;
38467 TREE_READONLY (new_fndecl) = 1;
38469 return new_fndecl;
38472 /* Handler for an ACML-style interface to
38473 a library with vectorized intrinsics. */
38475 static tree
38476 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38478 char name[20] = "__vr.._";
38479 tree fntype, new_fndecl, args;
38480 unsigned arity;
38481 const char *bname;
38482 machine_mode el_mode, in_mode;
38483 int n, in_n;
38485 /* The ACML is 64bits only and suitable for unsafe math only as
38486 it does not correctly support parts of IEEE with the required
38487 precision such as denormals. */
38488 if (!TARGET_64BIT
38489 || !flag_unsafe_math_optimizations)
38490 return NULL_TREE;
38492 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38493 n = TYPE_VECTOR_SUBPARTS (type_out);
38494 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38495 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38496 if (el_mode != in_mode
38497 || n != in_n)
38498 return NULL_TREE;
38500 switch (fn)
38502 CASE_CFN_SIN:
38503 CASE_CFN_COS:
38504 CASE_CFN_EXP:
38505 CASE_CFN_LOG:
38506 CASE_CFN_LOG2:
38507 CASE_CFN_LOG10:
38508 if (el_mode == DFmode && n == 2)
38510 name[4] = 'd';
38511 name[5] = '2';
38513 else if (el_mode == SFmode && n == 4)
38515 name[4] = 's';
38516 name[5] = '4';
38518 else
38519 return NULL_TREE;
38520 break;
38522 default:
38523 return NULL_TREE;
38526 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38527 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38528 sprintf (name + 7, "%s", bname+10);
38530 arity = 0;
38531 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38532 arity++;
38534 if (arity == 1)
38535 fntype = build_function_type_list (type_out, type_in, NULL);
38536 else
38537 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38539 /* Build a function declaration for the vectorized function. */
38540 new_fndecl = build_decl (BUILTINS_LOCATION,
38541 FUNCTION_DECL, get_identifier (name), fntype);
38542 TREE_PUBLIC (new_fndecl) = 1;
38543 DECL_EXTERNAL (new_fndecl) = 1;
38544 DECL_IS_NOVOPS (new_fndecl) = 1;
38545 TREE_READONLY (new_fndecl) = 1;
38547 return new_fndecl;
38550 /* Returns a decl of a function that implements gather load with
38551 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38552 Return NULL_TREE if it is not available. */
38554 static tree
38555 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38556 const_tree index_type, int scale)
38558 bool si;
38559 enum ix86_builtins code;
38561 if (! TARGET_AVX2)
38562 return NULL_TREE;
38564 if ((TREE_CODE (index_type) != INTEGER_TYPE
38565 && !POINTER_TYPE_P (index_type))
38566 || (TYPE_MODE (index_type) != SImode
38567 && TYPE_MODE (index_type) != DImode))
38568 return NULL_TREE;
38570 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38571 return NULL_TREE;
38573 /* v*gather* insn sign extends index to pointer mode. */
38574 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38575 && TYPE_UNSIGNED (index_type))
38576 return NULL_TREE;
38578 if (scale <= 0
38579 || scale > 8
38580 || (scale & (scale - 1)) != 0)
38581 return NULL_TREE;
38583 si = TYPE_MODE (index_type) == SImode;
38584 switch (TYPE_MODE (mem_vectype))
38586 case V2DFmode:
38587 if (TARGET_AVX512VL)
38588 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38589 else
38590 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38591 break;
38592 case V4DFmode:
38593 if (TARGET_AVX512VL)
38594 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38595 else
38596 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38597 break;
38598 case V2DImode:
38599 if (TARGET_AVX512VL)
38600 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38601 else
38602 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38603 break;
38604 case V4DImode:
38605 if (TARGET_AVX512VL)
38606 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38607 else
38608 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38609 break;
38610 case V4SFmode:
38611 if (TARGET_AVX512VL)
38612 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38613 else
38614 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38615 break;
38616 case V8SFmode:
38617 if (TARGET_AVX512VL)
38618 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38619 else
38620 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38621 break;
38622 case V4SImode:
38623 if (TARGET_AVX512VL)
38624 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38625 else
38626 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38627 break;
38628 case V8SImode:
38629 if (TARGET_AVX512VL)
38630 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38631 else
38632 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38633 break;
38634 case V8DFmode:
38635 if (TARGET_AVX512F)
38636 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38637 else
38638 return NULL_TREE;
38639 break;
38640 case V8DImode:
38641 if (TARGET_AVX512F)
38642 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38643 else
38644 return NULL_TREE;
38645 break;
38646 case V16SFmode:
38647 if (TARGET_AVX512F)
38648 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38649 else
38650 return NULL_TREE;
38651 break;
38652 case V16SImode:
38653 if (TARGET_AVX512F)
38654 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38655 else
38656 return NULL_TREE;
38657 break;
38658 default:
38659 return NULL_TREE;
38662 return ix86_get_builtin (code);
38665 /* Returns a decl of a function that implements scatter store with
38666 register type VECTYPE and index type INDEX_TYPE and SCALE.
38667 Return NULL_TREE if it is not available. */
38669 static tree
38670 ix86_vectorize_builtin_scatter (const_tree vectype,
38671 const_tree index_type, int scale)
38673 bool si;
38674 enum ix86_builtins code;
38676 if (!TARGET_AVX512F)
38677 return NULL_TREE;
38679 if ((TREE_CODE (index_type) != INTEGER_TYPE
38680 && !POINTER_TYPE_P (index_type))
38681 || (TYPE_MODE (index_type) != SImode
38682 && TYPE_MODE (index_type) != DImode))
38683 return NULL_TREE;
38685 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38686 return NULL_TREE;
38688 /* v*scatter* insn sign extends index to pointer mode. */
38689 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38690 && TYPE_UNSIGNED (index_type))
38691 return NULL_TREE;
38693 /* Scale can be 1, 2, 4 or 8. */
38694 if (scale <= 0
38695 || scale > 8
38696 || (scale & (scale - 1)) != 0)
38697 return NULL_TREE;
38699 si = TYPE_MODE (index_type) == SImode;
38700 switch (TYPE_MODE (vectype))
38702 case V8DFmode:
38703 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38704 break;
38705 case V8DImode:
38706 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38707 break;
38708 case V16SFmode:
38709 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38710 break;
38711 case V16SImode:
38712 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38713 break;
38714 default:
38715 return NULL_TREE;
38718 return ix86_builtins[code];
38721 /* Return true if it is safe to use the rsqrt optabs to optimize
38722 1.0/sqrt. */
38724 static bool
38725 use_rsqrt_p ()
38727 return (TARGET_SSE_MATH
38728 && flag_finite_math_only
38729 && !flag_trapping_math
38730 && flag_unsafe_math_optimizations);
38733 /* Returns a code for a target-specific builtin that implements
38734 reciprocal of the function, or NULL_TREE if not available. */
38736 static tree
38737 ix86_builtin_reciprocal (tree fndecl)
38739 switch (DECL_FUNCTION_CODE (fndecl))
38741 /* Vectorized version of sqrt to rsqrt conversion. */
38742 case IX86_BUILTIN_SQRTPS_NR:
38743 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38745 case IX86_BUILTIN_SQRTPS_NR256:
38746 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38748 default:
38749 return NULL_TREE;
38753 /* Helper for avx_vpermilps256_operand et al. This is also used by
38754 the expansion functions to turn the parallel back into a mask.
38755 The return value is 0 for no match and the imm8+1 for a match. */
38758 avx_vpermilp_parallel (rtx par, machine_mode mode)
38760 unsigned i, nelt = GET_MODE_NUNITS (mode);
38761 unsigned mask = 0;
38762 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38764 if (XVECLEN (par, 0) != (int) nelt)
38765 return 0;
38767 /* Validate that all of the elements are constants, and not totally
38768 out of range. Copy the data into an integral array to make the
38769 subsequent checks easier. */
38770 for (i = 0; i < nelt; ++i)
38772 rtx er = XVECEXP (par, 0, i);
38773 unsigned HOST_WIDE_INT ei;
38775 if (!CONST_INT_P (er))
38776 return 0;
38777 ei = INTVAL (er);
38778 if (ei >= nelt)
38779 return 0;
38780 ipar[i] = ei;
38783 switch (mode)
38785 case V8DFmode:
38786 /* In the 512-bit DFmode case, we can only move elements within
38787 a 128-bit lane. First fill the second part of the mask,
38788 then fallthru. */
38789 for (i = 4; i < 6; ++i)
38791 if (ipar[i] < 4 || ipar[i] >= 6)
38792 return 0;
38793 mask |= (ipar[i] - 4) << i;
38795 for (i = 6; i < 8; ++i)
38797 if (ipar[i] < 6)
38798 return 0;
38799 mask |= (ipar[i] - 6) << i;
38801 /* FALLTHRU */
38803 case V4DFmode:
38804 /* In the 256-bit DFmode case, we can only move elements within
38805 a 128-bit lane. */
38806 for (i = 0; i < 2; ++i)
38808 if (ipar[i] >= 2)
38809 return 0;
38810 mask |= ipar[i] << i;
38812 for (i = 2; i < 4; ++i)
38814 if (ipar[i] < 2)
38815 return 0;
38816 mask |= (ipar[i] - 2) << i;
38818 break;
38820 case V16SFmode:
38821 /* In 512 bit SFmode case, permutation in the upper 256 bits
38822 must mirror the permutation in the lower 256-bits. */
38823 for (i = 0; i < 8; ++i)
38824 if (ipar[i] + 8 != ipar[i + 8])
38825 return 0;
38826 /* FALLTHRU */
38828 case V8SFmode:
38829 /* In 256 bit SFmode case, we have full freedom of
38830 movement within the low 128-bit lane, but the high 128-bit
38831 lane must mirror the exact same pattern. */
38832 for (i = 0; i < 4; ++i)
38833 if (ipar[i] + 4 != ipar[i + 4])
38834 return 0;
38835 nelt = 4;
38836 /* FALLTHRU */
38838 case V2DFmode:
38839 case V4SFmode:
38840 /* In the 128-bit case, we've full freedom in the placement of
38841 the elements from the source operand. */
38842 for (i = 0; i < nelt; ++i)
38843 mask |= ipar[i] << (i * (nelt / 2));
38844 break;
38846 default:
38847 gcc_unreachable ();
38850 /* Make sure success has a non-zero value by adding one. */
38851 return mask + 1;
38854 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38855 the expansion functions to turn the parallel back into a mask.
38856 The return value is 0 for no match and the imm8+1 for a match. */
38859 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38861 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38862 unsigned mask = 0;
38863 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38865 if (XVECLEN (par, 0) != (int) nelt)
38866 return 0;
38868 /* Validate that all of the elements are constants, and not totally
38869 out of range. Copy the data into an integral array to make the
38870 subsequent checks easier. */
38871 for (i = 0; i < nelt; ++i)
38873 rtx er = XVECEXP (par, 0, i);
38874 unsigned HOST_WIDE_INT ei;
38876 if (!CONST_INT_P (er))
38877 return 0;
38878 ei = INTVAL (er);
38879 if (ei >= 2 * nelt)
38880 return 0;
38881 ipar[i] = ei;
38884 /* Validate that the halves of the permute are halves. */
38885 for (i = 0; i < nelt2 - 1; ++i)
38886 if (ipar[i] + 1 != ipar[i + 1])
38887 return 0;
38888 for (i = nelt2; i < nelt - 1; ++i)
38889 if (ipar[i] + 1 != ipar[i + 1])
38890 return 0;
38892 /* Reconstruct the mask. */
38893 for (i = 0; i < 2; ++i)
38895 unsigned e = ipar[i * nelt2];
38896 if (e % nelt2)
38897 return 0;
38898 e /= nelt2;
38899 mask |= e << (i * 4);
38902 /* Make sure success has a non-zero value by adding one. */
38903 return mask + 1;
38906 /* Return a register priority for hard reg REGNO. */
38907 static int
38908 ix86_register_priority (int hard_regno)
38910 /* ebp and r13 as the base always wants a displacement, r12 as the
38911 base always wants an index. So discourage their usage in an
38912 address. */
38913 if (hard_regno == R12_REG || hard_regno == R13_REG)
38914 return 0;
38915 if (hard_regno == BP_REG)
38916 return 1;
38917 /* New x86-64 int registers result in bigger code size. Discourage
38918 them. */
38919 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38920 return 2;
38921 /* New x86-64 SSE registers result in bigger code size. Discourage
38922 them. */
38923 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38924 return 2;
38925 /* Usage of AX register results in smaller code. Prefer it. */
38926 if (hard_regno == AX_REG)
38927 return 4;
38928 return 3;
38931 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38933 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38934 QImode must go into class Q_REGS.
38935 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38936 movdf to do mem-to-mem moves through integer regs. */
38938 static reg_class_t
38939 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38941 machine_mode mode = GET_MODE (x);
38943 /* We're only allowed to return a subclass of CLASS. Many of the
38944 following checks fail for NO_REGS, so eliminate that early. */
38945 if (regclass == NO_REGS)
38946 return NO_REGS;
38948 /* All classes can load zeros. */
38949 if (x == CONST0_RTX (mode))
38950 return regclass;
38952 /* Force constants into memory if we are loading a (nonzero) constant into
38953 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38954 instructions to load from a constant. */
38955 if (CONSTANT_P (x)
38956 && (MAYBE_MMX_CLASS_P (regclass)
38957 || MAYBE_SSE_CLASS_P (regclass)
38958 || MAYBE_MASK_CLASS_P (regclass)))
38959 return NO_REGS;
38961 /* Floating-point constants need more complex checks. */
38962 if (CONST_DOUBLE_P (x))
38964 /* General regs can load everything. */
38965 if (INTEGER_CLASS_P (regclass))
38966 return regclass;
38968 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38969 zero above. We only want to wind up preferring 80387 registers if
38970 we plan on doing computation with them. */
38971 if (IS_STACK_MODE (mode)
38972 && standard_80387_constant_p (x) > 0)
38974 /* Limit class to FP regs. */
38975 if (FLOAT_CLASS_P (regclass))
38976 return FLOAT_REGS;
38977 else if (regclass == FP_TOP_SSE_REGS)
38978 return FP_TOP_REG;
38979 else if (regclass == FP_SECOND_SSE_REGS)
38980 return FP_SECOND_REG;
38983 return NO_REGS;
38986 /* Prefer SSE regs only, if we can use them for math. */
38987 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38988 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38990 /* Generally when we see PLUS here, it's the function invariant
38991 (plus soft-fp const_int). Which can only be computed into general
38992 regs. */
38993 if (GET_CODE (x) == PLUS)
38994 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38996 /* QImode constants are easy to load, but non-constant QImode data
38997 must go into Q_REGS. */
38998 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39000 if (Q_CLASS_P (regclass))
39001 return regclass;
39002 else if (reg_class_subset_p (Q_REGS, regclass))
39003 return Q_REGS;
39004 else
39005 return NO_REGS;
39008 return regclass;
39011 /* Discourage putting floating-point values in SSE registers unless
39012 SSE math is being used, and likewise for the 387 registers. */
39013 static reg_class_t
39014 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39016 machine_mode mode = GET_MODE (x);
39018 /* Restrict the output reload class to the register bank that we are doing
39019 math on. If we would like not to return a subset of CLASS, reject this
39020 alternative: if reload cannot do this, it will still use its choice. */
39021 mode = GET_MODE (x);
39022 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39023 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39025 if (IS_STACK_MODE (mode))
39027 if (regclass == FP_TOP_SSE_REGS)
39028 return FP_TOP_REG;
39029 else if (regclass == FP_SECOND_SSE_REGS)
39030 return FP_SECOND_REG;
39031 else
39032 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39035 return regclass;
39038 static reg_class_t
39039 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39040 machine_mode mode, secondary_reload_info *sri)
39042 /* Double-word spills from general registers to non-offsettable memory
39043 references (zero-extended addresses) require special handling. */
39044 if (TARGET_64BIT
39045 && MEM_P (x)
39046 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39047 && INTEGER_CLASS_P (rclass)
39048 && !offsettable_memref_p (x))
39050 sri->icode = (in_p
39051 ? CODE_FOR_reload_noff_load
39052 : CODE_FOR_reload_noff_store);
39053 /* Add the cost of moving address to a temporary. */
39054 sri->extra_cost = 1;
39056 return NO_REGS;
39059 /* QImode spills from non-QI registers require
39060 intermediate register on 32bit targets. */
39061 if (mode == QImode
39062 && (MAYBE_MASK_CLASS_P (rclass)
39063 || (!TARGET_64BIT && !in_p
39064 && INTEGER_CLASS_P (rclass)
39065 && MAYBE_NON_Q_CLASS_P (rclass))))
39067 int regno;
39069 if (REG_P (x))
39070 regno = REGNO (x);
39071 else
39072 regno = -1;
39074 if (regno >= FIRST_PSEUDO_REGISTER || SUBREG_P (x))
39075 regno = true_regnum (x);
39077 /* Return Q_REGS if the operand is in memory. */
39078 if (regno == -1)
39079 return Q_REGS;
39082 /* This condition handles corner case where an expression involving
39083 pointers gets vectorized. We're trying to use the address of a
39084 stack slot as a vector initializer.
39086 (set (reg:V2DI 74 [ vect_cst_.2 ])
39087 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39089 Eventually frame gets turned into sp+offset like this:
39091 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39092 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39093 (const_int 392 [0x188]))))
39095 That later gets turned into:
39097 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39098 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39099 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39101 We'll have the following reload recorded:
39103 Reload 0: reload_in (DI) =
39104 (plus:DI (reg/f:DI 7 sp)
39105 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39106 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39107 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39108 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39109 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39110 reload_reg_rtx: (reg:V2DI 22 xmm1)
39112 Which isn't going to work since SSE instructions can't handle scalar
39113 additions. Returning GENERAL_REGS forces the addition into integer
39114 register and reload can handle subsequent reloads without problems. */
39116 if (in_p && GET_CODE (x) == PLUS
39117 && SSE_CLASS_P (rclass)
39118 && SCALAR_INT_MODE_P (mode))
39119 return GENERAL_REGS;
39121 return NO_REGS;
39124 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39126 static bool
39127 ix86_class_likely_spilled_p (reg_class_t rclass)
39129 switch (rclass)
39131 case AREG:
39132 case DREG:
39133 case CREG:
39134 case BREG:
39135 case AD_REGS:
39136 case SIREG:
39137 case DIREG:
39138 case SSE_FIRST_REG:
39139 case FP_TOP_REG:
39140 case FP_SECOND_REG:
39141 case BND_REGS:
39142 return true;
39144 default:
39145 break;
39148 return false;
39151 /* If we are copying between general and FP registers, we need a memory
39152 location. The same is true for SSE and MMX registers.
39154 To optimize register_move_cost performance, allow inline variant.
39156 The macro can't work reliably when one of the CLASSES is class containing
39157 registers from multiple units (SSE, MMX, integer). We avoid this by never
39158 combining those units in single alternative in the machine description.
39159 Ensure that this constraint holds to avoid unexpected surprises.
39161 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
39162 enforce these sanity checks. */
39164 static inline bool
39165 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39166 machine_mode mode, int strict)
39168 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39169 return false;
39170 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39171 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39172 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39173 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39174 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39175 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
39177 gcc_assert (!strict || lra_in_progress);
39178 return true;
39181 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39182 return true;
39184 /* Between mask and general, we have moves no larger than word size. */
39185 if ((MAYBE_MASK_CLASS_P (class1) != MAYBE_MASK_CLASS_P (class2))
39186 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39187 return true;
39189 /* ??? This is a lie. We do have moves between mmx/general, and for
39190 mmx/sse2. But by saying we need secondary memory we discourage the
39191 register allocator from using the mmx registers unless needed. */
39192 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39193 return true;
39195 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39197 /* SSE1 doesn't have any direct moves from other classes. */
39198 if (!TARGET_SSE2)
39199 return true;
39201 /* If the target says that inter-unit moves are more expensive
39202 than moving through memory, then don't generate them. */
39203 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39204 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39205 return true;
39207 /* Between SSE and general, we have moves no larger than word size. */
39208 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39209 return true;
39212 return false;
39215 bool
39216 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
39217 machine_mode mode, int strict)
39219 return inline_secondary_memory_needed (class1, class2, mode, strict);
39222 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39224 On the 80386, this is the size of MODE in words,
39225 except in the FP regs, where a single reg is always enough. */
39227 static unsigned char
39228 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39230 if (MAYBE_INTEGER_CLASS_P (rclass))
39232 if (mode == XFmode)
39233 return (TARGET_64BIT ? 2 : 3);
39234 else if (mode == XCmode)
39235 return (TARGET_64BIT ? 4 : 6);
39236 else
39237 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39239 else
39241 if (COMPLEX_MODE_P (mode))
39242 return 2;
39243 else
39244 return 1;
39248 /* Return true if the registers in CLASS cannot represent the change from
39249 modes FROM to TO. */
39251 bool
39252 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
39253 enum reg_class regclass)
39255 if (from == to)
39256 return false;
39258 /* x87 registers can't do subreg at all, as all values are reformatted
39259 to extended precision. */
39260 if (MAYBE_FLOAT_CLASS_P (regclass))
39261 return true;
39263 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39265 /* Vector registers do not support QI or HImode loads. If we don't
39266 disallow a change to these modes, reload will assume it's ok to
39267 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39268 the vec_dupv4hi pattern. */
39269 if (GET_MODE_SIZE (from) < 4)
39270 return true;
39273 return false;
39276 /* Return the cost of moving data of mode M between a
39277 register and memory. A value of 2 is the default; this cost is
39278 relative to those in `REGISTER_MOVE_COST'.
39280 This function is used extensively by register_move_cost that is used to
39281 build tables at startup. Make it inline in this case.
39282 When IN is 2, return maximum of in and out move cost.
39284 If moving between registers and memory is more expensive than
39285 between two registers, you should define this macro to express the
39286 relative cost.
39288 Model also increased moving costs of QImode registers in non
39289 Q_REGS classes.
39291 static inline int
39292 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39293 int in)
39295 int cost;
39296 if (FLOAT_CLASS_P (regclass))
39298 int index;
39299 switch (mode)
39301 case SFmode:
39302 index = 0;
39303 break;
39304 case DFmode:
39305 index = 1;
39306 break;
39307 case XFmode:
39308 index = 2;
39309 break;
39310 default:
39311 return 100;
39313 if (in == 2)
39314 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39315 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39317 if (SSE_CLASS_P (regclass))
39319 int index;
39320 switch (GET_MODE_SIZE (mode))
39322 case 4:
39323 index = 0;
39324 break;
39325 case 8:
39326 index = 1;
39327 break;
39328 case 16:
39329 index = 2;
39330 break;
39331 default:
39332 return 100;
39334 if (in == 2)
39335 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39336 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39338 if (MMX_CLASS_P (regclass))
39340 int index;
39341 switch (GET_MODE_SIZE (mode))
39343 case 4:
39344 index = 0;
39345 break;
39346 case 8:
39347 index = 1;
39348 break;
39349 default:
39350 return 100;
39352 if (in)
39353 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39354 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39356 switch (GET_MODE_SIZE (mode))
39358 case 1:
39359 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39361 if (!in)
39362 return ix86_cost->int_store[0];
39363 if (TARGET_PARTIAL_REG_DEPENDENCY
39364 && optimize_function_for_speed_p (cfun))
39365 cost = ix86_cost->movzbl_load;
39366 else
39367 cost = ix86_cost->int_load[0];
39368 if (in == 2)
39369 return MAX (cost, ix86_cost->int_store[0]);
39370 return cost;
39372 else
39374 if (in == 2)
39375 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39376 if (in)
39377 return ix86_cost->movzbl_load;
39378 else
39379 return ix86_cost->int_store[0] + 4;
39381 break;
39382 case 2:
39383 if (in == 2)
39384 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39385 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39386 default:
39387 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39388 if (mode == TFmode)
39389 mode = XFmode;
39390 if (in == 2)
39391 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39392 else if (in)
39393 cost = ix86_cost->int_load[2];
39394 else
39395 cost = ix86_cost->int_store[2];
39396 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39400 static int
39401 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39402 bool in)
39404 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39408 /* Return the cost of moving data from a register in class CLASS1 to
39409 one in class CLASS2.
39411 It is not required that the cost always equal 2 when FROM is the same as TO;
39412 on some machines it is expensive to move between registers if they are not
39413 general registers. */
39415 static int
39416 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39417 reg_class_t class2_i)
39419 enum reg_class class1 = (enum reg_class) class1_i;
39420 enum reg_class class2 = (enum reg_class) class2_i;
39422 /* In case we require secondary memory, compute cost of the store followed
39423 by load. In order to avoid bad register allocation choices, we need
39424 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39426 if (inline_secondary_memory_needed (class1, class2, mode, 0))
39428 int cost = 1;
39430 cost += inline_memory_move_cost (mode, class1, 2);
39431 cost += inline_memory_move_cost (mode, class2, 2);
39433 /* In case of copying from general_purpose_register we may emit multiple
39434 stores followed by single load causing memory size mismatch stall.
39435 Count this as arbitrarily high cost of 20. */
39436 if (targetm.class_max_nregs (class1, mode)
39437 > targetm.class_max_nregs (class2, mode))
39438 cost += 20;
39440 /* In the case of FP/MMX moves, the registers actually overlap, and we
39441 have to switch modes in order to treat them differently. */
39442 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39443 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39444 cost += 20;
39446 return cost;
39449 /* Moves between SSE/MMX and integer unit are expensive. */
39450 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39451 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39453 /* ??? By keeping returned value relatively high, we limit the number
39454 of moves between integer and MMX/SSE registers for all targets.
39455 Additionally, high value prevents problem with x86_modes_tieable_p(),
39456 where integer modes in MMX/SSE registers are not tieable
39457 because of missing QImode and HImode moves to, from or between
39458 MMX/SSE registers. */
39459 return MAX (8, ix86_cost->mmxsse_to_integer);
39461 if (MAYBE_FLOAT_CLASS_P (class1))
39462 return ix86_cost->fp_move;
39463 if (MAYBE_SSE_CLASS_P (class1))
39464 return ix86_cost->sse_move;
39465 if (MAYBE_MMX_CLASS_P (class1))
39466 return ix86_cost->mmx_move;
39467 return 2;
39470 /* Return TRUE if hard register REGNO can hold a value of machine-mode
39471 MODE. */
39473 bool
39474 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
39476 /* Flags and only flags can only hold CCmode values. */
39477 if (CC_REGNO_P (regno))
39478 return GET_MODE_CLASS (mode) == MODE_CC;
39479 if (GET_MODE_CLASS (mode) == MODE_CC
39480 || GET_MODE_CLASS (mode) == MODE_RANDOM
39481 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39482 return false;
39483 if (STACK_REGNO_P (regno))
39484 return VALID_FP_MODE_P (mode);
39485 if (MASK_REGNO_P (regno))
39486 return (VALID_MASK_REG_MODE (mode)
39487 || (TARGET_AVX512BW
39488 && VALID_MASK_AVX512BW_MODE (mode)));
39489 if (BND_REGNO_P (regno))
39490 return VALID_BND_REG_MODE (mode);
39491 if (SSE_REGNO_P (regno))
39493 /* We implement the move patterns for all vector modes into and
39494 out of SSE registers, even when no operation instructions
39495 are available. */
39497 /* For AVX-512 we allow, regardless of regno:
39498 - XI mode
39499 - any of 512-bit wide vector mode
39500 - any scalar mode. */
39501 if (TARGET_AVX512F
39502 && (mode == XImode
39503 || VALID_AVX512F_REG_MODE (mode)
39504 || VALID_AVX512F_SCALAR_MODE (mode)))
39505 return true;
39507 /* TODO check for QI/HI scalars. */
39508 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39509 if (TARGET_AVX512VL
39510 && (mode == OImode
39511 || mode == TImode
39512 || VALID_AVX256_REG_MODE (mode)
39513 || VALID_AVX512VL_128_REG_MODE (mode)))
39514 return true;
39516 /* xmm16-xmm31 are only available for AVX-512. */
39517 if (EXT_REX_SSE_REGNO_P (regno))
39518 return false;
39520 /* OImode and AVX modes are available only when AVX is enabled. */
39521 return ((TARGET_AVX
39522 && VALID_AVX256_REG_OR_OI_MODE (mode))
39523 || VALID_SSE_REG_MODE (mode)
39524 || VALID_SSE2_REG_MODE (mode)
39525 || VALID_MMX_REG_MODE (mode)
39526 || VALID_MMX_REG_MODE_3DNOW (mode));
39528 if (MMX_REGNO_P (regno))
39530 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39531 so if the register is available at all, then we can move data of
39532 the given mode into or out of it. */
39533 return (VALID_MMX_REG_MODE (mode)
39534 || VALID_MMX_REG_MODE_3DNOW (mode));
39537 if (mode == QImode)
39539 /* Take care for QImode values - they can be in non-QI regs,
39540 but then they do cause partial register stalls. */
39541 if (ANY_QI_REGNO_P (regno))
39542 return true;
39543 if (!TARGET_PARTIAL_REG_STALL)
39544 return true;
39545 /* LRA checks if the hard register is OK for the given mode.
39546 QImode values can live in non-QI regs, so we allow all
39547 registers here. */
39548 if (lra_in_progress)
39549 return true;
39550 return !can_create_pseudo_p ();
39552 /* We handle both integer and floats in the general purpose registers. */
39553 else if (VALID_INT_MODE_P (mode))
39554 return true;
39555 else if (VALID_FP_MODE_P (mode))
39556 return true;
39557 else if (VALID_DFP_MODE_P (mode))
39558 return true;
39559 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39560 on to use that value in smaller contexts, this can easily force a
39561 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39562 supporting DImode, allow it. */
39563 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39564 return true;
39566 return false;
39569 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39570 tieable integer mode. */
39572 static bool
39573 ix86_tieable_integer_mode_p (machine_mode mode)
39575 switch (mode)
39577 case HImode:
39578 case SImode:
39579 return true;
39581 case QImode:
39582 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39584 case DImode:
39585 return TARGET_64BIT;
39587 default:
39588 return false;
39592 /* Return true if MODE1 is accessible in a register that can hold MODE2
39593 without copying. That is, all register classes that can hold MODE2
39594 can also hold MODE1. */
39596 bool
39597 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39599 if (mode1 == mode2)
39600 return true;
39602 if (ix86_tieable_integer_mode_p (mode1)
39603 && ix86_tieable_integer_mode_p (mode2))
39604 return true;
39606 /* MODE2 being XFmode implies fp stack or general regs, which means we
39607 can tie any smaller floating point modes to it. Note that we do not
39608 tie this with TFmode. */
39609 if (mode2 == XFmode)
39610 return mode1 == SFmode || mode1 == DFmode;
39612 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39613 that we can tie it with SFmode. */
39614 if (mode2 == DFmode)
39615 return mode1 == SFmode;
39617 /* If MODE2 is only appropriate for an SSE register, then tie with
39618 any other mode acceptable to SSE registers. */
39619 if (GET_MODE_SIZE (mode2) == 32
39620 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39621 return (GET_MODE_SIZE (mode1) == 32
39622 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39623 if (GET_MODE_SIZE (mode2) == 16
39624 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39625 return (GET_MODE_SIZE (mode1) == 16
39626 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39628 /* If MODE2 is appropriate for an MMX register, then tie
39629 with any other mode acceptable to MMX registers. */
39630 if (GET_MODE_SIZE (mode2) == 8
39631 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39632 return (GET_MODE_SIZE (mode1) == 8
39633 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39635 return false;
39638 /* Return the cost of moving between two registers of mode MODE. */
39640 static int
39641 ix86_set_reg_reg_cost (machine_mode mode)
39643 unsigned int units = UNITS_PER_WORD;
39645 switch (GET_MODE_CLASS (mode))
39647 default:
39648 break;
39650 case MODE_CC:
39651 units = GET_MODE_SIZE (CCmode);
39652 break;
39654 case MODE_FLOAT:
39655 if ((TARGET_SSE && mode == TFmode)
39656 || (TARGET_80387 && mode == XFmode)
39657 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39658 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39659 units = GET_MODE_SIZE (mode);
39660 break;
39662 case MODE_COMPLEX_FLOAT:
39663 if ((TARGET_SSE && mode == TCmode)
39664 || (TARGET_80387 && mode == XCmode)
39665 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39666 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39667 units = GET_MODE_SIZE (mode);
39668 break;
39670 case MODE_VECTOR_INT:
39671 case MODE_VECTOR_FLOAT:
39672 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39673 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39674 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39675 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39676 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39677 units = GET_MODE_SIZE (mode);
39680 /* Return the cost of moving between two registers of mode MODE,
39681 assuming that the move will be in pieces of at most UNITS bytes. */
39682 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39685 /* Compute a (partial) cost for rtx X. Return true if the complete
39686 cost has been computed, and false if subexpressions should be
39687 scanned. In either case, *TOTAL contains the cost result. */
39689 static bool
39690 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39691 int *total, bool speed)
39693 rtx mask;
39694 enum rtx_code code = GET_CODE (x);
39695 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39696 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39698 switch (code)
39700 case SET:
39701 if (register_operand (SET_DEST (x), VOIDmode)
39702 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39704 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39705 return true;
39707 return false;
39709 case CONST_INT:
39710 case CONST:
39711 case LABEL_REF:
39712 case SYMBOL_REF:
39713 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39714 *total = 3;
39715 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39716 *total = 2;
39717 else if (flag_pic && SYMBOLIC_CONST (x)
39718 && !(TARGET_64BIT
39719 && (GET_CODE (x) == LABEL_REF
39720 || (GET_CODE (x) == SYMBOL_REF
39721 && SYMBOL_REF_LOCAL_P (x))))
39722 /* Use 0 cost for CONST to improve its propagation. */
39723 && (TARGET_64BIT || GET_CODE (x) != CONST))
39724 *total = 1;
39725 else
39726 *total = 0;
39727 return true;
39729 case CONST_DOUBLE:
39730 if (IS_STACK_MODE (mode))
39731 switch (standard_80387_constant_p (x))
39733 case -1:
39734 case 0:
39735 break;
39736 case 1: /* 0.0 */
39737 *total = 1;
39738 return true;
39739 default: /* Other constants */
39740 *total = 2;
39741 return true;
39743 /* FALLTHRU */
39745 case CONST_VECTOR:
39746 switch (standard_sse_constant_p (x, mode))
39748 case 0:
39749 break;
39750 case 1: /* 0: xor eliminates false dependency */
39751 *total = 0;
39752 return true;
39753 default: /* -1: cmp contains false dependency */
39754 *total = 1;
39755 return true;
39757 /* FALLTHRU */
39759 case CONST_WIDE_INT:
39760 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39761 it'll probably end up. Add a penalty for size. */
39762 *total = (COSTS_N_INSNS (1)
39763 + (!TARGET_64BIT && flag_pic)
39764 + (GET_MODE_SIZE (mode) <= 4
39765 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39766 return true;
39768 case ZERO_EXTEND:
39769 /* The zero extensions is often completely free on x86_64, so make
39770 it as cheap as possible. */
39771 if (TARGET_64BIT && mode == DImode
39772 && GET_MODE (XEXP (x, 0)) == SImode)
39773 *total = 1;
39774 else if (TARGET_ZERO_EXTEND_WITH_AND)
39775 *total = cost->add;
39776 else
39777 *total = cost->movzx;
39778 return false;
39780 case SIGN_EXTEND:
39781 *total = cost->movsx;
39782 return false;
39784 case ASHIFT:
39785 if (SCALAR_INT_MODE_P (mode)
39786 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39787 && CONST_INT_P (XEXP (x, 1)))
39789 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39790 if (value == 1)
39792 *total = cost->add;
39793 return false;
39795 if ((value == 2 || value == 3)
39796 && cost->lea <= cost->shift_const)
39798 *total = cost->lea;
39799 return false;
39802 /* FALLTHRU */
39804 case ROTATE:
39805 case ASHIFTRT:
39806 case LSHIFTRT:
39807 case ROTATERT:
39808 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39810 /* ??? Should be SSE vector operation cost. */
39811 /* At least for published AMD latencies, this really is the same
39812 as the latency for a simple fpu operation like fabs. */
39813 /* V*QImode is emulated with 1-11 insns. */
39814 if (mode == V16QImode || mode == V32QImode)
39816 int count = 11;
39817 if (TARGET_XOP && mode == V16QImode)
39819 /* For XOP we use vpshab, which requires a broadcast of the
39820 value to the variable shift insn. For constants this
39821 means a V16Q const in mem; even when we can perform the
39822 shift with one insn set the cost to prefer paddb. */
39823 if (CONSTANT_P (XEXP (x, 1)))
39825 *total = (cost->fabs
39826 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
39827 + (speed ? 2 : COSTS_N_BYTES (16)));
39828 return true;
39830 count = 3;
39832 else if (TARGET_SSSE3)
39833 count = 7;
39834 *total = cost->fabs * count;
39836 else
39837 *total = cost->fabs;
39839 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39841 if (CONST_INT_P (XEXP (x, 1)))
39843 if (INTVAL (XEXP (x, 1)) > 32)
39844 *total = cost->shift_const + COSTS_N_INSNS (2);
39845 else
39846 *total = cost->shift_const * 2;
39848 else
39850 if (GET_CODE (XEXP (x, 1)) == AND)
39851 *total = cost->shift_var * 2;
39852 else
39853 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
39856 else
39858 if (CONST_INT_P (XEXP (x, 1)))
39859 *total = cost->shift_const;
39860 else if (SUBREG_P (XEXP (x, 1))
39861 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
39863 /* Return the cost after shift-and truncation. */
39864 *total = cost->shift_var;
39865 return true;
39867 else
39868 *total = cost->shift_var;
39870 return false;
39872 case FMA:
39874 rtx sub;
39876 gcc_assert (FLOAT_MODE_P (mode));
39877 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39879 /* ??? SSE scalar/vector cost should be used here. */
39880 /* ??? Bald assumption that fma has the same cost as fmul. */
39881 *total = cost->fmul;
39882 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39884 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39885 sub = XEXP (x, 0);
39886 if (GET_CODE (sub) == NEG)
39887 sub = XEXP (sub, 0);
39888 *total += rtx_cost (sub, mode, FMA, 0, speed);
39890 sub = XEXP (x, 2);
39891 if (GET_CODE (sub) == NEG)
39892 sub = XEXP (sub, 0);
39893 *total += rtx_cost (sub, mode, FMA, 2, speed);
39894 return true;
39897 case MULT:
39898 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39900 /* ??? SSE scalar cost should be used here. */
39901 *total = cost->fmul;
39902 return false;
39904 else if (X87_FLOAT_MODE_P (mode))
39906 *total = cost->fmul;
39907 return false;
39909 else if (FLOAT_MODE_P (mode))
39911 /* ??? SSE vector cost should be used here. */
39912 *total = cost->fmul;
39913 return false;
39915 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39917 /* V*QImode is emulated with 7-13 insns. */
39918 if (mode == V16QImode || mode == V32QImode)
39920 int extra = 11;
39921 if (TARGET_XOP && mode == V16QImode)
39922 extra = 5;
39923 else if (TARGET_SSSE3)
39924 extra = 6;
39925 *total = cost->fmul * 2 + cost->fabs * extra;
39927 /* V*DImode is emulated with 5-8 insns. */
39928 else if (mode == V2DImode || mode == V4DImode)
39930 if (TARGET_XOP && mode == V2DImode)
39931 *total = cost->fmul * 2 + cost->fabs * 3;
39932 else
39933 *total = cost->fmul * 3 + cost->fabs * 5;
39935 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39936 insns, including two PMULUDQ. */
39937 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39938 *total = cost->fmul * 2 + cost->fabs * 5;
39939 else
39940 *total = cost->fmul;
39941 return false;
39943 else
39945 rtx op0 = XEXP (x, 0);
39946 rtx op1 = XEXP (x, 1);
39947 int nbits;
39948 if (CONST_INT_P (XEXP (x, 1)))
39950 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39951 for (nbits = 0; value != 0; value &= value - 1)
39952 nbits++;
39954 else
39955 /* This is arbitrary. */
39956 nbits = 7;
39958 /* Compute costs correctly for widening multiplication. */
39959 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39960 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39961 == GET_MODE_SIZE (mode))
39963 int is_mulwiden = 0;
39964 machine_mode inner_mode = GET_MODE (op0);
39966 if (GET_CODE (op0) == GET_CODE (op1))
39967 is_mulwiden = 1, op1 = XEXP (op1, 0);
39968 else if (CONST_INT_P (op1))
39970 if (GET_CODE (op0) == SIGN_EXTEND)
39971 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39972 == INTVAL (op1);
39973 else
39974 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39977 if (is_mulwiden)
39978 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39981 *total = (cost->mult_init[MODE_INDEX (mode)]
39982 + nbits * cost->mult_bit
39983 + rtx_cost (op0, mode, outer_code, opno, speed)
39984 + rtx_cost (op1, mode, outer_code, opno, speed));
39986 return true;
39989 case DIV:
39990 case UDIV:
39991 case MOD:
39992 case UMOD:
39993 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39994 /* ??? SSE cost should be used here. */
39995 *total = cost->fdiv;
39996 else if (X87_FLOAT_MODE_P (mode))
39997 *total = cost->fdiv;
39998 else if (FLOAT_MODE_P (mode))
39999 /* ??? SSE vector cost should be used here. */
40000 *total = cost->fdiv;
40001 else
40002 *total = cost->divide[MODE_INDEX (mode)];
40003 return false;
40005 case PLUS:
40006 if (GET_MODE_CLASS (mode) == MODE_INT
40007 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40009 if (GET_CODE (XEXP (x, 0)) == PLUS
40010 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40011 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40012 && CONSTANT_P (XEXP (x, 1)))
40014 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40015 if (val == 2 || val == 4 || val == 8)
40017 *total = cost->lea;
40018 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40019 outer_code, opno, speed);
40020 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40021 outer_code, opno, speed);
40022 *total += rtx_cost (XEXP (x, 1), mode,
40023 outer_code, opno, speed);
40024 return true;
40027 else if (GET_CODE (XEXP (x, 0)) == MULT
40028 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40030 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40031 if (val == 2 || val == 4 || val == 8)
40033 *total = cost->lea;
40034 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40035 outer_code, opno, speed);
40036 *total += rtx_cost (XEXP (x, 1), mode,
40037 outer_code, opno, speed);
40038 return true;
40041 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40043 *total = cost->lea;
40044 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40045 outer_code, opno, speed);
40046 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40047 outer_code, opno, speed);
40048 *total += rtx_cost (XEXP (x, 1), mode,
40049 outer_code, opno, speed);
40050 return true;
40053 /* FALLTHRU */
40055 case MINUS:
40056 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40058 /* ??? SSE cost should be used here. */
40059 *total = cost->fadd;
40060 return false;
40062 else if (X87_FLOAT_MODE_P (mode))
40064 *total = cost->fadd;
40065 return false;
40067 else if (FLOAT_MODE_P (mode))
40069 /* ??? SSE vector cost should be used here. */
40070 *total = cost->fadd;
40071 return false;
40073 /* FALLTHRU */
40075 case AND:
40076 case IOR:
40077 case XOR:
40078 if (GET_MODE_CLASS (mode) == MODE_INT
40079 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40081 *total = (cost->add * 2
40082 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40083 << (GET_MODE (XEXP (x, 0)) != DImode))
40084 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40085 << (GET_MODE (XEXP (x, 1)) != DImode)));
40086 return true;
40088 /* FALLTHRU */
40090 case NEG:
40091 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40093 /* ??? SSE cost should be used here. */
40094 *total = cost->fchs;
40095 return false;
40097 else if (X87_FLOAT_MODE_P (mode))
40099 *total = cost->fchs;
40100 return false;
40102 else if (FLOAT_MODE_P (mode))
40104 /* ??? SSE vector cost should be used here. */
40105 *total = cost->fchs;
40106 return false;
40108 /* FALLTHRU */
40110 case NOT:
40111 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40113 /* ??? Should be SSE vector operation cost. */
40114 /* At least for published AMD latencies, this really is the same
40115 as the latency for a simple fpu operation like fabs. */
40116 *total = cost->fabs;
40118 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40119 *total = cost->add * 2;
40120 else
40121 *total = cost->add;
40122 return false;
40124 case COMPARE:
40125 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40126 && XEXP (XEXP (x, 0), 1) == const1_rtx
40127 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40128 && XEXP (x, 1) == const0_rtx)
40130 /* This kind of construct is implemented using test[bwl].
40131 Treat it as if we had an AND. */
40132 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40133 *total = (cost->add
40134 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40135 opno, speed)
40136 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40137 return true;
40140 /* The embedded comparison operand is completely free. */
40141 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40142 && XEXP (x, 1) == const0_rtx)
40143 *total = 0;
40145 return false;
40147 case FLOAT_EXTEND:
40148 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40149 *total = 0;
40150 return false;
40152 case ABS:
40153 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40154 /* ??? SSE cost should be used here. */
40155 *total = cost->fabs;
40156 else if (X87_FLOAT_MODE_P (mode))
40157 *total = cost->fabs;
40158 else if (FLOAT_MODE_P (mode))
40159 /* ??? SSE vector cost should be used here. */
40160 *total = cost->fabs;
40161 return false;
40163 case SQRT:
40164 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40165 /* ??? SSE cost should be used here. */
40166 *total = cost->fsqrt;
40167 else if (X87_FLOAT_MODE_P (mode))
40168 *total = cost->fsqrt;
40169 else if (FLOAT_MODE_P (mode))
40170 /* ??? SSE vector cost should be used here. */
40171 *total = cost->fsqrt;
40172 return false;
40174 case UNSPEC:
40175 if (XINT (x, 1) == UNSPEC_TP)
40176 *total = 0;
40177 return false;
40179 case VEC_SELECT:
40180 case VEC_CONCAT:
40181 case VEC_DUPLICATE:
40182 /* ??? Assume all of these vector manipulation patterns are
40183 recognizable. In which case they all pretty much have the
40184 same cost. */
40185 *total = cost->fabs;
40186 return true;
40187 case VEC_MERGE:
40188 mask = XEXP (x, 2);
40189 /* This is masked instruction, assume the same cost,
40190 as nonmasked variant. */
40191 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40192 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40193 else
40194 *total = cost->fabs;
40195 return true;
40197 default:
40198 return false;
40202 #if TARGET_MACHO
40204 static int current_machopic_label_num;
40206 /* Given a symbol name and its associated stub, write out the
40207 definition of the stub. */
40209 void
40210 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40212 unsigned int length;
40213 char *binder_name, *symbol_name, lazy_ptr_name[32];
40214 int label = ++current_machopic_label_num;
40216 /* For 64-bit we shouldn't get here. */
40217 gcc_assert (!TARGET_64BIT);
40219 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40220 symb = targetm.strip_name_encoding (symb);
40222 length = strlen (stub);
40223 binder_name = XALLOCAVEC (char, length + 32);
40224 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40226 length = strlen (symb);
40227 symbol_name = XALLOCAVEC (char, length + 32);
40228 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40230 sprintf (lazy_ptr_name, "L%d$lz", label);
40232 if (MACHOPIC_ATT_STUB)
40233 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40234 else if (MACHOPIC_PURE)
40235 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40236 else
40237 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40239 fprintf (file, "%s:\n", stub);
40240 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40242 if (MACHOPIC_ATT_STUB)
40244 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40246 else if (MACHOPIC_PURE)
40248 /* PIC stub. */
40249 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40250 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40251 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40252 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40253 label, lazy_ptr_name, label);
40254 fprintf (file, "\tjmp\t*%%ecx\n");
40256 else
40257 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40259 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40260 it needs no stub-binding-helper. */
40261 if (MACHOPIC_ATT_STUB)
40262 return;
40264 fprintf (file, "%s:\n", binder_name);
40266 if (MACHOPIC_PURE)
40268 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40269 fprintf (file, "\tpushl\t%%ecx\n");
40271 else
40272 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40274 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40276 /* N.B. Keep the correspondence of these
40277 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40278 old-pic/new-pic/non-pic stubs; altering this will break
40279 compatibility with existing dylibs. */
40280 if (MACHOPIC_PURE)
40282 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40283 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40285 else
40286 /* 16-byte -mdynamic-no-pic stub. */
40287 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40289 fprintf (file, "%s:\n", lazy_ptr_name);
40290 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40291 fprintf (file, ASM_LONG "%s\n", binder_name);
40293 #endif /* TARGET_MACHO */
40295 /* Order the registers for register allocator. */
40297 void
40298 x86_order_regs_for_local_alloc (void)
40300 int pos = 0;
40301 int i;
40303 /* First allocate the local general purpose registers. */
40304 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40305 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40306 reg_alloc_order [pos++] = i;
40308 /* Global general purpose registers. */
40309 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40310 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40311 reg_alloc_order [pos++] = i;
40313 /* x87 registers come first in case we are doing FP math
40314 using them. */
40315 if (!TARGET_SSE_MATH)
40316 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40317 reg_alloc_order [pos++] = i;
40319 /* SSE registers. */
40320 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40321 reg_alloc_order [pos++] = i;
40322 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40323 reg_alloc_order [pos++] = i;
40325 /* Extended REX SSE registers. */
40326 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40327 reg_alloc_order [pos++] = i;
40329 /* Mask register. */
40330 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40331 reg_alloc_order [pos++] = i;
40333 /* MPX bound registers. */
40334 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40335 reg_alloc_order [pos++] = i;
40337 /* x87 registers. */
40338 if (TARGET_SSE_MATH)
40339 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40340 reg_alloc_order [pos++] = i;
40342 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40343 reg_alloc_order [pos++] = i;
40345 /* Initialize the rest of array as we do not allocate some registers
40346 at all. */
40347 while (pos < FIRST_PSEUDO_REGISTER)
40348 reg_alloc_order [pos++] = 0;
40351 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40352 in struct attribute_spec handler. */
40353 static tree
40354 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40355 tree args,
40356 int,
40357 bool *no_add_attrs)
40359 if (TREE_CODE (*node) != FUNCTION_TYPE
40360 && TREE_CODE (*node) != METHOD_TYPE
40361 && TREE_CODE (*node) != FIELD_DECL
40362 && TREE_CODE (*node) != TYPE_DECL)
40364 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40365 name);
40366 *no_add_attrs = true;
40367 return NULL_TREE;
40369 if (TARGET_64BIT)
40371 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40372 name);
40373 *no_add_attrs = true;
40374 return NULL_TREE;
40376 if (is_attribute_p ("callee_pop_aggregate_return", name))
40378 tree cst;
40380 cst = TREE_VALUE (args);
40381 if (TREE_CODE (cst) != INTEGER_CST)
40383 warning (OPT_Wattributes,
40384 "%qE attribute requires an integer constant argument",
40385 name);
40386 *no_add_attrs = true;
40388 else if (compare_tree_int (cst, 0) != 0
40389 && compare_tree_int (cst, 1) != 0)
40391 warning (OPT_Wattributes,
40392 "argument to %qE attribute is neither zero, nor one",
40393 name);
40394 *no_add_attrs = true;
40397 return NULL_TREE;
40400 return NULL_TREE;
40403 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40404 struct attribute_spec.handler. */
40405 static tree
40406 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40407 bool *no_add_attrs)
40409 if (TREE_CODE (*node) != FUNCTION_TYPE
40410 && TREE_CODE (*node) != METHOD_TYPE
40411 && TREE_CODE (*node) != FIELD_DECL
40412 && TREE_CODE (*node) != TYPE_DECL)
40414 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40415 name);
40416 *no_add_attrs = true;
40417 return NULL_TREE;
40420 /* Can combine regparm with all attributes but fastcall. */
40421 if (is_attribute_p ("ms_abi", name))
40423 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40425 error ("ms_abi and sysv_abi attributes are not compatible");
40428 return NULL_TREE;
40430 else if (is_attribute_p ("sysv_abi", name))
40432 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40434 error ("ms_abi and sysv_abi attributes are not compatible");
40437 return NULL_TREE;
40440 return NULL_TREE;
40443 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40444 struct attribute_spec.handler. */
40445 static tree
40446 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40447 bool *no_add_attrs)
40449 tree *type = NULL;
40450 if (DECL_P (*node))
40452 if (TREE_CODE (*node) == TYPE_DECL)
40453 type = &TREE_TYPE (*node);
40455 else
40456 type = node;
40458 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40460 warning (OPT_Wattributes, "%qE attribute ignored",
40461 name);
40462 *no_add_attrs = true;
40465 else if ((is_attribute_p ("ms_struct", name)
40466 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40467 || ((is_attribute_p ("gcc_struct", name)
40468 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40470 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40471 name);
40472 *no_add_attrs = true;
40475 return NULL_TREE;
40478 static tree
40479 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40480 bool *no_add_attrs)
40482 if (TREE_CODE (*node) != FUNCTION_DECL)
40484 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40485 name);
40486 *no_add_attrs = true;
40488 return NULL_TREE;
40491 static tree
40492 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40493 int, bool *)
40495 return NULL_TREE;
40498 static tree
40499 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40501 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40502 but the function type contains args and return type data. */
40503 tree func_type = *node;
40504 tree return_type = TREE_TYPE (func_type);
40506 int nargs = 0;
40507 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40508 while (current_arg_type
40509 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40511 if (nargs == 0)
40513 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40514 error ("interrupt service routine should have a pointer "
40515 "as the first argument");
40517 else if (nargs == 1)
40519 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40520 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40521 error ("interrupt service routine should have unsigned %s"
40522 "int as the second argument",
40523 TARGET_64BIT
40524 ? (TARGET_X32 ? "long long " : "long ")
40525 : "");
40527 nargs++;
40528 current_arg_type = TREE_CHAIN (current_arg_type);
40530 if (!nargs || nargs > 2)
40531 error ("interrupt service routine can only have a pointer argument "
40532 "and an optional integer argument");
40533 if (! VOID_TYPE_P (return_type))
40534 error ("interrupt service routine can't have non-void return value");
40536 return NULL_TREE;
40539 static bool
40540 ix86_ms_bitfield_layout_p (const_tree record_type)
40542 return ((TARGET_MS_BITFIELD_LAYOUT
40543 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40544 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40547 /* Returns an expression indicating where the this parameter is
40548 located on entry to the FUNCTION. */
40550 static rtx
40551 x86_this_parameter (tree function)
40553 tree type = TREE_TYPE (function);
40554 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40555 int nregs;
40557 if (TARGET_64BIT)
40559 const int *parm_regs;
40561 if (ix86_function_type_abi (type) == MS_ABI)
40562 parm_regs = x86_64_ms_abi_int_parameter_registers;
40563 else
40564 parm_regs = x86_64_int_parameter_registers;
40565 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40568 nregs = ix86_function_regparm (type, function);
40570 if (nregs > 0 && !stdarg_p (type))
40572 int regno;
40573 unsigned int ccvt = ix86_get_callcvt (type);
40575 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40576 regno = aggr ? DX_REG : CX_REG;
40577 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40579 regno = CX_REG;
40580 if (aggr)
40581 return gen_rtx_MEM (SImode,
40582 plus_constant (Pmode, stack_pointer_rtx, 4));
40584 else
40586 regno = AX_REG;
40587 if (aggr)
40589 regno = DX_REG;
40590 if (nregs == 1)
40591 return gen_rtx_MEM (SImode,
40592 plus_constant (Pmode,
40593 stack_pointer_rtx, 4));
40596 return gen_rtx_REG (SImode, regno);
40599 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40600 aggr ? 8 : 4));
40603 /* Determine whether x86_output_mi_thunk can succeed. */
40605 static bool
40606 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40607 const_tree function)
40609 /* 64-bit can handle anything. */
40610 if (TARGET_64BIT)
40611 return true;
40613 /* For 32-bit, everything's fine if we have one free register. */
40614 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40615 return true;
40617 /* Need a free register for vcall_offset. */
40618 if (vcall_offset)
40619 return false;
40621 /* Need a free register for GOT references. */
40622 if (flag_pic && !targetm.binds_local_p (function))
40623 return false;
40625 /* Otherwise ok. */
40626 return true;
40629 /* Output the assembler code for a thunk function. THUNK_DECL is the
40630 declaration for the thunk function itself, FUNCTION is the decl for
40631 the target function. DELTA is an immediate constant offset to be
40632 added to THIS. If VCALL_OFFSET is nonzero, the word at
40633 *(*this + vcall_offset) should be added to THIS. */
40635 static void
40636 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40637 HOST_WIDE_INT vcall_offset, tree function)
40639 rtx this_param = x86_this_parameter (function);
40640 rtx this_reg, tmp, fnaddr;
40641 unsigned int tmp_regno;
40642 rtx_insn *insn;
40644 if (TARGET_64BIT)
40645 tmp_regno = R10_REG;
40646 else
40648 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40649 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40650 tmp_regno = AX_REG;
40651 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40652 tmp_regno = DX_REG;
40653 else
40654 tmp_regno = CX_REG;
40657 emit_note (NOTE_INSN_PROLOGUE_END);
40659 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40660 pull it in now and let DELTA benefit. */
40661 if (REG_P (this_param))
40662 this_reg = this_param;
40663 else if (vcall_offset)
40665 /* Put the this parameter into %eax. */
40666 this_reg = gen_rtx_REG (Pmode, AX_REG);
40667 emit_move_insn (this_reg, this_param);
40669 else
40670 this_reg = NULL_RTX;
40672 /* Adjust the this parameter by a fixed constant. */
40673 if (delta)
40675 rtx delta_rtx = GEN_INT (delta);
40676 rtx delta_dst = this_reg ? this_reg : this_param;
40678 if (TARGET_64BIT)
40680 if (!x86_64_general_operand (delta_rtx, Pmode))
40682 tmp = gen_rtx_REG (Pmode, tmp_regno);
40683 emit_move_insn (tmp, delta_rtx);
40684 delta_rtx = tmp;
40688 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40691 /* Adjust the this parameter by a value stored in the vtable. */
40692 if (vcall_offset)
40694 rtx vcall_addr, vcall_mem, this_mem;
40696 tmp = gen_rtx_REG (Pmode, tmp_regno);
40698 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40699 if (Pmode != ptr_mode)
40700 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40701 emit_move_insn (tmp, this_mem);
40703 /* Adjust the this parameter. */
40704 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40705 if (TARGET_64BIT
40706 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40708 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40709 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40710 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40713 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40714 if (Pmode != ptr_mode)
40715 emit_insn (gen_addsi_1_zext (this_reg,
40716 gen_rtx_REG (ptr_mode,
40717 REGNO (this_reg)),
40718 vcall_mem));
40719 else
40720 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40723 /* If necessary, drop THIS back to its stack slot. */
40724 if (this_reg && this_reg != this_param)
40725 emit_move_insn (this_param, this_reg);
40727 fnaddr = XEXP (DECL_RTL (function), 0);
40728 if (TARGET_64BIT)
40730 if (!flag_pic || targetm.binds_local_p (function)
40731 || TARGET_PECOFF)
40733 else
40735 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40736 tmp = gen_rtx_CONST (Pmode, tmp);
40737 fnaddr = gen_const_mem (Pmode, tmp);
40740 else
40742 if (!flag_pic || targetm.binds_local_p (function))
40744 #if TARGET_MACHO
40745 else if (TARGET_MACHO)
40747 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40748 fnaddr = XEXP (fnaddr, 0);
40750 #endif /* TARGET_MACHO */
40751 else
40753 tmp = gen_rtx_REG (Pmode, CX_REG);
40754 output_set_got (tmp, NULL_RTX);
40756 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40757 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40758 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40759 fnaddr = gen_const_mem (Pmode, fnaddr);
40763 /* Our sibling call patterns do not allow memories, because we have no
40764 predicate that can distinguish between frame and non-frame memory.
40765 For our purposes here, we can get away with (ab)using a jump pattern,
40766 because we're going to do no optimization. */
40767 if (MEM_P (fnaddr))
40769 if (sibcall_insn_operand (fnaddr, word_mode))
40771 fnaddr = XEXP (DECL_RTL (function), 0);
40772 tmp = gen_rtx_MEM (QImode, fnaddr);
40773 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40774 tmp = emit_call_insn (tmp);
40775 SIBLING_CALL_P (tmp) = 1;
40777 else
40778 emit_jump_insn (gen_indirect_jump (fnaddr));
40780 else
40782 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40784 // CM_LARGE_PIC always uses pseudo PIC register which is
40785 // uninitialized. Since FUNCTION is local and calling it
40786 // doesn't go through PLT, we use scratch register %r11 as
40787 // PIC register and initialize it here.
40788 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40789 ix86_init_large_pic_reg (tmp_regno);
40790 fnaddr = legitimize_pic_address (fnaddr,
40791 gen_rtx_REG (Pmode, tmp_regno));
40794 if (!sibcall_insn_operand (fnaddr, word_mode))
40796 tmp = gen_rtx_REG (word_mode, tmp_regno);
40797 if (GET_MODE (fnaddr) != word_mode)
40798 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40799 emit_move_insn (tmp, fnaddr);
40800 fnaddr = tmp;
40803 tmp = gen_rtx_MEM (QImode, fnaddr);
40804 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40805 tmp = emit_call_insn (tmp);
40806 SIBLING_CALL_P (tmp) = 1;
40808 emit_barrier ();
40810 /* Emit just enough of rest_of_compilation to get the insns emitted.
40811 Note that use_thunk calls assemble_start_function et al. */
40812 insn = get_insns ();
40813 shorten_branches (insn);
40814 final_start_function (insn, file, 1);
40815 final (insn, file, 1);
40816 final_end_function ();
40819 static void
40820 x86_file_start (void)
40822 default_file_start ();
40823 if (TARGET_16BIT)
40824 fputs ("\t.code16gcc\n", asm_out_file);
40825 #if TARGET_MACHO
40826 darwin_file_start ();
40827 #endif
40828 if (X86_FILE_START_VERSION_DIRECTIVE)
40829 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40830 if (X86_FILE_START_FLTUSED)
40831 fputs ("\t.global\t__fltused\n", asm_out_file);
40832 if (ix86_asm_dialect == ASM_INTEL)
40833 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40837 x86_field_alignment (tree field, int computed)
40839 machine_mode mode;
40840 tree type = TREE_TYPE (field);
40842 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40843 return computed;
40844 if (TARGET_IAMCU)
40845 return iamcu_alignment (type, computed);
40846 mode = TYPE_MODE (strip_array_types (type));
40847 if (mode == DFmode || mode == DCmode
40848 || GET_MODE_CLASS (mode) == MODE_INT
40849 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40850 return MIN (32, computed);
40851 return computed;
40854 /* Print call to TARGET to FILE. */
40856 static void
40857 x86_print_call_or_nop (FILE *file, const char *target)
40859 if (flag_nop_mcount)
40860 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
40861 else
40862 fprintf (file, "1:\tcall\t%s\n", target);
40865 /* Output assembler code to FILE to increment profiler label # LABELNO
40866 for profiling a function entry. */
40867 void
40868 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40870 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40871 : MCOUNT_NAME);
40872 if (TARGET_64BIT)
40874 #ifndef NO_PROFILE_COUNTERS
40875 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40876 #endif
40878 if (!TARGET_PECOFF && flag_pic)
40879 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40880 else
40881 x86_print_call_or_nop (file, mcount_name);
40883 else if (flag_pic)
40885 #ifndef NO_PROFILE_COUNTERS
40886 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40887 LPREFIX, labelno);
40888 #endif
40889 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40891 else
40893 #ifndef NO_PROFILE_COUNTERS
40894 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40895 LPREFIX, labelno);
40896 #endif
40897 x86_print_call_or_nop (file, mcount_name);
40900 if (flag_record_mcount)
40902 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40903 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40904 fprintf (file, "\t.previous\n");
40908 /* We don't have exact information about the insn sizes, but we may assume
40909 quite safely that we are informed about all 1 byte insns and memory
40910 address sizes. This is enough to eliminate unnecessary padding in
40911 99% of cases. */
40913 static int
40914 min_insn_size (rtx_insn *insn)
40916 int l = 0, len;
40918 if (!INSN_P (insn) || !active_insn_p (insn))
40919 return 0;
40921 /* Discard alignments we've emit and jump instructions. */
40922 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40923 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40924 return 0;
40926 /* Important case - calls are always 5 bytes.
40927 It is common to have many calls in the row. */
40928 if (CALL_P (insn)
40929 && symbolic_reference_mentioned_p (PATTERN (insn))
40930 && !SIBLING_CALL_P (insn))
40931 return 5;
40932 len = get_attr_length (insn);
40933 if (len <= 1)
40934 return 1;
40936 /* For normal instructions we rely on get_attr_length being exact,
40937 with a few exceptions. */
40938 if (!JUMP_P (insn))
40940 enum attr_type type = get_attr_type (insn);
40942 switch (type)
40944 case TYPE_MULTI:
40945 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40946 || asm_noperands (PATTERN (insn)) >= 0)
40947 return 0;
40948 break;
40949 case TYPE_OTHER:
40950 case TYPE_FCMP:
40951 break;
40952 default:
40953 /* Otherwise trust get_attr_length. */
40954 return len;
40957 l = get_attr_length_address (insn);
40958 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40959 l = 4;
40961 if (l)
40962 return 1+l;
40963 else
40964 return 2;
40967 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40969 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40970 window. */
40972 static void
40973 ix86_avoid_jump_mispredicts (void)
40975 rtx_insn *insn, *start = get_insns ();
40976 int nbytes = 0, njumps = 0;
40977 bool isjump = false;
40979 /* Look for all minimal intervals of instructions containing 4 jumps.
40980 The intervals are bounded by START and INSN. NBYTES is the total
40981 size of instructions in the interval including INSN and not including
40982 START. When the NBYTES is smaller than 16 bytes, it is possible
40983 that the end of START and INSN ends up in the same 16byte page.
40985 The smallest offset in the page INSN can start is the case where START
40986 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40987 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40989 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40990 have to, control transfer to label(s) can be performed through other
40991 means, and also we estimate minimum length of all asm stmts as 0. */
40992 for (insn = start; insn; insn = NEXT_INSN (insn))
40994 int min_size;
40996 if (LABEL_P (insn))
40998 int align = label_to_alignment (insn);
40999 int max_skip = label_to_max_skip (insn);
41001 if (max_skip > 15)
41002 max_skip = 15;
41003 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41004 already in the current 16 byte page, because otherwise
41005 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41006 bytes to reach 16 byte boundary. */
41007 if (align <= 0
41008 || (align <= 3 && max_skip != (1 << align) - 1))
41009 max_skip = 0;
41010 if (dump_file)
41011 fprintf (dump_file, "Label %i with max_skip %i\n",
41012 INSN_UID (insn), max_skip);
41013 if (max_skip)
41015 while (nbytes + max_skip >= 16)
41017 start = NEXT_INSN (start);
41018 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41019 || CALL_P (start))
41020 njumps--, isjump = true;
41021 else
41022 isjump = false;
41023 nbytes -= min_insn_size (start);
41026 continue;
41029 min_size = min_insn_size (insn);
41030 nbytes += min_size;
41031 if (dump_file)
41032 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41033 INSN_UID (insn), min_size);
41034 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41035 || CALL_P (insn))
41036 njumps++;
41037 else
41038 continue;
41040 while (njumps > 3)
41042 start = NEXT_INSN (start);
41043 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41044 || CALL_P (start))
41045 njumps--, isjump = true;
41046 else
41047 isjump = false;
41048 nbytes -= min_insn_size (start);
41050 gcc_assert (njumps >= 0);
41051 if (dump_file)
41052 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41053 INSN_UID (start), INSN_UID (insn), nbytes);
41055 if (njumps == 3 && isjump && nbytes < 16)
41057 int padsize = 15 - nbytes + min_insn_size (insn);
41059 if (dump_file)
41060 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41061 INSN_UID (insn), padsize);
41062 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41066 #endif
41068 /* AMD Athlon works faster
41069 when RET is not destination of conditional jump or directly preceded
41070 by other jump instruction. We avoid the penalty by inserting NOP just
41071 before the RET instructions in such cases. */
41072 static void
41073 ix86_pad_returns (void)
41075 edge e;
41076 edge_iterator ei;
41078 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41080 basic_block bb = e->src;
41081 rtx_insn *ret = BB_END (bb);
41082 rtx_insn *prev;
41083 bool replace = false;
41085 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41086 || optimize_bb_for_size_p (bb))
41087 continue;
41088 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41089 if (active_insn_p (prev) || LABEL_P (prev))
41090 break;
41091 if (prev && LABEL_P (prev))
41093 edge e;
41094 edge_iterator ei;
41096 FOR_EACH_EDGE (e, ei, bb->preds)
41097 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41098 && !(e->flags & EDGE_FALLTHRU))
41100 replace = true;
41101 break;
41104 if (!replace)
41106 prev = prev_active_insn (ret);
41107 if (prev
41108 && ((JUMP_P (prev) && any_condjump_p (prev))
41109 || CALL_P (prev)))
41110 replace = true;
41111 /* Empty functions get branch mispredict even when
41112 the jump destination is not visible to us. */
41113 if (!prev && !optimize_function_for_size_p (cfun))
41114 replace = true;
41116 if (replace)
41118 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41119 delete_insn (ret);
41124 /* Count the minimum number of instructions in BB. Return 4 if the
41125 number of instructions >= 4. */
41127 static int
41128 ix86_count_insn_bb (basic_block bb)
41130 rtx_insn *insn;
41131 int insn_count = 0;
41133 /* Count number of instructions in this block. Return 4 if the number
41134 of instructions >= 4. */
41135 FOR_BB_INSNS (bb, insn)
41137 /* Only happen in exit blocks. */
41138 if (JUMP_P (insn)
41139 && ANY_RETURN_P (PATTERN (insn)))
41140 break;
41142 if (NONDEBUG_INSN_P (insn)
41143 && GET_CODE (PATTERN (insn)) != USE
41144 && GET_CODE (PATTERN (insn)) != CLOBBER)
41146 insn_count++;
41147 if (insn_count >= 4)
41148 return insn_count;
41152 return insn_count;
41156 /* Count the minimum number of instructions in code path in BB.
41157 Return 4 if the number of instructions >= 4. */
41159 static int
41160 ix86_count_insn (basic_block bb)
41162 edge e;
41163 edge_iterator ei;
41164 int min_prev_count;
41166 /* Only bother counting instructions along paths with no
41167 more than 2 basic blocks between entry and exit. Given
41168 that BB has an edge to exit, determine if a predecessor
41169 of BB has an edge from entry. If so, compute the number
41170 of instructions in the predecessor block. If there
41171 happen to be multiple such blocks, compute the minimum. */
41172 min_prev_count = 4;
41173 FOR_EACH_EDGE (e, ei, bb->preds)
41175 edge prev_e;
41176 edge_iterator prev_ei;
41178 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41180 min_prev_count = 0;
41181 break;
41183 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41185 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41187 int count = ix86_count_insn_bb (e->src);
41188 if (count < min_prev_count)
41189 min_prev_count = count;
41190 break;
41195 if (min_prev_count < 4)
41196 min_prev_count += ix86_count_insn_bb (bb);
41198 return min_prev_count;
41201 /* Pad short function to 4 instructions. */
41203 static void
41204 ix86_pad_short_function (void)
41206 edge e;
41207 edge_iterator ei;
41209 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41211 rtx_insn *ret = BB_END (e->src);
41212 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41214 int insn_count = ix86_count_insn (e->src);
41216 /* Pad short function. */
41217 if (insn_count < 4)
41219 rtx_insn *insn = ret;
41221 /* Find epilogue. */
41222 while (insn
41223 && (!NOTE_P (insn)
41224 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41225 insn = PREV_INSN (insn);
41227 if (!insn)
41228 insn = ret;
41230 /* Two NOPs count as one instruction. */
41231 insn_count = 2 * (4 - insn_count);
41232 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41238 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41239 the epilogue, the Windows system unwinder will apply epilogue logic and
41240 produce incorrect offsets. This can be avoided by adding a nop between
41241 the last insn that can throw and the first insn of the epilogue. */
41243 static void
41244 ix86_seh_fixup_eh_fallthru (void)
41246 edge e;
41247 edge_iterator ei;
41249 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41251 rtx_insn *insn, *next;
41253 /* Find the beginning of the epilogue. */
41254 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41255 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41256 break;
41257 if (insn == NULL)
41258 continue;
41260 /* We only care about preceding insns that can throw. */
41261 insn = prev_active_insn (insn);
41262 if (insn == NULL || !can_throw_internal (insn))
41263 continue;
41265 /* Do not separate calls from their debug information. */
41266 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41267 if (NOTE_P (next)
41268 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41269 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41270 insn = next;
41271 else
41272 break;
41274 emit_insn_after (gen_nops (const1_rtx), insn);
41278 /* Given a register number BASE, the lowest of a group of registers, update
41279 regsets IN and OUT with the registers that should be avoided in input
41280 and output operands respectively when trying to avoid generating a modr/m
41281 byte for -fmitigate-rop. */
41283 static void
41284 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41286 SET_HARD_REG_BIT (out, base);
41287 SET_HARD_REG_BIT (out, base + 1);
41288 SET_HARD_REG_BIT (in, base + 2);
41289 SET_HARD_REG_BIT (in, base + 3);
41292 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41293 that certain encodings of modr/m bytes do not occur. */
41294 static void
41295 ix86_mitigate_rop (void)
41297 HARD_REG_SET input_risky;
41298 HARD_REG_SET output_risky;
41299 HARD_REG_SET inout_risky;
41301 CLEAR_HARD_REG_SET (output_risky);
41302 CLEAR_HARD_REG_SET (input_risky);
41303 SET_HARD_REG_BIT (output_risky, AX_REG);
41304 SET_HARD_REG_BIT (output_risky, CX_REG);
41305 SET_HARD_REG_BIT (input_risky, BX_REG);
41306 SET_HARD_REG_BIT (input_risky, DX_REG);
41307 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41308 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41309 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41310 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41311 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41312 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41313 COPY_HARD_REG_SET (inout_risky, input_risky);
41314 IOR_HARD_REG_SET (inout_risky, output_risky);
41316 df_note_add_problem ();
41317 /* Fix up what stack-regs did. */
41318 df_insn_rescan_all ();
41319 df_analyze ();
41321 regrename_init (true);
41322 regrename_analyze (NULL);
41324 auto_vec<du_head_p> cands;
41326 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41328 if (!NONDEBUG_INSN_P (insn))
41329 continue;
41331 if (GET_CODE (PATTERN (insn)) == USE
41332 || GET_CODE (PATTERN (insn)) == CLOBBER)
41333 continue;
41335 extract_insn (insn);
41337 int opno0, opno1;
41338 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41339 recog_data.n_operands, &opno0,
41340 &opno1);
41342 if (!ix86_rop_should_change_byte_p (modrm))
41343 continue;
41345 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41347 /* This happens when regrename has to fail a block. */
41348 if (!info->op_info)
41349 continue;
41351 if (info->op_info[opno0].n_chains != 0)
41353 gcc_assert (info->op_info[opno0].n_chains == 1);
41354 du_head_p op0c;
41355 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41356 if (op0c->target_data_1 + op0c->target_data_2 == 0
41357 && !op0c->cannot_rename)
41358 cands.safe_push (op0c);
41360 op0c->target_data_1++;
41362 if (info->op_info[opno1].n_chains != 0)
41364 gcc_assert (info->op_info[opno1].n_chains == 1);
41365 du_head_p op1c;
41366 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41367 if (op1c->target_data_1 + op1c->target_data_2 == 0
41368 && !op1c->cannot_rename)
41369 cands.safe_push (op1c);
41371 op1c->target_data_2++;
41375 int i;
41376 du_head_p head;
41377 FOR_EACH_VEC_ELT (cands, i, head)
41379 int old_reg, best_reg;
41380 HARD_REG_SET unavailable;
41382 CLEAR_HARD_REG_SET (unavailable);
41383 if (head->target_data_1)
41384 IOR_HARD_REG_SET (unavailable, output_risky);
41385 if (head->target_data_2)
41386 IOR_HARD_REG_SET (unavailable, input_risky);
41388 int n_uses;
41389 reg_class superclass = regrename_find_superclass (head, &n_uses,
41390 &unavailable);
41391 old_reg = head->regno;
41392 best_reg = find_rename_reg (head, superclass, &unavailable,
41393 old_reg, false);
41394 bool ok = regrename_do_replace (head, best_reg);
41395 gcc_assert (ok);
41396 if (dump_file)
41397 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41398 reg_names[best_reg], reg_class_names[superclass]);
41402 regrename_finish ();
41404 df_analyze ();
41406 basic_block bb;
41407 regset_head live;
41409 INIT_REG_SET (&live);
41411 FOR_EACH_BB_FN (bb, cfun)
41413 rtx_insn *insn;
41415 COPY_REG_SET (&live, DF_LR_OUT (bb));
41416 df_simulate_initialize_backwards (bb, &live);
41418 FOR_BB_INSNS_REVERSE (bb, insn)
41420 if (!NONDEBUG_INSN_P (insn))
41421 continue;
41423 df_simulate_one_insn_backwards (bb, insn, &live);
41425 if (GET_CODE (PATTERN (insn)) == USE
41426 || GET_CODE (PATTERN (insn)) == CLOBBER)
41427 continue;
41429 extract_insn (insn);
41430 constrain_operands_cached (insn, reload_completed);
41431 int opno0, opno1;
41432 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41433 recog_data.n_operands, &opno0,
41434 &opno1);
41435 if (modrm < 0
41436 || !ix86_rop_should_change_byte_p (modrm)
41437 || opno0 == opno1)
41438 continue;
41440 rtx oldreg = recog_data.operand[opno1];
41441 preprocess_constraints (insn);
41442 const operand_alternative *alt = which_op_alt ();
41444 int i;
41445 for (i = 0; i < recog_data.n_operands; i++)
41446 if (i != opno1
41447 && alt[i].earlyclobber
41448 && reg_overlap_mentioned_p (recog_data.operand[i],
41449 oldreg))
41450 break;
41452 if (i < recog_data.n_operands)
41453 continue;
41455 if (dump_file)
41456 fprintf (dump_file,
41457 "attempting to fix modrm byte in insn %d:"
41458 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41459 reg_class_names[alt[opno1].cl]);
41461 HARD_REG_SET unavailable;
41462 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41463 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41464 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41465 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41466 IOR_HARD_REG_SET (unavailable, output_risky);
41467 IOR_COMPL_HARD_REG_SET (unavailable,
41468 reg_class_contents[alt[opno1].cl]);
41470 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41471 if (!TEST_HARD_REG_BIT (unavailable, i))
41472 break;
41473 if (i == FIRST_PSEUDO_REGISTER)
41475 if (dump_file)
41476 fprintf (dump_file, ", none available\n");
41477 continue;
41479 if (dump_file)
41480 fprintf (dump_file, " -> %d\n", i);
41481 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41482 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41483 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41488 /* Implement machine specific optimizations. We implement padding of returns
41489 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41490 static void
41491 ix86_reorg (void)
41493 /* We are freeing block_for_insn in the toplev to keep compatibility
41494 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41495 compute_bb_for_insn ();
41497 if (flag_mitigate_rop)
41498 ix86_mitigate_rop ();
41500 if (TARGET_SEH && current_function_has_exception_handlers ())
41501 ix86_seh_fixup_eh_fallthru ();
41503 if (optimize && optimize_function_for_speed_p (cfun))
41505 if (TARGET_PAD_SHORT_FUNCTION)
41506 ix86_pad_short_function ();
41507 else if (TARGET_PAD_RETURNS)
41508 ix86_pad_returns ();
41509 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41510 if (TARGET_FOUR_JUMP_LIMIT)
41511 ix86_avoid_jump_mispredicts ();
41512 #endif
41516 /* Return nonzero when QImode register that must be represented via REX prefix
41517 is used. */
41518 bool
41519 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41521 int i;
41522 extract_insn_cached (insn);
41523 for (i = 0; i < recog_data.n_operands; i++)
41524 if (GENERAL_REG_P (recog_data.operand[i])
41525 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41526 return true;
41527 return false;
41530 /* Return true when INSN mentions register that must be encoded using REX
41531 prefix. */
41532 bool
41533 x86_extended_reg_mentioned_p (rtx insn)
41535 subrtx_iterator::array_type array;
41536 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41538 const_rtx x = *iter;
41539 if (REG_P (x)
41540 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41541 return true;
41543 return false;
41546 /* If profitable, negate (without causing overflow) integer constant
41547 of mode MODE at location LOC. Return true in this case. */
41548 bool
41549 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41551 HOST_WIDE_INT val;
41553 if (!CONST_INT_P (*loc))
41554 return false;
41556 switch (mode)
41558 case DImode:
41559 /* DImode x86_64 constants must fit in 32 bits. */
41560 gcc_assert (x86_64_immediate_operand (*loc, mode));
41562 mode = SImode;
41563 break;
41565 case SImode:
41566 case HImode:
41567 case QImode:
41568 break;
41570 default:
41571 gcc_unreachable ();
41574 /* Avoid overflows. */
41575 if (mode_signbit_p (mode, *loc))
41576 return false;
41578 val = INTVAL (*loc);
41580 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41581 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41582 if ((val < 0 && val != -128)
41583 || val == 128)
41585 *loc = GEN_INT (-val);
41586 return true;
41589 return false;
41592 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41593 optabs would emit if we didn't have TFmode patterns. */
41595 void
41596 x86_emit_floatuns (rtx operands[2])
41598 rtx_code_label *neglab, *donelab;
41599 rtx i0, i1, f0, in, out;
41600 machine_mode mode, inmode;
41602 inmode = GET_MODE (operands[1]);
41603 gcc_assert (inmode == SImode || inmode == DImode);
41605 out = operands[0];
41606 in = force_reg (inmode, operands[1]);
41607 mode = GET_MODE (out);
41608 neglab = gen_label_rtx ();
41609 donelab = gen_label_rtx ();
41610 f0 = gen_reg_rtx (mode);
41612 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41614 expand_float (out, in, 0);
41616 emit_jump_insn (gen_jump (donelab));
41617 emit_barrier ();
41619 emit_label (neglab);
41621 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41622 1, OPTAB_DIRECT);
41623 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41624 1, OPTAB_DIRECT);
41625 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41627 expand_float (f0, i0, 0);
41629 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41631 emit_label (donelab);
41634 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41635 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41636 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41637 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41639 /* Get a vector mode of the same size as the original but with elements
41640 twice as wide. This is only guaranteed to apply to integral vectors. */
41642 static inline machine_mode
41643 get_mode_wider_vector (machine_mode o)
41645 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41646 machine_mode n = GET_MODE_WIDER_MODE (o);
41647 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41648 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41649 return n;
41652 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41653 fill target with val via vec_duplicate. */
41655 static bool
41656 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41658 bool ok;
41659 rtx_insn *insn;
41660 rtx dup;
41662 /* First attempt to recognize VAL as-is. */
41663 dup = gen_rtx_VEC_DUPLICATE (mode, val);
41664 insn = emit_insn (gen_rtx_SET (target, dup));
41665 if (recog_memoized (insn) < 0)
41667 rtx_insn *seq;
41668 /* If that fails, force VAL into a register. */
41670 start_sequence ();
41671 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
41672 seq = get_insns ();
41673 end_sequence ();
41674 if (seq)
41675 emit_insn_before (seq, insn);
41677 ok = recog_memoized (insn) >= 0;
41678 gcc_assert (ok);
41680 return true;
41683 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41684 with all elements equal to VAR. Return true if successful. */
41686 static bool
41687 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41688 rtx target, rtx val)
41690 bool ok;
41692 switch (mode)
41694 case V2SImode:
41695 case V2SFmode:
41696 if (!mmx_ok)
41697 return false;
41698 /* FALLTHRU */
41700 case V4DFmode:
41701 case V4DImode:
41702 case V8SFmode:
41703 case V8SImode:
41704 case V2DFmode:
41705 case V2DImode:
41706 case V4SFmode:
41707 case V4SImode:
41708 case V16SImode:
41709 case V8DImode:
41710 case V16SFmode:
41711 case V8DFmode:
41712 return ix86_vector_duplicate_value (mode, target, val);
41714 case V4HImode:
41715 if (!mmx_ok)
41716 return false;
41717 if (TARGET_SSE || TARGET_3DNOW_A)
41719 rtx x;
41721 val = gen_lowpart (SImode, val);
41722 x = gen_rtx_TRUNCATE (HImode, val);
41723 x = gen_rtx_VEC_DUPLICATE (mode, x);
41724 emit_insn (gen_rtx_SET (target, x));
41725 return true;
41727 goto widen;
41729 case V8QImode:
41730 if (!mmx_ok)
41731 return false;
41732 goto widen;
41734 case V8HImode:
41735 if (TARGET_AVX2)
41736 return ix86_vector_duplicate_value (mode, target, val);
41738 if (TARGET_SSE2)
41740 struct expand_vec_perm_d dperm;
41741 rtx tmp1, tmp2;
41743 permute:
41744 memset (&dperm, 0, sizeof (dperm));
41745 dperm.target = target;
41746 dperm.vmode = mode;
41747 dperm.nelt = GET_MODE_NUNITS (mode);
41748 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41749 dperm.one_operand_p = true;
41751 /* Extend to SImode using a paradoxical SUBREG. */
41752 tmp1 = gen_reg_rtx (SImode);
41753 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41755 /* Insert the SImode value as low element of a V4SImode vector. */
41756 tmp2 = gen_reg_rtx (V4SImode);
41757 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41758 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41760 ok = (expand_vec_perm_1 (&dperm)
41761 || expand_vec_perm_broadcast_1 (&dperm));
41762 gcc_assert (ok);
41763 return ok;
41765 goto widen;
41767 case V16QImode:
41768 if (TARGET_AVX2)
41769 return ix86_vector_duplicate_value (mode, target, val);
41771 if (TARGET_SSE2)
41772 goto permute;
41773 goto widen;
41775 widen:
41776 /* Replicate the value once into the next wider mode and recurse. */
41778 machine_mode smode, wsmode, wvmode;
41779 rtx x;
41781 smode = GET_MODE_INNER (mode);
41782 wvmode = get_mode_wider_vector (mode);
41783 wsmode = GET_MODE_INNER (wvmode);
41785 val = convert_modes (wsmode, smode, val, true);
41786 x = expand_simple_binop (wsmode, ASHIFT, val,
41787 GEN_INT (GET_MODE_BITSIZE (smode)),
41788 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41789 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41791 x = gen_reg_rtx (wvmode);
41792 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41793 gcc_assert (ok);
41794 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41795 return ok;
41798 case V16HImode:
41799 case V32QImode:
41800 if (TARGET_AVX2)
41801 return ix86_vector_duplicate_value (mode, target, val);
41802 else
41804 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41805 rtx x = gen_reg_rtx (hvmode);
41807 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41808 gcc_assert (ok);
41810 x = gen_rtx_VEC_CONCAT (mode, x, x);
41811 emit_insn (gen_rtx_SET (target, x));
41813 return true;
41815 case V64QImode:
41816 case V32HImode:
41817 if (TARGET_AVX512BW)
41818 return ix86_vector_duplicate_value (mode, target, val);
41819 else
41821 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41822 rtx x = gen_reg_rtx (hvmode);
41824 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41825 gcc_assert (ok);
41827 x = gen_rtx_VEC_CONCAT (mode, x, x);
41828 emit_insn (gen_rtx_SET (target, x));
41830 return true;
41832 default:
41833 return false;
41837 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41838 whose ONE_VAR element is VAR, and other elements are zero. Return true
41839 if successful. */
41841 static bool
41842 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41843 rtx target, rtx var, int one_var)
41845 machine_mode vsimode;
41846 rtx new_target;
41847 rtx x, tmp;
41848 bool use_vector_set = false;
41850 switch (mode)
41852 case V2DImode:
41853 /* For SSE4.1, we normally use vector set. But if the second
41854 element is zero and inter-unit moves are OK, we use movq
41855 instead. */
41856 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41857 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41858 && one_var == 0));
41859 break;
41860 case V16QImode:
41861 case V4SImode:
41862 case V4SFmode:
41863 use_vector_set = TARGET_SSE4_1;
41864 break;
41865 case V8HImode:
41866 use_vector_set = TARGET_SSE2;
41867 break;
41868 case V4HImode:
41869 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41870 break;
41871 case V32QImode:
41872 case V16HImode:
41873 case V8SImode:
41874 case V8SFmode:
41875 case V4DFmode:
41876 use_vector_set = TARGET_AVX;
41877 break;
41878 case V4DImode:
41879 /* Use ix86_expand_vector_set in 64bit mode only. */
41880 use_vector_set = TARGET_AVX && TARGET_64BIT;
41881 break;
41882 default:
41883 break;
41886 if (use_vector_set)
41888 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41889 var = force_reg (GET_MODE_INNER (mode), var);
41890 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41891 return true;
41894 switch (mode)
41896 case V2SFmode:
41897 case V2SImode:
41898 if (!mmx_ok)
41899 return false;
41900 /* FALLTHRU */
41902 case V2DFmode:
41903 case V2DImode:
41904 if (one_var != 0)
41905 return false;
41906 var = force_reg (GET_MODE_INNER (mode), var);
41907 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41908 emit_insn (gen_rtx_SET (target, x));
41909 return true;
41911 case V4SFmode:
41912 case V4SImode:
41913 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41914 new_target = gen_reg_rtx (mode);
41915 else
41916 new_target = target;
41917 var = force_reg (GET_MODE_INNER (mode), var);
41918 x = gen_rtx_VEC_DUPLICATE (mode, var);
41919 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41920 emit_insn (gen_rtx_SET (new_target, x));
41921 if (one_var != 0)
41923 /* We need to shuffle the value to the correct position, so
41924 create a new pseudo to store the intermediate result. */
41926 /* With SSE2, we can use the integer shuffle insns. */
41927 if (mode != V4SFmode && TARGET_SSE2)
41929 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41930 const1_rtx,
41931 GEN_INT (one_var == 1 ? 0 : 1),
41932 GEN_INT (one_var == 2 ? 0 : 1),
41933 GEN_INT (one_var == 3 ? 0 : 1)));
41934 if (target != new_target)
41935 emit_move_insn (target, new_target);
41936 return true;
41939 /* Otherwise convert the intermediate result to V4SFmode and
41940 use the SSE1 shuffle instructions. */
41941 if (mode != V4SFmode)
41943 tmp = gen_reg_rtx (V4SFmode);
41944 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41946 else
41947 tmp = new_target;
41949 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41950 const1_rtx,
41951 GEN_INT (one_var == 1 ? 0 : 1),
41952 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41953 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41955 if (mode != V4SFmode)
41956 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41957 else if (tmp != target)
41958 emit_move_insn (target, tmp);
41960 else if (target != new_target)
41961 emit_move_insn (target, new_target);
41962 return true;
41964 case V8HImode:
41965 case V16QImode:
41966 vsimode = V4SImode;
41967 goto widen;
41968 case V4HImode:
41969 case V8QImode:
41970 if (!mmx_ok)
41971 return false;
41972 vsimode = V2SImode;
41973 goto widen;
41974 widen:
41975 if (one_var != 0)
41976 return false;
41978 /* Zero extend the variable element to SImode and recurse. */
41979 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41981 x = gen_reg_rtx (vsimode);
41982 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41983 var, one_var))
41984 gcc_unreachable ();
41986 emit_move_insn (target, gen_lowpart (mode, x));
41987 return true;
41989 default:
41990 return false;
41994 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41995 consisting of the values in VALS. It is known that all elements
41996 except ONE_VAR are constants. Return true if successful. */
41998 static bool
41999 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42000 rtx target, rtx vals, int one_var)
42002 rtx var = XVECEXP (vals, 0, one_var);
42003 machine_mode wmode;
42004 rtx const_vec, x;
42006 const_vec = copy_rtx (vals);
42007 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42008 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42010 switch (mode)
42012 case V2DFmode:
42013 case V2DImode:
42014 case V2SFmode:
42015 case V2SImode:
42016 /* For the two element vectors, it's just as easy to use
42017 the general case. */
42018 return false;
42020 case V4DImode:
42021 /* Use ix86_expand_vector_set in 64bit mode only. */
42022 if (!TARGET_64BIT)
42023 return false;
42024 /* FALLTHRU */
42025 case V4DFmode:
42026 case V8SFmode:
42027 case V8SImode:
42028 case V16HImode:
42029 case V32QImode:
42030 case V4SFmode:
42031 case V4SImode:
42032 case V8HImode:
42033 case V4HImode:
42034 break;
42036 case V16QImode:
42037 if (TARGET_SSE4_1)
42038 break;
42039 wmode = V8HImode;
42040 goto widen;
42041 case V8QImode:
42042 wmode = V4HImode;
42043 goto widen;
42044 widen:
42045 /* There's no way to set one QImode entry easily. Combine
42046 the variable value with its adjacent constant value, and
42047 promote to an HImode set. */
42048 x = XVECEXP (vals, 0, one_var ^ 1);
42049 if (one_var & 1)
42051 var = convert_modes (HImode, QImode, var, true);
42052 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42053 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42054 x = GEN_INT (INTVAL (x) & 0xff);
42056 else
42058 var = convert_modes (HImode, QImode, var, true);
42059 x = gen_int_mode (INTVAL (x) << 8, HImode);
42061 if (x != const0_rtx)
42062 var = expand_simple_binop (HImode, IOR, var, x, var,
42063 1, OPTAB_LIB_WIDEN);
42065 x = gen_reg_rtx (wmode);
42066 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42067 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42069 emit_move_insn (target, gen_lowpart (mode, x));
42070 return true;
42072 default:
42073 return false;
42076 emit_move_insn (target, const_vec);
42077 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42078 return true;
42081 /* A subroutine of ix86_expand_vector_init_general. Use vector
42082 concatenate to handle the most general case: all values variable,
42083 and none identical. */
42085 static void
42086 ix86_expand_vector_init_concat (machine_mode mode,
42087 rtx target, rtx *ops, int n)
42089 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42090 rtx first[16], second[8], third[4];
42091 rtvec v;
42092 int i, j;
42094 switch (n)
42096 case 2:
42097 switch (mode)
42099 case V16SImode:
42100 cmode = V8SImode;
42101 break;
42102 case V16SFmode:
42103 cmode = V8SFmode;
42104 break;
42105 case V8DImode:
42106 cmode = V4DImode;
42107 break;
42108 case V8DFmode:
42109 cmode = V4DFmode;
42110 break;
42111 case V8SImode:
42112 cmode = V4SImode;
42113 break;
42114 case V8SFmode:
42115 cmode = V4SFmode;
42116 break;
42117 case V4DImode:
42118 cmode = V2DImode;
42119 break;
42120 case V4DFmode:
42121 cmode = V2DFmode;
42122 break;
42123 case V4SImode:
42124 cmode = V2SImode;
42125 break;
42126 case V4SFmode:
42127 cmode = V2SFmode;
42128 break;
42129 case V2DImode:
42130 cmode = DImode;
42131 break;
42132 case V2SImode:
42133 cmode = SImode;
42134 break;
42135 case V2DFmode:
42136 cmode = DFmode;
42137 break;
42138 case V2SFmode:
42139 cmode = SFmode;
42140 break;
42141 default:
42142 gcc_unreachable ();
42145 if (!register_operand (ops[1], cmode))
42146 ops[1] = force_reg (cmode, ops[1]);
42147 if (!register_operand (ops[0], cmode))
42148 ops[0] = force_reg (cmode, ops[0]);
42149 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42150 ops[1])));
42151 break;
42153 case 4:
42154 switch (mode)
42156 case V4DImode:
42157 cmode = V2DImode;
42158 break;
42159 case V4DFmode:
42160 cmode = V2DFmode;
42161 break;
42162 case V4SImode:
42163 cmode = V2SImode;
42164 break;
42165 case V4SFmode:
42166 cmode = V2SFmode;
42167 break;
42168 default:
42169 gcc_unreachable ();
42171 goto half;
42173 case 8:
42174 switch (mode)
42176 case V8DImode:
42177 cmode = V2DImode;
42178 hmode = V4DImode;
42179 break;
42180 case V8DFmode:
42181 cmode = V2DFmode;
42182 hmode = V4DFmode;
42183 break;
42184 case V8SImode:
42185 cmode = V2SImode;
42186 hmode = V4SImode;
42187 break;
42188 case V8SFmode:
42189 cmode = V2SFmode;
42190 hmode = V4SFmode;
42191 break;
42192 default:
42193 gcc_unreachable ();
42195 goto half;
42197 case 16:
42198 switch (mode)
42200 case V16SImode:
42201 cmode = V2SImode;
42202 hmode = V4SImode;
42203 gmode = V8SImode;
42204 break;
42205 case V16SFmode:
42206 cmode = V2SFmode;
42207 hmode = V4SFmode;
42208 gmode = V8SFmode;
42209 break;
42210 default:
42211 gcc_unreachable ();
42213 goto half;
42215 half:
42216 /* FIXME: We process inputs backward to help RA. PR 36222. */
42217 i = n - 1;
42218 j = (n >> 1) - 1;
42219 for (; i > 0; i -= 2, j--)
42221 first[j] = gen_reg_rtx (cmode);
42222 v = gen_rtvec (2, ops[i - 1], ops[i]);
42223 ix86_expand_vector_init (false, first[j],
42224 gen_rtx_PARALLEL (cmode, v));
42227 n >>= 1;
42228 if (n > 4)
42230 gcc_assert (hmode != VOIDmode);
42231 gcc_assert (gmode != VOIDmode);
42232 for (i = j = 0; i < n; i += 2, j++)
42234 second[j] = gen_reg_rtx (hmode);
42235 ix86_expand_vector_init_concat (hmode, second [j],
42236 &first [i], 2);
42238 n >>= 1;
42239 for (i = j = 0; i < n; i += 2, j++)
42241 third[j] = gen_reg_rtx (gmode);
42242 ix86_expand_vector_init_concat (gmode, third[j],
42243 &second[i], 2);
42245 n >>= 1;
42246 ix86_expand_vector_init_concat (mode, target, third, n);
42248 else if (n > 2)
42250 gcc_assert (hmode != VOIDmode);
42251 for (i = j = 0; i < n; i += 2, j++)
42253 second[j] = gen_reg_rtx (hmode);
42254 ix86_expand_vector_init_concat (hmode, second [j],
42255 &first [i], 2);
42257 n >>= 1;
42258 ix86_expand_vector_init_concat (mode, target, second, n);
42260 else
42261 ix86_expand_vector_init_concat (mode, target, first, n);
42262 break;
42264 default:
42265 gcc_unreachable ();
42269 /* A subroutine of ix86_expand_vector_init_general. Use vector
42270 interleave to handle the most general case: all values variable,
42271 and none identical. */
42273 static void
42274 ix86_expand_vector_init_interleave (machine_mode mode,
42275 rtx target, rtx *ops, int n)
42277 machine_mode first_imode, second_imode, third_imode, inner_mode;
42278 int i, j;
42279 rtx op0, op1;
42280 rtx (*gen_load_even) (rtx, rtx, rtx);
42281 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42282 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42284 switch (mode)
42286 case V8HImode:
42287 gen_load_even = gen_vec_setv8hi;
42288 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42289 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42290 inner_mode = HImode;
42291 first_imode = V4SImode;
42292 second_imode = V2DImode;
42293 third_imode = VOIDmode;
42294 break;
42295 case V16QImode:
42296 gen_load_even = gen_vec_setv16qi;
42297 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42298 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42299 inner_mode = QImode;
42300 first_imode = V8HImode;
42301 second_imode = V4SImode;
42302 third_imode = V2DImode;
42303 break;
42304 default:
42305 gcc_unreachable ();
42308 for (i = 0; i < n; i++)
42310 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42311 op0 = gen_reg_rtx (SImode);
42312 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42314 /* Insert the SImode value as low element of V4SImode vector. */
42315 op1 = gen_reg_rtx (V4SImode);
42316 op0 = gen_rtx_VEC_MERGE (V4SImode,
42317 gen_rtx_VEC_DUPLICATE (V4SImode,
42318 op0),
42319 CONST0_RTX (V4SImode),
42320 const1_rtx);
42321 emit_insn (gen_rtx_SET (op1, op0));
42323 /* Cast the V4SImode vector back to a vector in orignal mode. */
42324 op0 = gen_reg_rtx (mode);
42325 emit_move_insn (op0, gen_lowpart (mode, op1));
42327 /* Load even elements into the second position. */
42328 emit_insn (gen_load_even (op0,
42329 force_reg (inner_mode,
42330 ops [i + i + 1]),
42331 const1_rtx));
42333 /* Cast vector to FIRST_IMODE vector. */
42334 ops[i] = gen_reg_rtx (first_imode);
42335 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42338 /* Interleave low FIRST_IMODE vectors. */
42339 for (i = j = 0; i < n; i += 2, j++)
42341 op0 = gen_reg_rtx (first_imode);
42342 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42344 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42345 ops[j] = gen_reg_rtx (second_imode);
42346 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42349 /* Interleave low SECOND_IMODE vectors. */
42350 switch (second_imode)
42352 case V4SImode:
42353 for (i = j = 0; i < n / 2; i += 2, j++)
42355 op0 = gen_reg_rtx (second_imode);
42356 emit_insn (gen_interleave_second_low (op0, ops[i],
42357 ops[i + 1]));
42359 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42360 vector. */
42361 ops[j] = gen_reg_rtx (third_imode);
42362 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42364 second_imode = V2DImode;
42365 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42366 /* FALLTHRU */
42368 case V2DImode:
42369 op0 = gen_reg_rtx (second_imode);
42370 emit_insn (gen_interleave_second_low (op0, ops[0],
42371 ops[1]));
42373 /* Cast the SECOND_IMODE vector back to a vector on original
42374 mode. */
42375 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42376 break;
42378 default:
42379 gcc_unreachable ();
42383 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42384 all values variable, and none identical. */
42386 static void
42387 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42388 rtx target, rtx vals)
42390 rtx ops[64], op0, op1, op2, op3, op4, op5;
42391 machine_mode half_mode = VOIDmode;
42392 machine_mode quarter_mode = VOIDmode;
42393 int n, i;
42395 switch (mode)
42397 case V2SFmode:
42398 case V2SImode:
42399 if (!mmx_ok && !TARGET_SSE)
42400 break;
42401 /* FALLTHRU */
42403 case V16SImode:
42404 case V16SFmode:
42405 case V8DFmode:
42406 case V8DImode:
42407 case V8SFmode:
42408 case V8SImode:
42409 case V4DFmode:
42410 case V4DImode:
42411 case V4SFmode:
42412 case V4SImode:
42413 case V2DFmode:
42414 case V2DImode:
42415 n = GET_MODE_NUNITS (mode);
42416 for (i = 0; i < n; i++)
42417 ops[i] = XVECEXP (vals, 0, i);
42418 ix86_expand_vector_init_concat (mode, target, ops, n);
42419 return;
42421 case V32QImode:
42422 half_mode = V16QImode;
42423 goto half;
42425 case V16HImode:
42426 half_mode = V8HImode;
42427 goto half;
42429 half:
42430 n = GET_MODE_NUNITS (mode);
42431 for (i = 0; i < n; i++)
42432 ops[i] = XVECEXP (vals, 0, i);
42433 op0 = gen_reg_rtx (half_mode);
42434 op1 = gen_reg_rtx (half_mode);
42435 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42436 n >> 2);
42437 ix86_expand_vector_init_interleave (half_mode, op1,
42438 &ops [n >> 1], n >> 2);
42439 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42440 return;
42442 case V64QImode:
42443 quarter_mode = V16QImode;
42444 half_mode = V32QImode;
42445 goto quarter;
42447 case V32HImode:
42448 quarter_mode = V8HImode;
42449 half_mode = V16HImode;
42450 goto quarter;
42452 quarter:
42453 n = GET_MODE_NUNITS (mode);
42454 for (i = 0; i < n; i++)
42455 ops[i] = XVECEXP (vals, 0, i);
42456 op0 = gen_reg_rtx (quarter_mode);
42457 op1 = gen_reg_rtx (quarter_mode);
42458 op2 = gen_reg_rtx (quarter_mode);
42459 op3 = gen_reg_rtx (quarter_mode);
42460 op4 = gen_reg_rtx (half_mode);
42461 op5 = gen_reg_rtx (half_mode);
42462 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42463 n >> 3);
42464 ix86_expand_vector_init_interleave (quarter_mode, op1,
42465 &ops [n >> 2], n >> 3);
42466 ix86_expand_vector_init_interleave (quarter_mode, op2,
42467 &ops [n >> 1], n >> 3);
42468 ix86_expand_vector_init_interleave (quarter_mode, op3,
42469 &ops [(n >> 1) | (n >> 2)], n >> 3);
42470 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42471 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42472 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42473 return;
42475 case V16QImode:
42476 if (!TARGET_SSE4_1)
42477 break;
42478 /* FALLTHRU */
42480 case V8HImode:
42481 if (!TARGET_SSE2)
42482 break;
42484 /* Don't use ix86_expand_vector_init_interleave if we can't
42485 move from GPR to SSE register directly. */
42486 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42487 break;
42489 n = GET_MODE_NUNITS (mode);
42490 for (i = 0; i < n; i++)
42491 ops[i] = XVECEXP (vals, 0, i);
42492 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42493 return;
42495 case V4HImode:
42496 case V8QImode:
42497 break;
42499 default:
42500 gcc_unreachable ();
42504 int i, j, n_elts, n_words, n_elt_per_word;
42505 machine_mode inner_mode;
42506 rtx words[4], shift;
42508 inner_mode = GET_MODE_INNER (mode);
42509 n_elts = GET_MODE_NUNITS (mode);
42510 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42511 n_elt_per_word = n_elts / n_words;
42512 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42514 for (i = 0; i < n_words; ++i)
42516 rtx word = NULL_RTX;
42518 for (j = 0; j < n_elt_per_word; ++j)
42520 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42521 elt = convert_modes (word_mode, inner_mode, elt, true);
42523 if (j == 0)
42524 word = elt;
42525 else
42527 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42528 word, 1, OPTAB_LIB_WIDEN);
42529 word = expand_simple_binop (word_mode, IOR, word, elt,
42530 word, 1, OPTAB_LIB_WIDEN);
42534 words[i] = word;
42537 if (n_words == 1)
42538 emit_move_insn (target, gen_lowpart (mode, words[0]));
42539 else if (n_words == 2)
42541 rtx tmp = gen_reg_rtx (mode);
42542 emit_clobber (tmp);
42543 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42544 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42545 emit_move_insn (target, tmp);
42547 else if (n_words == 4)
42549 rtx tmp = gen_reg_rtx (V4SImode);
42550 gcc_assert (word_mode == SImode);
42551 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42552 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42553 emit_move_insn (target, gen_lowpart (mode, tmp));
42555 else
42556 gcc_unreachable ();
42560 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42561 instructions unless MMX_OK is true. */
42563 void
42564 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42566 machine_mode mode = GET_MODE (target);
42567 machine_mode inner_mode = GET_MODE_INNER (mode);
42568 int n_elts = GET_MODE_NUNITS (mode);
42569 int n_var = 0, one_var = -1;
42570 bool all_same = true, all_const_zero = true;
42571 int i;
42572 rtx x;
42574 for (i = 0; i < n_elts; ++i)
42576 x = XVECEXP (vals, 0, i);
42577 if (!(CONST_SCALAR_INT_P (x)
42578 || CONST_DOUBLE_P (x)
42579 || CONST_FIXED_P (x)))
42580 n_var++, one_var = i;
42581 else if (x != CONST0_RTX (inner_mode))
42582 all_const_zero = false;
42583 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42584 all_same = false;
42587 /* Constants are best loaded from the constant pool. */
42588 if (n_var == 0)
42590 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42591 return;
42594 /* If all values are identical, broadcast the value. */
42595 if (all_same
42596 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42597 XVECEXP (vals, 0, 0)))
42598 return;
42600 /* Values where only one field is non-constant are best loaded from
42601 the pool and overwritten via move later. */
42602 if (n_var == 1)
42604 if (all_const_zero
42605 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42606 XVECEXP (vals, 0, one_var),
42607 one_var))
42608 return;
42610 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42611 return;
42614 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42617 void
42618 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42620 machine_mode mode = GET_MODE (target);
42621 machine_mode inner_mode = GET_MODE_INNER (mode);
42622 machine_mode half_mode;
42623 bool use_vec_merge = false;
42624 rtx tmp;
42625 static rtx (*gen_extract[6][2]) (rtx, rtx)
42627 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42628 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42629 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42630 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42631 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42632 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42634 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42636 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42637 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42638 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42639 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42640 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42641 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42643 int i, j, n;
42644 machine_mode mmode = VOIDmode;
42645 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42647 switch (mode)
42649 case V2SFmode:
42650 case V2SImode:
42651 if (mmx_ok)
42653 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42654 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42655 if (elt == 0)
42656 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42657 else
42658 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42659 emit_insn (gen_rtx_SET (target, tmp));
42660 return;
42662 break;
42664 case V2DImode:
42665 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42666 if (use_vec_merge)
42667 break;
42669 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42670 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42671 if (elt == 0)
42672 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42673 else
42674 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42675 emit_insn (gen_rtx_SET (target, tmp));
42676 return;
42678 case V2DFmode:
42680 rtx op0, op1;
42682 /* For the two element vectors, we implement a VEC_CONCAT with
42683 the extraction of the other element. */
42685 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42686 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42688 if (elt == 0)
42689 op0 = val, op1 = tmp;
42690 else
42691 op0 = tmp, op1 = val;
42693 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42694 emit_insn (gen_rtx_SET (target, tmp));
42696 return;
42698 case V4SFmode:
42699 use_vec_merge = TARGET_SSE4_1;
42700 if (use_vec_merge)
42701 break;
42703 switch (elt)
42705 case 0:
42706 use_vec_merge = true;
42707 break;
42709 case 1:
42710 /* tmp = target = A B C D */
42711 tmp = copy_to_reg (target);
42712 /* target = A A B B */
42713 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42714 /* target = X A B B */
42715 ix86_expand_vector_set (false, target, val, 0);
42716 /* target = A X C D */
42717 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42718 const1_rtx, const0_rtx,
42719 GEN_INT (2+4), GEN_INT (3+4)));
42720 return;
42722 case 2:
42723 /* tmp = target = A B C D */
42724 tmp = copy_to_reg (target);
42725 /* tmp = X B C D */
42726 ix86_expand_vector_set (false, tmp, val, 0);
42727 /* target = A B X D */
42728 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42729 const0_rtx, const1_rtx,
42730 GEN_INT (0+4), GEN_INT (3+4)));
42731 return;
42733 case 3:
42734 /* tmp = target = A B C D */
42735 tmp = copy_to_reg (target);
42736 /* tmp = X B C D */
42737 ix86_expand_vector_set (false, tmp, val, 0);
42738 /* target = A B X D */
42739 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42740 const0_rtx, const1_rtx,
42741 GEN_INT (2+4), GEN_INT (0+4)));
42742 return;
42744 default:
42745 gcc_unreachable ();
42747 break;
42749 case V4SImode:
42750 use_vec_merge = TARGET_SSE4_1;
42751 if (use_vec_merge)
42752 break;
42754 /* Element 0 handled by vec_merge below. */
42755 if (elt == 0)
42757 use_vec_merge = true;
42758 break;
42761 if (TARGET_SSE2)
42763 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42764 store into element 0, then shuffle them back. */
42766 rtx order[4];
42768 order[0] = GEN_INT (elt);
42769 order[1] = const1_rtx;
42770 order[2] = const2_rtx;
42771 order[3] = GEN_INT (3);
42772 order[elt] = const0_rtx;
42774 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42775 order[1], order[2], order[3]));
42777 ix86_expand_vector_set (false, target, val, 0);
42779 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42780 order[1], order[2], order[3]));
42782 else
42784 /* For SSE1, we have to reuse the V4SF code. */
42785 rtx t = gen_reg_rtx (V4SFmode);
42786 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42787 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42788 emit_move_insn (target, gen_lowpart (mode, t));
42790 return;
42792 case V8HImode:
42793 use_vec_merge = TARGET_SSE2;
42794 break;
42795 case V4HImode:
42796 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42797 break;
42799 case V16QImode:
42800 use_vec_merge = TARGET_SSE4_1;
42801 break;
42803 case V8QImode:
42804 break;
42806 case V32QImode:
42807 half_mode = V16QImode;
42808 j = 0;
42809 n = 16;
42810 goto half;
42812 case V16HImode:
42813 half_mode = V8HImode;
42814 j = 1;
42815 n = 8;
42816 goto half;
42818 case V8SImode:
42819 half_mode = V4SImode;
42820 j = 2;
42821 n = 4;
42822 goto half;
42824 case V4DImode:
42825 half_mode = V2DImode;
42826 j = 3;
42827 n = 2;
42828 goto half;
42830 case V8SFmode:
42831 half_mode = V4SFmode;
42832 j = 4;
42833 n = 4;
42834 goto half;
42836 case V4DFmode:
42837 half_mode = V2DFmode;
42838 j = 5;
42839 n = 2;
42840 goto half;
42842 half:
42843 /* Compute offset. */
42844 i = elt / n;
42845 elt %= n;
42847 gcc_assert (i <= 1);
42849 /* Extract the half. */
42850 tmp = gen_reg_rtx (half_mode);
42851 emit_insn (gen_extract[j][i] (tmp, target));
42853 /* Put val in tmp at elt. */
42854 ix86_expand_vector_set (false, tmp, val, elt);
42856 /* Put it back. */
42857 emit_insn (gen_insert[j][i] (target, target, tmp));
42858 return;
42860 case V8DFmode:
42861 if (TARGET_AVX512F)
42863 mmode = QImode;
42864 gen_blendm = gen_avx512f_blendmv8df;
42866 break;
42868 case V8DImode:
42869 if (TARGET_AVX512F)
42871 mmode = QImode;
42872 gen_blendm = gen_avx512f_blendmv8di;
42874 break;
42876 case V16SFmode:
42877 if (TARGET_AVX512F)
42879 mmode = HImode;
42880 gen_blendm = gen_avx512f_blendmv16sf;
42882 break;
42884 case V16SImode:
42885 if (TARGET_AVX512F)
42887 mmode = HImode;
42888 gen_blendm = gen_avx512f_blendmv16si;
42890 break;
42892 case V32HImode:
42893 if (TARGET_AVX512F && TARGET_AVX512BW)
42895 mmode = SImode;
42896 gen_blendm = gen_avx512bw_blendmv32hi;
42898 break;
42900 case V64QImode:
42901 if (TARGET_AVX512F && TARGET_AVX512BW)
42903 mmode = DImode;
42904 gen_blendm = gen_avx512bw_blendmv64qi;
42906 break;
42908 default:
42909 break;
42912 if (mmode != VOIDmode)
42914 tmp = gen_reg_rtx (mode);
42915 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42916 /* The avx512*_blendm<mode> expanders have different operand order
42917 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42918 elements where the mask is set and second input operand otherwise,
42919 in {sse,avx}*_*blend* the first input operand is used for elements
42920 where the mask is clear and second input operand otherwise. */
42921 emit_insn (gen_blendm (target, target, tmp,
42922 force_reg (mmode,
42923 gen_int_mode (1 << elt, mmode))));
42925 else if (use_vec_merge)
42927 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42928 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42929 emit_insn (gen_rtx_SET (target, tmp));
42931 else
42933 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42935 emit_move_insn (mem, target);
42937 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42938 emit_move_insn (tmp, val);
42940 emit_move_insn (target, mem);
42944 void
42945 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42947 machine_mode mode = GET_MODE (vec);
42948 machine_mode inner_mode = GET_MODE_INNER (mode);
42949 bool use_vec_extr = false;
42950 rtx tmp;
42952 switch (mode)
42954 case V2SImode:
42955 case V2SFmode:
42956 if (!mmx_ok)
42957 break;
42958 /* FALLTHRU */
42960 case V2DFmode:
42961 case V2DImode:
42962 use_vec_extr = true;
42963 break;
42965 case V4SFmode:
42966 use_vec_extr = TARGET_SSE4_1;
42967 if (use_vec_extr)
42968 break;
42970 switch (elt)
42972 case 0:
42973 tmp = vec;
42974 break;
42976 case 1:
42977 case 3:
42978 tmp = gen_reg_rtx (mode);
42979 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42980 GEN_INT (elt), GEN_INT (elt),
42981 GEN_INT (elt+4), GEN_INT (elt+4)));
42982 break;
42984 case 2:
42985 tmp = gen_reg_rtx (mode);
42986 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42987 break;
42989 default:
42990 gcc_unreachable ();
42992 vec = tmp;
42993 use_vec_extr = true;
42994 elt = 0;
42995 break;
42997 case V4SImode:
42998 use_vec_extr = TARGET_SSE4_1;
42999 if (use_vec_extr)
43000 break;
43002 if (TARGET_SSE2)
43004 switch (elt)
43006 case 0:
43007 tmp = vec;
43008 break;
43010 case 1:
43011 case 3:
43012 tmp = gen_reg_rtx (mode);
43013 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43014 GEN_INT (elt), GEN_INT (elt),
43015 GEN_INT (elt), GEN_INT (elt)));
43016 break;
43018 case 2:
43019 tmp = gen_reg_rtx (mode);
43020 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43021 break;
43023 default:
43024 gcc_unreachable ();
43026 vec = tmp;
43027 use_vec_extr = true;
43028 elt = 0;
43030 else
43032 /* For SSE1, we have to reuse the V4SF code. */
43033 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43034 gen_lowpart (V4SFmode, vec), elt);
43035 return;
43037 break;
43039 case V8HImode:
43040 use_vec_extr = TARGET_SSE2;
43041 break;
43042 case V4HImode:
43043 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43044 break;
43046 case V16QImode:
43047 use_vec_extr = TARGET_SSE4_1;
43048 break;
43050 case V8SFmode:
43051 if (TARGET_AVX)
43053 tmp = gen_reg_rtx (V4SFmode);
43054 if (elt < 4)
43055 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43056 else
43057 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43058 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43059 return;
43061 break;
43063 case V4DFmode:
43064 if (TARGET_AVX)
43066 tmp = gen_reg_rtx (V2DFmode);
43067 if (elt < 2)
43068 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43069 else
43070 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43071 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43072 return;
43074 break;
43076 case V32QImode:
43077 if (TARGET_AVX)
43079 tmp = gen_reg_rtx (V16QImode);
43080 if (elt < 16)
43081 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43082 else
43083 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43084 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43085 return;
43087 break;
43089 case V16HImode:
43090 if (TARGET_AVX)
43092 tmp = gen_reg_rtx (V8HImode);
43093 if (elt < 8)
43094 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43095 else
43096 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43097 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43098 return;
43100 break;
43102 case V8SImode:
43103 if (TARGET_AVX)
43105 tmp = gen_reg_rtx (V4SImode);
43106 if (elt < 4)
43107 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43108 else
43109 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43110 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43111 return;
43113 break;
43115 case V4DImode:
43116 if (TARGET_AVX)
43118 tmp = gen_reg_rtx (V2DImode);
43119 if (elt < 2)
43120 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43121 else
43122 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43123 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43124 return;
43126 break;
43128 case V32HImode:
43129 if (TARGET_AVX512BW)
43131 tmp = gen_reg_rtx (V16HImode);
43132 if (elt < 16)
43133 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43134 else
43135 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43136 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43137 return;
43139 break;
43141 case V64QImode:
43142 if (TARGET_AVX512BW)
43144 tmp = gen_reg_rtx (V32QImode);
43145 if (elt < 32)
43146 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43147 else
43148 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43149 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43150 return;
43152 break;
43154 case V16SFmode:
43155 tmp = gen_reg_rtx (V8SFmode);
43156 if (elt < 8)
43157 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43158 else
43159 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43160 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43161 return;
43163 case V8DFmode:
43164 tmp = gen_reg_rtx (V4DFmode);
43165 if (elt < 4)
43166 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43167 else
43168 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43169 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43170 return;
43172 case V16SImode:
43173 tmp = gen_reg_rtx (V8SImode);
43174 if (elt < 8)
43175 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43176 else
43177 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43178 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43179 return;
43181 case V8DImode:
43182 tmp = gen_reg_rtx (V4DImode);
43183 if (elt < 4)
43184 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43185 else
43186 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43187 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43188 return;
43190 case V8QImode:
43191 /* ??? Could extract the appropriate HImode element and shift. */
43192 default:
43193 break;
43196 if (use_vec_extr)
43198 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43199 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43201 /* Let the rtl optimizers know about the zero extension performed. */
43202 if (inner_mode == QImode || inner_mode == HImode)
43204 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43205 target = gen_lowpart (SImode, target);
43208 emit_insn (gen_rtx_SET (target, tmp));
43210 else
43212 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43214 emit_move_insn (mem, vec);
43216 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43217 emit_move_insn (target, tmp);
43221 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43222 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43223 The upper bits of DEST are undefined, though they shouldn't cause
43224 exceptions (some bits from src or all zeros are ok). */
43226 static void
43227 emit_reduc_half (rtx dest, rtx src, int i)
43229 rtx tem, d = dest;
43230 switch (GET_MODE (src))
43232 case V4SFmode:
43233 if (i == 128)
43234 tem = gen_sse_movhlps (dest, src, src);
43235 else
43236 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43237 GEN_INT (1 + 4), GEN_INT (1 + 4));
43238 break;
43239 case V2DFmode:
43240 tem = gen_vec_interleave_highv2df (dest, src, src);
43241 break;
43242 case V16QImode:
43243 case V8HImode:
43244 case V4SImode:
43245 case V2DImode:
43246 d = gen_reg_rtx (V1TImode);
43247 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43248 GEN_INT (i / 2));
43249 break;
43250 case V8SFmode:
43251 if (i == 256)
43252 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43253 else
43254 tem = gen_avx_shufps256 (dest, src, src,
43255 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43256 break;
43257 case V4DFmode:
43258 if (i == 256)
43259 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43260 else
43261 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43262 break;
43263 case V32QImode:
43264 case V16HImode:
43265 case V8SImode:
43266 case V4DImode:
43267 if (i == 256)
43269 if (GET_MODE (dest) != V4DImode)
43270 d = gen_reg_rtx (V4DImode);
43271 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43272 gen_lowpart (V4DImode, src),
43273 const1_rtx);
43275 else
43277 d = gen_reg_rtx (V2TImode);
43278 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43279 GEN_INT (i / 2));
43281 break;
43282 case V64QImode:
43283 case V32HImode:
43284 case V16SImode:
43285 case V16SFmode:
43286 case V8DImode:
43287 case V8DFmode:
43288 if (i > 128)
43289 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43290 gen_lowpart (V16SImode, src),
43291 gen_lowpart (V16SImode, src),
43292 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43293 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43294 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43295 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43296 GEN_INT (0xC), GEN_INT (0xD),
43297 GEN_INT (0xE), GEN_INT (0xF),
43298 GEN_INT (0x10), GEN_INT (0x11),
43299 GEN_INT (0x12), GEN_INT (0x13),
43300 GEN_INT (0x14), GEN_INT (0x15),
43301 GEN_INT (0x16), GEN_INT (0x17));
43302 else
43303 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43304 gen_lowpart (V16SImode, src),
43305 GEN_INT (i == 128 ? 0x2 : 0x1),
43306 GEN_INT (0x3),
43307 GEN_INT (0x3),
43308 GEN_INT (0x3),
43309 GEN_INT (i == 128 ? 0x6 : 0x5),
43310 GEN_INT (0x7),
43311 GEN_INT (0x7),
43312 GEN_INT (0x7),
43313 GEN_INT (i == 128 ? 0xA : 0x9),
43314 GEN_INT (0xB),
43315 GEN_INT (0xB),
43316 GEN_INT (0xB),
43317 GEN_INT (i == 128 ? 0xE : 0xD),
43318 GEN_INT (0xF),
43319 GEN_INT (0xF),
43320 GEN_INT (0xF));
43321 break;
43322 default:
43323 gcc_unreachable ();
43325 emit_insn (tem);
43326 if (d != dest)
43327 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43330 /* Expand a vector reduction. FN is the binary pattern to reduce;
43331 DEST is the destination; IN is the input vector. */
43333 void
43334 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43336 rtx half, dst, vec = in;
43337 machine_mode mode = GET_MODE (in);
43338 int i;
43340 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43341 if (TARGET_SSE4_1
43342 && mode == V8HImode
43343 && fn == gen_uminv8hi3)
43345 emit_insn (gen_sse4_1_phminposuw (dest, in));
43346 return;
43349 for (i = GET_MODE_BITSIZE (mode);
43350 i > GET_MODE_UNIT_BITSIZE (mode);
43351 i >>= 1)
43353 half = gen_reg_rtx (mode);
43354 emit_reduc_half (half, vec, i);
43355 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43356 dst = dest;
43357 else
43358 dst = gen_reg_rtx (mode);
43359 emit_insn (fn (dst, half, vec));
43360 vec = dst;
43364 /* Target hook for scalar_mode_supported_p. */
43365 static bool
43366 ix86_scalar_mode_supported_p (machine_mode mode)
43368 if (DECIMAL_FLOAT_MODE_P (mode))
43369 return default_decimal_float_supported_p ();
43370 else if (mode == TFmode)
43371 return true;
43372 else
43373 return default_scalar_mode_supported_p (mode);
43376 /* Implements target hook vector_mode_supported_p. */
43377 static bool
43378 ix86_vector_mode_supported_p (machine_mode mode)
43380 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43381 return true;
43382 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43383 return true;
43384 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43385 return true;
43386 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43387 return true;
43388 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43389 return true;
43390 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43391 return true;
43392 return false;
43395 /* Target hook for c_mode_for_suffix. */
43396 static machine_mode
43397 ix86_c_mode_for_suffix (char suffix)
43399 if (suffix == 'q')
43400 return TFmode;
43401 if (suffix == 'w')
43402 return XFmode;
43404 return VOIDmode;
43407 /* Worker function for TARGET_MD_ASM_ADJUST.
43409 We implement asm flag outputs, and maintain source compatibility
43410 with the old cc0-based compiler. */
43412 static rtx_insn *
43413 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43414 vec<const char *> &constraints,
43415 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43417 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43418 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43420 bool saw_asm_flag = false;
43422 start_sequence ();
43423 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43425 const char *con = constraints[i];
43426 if (strncmp (con, "=@cc", 4) != 0)
43427 continue;
43428 con += 4;
43429 if (strchr (con, ',') != NULL)
43431 error ("alternatives not allowed in asm flag output");
43432 continue;
43435 bool invert = false;
43436 if (con[0] == 'n')
43437 invert = true, con++;
43439 machine_mode mode = CCmode;
43440 rtx_code code = UNKNOWN;
43442 switch (con[0])
43444 case 'a':
43445 if (con[1] == 0)
43446 mode = CCAmode, code = EQ;
43447 else if (con[1] == 'e' && con[2] == 0)
43448 mode = CCCmode, code = NE;
43449 break;
43450 case 'b':
43451 if (con[1] == 0)
43452 mode = CCCmode, code = EQ;
43453 else if (con[1] == 'e' && con[2] == 0)
43454 mode = CCAmode, code = NE;
43455 break;
43456 case 'c':
43457 if (con[1] == 0)
43458 mode = CCCmode, code = EQ;
43459 break;
43460 case 'e':
43461 if (con[1] == 0)
43462 mode = CCZmode, code = EQ;
43463 break;
43464 case 'g':
43465 if (con[1] == 0)
43466 mode = CCGCmode, code = GT;
43467 else if (con[1] == 'e' && con[2] == 0)
43468 mode = CCGCmode, code = GE;
43469 break;
43470 case 'l':
43471 if (con[1] == 0)
43472 mode = CCGCmode, code = LT;
43473 else if (con[1] == 'e' && con[2] == 0)
43474 mode = CCGCmode, code = LE;
43475 break;
43476 case 'o':
43477 if (con[1] == 0)
43478 mode = CCOmode, code = EQ;
43479 break;
43480 case 'p':
43481 if (con[1] == 0)
43482 mode = CCPmode, code = EQ;
43483 break;
43484 case 's':
43485 if (con[1] == 0)
43486 mode = CCSmode, code = EQ;
43487 break;
43488 case 'z':
43489 if (con[1] == 0)
43490 mode = CCZmode, code = EQ;
43491 break;
43493 if (code == UNKNOWN)
43495 error ("unknown asm flag output %qs", constraints[i]);
43496 continue;
43498 if (invert)
43499 code = reverse_condition (code);
43501 rtx dest = outputs[i];
43502 if (!saw_asm_flag)
43504 /* This is the first asm flag output. Here we put the flags
43505 register in as the real output and adjust the condition to
43506 allow it. */
43507 constraints[i] = "=Bf";
43508 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43509 saw_asm_flag = true;
43511 else
43513 /* We don't need the flags register as output twice. */
43514 constraints[i] = "=X";
43515 outputs[i] = gen_rtx_SCRATCH (SImode);
43518 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43519 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43521 machine_mode dest_mode = GET_MODE (dest);
43522 if (!SCALAR_INT_MODE_P (dest_mode))
43524 error ("invalid type for asm flag output");
43525 continue;
43528 if (dest_mode == DImode && !TARGET_64BIT)
43529 dest_mode = SImode;
43531 if (dest_mode != QImode)
43533 rtx destqi = gen_reg_rtx (QImode);
43534 emit_insn (gen_rtx_SET (destqi, x));
43536 if (TARGET_ZERO_EXTEND_WITH_AND
43537 && optimize_function_for_speed_p (cfun))
43539 x = force_reg (dest_mode, const0_rtx);
43541 emit_insn (gen_movstrictqi
43542 (gen_lowpart (QImode, x), destqi));
43544 else
43545 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43548 if (dest_mode != GET_MODE (dest))
43550 rtx tmp = gen_reg_rtx (SImode);
43552 emit_insn (gen_rtx_SET (tmp, x));
43553 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43555 else
43556 emit_insn (gen_rtx_SET (dest, x));
43558 rtx_insn *seq = get_insns ();
43559 end_sequence ();
43561 if (saw_asm_flag)
43562 return seq;
43563 else
43565 /* If we had no asm flag outputs, clobber the flags. */
43566 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43567 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43568 return NULL;
43572 /* Implements target vector targetm.asm.encode_section_info. */
43574 static void ATTRIBUTE_UNUSED
43575 ix86_encode_section_info (tree decl, rtx rtl, int first)
43577 default_encode_section_info (decl, rtl, first);
43579 if (ix86_in_large_data_p (decl))
43580 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43583 /* Worker function for REVERSE_CONDITION. */
43585 enum rtx_code
43586 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43588 return (mode != CCFPmode && mode != CCFPUmode
43589 ? reverse_condition (code)
43590 : reverse_condition_maybe_unordered (code));
43593 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43594 to OPERANDS[0]. */
43596 const char *
43597 output_387_reg_move (rtx insn, rtx *operands)
43599 if (REG_P (operands[0]))
43601 if (REG_P (operands[1])
43602 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43604 if (REGNO (operands[0]) == FIRST_STACK_REG)
43605 return output_387_ffreep (operands, 0);
43606 return "fstp\t%y0";
43608 if (STACK_TOP_P (operands[0]))
43609 return "fld%Z1\t%y1";
43610 return "fst\t%y0";
43612 else if (MEM_P (operands[0]))
43614 gcc_assert (REG_P (operands[1]));
43615 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43616 return "fstp%Z0\t%y0";
43617 else
43619 /* There is no non-popping store to memory for XFmode.
43620 So if we need one, follow the store with a load. */
43621 if (GET_MODE (operands[0]) == XFmode)
43622 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43623 else
43624 return "fst%Z0\t%y0";
43627 else
43628 gcc_unreachable();
43631 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43632 FP status register is set. */
43634 void
43635 ix86_emit_fp_unordered_jump (rtx label)
43637 rtx reg = gen_reg_rtx (HImode);
43638 rtx temp;
43640 emit_insn (gen_x86_fnstsw_1 (reg));
43642 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43644 emit_insn (gen_x86_sahf_1 (reg));
43646 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43647 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43649 else
43651 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
43653 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43654 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43657 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43658 gen_rtx_LABEL_REF (VOIDmode, label),
43659 pc_rtx);
43660 temp = gen_rtx_SET (pc_rtx, temp);
43662 emit_jump_insn (temp);
43663 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43666 /* Output code to perform a log1p XFmode calculation. */
43668 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43670 rtx_code_label *label1 = gen_label_rtx ();
43671 rtx_code_label *label2 = gen_label_rtx ();
43673 rtx tmp = gen_reg_rtx (XFmode);
43674 rtx tmp2 = gen_reg_rtx (XFmode);
43675 rtx test;
43677 emit_insn (gen_absxf2 (tmp, op1));
43678 test = gen_rtx_GE (VOIDmode, tmp,
43679 const_double_from_real_value (
43680 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43681 XFmode));
43682 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43684 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43685 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43686 emit_jump (label2);
43688 emit_label (label1);
43689 emit_move_insn (tmp, CONST1_RTX (XFmode));
43690 emit_insn (gen_addxf3 (tmp, op1, tmp));
43691 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43692 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43694 emit_label (label2);
43697 /* Emit code for round calculation. */
43698 void ix86_emit_i387_round (rtx op0, rtx op1)
43700 machine_mode inmode = GET_MODE (op1);
43701 machine_mode outmode = GET_MODE (op0);
43702 rtx e1, e2, res, tmp, tmp1, half;
43703 rtx scratch = gen_reg_rtx (HImode);
43704 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43705 rtx_code_label *jump_label = gen_label_rtx ();
43706 rtx insn;
43707 rtx (*gen_abs) (rtx, rtx);
43708 rtx (*gen_neg) (rtx, rtx);
43710 switch (inmode)
43712 case SFmode:
43713 gen_abs = gen_abssf2;
43714 break;
43715 case DFmode:
43716 gen_abs = gen_absdf2;
43717 break;
43718 case XFmode:
43719 gen_abs = gen_absxf2;
43720 break;
43721 default:
43722 gcc_unreachable ();
43725 switch (outmode)
43727 case SFmode:
43728 gen_neg = gen_negsf2;
43729 break;
43730 case DFmode:
43731 gen_neg = gen_negdf2;
43732 break;
43733 case XFmode:
43734 gen_neg = gen_negxf2;
43735 break;
43736 case HImode:
43737 gen_neg = gen_neghi2;
43738 break;
43739 case SImode:
43740 gen_neg = gen_negsi2;
43741 break;
43742 case DImode:
43743 gen_neg = gen_negdi2;
43744 break;
43745 default:
43746 gcc_unreachable ();
43749 e1 = gen_reg_rtx (inmode);
43750 e2 = gen_reg_rtx (inmode);
43751 res = gen_reg_rtx (outmode);
43753 half = const_double_from_real_value (dconsthalf, inmode);
43755 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43757 /* scratch = fxam(op1) */
43758 emit_insn (gen_rtx_SET (scratch,
43759 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43760 UNSPEC_FXAM)));
43761 /* e1 = fabs(op1) */
43762 emit_insn (gen_abs (e1, op1));
43764 /* e2 = e1 + 0.5 */
43765 half = force_reg (inmode, half);
43766 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43768 /* res = floor(e2) */
43769 if (inmode != XFmode)
43771 tmp1 = gen_reg_rtx (XFmode);
43773 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43775 else
43776 tmp1 = e2;
43778 switch (outmode)
43780 case SFmode:
43781 case DFmode:
43783 rtx tmp0 = gen_reg_rtx (XFmode);
43785 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43787 emit_insn (gen_rtx_SET (res,
43788 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43789 UNSPEC_TRUNC_NOOP)));
43791 break;
43792 case XFmode:
43793 emit_insn (gen_frndintxf2_floor (res, tmp1));
43794 break;
43795 case HImode:
43796 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43797 break;
43798 case SImode:
43799 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43800 break;
43801 case DImode:
43802 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43803 break;
43804 default:
43805 gcc_unreachable ();
43808 /* flags = signbit(a) */
43809 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
43811 /* if (flags) then res = -res */
43812 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43813 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43814 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43815 pc_rtx);
43816 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43817 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43818 JUMP_LABEL (insn) = jump_label;
43820 emit_insn (gen_neg (res, res));
43822 emit_label (jump_label);
43823 LABEL_NUSES (jump_label) = 1;
43825 emit_move_insn (op0, res);
43828 /* Output code to perform a Newton-Rhapson approximation of a single precision
43829 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43831 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43833 rtx x0, x1, e0, e1;
43835 x0 = gen_reg_rtx (mode);
43836 e0 = gen_reg_rtx (mode);
43837 e1 = gen_reg_rtx (mode);
43838 x1 = gen_reg_rtx (mode);
43840 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43842 b = force_reg (mode, b);
43844 /* x0 = rcp(b) estimate */
43845 if (mode == V16SFmode || mode == V8DFmode)
43847 if (TARGET_AVX512ER)
43849 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43850 UNSPEC_RCP28)));
43851 /* res = a * x0 */
43852 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43853 return;
43855 else
43856 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43857 UNSPEC_RCP14)));
43859 else
43860 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43861 UNSPEC_RCP)));
43863 /* e0 = x0 * b */
43864 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43866 /* e0 = x0 * e0 */
43867 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43869 /* e1 = x0 + x0 */
43870 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43872 /* x1 = e1 - e0 */
43873 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43875 /* res = a * x1 */
43876 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43879 /* Output code to perform a Newton-Rhapson approximation of a
43880 single precision floating point [reciprocal] square root. */
43882 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43884 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43885 REAL_VALUE_TYPE r;
43886 int unspec;
43888 x0 = gen_reg_rtx (mode);
43889 e0 = gen_reg_rtx (mode);
43890 e1 = gen_reg_rtx (mode);
43891 e2 = gen_reg_rtx (mode);
43892 e3 = gen_reg_rtx (mode);
43894 if (TARGET_AVX512ER && mode == V16SFmode)
43896 if (recip)
43897 /* res = rsqrt28(a) estimate */
43898 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43899 UNSPEC_RSQRT28)));
43900 else
43902 /* x0 = rsqrt28(a) estimate */
43903 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43904 UNSPEC_RSQRT28)));
43905 /* res = rcp28(x0) estimate */
43906 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43907 UNSPEC_RCP28)));
43909 return;
43912 real_from_integer (&r, VOIDmode, -3, SIGNED);
43913 mthree = const_double_from_real_value (r, SFmode);
43915 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43916 mhalf = const_double_from_real_value (r, SFmode);
43917 unspec = UNSPEC_RSQRT;
43919 if (VECTOR_MODE_P (mode))
43921 mthree = ix86_build_const_vector (mode, true, mthree);
43922 mhalf = ix86_build_const_vector (mode, true, mhalf);
43923 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43924 if (GET_MODE_SIZE (mode) == 64)
43925 unspec = UNSPEC_RSQRT14;
43928 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43929 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43931 a = force_reg (mode, a);
43933 /* x0 = rsqrt(a) estimate */
43934 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43935 unspec)));
43937 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43938 if (!recip)
43940 rtx zero = force_reg (mode, CONST0_RTX(mode));
43941 rtx mask;
43943 /* Handle masked compare. */
43944 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43946 mask = gen_reg_rtx (HImode);
43947 /* Imm value 0x4 corresponds to not-equal comparison. */
43948 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43949 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43951 else
43953 mask = gen_reg_rtx (mode);
43954 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43955 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43959 /* e0 = x0 * a */
43960 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43961 /* e1 = e0 * x0 */
43962 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43964 /* e2 = e1 - 3. */
43965 mthree = force_reg (mode, mthree);
43966 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43968 mhalf = force_reg (mode, mhalf);
43969 if (recip)
43970 /* e3 = -.5 * x0 */
43971 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43972 else
43973 /* e3 = -.5 * e0 */
43974 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43975 /* ret = e2 * e3 */
43976 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43979 #ifdef TARGET_SOLARIS
43980 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43982 static void
43983 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43984 tree decl)
43986 /* With Binutils 2.15, the "@unwind" marker must be specified on
43987 every occurrence of the ".eh_frame" section, not just the first
43988 one. */
43989 if (TARGET_64BIT
43990 && strcmp (name, ".eh_frame") == 0)
43992 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43993 flags & SECTION_WRITE ? "aw" : "a");
43994 return;
43997 #ifndef USE_GAS
43998 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44000 solaris_elf_asm_comdat_section (name, flags, decl);
44001 return;
44003 #endif
44005 default_elf_asm_named_section (name, flags, decl);
44007 #endif /* TARGET_SOLARIS */
44009 /* Return the mangling of TYPE if it is an extended fundamental type. */
44011 static const char *
44012 ix86_mangle_type (const_tree type)
44014 type = TYPE_MAIN_VARIANT (type);
44016 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44017 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44018 return NULL;
44020 switch (TYPE_MODE (type))
44022 case TFmode:
44023 /* __float128 is "g". */
44024 return "g";
44025 case XFmode:
44026 /* "long double" or __float80 is "e". */
44027 return "e";
44028 default:
44029 return NULL;
44033 /* For 32-bit code we can save PIC register setup by using
44034 __stack_chk_fail_local hidden function instead of calling
44035 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44036 register, so it is better to call __stack_chk_fail directly. */
44038 static tree ATTRIBUTE_UNUSED
44039 ix86_stack_protect_fail (void)
44041 return TARGET_64BIT
44042 ? default_external_stack_protect_fail ()
44043 : default_hidden_stack_protect_fail ();
44046 /* Select a format to encode pointers in exception handling data. CODE
44047 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44048 true if the symbol may be affected by dynamic relocations.
44050 ??? All x86 object file formats are capable of representing this.
44051 After all, the relocation needed is the same as for the call insn.
44052 Whether or not a particular assembler allows us to enter such, I
44053 guess we'll have to see. */
44055 asm_preferred_eh_data_format (int code, int global)
44057 if (flag_pic)
44059 int type = DW_EH_PE_sdata8;
44060 if (!TARGET_64BIT
44061 || ix86_cmodel == CM_SMALL_PIC
44062 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44063 type = DW_EH_PE_sdata4;
44064 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44066 if (ix86_cmodel == CM_SMALL
44067 || (ix86_cmodel == CM_MEDIUM && code))
44068 return DW_EH_PE_udata4;
44069 return DW_EH_PE_absptr;
44072 /* Expand copysign from SIGN to the positive value ABS_VALUE
44073 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44074 the sign-bit. */
44075 static void
44076 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44078 machine_mode mode = GET_MODE (sign);
44079 rtx sgn = gen_reg_rtx (mode);
44080 if (mask == NULL_RTX)
44082 machine_mode vmode;
44084 if (mode == SFmode)
44085 vmode = V4SFmode;
44086 else if (mode == DFmode)
44087 vmode = V2DFmode;
44088 else
44089 vmode = mode;
44091 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44092 if (!VECTOR_MODE_P (mode))
44094 /* We need to generate a scalar mode mask in this case. */
44095 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44096 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44097 mask = gen_reg_rtx (mode);
44098 emit_insn (gen_rtx_SET (mask, tmp));
44101 else
44102 mask = gen_rtx_NOT (mode, mask);
44103 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44104 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44107 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44108 mask for masking out the sign-bit is stored in *SMASK, if that is
44109 non-null. */
44110 static rtx
44111 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44113 machine_mode vmode, mode = GET_MODE (op0);
44114 rtx xa, mask;
44116 xa = gen_reg_rtx (mode);
44117 if (mode == SFmode)
44118 vmode = V4SFmode;
44119 else if (mode == DFmode)
44120 vmode = V2DFmode;
44121 else
44122 vmode = mode;
44123 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44124 if (!VECTOR_MODE_P (mode))
44126 /* We need to generate a scalar mode mask in this case. */
44127 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44128 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44129 mask = gen_reg_rtx (mode);
44130 emit_insn (gen_rtx_SET (mask, tmp));
44132 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44134 if (smask)
44135 *smask = mask;
44137 return xa;
44140 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44141 swapping the operands if SWAP_OPERANDS is true. The expanded
44142 code is a forward jump to a newly created label in case the
44143 comparison is true. The generated label rtx is returned. */
44144 static rtx_code_label *
44145 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44146 bool swap_operands)
44148 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
44149 rtx_code_label *label;
44150 rtx tmp;
44152 if (swap_operands)
44153 std::swap (op0, op1);
44155 label = gen_label_rtx ();
44156 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
44157 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
44158 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
44159 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44160 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44161 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44162 JUMP_LABEL (tmp) = label;
44164 return label;
44167 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44168 using comparison code CODE. Operands are swapped for the comparison if
44169 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44170 static rtx
44171 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44172 bool swap_operands)
44174 rtx (*insn)(rtx, rtx, rtx, rtx);
44175 machine_mode mode = GET_MODE (op0);
44176 rtx mask = gen_reg_rtx (mode);
44178 if (swap_operands)
44179 std::swap (op0, op1);
44181 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44183 emit_insn (insn (mask, op0, op1,
44184 gen_rtx_fmt_ee (code, mode, op0, op1)));
44185 return mask;
44188 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44189 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44190 static rtx
44191 ix86_gen_TWO52 (machine_mode mode)
44193 REAL_VALUE_TYPE TWO52r;
44194 rtx TWO52;
44196 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44197 TWO52 = const_double_from_real_value (TWO52r, mode);
44198 TWO52 = force_reg (mode, TWO52);
44200 return TWO52;
44203 /* Expand SSE sequence for computing lround from OP1 storing
44204 into OP0. */
44205 void
44206 ix86_expand_lround (rtx op0, rtx op1)
44208 /* C code for the stuff we're doing below:
44209 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44210 return (long)tmp;
44212 machine_mode mode = GET_MODE (op1);
44213 const struct real_format *fmt;
44214 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44215 rtx adj;
44217 /* load nextafter (0.5, 0.0) */
44218 fmt = REAL_MODE_FORMAT (mode);
44219 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44220 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44222 /* adj = copysign (0.5, op1) */
44223 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44224 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44226 /* adj = op1 + adj */
44227 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44229 /* op0 = (imode)adj */
44230 expand_fix (op0, adj, 0);
44233 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44234 into OPERAND0. */
44235 void
44236 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44238 /* C code for the stuff we're doing below (for do_floor):
44239 xi = (long)op1;
44240 xi -= (double)xi > op1 ? 1 : 0;
44241 return xi;
44243 machine_mode fmode = GET_MODE (op1);
44244 machine_mode imode = GET_MODE (op0);
44245 rtx ireg, freg, tmp;
44246 rtx_code_label *label;
44248 /* reg = (long)op1 */
44249 ireg = gen_reg_rtx (imode);
44250 expand_fix (ireg, op1, 0);
44252 /* freg = (double)reg */
44253 freg = gen_reg_rtx (fmode);
44254 expand_float (freg, ireg, 0);
44256 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44257 label = ix86_expand_sse_compare_and_jump (UNLE,
44258 freg, op1, !do_floor);
44259 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44260 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44261 emit_move_insn (ireg, tmp);
44263 emit_label (label);
44264 LABEL_NUSES (label) = 1;
44266 emit_move_insn (op0, ireg);
44269 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44270 result in OPERAND0. */
44271 void
44272 ix86_expand_rint (rtx operand0, rtx operand1)
44274 /* C code for the stuff we're doing below:
44275 xa = fabs (operand1);
44276 if (!isless (xa, 2**52))
44277 return operand1;
44278 xa = xa + 2**52 - 2**52;
44279 return copysign (xa, operand1);
44281 machine_mode mode = GET_MODE (operand0);
44282 rtx res, xa, TWO52, mask;
44283 rtx_code_label *label;
44285 res = gen_reg_rtx (mode);
44286 emit_move_insn (res, operand1);
44288 /* xa = abs (operand1) */
44289 xa = ix86_expand_sse_fabs (res, &mask);
44291 /* if (!isless (xa, TWO52)) goto label; */
44292 TWO52 = ix86_gen_TWO52 (mode);
44293 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44295 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44296 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44298 ix86_sse_copysign_to_positive (res, xa, res, mask);
44300 emit_label (label);
44301 LABEL_NUSES (label) = 1;
44303 emit_move_insn (operand0, res);
44306 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44307 into OPERAND0. */
44308 void
44309 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44311 /* C code for the stuff we expand below.
44312 double xa = fabs (x), x2;
44313 if (!isless (xa, TWO52))
44314 return x;
44315 xa = xa + TWO52 - TWO52;
44316 x2 = copysign (xa, x);
44317 Compensate. Floor:
44318 if (x2 > x)
44319 x2 -= 1;
44320 Compensate. Ceil:
44321 if (x2 < x)
44322 x2 -= -1;
44323 return x2;
44325 machine_mode mode = GET_MODE (operand0);
44326 rtx xa, TWO52, tmp, one, res, mask;
44327 rtx_code_label *label;
44329 TWO52 = ix86_gen_TWO52 (mode);
44331 /* Temporary for holding the result, initialized to the input
44332 operand to ease control flow. */
44333 res = gen_reg_rtx (mode);
44334 emit_move_insn (res, operand1);
44336 /* xa = abs (operand1) */
44337 xa = ix86_expand_sse_fabs (res, &mask);
44339 /* if (!isless (xa, TWO52)) goto label; */
44340 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44342 /* xa = xa + TWO52 - TWO52; */
44343 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44344 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44346 /* xa = copysign (xa, operand1) */
44347 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44349 /* generate 1.0 or -1.0 */
44350 one = force_reg (mode,
44351 const_double_from_real_value (do_floor
44352 ? dconst1 : dconstm1, mode));
44354 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44355 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44356 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44357 /* We always need to subtract here to preserve signed zero. */
44358 tmp = expand_simple_binop (mode, MINUS,
44359 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44360 emit_move_insn (res, tmp);
44362 emit_label (label);
44363 LABEL_NUSES (label) = 1;
44365 emit_move_insn (operand0, res);
44368 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44369 into OPERAND0. */
44370 void
44371 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44373 /* C code for the stuff we expand below.
44374 double xa = fabs (x), x2;
44375 if (!isless (xa, TWO52))
44376 return x;
44377 x2 = (double)(long)x;
44378 Compensate. Floor:
44379 if (x2 > x)
44380 x2 -= 1;
44381 Compensate. Ceil:
44382 if (x2 < x)
44383 x2 += 1;
44384 if (HONOR_SIGNED_ZEROS (mode))
44385 return copysign (x2, x);
44386 return x2;
44388 machine_mode mode = GET_MODE (operand0);
44389 rtx xa, xi, TWO52, tmp, one, res, mask;
44390 rtx_code_label *label;
44392 TWO52 = ix86_gen_TWO52 (mode);
44394 /* Temporary for holding the result, initialized to the input
44395 operand to ease control flow. */
44396 res = gen_reg_rtx (mode);
44397 emit_move_insn (res, operand1);
44399 /* xa = abs (operand1) */
44400 xa = ix86_expand_sse_fabs (res, &mask);
44402 /* if (!isless (xa, TWO52)) goto label; */
44403 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44405 /* xa = (double)(long)x */
44406 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44407 expand_fix (xi, res, 0);
44408 expand_float (xa, xi, 0);
44410 /* generate 1.0 */
44411 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44413 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44414 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44415 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44416 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44417 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44418 emit_move_insn (res, tmp);
44420 if (HONOR_SIGNED_ZEROS (mode))
44421 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44423 emit_label (label);
44424 LABEL_NUSES (label) = 1;
44426 emit_move_insn (operand0, res);
44429 /* Expand SSE sequence for computing round from OPERAND1 storing
44430 into OPERAND0. Sequence that works without relying on DImode truncation
44431 via cvttsd2siq that is only available on 64bit targets. */
44432 void
44433 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44435 /* C code for the stuff we expand below.
44436 double xa = fabs (x), xa2, x2;
44437 if (!isless (xa, TWO52))
44438 return x;
44439 Using the absolute value and copying back sign makes
44440 -0.0 -> -0.0 correct.
44441 xa2 = xa + TWO52 - TWO52;
44442 Compensate.
44443 dxa = xa2 - xa;
44444 if (dxa <= -0.5)
44445 xa2 += 1;
44446 else if (dxa > 0.5)
44447 xa2 -= 1;
44448 x2 = copysign (xa2, x);
44449 return x2;
44451 machine_mode mode = GET_MODE (operand0);
44452 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44453 rtx_code_label *label;
44455 TWO52 = ix86_gen_TWO52 (mode);
44457 /* Temporary for holding the result, initialized to the input
44458 operand to ease control flow. */
44459 res = gen_reg_rtx (mode);
44460 emit_move_insn (res, operand1);
44462 /* xa = abs (operand1) */
44463 xa = ix86_expand_sse_fabs (res, &mask);
44465 /* if (!isless (xa, TWO52)) goto label; */
44466 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44468 /* xa2 = xa + TWO52 - TWO52; */
44469 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44470 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44472 /* dxa = xa2 - xa; */
44473 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44475 /* generate 0.5, 1.0 and -0.5 */
44476 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44477 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44478 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44479 0, OPTAB_DIRECT);
44481 /* Compensate. */
44482 tmp = gen_reg_rtx (mode);
44483 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44484 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44485 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44486 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44487 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44488 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44489 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44490 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44492 /* res = copysign (xa2, operand1) */
44493 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44495 emit_label (label);
44496 LABEL_NUSES (label) = 1;
44498 emit_move_insn (operand0, res);
44501 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44502 into OPERAND0. */
44503 void
44504 ix86_expand_trunc (rtx operand0, rtx operand1)
44506 /* C code for SSE variant we expand below.
44507 double xa = fabs (x), x2;
44508 if (!isless (xa, TWO52))
44509 return x;
44510 x2 = (double)(long)x;
44511 if (HONOR_SIGNED_ZEROS (mode))
44512 return copysign (x2, x);
44513 return x2;
44515 machine_mode mode = GET_MODE (operand0);
44516 rtx xa, xi, TWO52, res, mask;
44517 rtx_code_label *label;
44519 TWO52 = ix86_gen_TWO52 (mode);
44521 /* Temporary for holding the result, initialized to the input
44522 operand to ease control flow. */
44523 res = gen_reg_rtx (mode);
44524 emit_move_insn (res, operand1);
44526 /* xa = abs (operand1) */
44527 xa = ix86_expand_sse_fabs (res, &mask);
44529 /* if (!isless (xa, TWO52)) goto label; */
44530 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44532 /* x = (double)(long)x */
44533 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44534 expand_fix (xi, res, 0);
44535 expand_float (res, xi, 0);
44537 if (HONOR_SIGNED_ZEROS (mode))
44538 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44540 emit_label (label);
44541 LABEL_NUSES (label) = 1;
44543 emit_move_insn (operand0, res);
44546 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44547 into OPERAND0. */
44548 void
44549 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44551 machine_mode mode = GET_MODE (operand0);
44552 rtx xa, mask, TWO52, one, res, smask, tmp;
44553 rtx_code_label *label;
44555 /* C code for SSE variant we expand below.
44556 double xa = fabs (x), x2;
44557 if (!isless (xa, TWO52))
44558 return x;
44559 xa2 = xa + TWO52 - TWO52;
44560 Compensate:
44561 if (xa2 > xa)
44562 xa2 -= 1.0;
44563 x2 = copysign (xa2, x);
44564 return x2;
44567 TWO52 = ix86_gen_TWO52 (mode);
44569 /* Temporary for holding the result, initialized to the input
44570 operand to ease control flow. */
44571 res = gen_reg_rtx (mode);
44572 emit_move_insn (res, operand1);
44574 /* xa = abs (operand1) */
44575 xa = ix86_expand_sse_fabs (res, &smask);
44577 /* if (!isless (xa, TWO52)) goto label; */
44578 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44580 /* res = xa + TWO52 - TWO52; */
44581 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44582 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44583 emit_move_insn (res, tmp);
44585 /* generate 1.0 */
44586 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44588 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44589 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44590 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44591 tmp = expand_simple_binop (mode, MINUS,
44592 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44593 emit_move_insn (res, tmp);
44595 /* res = copysign (res, operand1) */
44596 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44598 emit_label (label);
44599 LABEL_NUSES (label) = 1;
44601 emit_move_insn (operand0, res);
44604 /* Expand SSE sequence for computing round from OPERAND1 storing
44605 into OPERAND0. */
44606 void
44607 ix86_expand_round (rtx operand0, rtx operand1)
44609 /* C code for the stuff we're doing below:
44610 double xa = fabs (x);
44611 if (!isless (xa, TWO52))
44612 return x;
44613 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44614 return copysign (xa, x);
44616 machine_mode mode = GET_MODE (operand0);
44617 rtx res, TWO52, xa, xi, half, mask;
44618 rtx_code_label *label;
44619 const struct real_format *fmt;
44620 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44622 /* Temporary for holding the result, initialized to the input
44623 operand to ease control flow. */
44624 res = gen_reg_rtx (mode);
44625 emit_move_insn (res, operand1);
44627 TWO52 = ix86_gen_TWO52 (mode);
44628 xa = ix86_expand_sse_fabs (res, &mask);
44629 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44631 /* load nextafter (0.5, 0.0) */
44632 fmt = REAL_MODE_FORMAT (mode);
44633 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44634 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44636 /* xa = xa + 0.5 */
44637 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44638 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44640 /* xa = (double)(int64_t)xa */
44641 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44642 expand_fix (xi, xa, 0);
44643 expand_float (xa, xi, 0);
44645 /* res = copysign (xa, operand1) */
44646 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44648 emit_label (label);
44649 LABEL_NUSES (label) = 1;
44651 emit_move_insn (operand0, res);
44654 /* Expand SSE sequence for computing round
44655 from OP1 storing into OP0 using sse4 round insn. */
44656 void
44657 ix86_expand_round_sse4 (rtx op0, rtx op1)
44659 machine_mode mode = GET_MODE (op0);
44660 rtx e1, e2, res, half;
44661 const struct real_format *fmt;
44662 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44663 rtx (*gen_copysign) (rtx, rtx, rtx);
44664 rtx (*gen_round) (rtx, rtx, rtx);
44666 switch (mode)
44668 case SFmode:
44669 gen_copysign = gen_copysignsf3;
44670 gen_round = gen_sse4_1_roundsf2;
44671 break;
44672 case DFmode:
44673 gen_copysign = gen_copysigndf3;
44674 gen_round = gen_sse4_1_rounddf2;
44675 break;
44676 default:
44677 gcc_unreachable ();
44680 /* round (a) = trunc (a + copysign (0.5, a)) */
44682 /* load nextafter (0.5, 0.0) */
44683 fmt = REAL_MODE_FORMAT (mode);
44684 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44685 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44686 half = const_double_from_real_value (pred_half, mode);
44688 /* e1 = copysign (0.5, op1) */
44689 e1 = gen_reg_rtx (mode);
44690 emit_insn (gen_copysign (e1, half, op1));
44692 /* e2 = op1 + e1 */
44693 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44695 /* res = trunc (e2) */
44696 res = gen_reg_rtx (mode);
44697 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44699 emit_move_insn (op0, res);
44703 /* Table of valid machine attributes. */
44704 static const struct attribute_spec ix86_attribute_table[] =
44706 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44707 affects_type_identity } */
44708 /* Stdcall attribute says callee is responsible for popping arguments
44709 if they are not variable. */
44710 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44711 true },
44712 /* Fastcall attribute says callee is responsible for popping arguments
44713 if they are not variable. */
44714 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44715 true },
44716 /* Thiscall attribute says callee is responsible for popping arguments
44717 if they are not variable. */
44718 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44719 true },
44720 /* Cdecl attribute says the callee is a normal C declaration */
44721 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44722 true },
44723 /* Regparm attribute specifies how many integer arguments are to be
44724 passed in registers. */
44725 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44726 true },
44727 /* Sseregparm attribute says we are using x86_64 calling conventions
44728 for FP arguments. */
44729 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44730 true },
44731 /* The transactional memory builtins are implicitly regparm or fastcall
44732 depending on the ABI. Override the generic do-nothing attribute that
44733 these builtins were declared with. */
44734 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44735 true },
44736 /* force_align_arg_pointer says this function realigns the stack at entry. */
44737 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44738 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44739 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44740 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44741 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44742 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44743 false },
44744 #endif
44745 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44746 false },
44747 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44748 false },
44749 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44750 SUBTARGET_ATTRIBUTE_TABLE,
44751 #endif
44752 /* ms_abi and sysv_abi calling convention function attributes. */
44753 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44754 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44755 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44756 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
44757 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44758 false },
44759 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44760 ix86_handle_callee_pop_aggregate_return, true },
44761 { "interrupt", 0, 0, false, true, true,
44762 ix86_handle_interrupt_attribute, false },
44763 { "no_caller_saved_registers", 0, 0, false, true, true,
44764 ix86_handle_no_caller_saved_registers_attribute, false },
44766 /* End element. */
44767 { NULL, 0, 0, false, false, false, NULL, false }
44770 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44771 static int
44772 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44773 tree vectype, int)
44775 switch (type_of_cost)
44777 case scalar_stmt:
44778 return ix86_cost->scalar_stmt_cost;
44780 case scalar_load:
44781 return ix86_cost->scalar_load_cost;
44783 case scalar_store:
44784 return ix86_cost->scalar_store_cost;
44786 case vector_stmt:
44787 return ix86_cost->vec_stmt_cost;
44789 case vector_load:
44790 return ix86_cost->vec_align_load_cost;
44792 case vector_store:
44793 return ix86_cost->vec_store_cost;
44795 case vec_to_scalar:
44796 return ix86_cost->vec_to_scalar_cost;
44798 case scalar_to_vec:
44799 return ix86_cost->scalar_to_vec_cost;
44801 case unaligned_load:
44802 case unaligned_store:
44803 return ix86_cost->vec_unalign_load_cost;
44805 case cond_branch_taken:
44806 return ix86_cost->cond_taken_branch_cost;
44808 case cond_branch_not_taken:
44809 return ix86_cost->cond_not_taken_branch_cost;
44811 case vec_perm:
44812 case vec_promote_demote:
44813 return ix86_cost->vec_stmt_cost;
44815 case vec_construct:
44816 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
44818 default:
44819 gcc_unreachable ();
44823 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44824 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44825 insn every time. */
44827 static GTY(()) rtx_insn *vselect_insn;
44829 /* Initialize vselect_insn. */
44831 static void
44832 init_vselect_insn (void)
44834 unsigned i;
44835 rtx x;
44837 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44838 for (i = 0; i < MAX_VECT_LEN; ++i)
44839 XVECEXP (x, 0, i) = const0_rtx;
44840 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44841 const0_rtx), x);
44842 x = gen_rtx_SET (const0_rtx, x);
44843 start_sequence ();
44844 vselect_insn = emit_insn (x);
44845 end_sequence ();
44848 /* Construct (set target (vec_select op0 (parallel perm))) and
44849 return true if that's a valid instruction in the active ISA. */
44851 static bool
44852 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44853 unsigned nelt, bool testing_p)
44855 unsigned int i;
44856 rtx x, save_vconcat;
44857 int icode;
44859 if (vselect_insn == NULL_RTX)
44860 init_vselect_insn ();
44862 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44863 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44864 for (i = 0; i < nelt; ++i)
44865 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44866 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44867 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44868 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44869 SET_DEST (PATTERN (vselect_insn)) = target;
44870 icode = recog_memoized (vselect_insn);
44872 if (icode >= 0 && !testing_p)
44873 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44875 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44876 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44877 INSN_CODE (vselect_insn) = -1;
44879 return icode >= 0;
44882 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44884 static bool
44885 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44886 const unsigned char *perm, unsigned nelt,
44887 bool testing_p)
44889 machine_mode v2mode;
44890 rtx x;
44891 bool ok;
44893 if (vselect_insn == NULL_RTX)
44894 init_vselect_insn ();
44896 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
44897 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44898 PUT_MODE (x, v2mode);
44899 XEXP (x, 0) = op0;
44900 XEXP (x, 1) = op1;
44901 ok = expand_vselect (target, x, perm, nelt, testing_p);
44902 XEXP (x, 0) = const0_rtx;
44903 XEXP (x, 1) = const0_rtx;
44904 return ok;
44907 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44908 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44910 static bool
44911 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44913 machine_mode mmode, vmode = d->vmode;
44914 unsigned i, mask, nelt = d->nelt;
44915 rtx target, op0, op1, maskop, x;
44916 rtx rperm[32], vperm;
44918 if (d->one_operand_p)
44919 return false;
44920 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44921 && (TARGET_AVX512BW
44922 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44924 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44926 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44928 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44930 else
44931 return false;
44933 /* This is a blend, not a permute. Elements must stay in their
44934 respective lanes. */
44935 for (i = 0; i < nelt; ++i)
44937 unsigned e = d->perm[i];
44938 if (!(e == i || e == i + nelt))
44939 return false;
44942 if (d->testing_p)
44943 return true;
44945 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
44946 decision should be extracted elsewhere, so that we only try that
44947 sequence once all budget==3 options have been tried. */
44948 target = d->target;
44949 op0 = d->op0;
44950 op1 = d->op1;
44951 mask = 0;
44953 switch (vmode)
44955 case V8DFmode:
44956 case V16SFmode:
44957 case V4DFmode:
44958 case V8SFmode:
44959 case V2DFmode:
44960 case V4SFmode:
44961 case V8HImode:
44962 case V8SImode:
44963 case V32HImode:
44964 case V64QImode:
44965 case V16SImode:
44966 case V8DImode:
44967 for (i = 0; i < nelt; ++i)
44968 mask |= (d->perm[i] >= nelt) << i;
44969 break;
44971 case V2DImode:
44972 for (i = 0; i < 2; ++i)
44973 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
44974 vmode = V8HImode;
44975 goto do_subreg;
44977 case V4SImode:
44978 for (i = 0; i < 4; ++i)
44979 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
44980 vmode = V8HImode;
44981 goto do_subreg;
44983 case V16QImode:
44984 /* See if bytes move in pairs so we can use pblendw with
44985 an immediate argument, rather than pblendvb with a vector
44986 argument. */
44987 for (i = 0; i < 16; i += 2)
44988 if (d->perm[i] + 1 != d->perm[i + 1])
44990 use_pblendvb:
44991 for (i = 0; i < nelt; ++i)
44992 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
44994 finish_pblendvb:
44995 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
44996 vperm = force_reg (vmode, vperm);
44998 if (GET_MODE_SIZE (vmode) == 16)
44999 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45000 else
45001 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45002 if (target != d->target)
45003 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45004 return true;
45007 for (i = 0; i < 8; ++i)
45008 mask |= (d->perm[i * 2] >= 16) << i;
45009 vmode = V8HImode;
45010 /* FALLTHRU */
45012 do_subreg:
45013 target = gen_reg_rtx (vmode);
45014 op0 = gen_lowpart (vmode, op0);
45015 op1 = gen_lowpart (vmode, op1);
45016 break;
45018 case V32QImode:
45019 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45020 for (i = 0; i < 32; i += 2)
45021 if (d->perm[i] + 1 != d->perm[i + 1])
45022 goto use_pblendvb;
45023 /* See if bytes move in quadruplets. If yes, vpblendd
45024 with immediate can be used. */
45025 for (i = 0; i < 32; i += 4)
45026 if (d->perm[i] + 2 != d->perm[i + 2])
45027 break;
45028 if (i < 32)
45030 /* See if bytes move the same in both lanes. If yes,
45031 vpblendw with immediate can be used. */
45032 for (i = 0; i < 16; i += 2)
45033 if (d->perm[i] + 16 != d->perm[i + 16])
45034 goto use_pblendvb;
45036 /* Use vpblendw. */
45037 for (i = 0; i < 16; ++i)
45038 mask |= (d->perm[i * 2] >= 32) << i;
45039 vmode = V16HImode;
45040 goto do_subreg;
45043 /* Use vpblendd. */
45044 for (i = 0; i < 8; ++i)
45045 mask |= (d->perm[i * 4] >= 32) << i;
45046 vmode = V8SImode;
45047 goto do_subreg;
45049 case V16HImode:
45050 /* See if words move in pairs. If yes, vpblendd can be used. */
45051 for (i = 0; i < 16; i += 2)
45052 if (d->perm[i] + 1 != d->perm[i + 1])
45053 break;
45054 if (i < 16)
45056 /* See if words move the same in both lanes. If not,
45057 vpblendvb must be used. */
45058 for (i = 0; i < 8; i++)
45059 if (d->perm[i] + 8 != d->perm[i + 8])
45061 /* Use vpblendvb. */
45062 for (i = 0; i < 32; ++i)
45063 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45065 vmode = V32QImode;
45066 nelt = 32;
45067 target = gen_reg_rtx (vmode);
45068 op0 = gen_lowpart (vmode, op0);
45069 op1 = gen_lowpart (vmode, op1);
45070 goto finish_pblendvb;
45073 /* Use vpblendw. */
45074 for (i = 0; i < 16; ++i)
45075 mask |= (d->perm[i] >= 16) << i;
45076 break;
45079 /* Use vpblendd. */
45080 for (i = 0; i < 8; ++i)
45081 mask |= (d->perm[i * 2] >= 16) << i;
45082 vmode = V8SImode;
45083 goto do_subreg;
45085 case V4DImode:
45086 /* Use vpblendd. */
45087 for (i = 0; i < 4; ++i)
45088 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45089 vmode = V8SImode;
45090 goto do_subreg;
45092 default:
45093 gcc_unreachable ();
45096 switch (vmode)
45098 case V8DFmode:
45099 case V8DImode:
45100 mmode = QImode;
45101 break;
45102 case V16SFmode:
45103 case V16SImode:
45104 mmode = HImode;
45105 break;
45106 case V32HImode:
45107 mmode = SImode;
45108 break;
45109 case V64QImode:
45110 mmode = DImode;
45111 break;
45112 default:
45113 mmode = VOIDmode;
45116 if (mmode != VOIDmode)
45117 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45118 else
45119 maskop = GEN_INT (mask);
45121 /* This matches five different patterns with the different modes. */
45122 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45123 x = gen_rtx_SET (target, x);
45124 emit_insn (x);
45125 if (target != d->target)
45126 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45128 return true;
45131 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45132 in terms of the variable form of vpermilps.
45134 Note that we will have already failed the immediate input vpermilps,
45135 which requires that the high and low part shuffle be identical; the
45136 variable form doesn't require that. */
45138 static bool
45139 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45141 rtx rperm[8], vperm;
45142 unsigned i;
45144 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45145 return false;
45147 /* We can only permute within the 128-bit lane. */
45148 for (i = 0; i < 8; ++i)
45150 unsigned e = d->perm[i];
45151 if (i < 4 ? e >= 4 : e < 4)
45152 return false;
45155 if (d->testing_p)
45156 return true;
45158 for (i = 0; i < 8; ++i)
45160 unsigned e = d->perm[i];
45162 /* Within each 128-bit lane, the elements of op0 are numbered
45163 from 0 and the elements of op1 are numbered from 4. */
45164 if (e >= 8 + 4)
45165 e -= 8;
45166 else if (e >= 4)
45167 e -= 4;
45169 rperm[i] = GEN_INT (e);
45172 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45173 vperm = force_reg (V8SImode, vperm);
45174 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45176 return true;
45179 /* Return true if permutation D can be performed as VMODE permutation
45180 instead. */
45182 static bool
45183 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45185 unsigned int i, j, chunk;
45187 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45188 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45189 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45190 return false;
45192 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45193 return true;
45195 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45196 for (i = 0; i < d->nelt; i += chunk)
45197 if (d->perm[i] & (chunk - 1))
45198 return false;
45199 else
45200 for (j = 1; j < chunk; ++j)
45201 if (d->perm[i] + j != d->perm[i + j])
45202 return false;
45204 return true;
45207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45208 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45210 static bool
45211 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45213 unsigned i, nelt, eltsz, mask;
45214 unsigned char perm[64];
45215 machine_mode vmode = V16QImode;
45216 rtx rperm[64], vperm, target, op0, op1;
45218 nelt = d->nelt;
45220 if (!d->one_operand_p)
45222 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45224 if (TARGET_AVX2
45225 && valid_perm_using_mode_p (V2TImode, d))
45227 if (d->testing_p)
45228 return true;
45230 /* Use vperm2i128 insn. The pattern uses
45231 V4DImode instead of V2TImode. */
45232 target = d->target;
45233 if (d->vmode != V4DImode)
45234 target = gen_reg_rtx (V4DImode);
45235 op0 = gen_lowpart (V4DImode, d->op0);
45236 op1 = gen_lowpart (V4DImode, d->op1);
45237 rperm[0]
45238 = GEN_INT ((d->perm[0] / (nelt / 2))
45239 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45240 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45241 if (target != d->target)
45242 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45243 return true;
45245 return false;
45248 else
45250 if (GET_MODE_SIZE (d->vmode) == 16)
45252 if (!TARGET_SSSE3)
45253 return false;
45255 else if (GET_MODE_SIZE (d->vmode) == 32)
45257 if (!TARGET_AVX2)
45258 return false;
45260 /* V4DImode should be already handled through
45261 expand_vselect by vpermq instruction. */
45262 gcc_assert (d->vmode != V4DImode);
45264 vmode = V32QImode;
45265 if (d->vmode == V8SImode
45266 || d->vmode == V16HImode
45267 || d->vmode == V32QImode)
45269 /* First see if vpermq can be used for
45270 V8SImode/V16HImode/V32QImode. */
45271 if (valid_perm_using_mode_p (V4DImode, d))
45273 for (i = 0; i < 4; i++)
45274 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45275 if (d->testing_p)
45276 return true;
45277 target = gen_reg_rtx (V4DImode);
45278 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45279 perm, 4, false))
45281 emit_move_insn (d->target,
45282 gen_lowpart (d->vmode, target));
45283 return true;
45285 return false;
45288 /* Next see if vpermd can be used. */
45289 if (valid_perm_using_mode_p (V8SImode, d))
45290 vmode = V8SImode;
45292 /* Or if vpermps can be used. */
45293 else if (d->vmode == V8SFmode)
45294 vmode = V8SImode;
45296 if (vmode == V32QImode)
45298 /* vpshufb only works intra lanes, it is not
45299 possible to shuffle bytes in between the lanes. */
45300 for (i = 0; i < nelt; ++i)
45301 if ((d->perm[i] ^ i) & (nelt / 2))
45302 return false;
45305 else if (GET_MODE_SIZE (d->vmode) == 64)
45307 if (!TARGET_AVX512BW)
45308 return false;
45310 /* If vpermq didn't work, vpshufb won't work either. */
45311 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45312 return false;
45314 vmode = V64QImode;
45315 if (d->vmode == V16SImode
45316 || d->vmode == V32HImode
45317 || d->vmode == V64QImode)
45319 /* First see if vpermq can be used for
45320 V16SImode/V32HImode/V64QImode. */
45321 if (valid_perm_using_mode_p (V8DImode, d))
45323 for (i = 0; i < 8; i++)
45324 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45325 if (d->testing_p)
45326 return true;
45327 target = gen_reg_rtx (V8DImode);
45328 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45329 perm, 8, false))
45331 emit_move_insn (d->target,
45332 gen_lowpart (d->vmode, target));
45333 return true;
45335 return false;
45338 /* Next see if vpermd can be used. */
45339 if (valid_perm_using_mode_p (V16SImode, d))
45340 vmode = V16SImode;
45342 /* Or if vpermps can be used. */
45343 else if (d->vmode == V16SFmode)
45344 vmode = V16SImode;
45345 if (vmode == V64QImode)
45347 /* vpshufb only works intra lanes, it is not
45348 possible to shuffle bytes in between the lanes. */
45349 for (i = 0; i < nelt; ++i)
45350 if ((d->perm[i] ^ i) & (nelt / 4))
45351 return false;
45354 else
45355 return false;
45358 if (d->testing_p)
45359 return true;
45361 if (vmode == V8SImode)
45362 for (i = 0; i < 8; ++i)
45363 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45364 else if (vmode == V16SImode)
45365 for (i = 0; i < 16; ++i)
45366 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45367 else
45369 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45370 if (!d->one_operand_p)
45371 mask = 2 * nelt - 1;
45372 else if (vmode == V16QImode)
45373 mask = nelt - 1;
45374 else if (vmode == V64QImode)
45375 mask = nelt / 4 - 1;
45376 else
45377 mask = nelt / 2 - 1;
45379 for (i = 0; i < nelt; ++i)
45381 unsigned j, e = d->perm[i] & mask;
45382 for (j = 0; j < eltsz; ++j)
45383 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45387 vperm = gen_rtx_CONST_VECTOR (vmode,
45388 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45389 vperm = force_reg (vmode, vperm);
45391 target = d->target;
45392 if (d->vmode != vmode)
45393 target = gen_reg_rtx (vmode);
45394 op0 = gen_lowpart (vmode, d->op0);
45395 if (d->one_operand_p)
45397 if (vmode == V16QImode)
45398 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45399 else if (vmode == V32QImode)
45400 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45401 else if (vmode == V64QImode)
45402 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45403 else if (vmode == V8SFmode)
45404 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45405 else if (vmode == V8SImode)
45406 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45407 else if (vmode == V16SFmode)
45408 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45409 else if (vmode == V16SImode)
45410 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45411 else
45412 gcc_unreachable ();
45414 else
45416 op1 = gen_lowpart (vmode, d->op1);
45417 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45419 if (target != d->target)
45420 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45422 return true;
45425 /* For V*[QHS]Imode permutations, check if the same permutation
45426 can't be performed in a 2x, 4x or 8x wider inner mode. */
45428 static bool
45429 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45430 struct expand_vec_perm_d *nd)
45432 int i;
45433 enum machine_mode mode = VOIDmode;
45435 switch (d->vmode)
45437 case V16QImode: mode = V8HImode; break;
45438 case V32QImode: mode = V16HImode; break;
45439 case V64QImode: mode = V32HImode; break;
45440 case V8HImode: mode = V4SImode; break;
45441 case V16HImode: mode = V8SImode; break;
45442 case V32HImode: mode = V16SImode; break;
45443 case V4SImode: mode = V2DImode; break;
45444 case V8SImode: mode = V4DImode; break;
45445 case V16SImode: mode = V8DImode; break;
45446 default: return false;
45448 for (i = 0; i < d->nelt; i += 2)
45449 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45450 return false;
45451 nd->vmode = mode;
45452 nd->nelt = d->nelt / 2;
45453 for (i = 0; i < nd->nelt; i++)
45454 nd->perm[i] = d->perm[2 * i] / 2;
45455 if (GET_MODE_INNER (mode) != DImode)
45456 canonicalize_vector_int_perm (nd, nd);
45457 if (nd != d)
45459 nd->one_operand_p = d->one_operand_p;
45460 nd->testing_p = d->testing_p;
45461 if (d->op0 == d->op1)
45462 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45463 else
45465 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45466 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45468 if (d->testing_p)
45469 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45470 else
45471 nd->target = gen_reg_rtx (nd->vmode);
45473 return true;
45476 /* Try to expand one-operand permutation with constant mask. */
45478 static bool
45479 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45481 machine_mode mode = GET_MODE (d->op0);
45482 machine_mode maskmode = mode;
45483 rtx (*gen) (rtx, rtx, rtx) = NULL;
45484 rtx target, op0, mask;
45485 rtx vec[64];
45487 if (!rtx_equal_p (d->op0, d->op1))
45488 return false;
45490 if (!TARGET_AVX512F)
45491 return false;
45493 switch (mode)
45495 case V16SImode:
45496 gen = gen_avx512f_permvarv16si;
45497 break;
45498 case V16SFmode:
45499 gen = gen_avx512f_permvarv16sf;
45500 maskmode = V16SImode;
45501 break;
45502 case V8DImode:
45503 gen = gen_avx512f_permvarv8di;
45504 break;
45505 case V8DFmode:
45506 gen = gen_avx512f_permvarv8df;
45507 maskmode = V8DImode;
45508 break;
45509 default:
45510 return false;
45513 target = d->target;
45514 op0 = d->op0;
45515 for (int i = 0; i < d->nelt; ++i)
45516 vec[i] = GEN_INT (d->perm[i]);
45517 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45518 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45519 return true;
45522 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45523 in a single instruction. */
45525 static bool
45526 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45528 unsigned i, nelt = d->nelt;
45529 struct expand_vec_perm_d nd;
45531 /* Check plain VEC_SELECT first, because AVX has instructions that could
45532 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45533 input where SEL+CONCAT may not. */
45534 if (d->one_operand_p)
45536 int mask = nelt - 1;
45537 bool identity_perm = true;
45538 bool broadcast_perm = true;
45540 for (i = 0; i < nelt; i++)
45542 nd.perm[i] = d->perm[i] & mask;
45543 if (nd.perm[i] != i)
45544 identity_perm = false;
45545 if (nd.perm[i])
45546 broadcast_perm = false;
45549 if (identity_perm)
45551 if (!d->testing_p)
45552 emit_move_insn (d->target, d->op0);
45553 return true;
45555 else if (broadcast_perm && TARGET_AVX2)
45557 /* Use vpbroadcast{b,w,d}. */
45558 rtx (*gen) (rtx, rtx) = NULL;
45559 switch (d->vmode)
45561 case V64QImode:
45562 if (TARGET_AVX512BW)
45563 gen = gen_avx512bw_vec_dupv64qi_1;
45564 break;
45565 case V32QImode:
45566 gen = gen_avx2_pbroadcastv32qi_1;
45567 break;
45568 case V32HImode:
45569 if (TARGET_AVX512BW)
45570 gen = gen_avx512bw_vec_dupv32hi_1;
45571 break;
45572 case V16HImode:
45573 gen = gen_avx2_pbroadcastv16hi_1;
45574 break;
45575 case V16SImode:
45576 if (TARGET_AVX512F)
45577 gen = gen_avx512f_vec_dupv16si_1;
45578 break;
45579 case V8SImode:
45580 gen = gen_avx2_pbroadcastv8si_1;
45581 break;
45582 case V16QImode:
45583 gen = gen_avx2_pbroadcastv16qi;
45584 break;
45585 case V8HImode:
45586 gen = gen_avx2_pbroadcastv8hi;
45587 break;
45588 case V16SFmode:
45589 if (TARGET_AVX512F)
45590 gen = gen_avx512f_vec_dupv16sf_1;
45591 break;
45592 case V8SFmode:
45593 gen = gen_avx2_vec_dupv8sf_1;
45594 break;
45595 case V8DFmode:
45596 if (TARGET_AVX512F)
45597 gen = gen_avx512f_vec_dupv8df_1;
45598 break;
45599 case V8DImode:
45600 if (TARGET_AVX512F)
45601 gen = gen_avx512f_vec_dupv8di_1;
45602 break;
45603 /* For other modes prefer other shuffles this function creates. */
45604 default: break;
45606 if (gen != NULL)
45608 if (!d->testing_p)
45609 emit_insn (gen (d->target, d->op0));
45610 return true;
45614 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45615 return true;
45617 /* There are plenty of patterns in sse.md that are written for
45618 SEL+CONCAT and are not replicated for a single op. Perhaps
45619 that should be changed, to avoid the nastiness here. */
45621 /* Recognize interleave style patterns, which means incrementing
45622 every other permutation operand. */
45623 for (i = 0; i < nelt; i += 2)
45625 nd.perm[i] = d->perm[i] & mask;
45626 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45628 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45629 d->testing_p))
45630 return true;
45632 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45633 if (nelt >= 4)
45635 for (i = 0; i < nelt; i += 4)
45637 nd.perm[i + 0] = d->perm[i + 0] & mask;
45638 nd.perm[i + 1] = d->perm[i + 1] & mask;
45639 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45640 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45643 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45644 d->testing_p))
45645 return true;
45649 /* Finally, try the fully general two operand permute. */
45650 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45651 d->testing_p))
45652 return true;
45654 /* Recognize interleave style patterns with reversed operands. */
45655 if (!d->one_operand_p)
45657 for (i = 0; i < nelt; ++i)
45659 unsigned e = d->perm[i];
45660 if (e >= nelt)
45661 e -= nelt;
45662 else
45663 e += nelt;
45664 nd.perm[i] = e;
45667 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45668 d->testing_p))
45669 return true;
45672 /* Try the SSE4.1 blend variable merge instructions. */
45673 if (expand_vec_perm_blend (d))
45674 return true;
45676 /* Try one of the AVX vpermil variable permutations. */
45677 if (expand_vec_perm_vpermil (d))
45678 return true;
45680 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45681 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45682 if (expand_vec_perm_pshufb (d))
45683 return true;
45685 /* Try the AVX2 vpalignr instruction. */
45686 if (expand_vec_perm_palignr (d, true))
45687 return true;
45689 /* Try the AVX512F vperm{s,d} instructions. */
45690 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45691 return true;
45693 /* Try the AVX512F vpermi2 instructions. */
45694 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45695 return true;
45697 /* See if we can get the same permutation in different vector integer
45698 mode. */
45699 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45701 if (!d->testing_p)
45702 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45703 return true;
45705 return false;
45708 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45709 in terms of a pair of pshuflw + pshufhw instructions. */
45711 static bool
45712 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45714 unsigned char perm2[MAX_VECT_LEN];
45715 unsigned i;
45716 bool ok;
45718 if (d->vmode != V8HImode || !d->one_operand_p)
45719 return false;
45721 /* The two permutations only operate in 64-bit lanes. */
45722 for (i = 0; i < 4; ++i)
45723 if (d->perm[i] >= 4)
45724 return false;
45725 for (i = 4; i < 8; ++i)
45726 if (d->perm[i] < 4)
45727 return false;
45729 if (d->testing_p)
45730 return true;
45732 /* Emit the pshuflw. */
45733 memcpy (perm2, d->perm, 4);
45734 for (i = 4; i < 8; ++i)
45735 perm2[i] = i;
45736 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45737 gcc_assert (ok);
45739 /* Emit the pshufhw. */
45740 memcpy (perm2 + 4, d->perm + 4, 4);
45741 for (i = 0; i < 4; ++i)
45742 perm2[i] = i;
45743 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45744 gcc_assert (ok);
45746 return true;
45749 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45750 the permutation using the SSSE3 palignr instruction. This succeeds
45751 when all of the elements in PERM fit within one vector and we merely
45752 need to shift them down so that a single vector permutation has a
45753 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45754 the vpalignr instruction itself can perform the requested permutation. */
45756 static bool
45757 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45759 unsigned i, nelt = d->nelt;
45760 unsigned min, max, minswap, maxswap;
45761 bool in_order, ok, swap = false;
45762 rtx shift, target;
45763 struct expand_vec_perm_d dcopy;
45765 /* Even with AVX, palignr only operates on 128-bit vectors,
45766 in AVX2 palignr operates on both 128-bit lanes. */
45767 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45768 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45769 return false;
45771 min = 2 * nelt;
45772 max = 0;
45773 minswap = 2 * nelt;
45774 maxswap = 0;
45775 for (i = 0; i < nelt; ++i)
45777 unsigned e = d->perm[i];
45778 unsigned eswap = d->perm[i] ^ nelt;
45779 if (GET_MODE_SIZE (d->vmode) == 32)
45781 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45782 eswap = e ^ (nelt / 2);
45784 if (e < min)
45785 min = e;
45786 if (e > max)
45787 max = e;
45788 if (eswap < minswap)
45789 minswap = eswap;
45790 if (eswap > maxswap)
45791 maxswap = eswap;
45793 if (min == 0
45794 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45796 if (d->one_operand_p
45797 || minswap == 0
45798 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45799 ? nelt / 2 : nelt))
45800 return false;
45801 swap = true;
45802 min = minswap;
45803 max = maxswap;
45806 /* Given that we have SSSE3, we know we'll be able to implement the
45807 single operand permutation after the palignr with pshufb for
45808 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45809 first. */
45810 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45811 return true;
45813 dcopy = *d;
45814 if (swap)
45816 dcopy.op0 = d->op1;
45817 dcopy.op1 = d->op0;
45818 for (i = 0; i < nelt; ++i)
45819 dcopy.perm[i] ^= nelt;
45822 in_order = true;
45823 for (i = 0; i < nelt; ++i)
45825 unsigned e = dcopy.perm[i];
45826 if (GET_MODE_SIZE (d->vmode) == 32
45827 && e >= nelt
45828 && (e & (nelt / 2 - 1)) < min)
45829 e = e - min - (nelt / 2);
45830 else
45831 e = e - min;
45832 if (e != i)
45833 in_order = false;
45834 dcopy.perm[i] = e;
45836 dcopy.one_operand_p = true;
45838 if (single_insn_only_p && !in_order)
45839 return false;
45841 /* For AVX2, test whether we can permute the result in one instruction. */
45842 if (d->testing_p)
45844 if (in_order)
45845 return true;
45846 dcopy.op1 = dcopy.op0;
45847 return expand_vec_perm_1 (&dcopy);
45850 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45851 if (GET_MODE_SIZE (d->vmode) == 16)
45853 target = gen_reg_rtx (TImode);
45854 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45855 gen_lowpart (TImode, dcopy.op0), shift));
45857 else
45859 target = gen_reg_rtx (V2TImode);
45860 emit_insn (gen_avx2_palignrv2ti (target,
45861 gen_lowpart (V2TImode, dcopy.op1),
45862 gen_lowpart (V2TImode, dcopy.op0),
45863 shift));
45866 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45868 /* Test for the degenerate case where the alignment by itself
45869 produces the desired permutation. */
45870 if (in_order)
45872 emit_move_insn (d->target, dcopy.op0);
45873 return true;
45876 ok = expand_vec_perm_1 (&dcopy);
45877 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45879 return ok;
45882 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45883 the permutation using the SSE4_1 pblendv instruction. Potentially
45884 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45886 static bool
45887 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45889 unsigned i, which, nelt = d->nelt;
45890 struct expand_vec_perm_d dcopy, dcopy1;
45891 machine_mode vmode = d->vmode;
45892 bool ok;
45894 /* Use the same checks as in expand_vec_perm_blend. */
45895 if (d->one_operand_p)
45896 return false;
45897 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45899 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45901 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45903 else
45904 return false;
45906 /* Figure out where permutation elements stay not in their
45907 respective lanes. */
45908 for (i = 0, which = 0; i < nelt; ++i)
45910 unsigned e = d->perm[i];
45911 if (e != i)
45912 which |= (e < nelt ? 1 : 2);
45914 /* We can pblend the part where elements stay not in their
45915 respective lanes only when these elements are all in one
45916 half of a permutation.
45917 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45918 lanes, but both 8 and 9 >= 8
45919 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45920 respective lanes and 8 >= 8, but 2 not. */
45921 if (which != 1 && which != 2)
45922 return false;
45923 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45924 return true;
45926 /* First we apply one operand permutation to the part where
45927 elements stay not in their respective lanes. */
45928 dcopy = *d;
45929 if (which == 2)
45930 dcopy.op0 = dcopy.op1 = d->op1;
45931 else
45932 dcopy.op0 = dcopy.op1 = d->op0;
45933 if (!d->testing_p)
45934 dcopy.target = gen_reg_rtx (vmode);
45935 dcopy.one_operand_p = true;
45937 for (i = 0; i < nelt; ++i)
45938 dcopy.perm[i] = d->perm[i] & (nelt - 1);
45940 ok = expand_vec_perm_1 (&dcopy);
45941 if (GET_MODE_SIZE (vmode) != 16 && !ok)
45942 return false;
45943 else
45944 gcc_assert (ok);
45945 if (d->testing_p)
45946 return true;
45948 /* Next we put permuted elements into their positions. */
45949 dcopy1 = *d;
45950 if (which == 2)
45951 dcopy1.op1 = dcopy.target;
45952 else
45953 dcopy1.op0 = dcopy.target;
45955 for (i = 0; i < nelt; ++i)
45956 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
45958 ok = expand_vec_perm_blend (&dcopy1);
45959 gcc_assert (ok);
45961 return true;
45964 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
45966 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45967 a two vector permutation into a single vector permutation by using
45968 an interleave operation to merge the vectors. */
45970 static bool
45971 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
45973 struct expand_vec_perm_d dremap, dfinal;
45974 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
45975 unsigned HOST_WIDE_INT contents;
45976 unsigned char remap[2 * MAX_VECT_LEN];
45977 rtx_insn *seq;
45978 bool ok, same_halves = false;
45980 if (GET_MODE_SIZE (d->vmode) == 16)
45982 if (d->one_operand_p)
45983 return false;
45985 else if (GET_MODE_SIZE (d->vmode) == 32)
45987 if (!TARGET_AVX)
45988 return false;
45989 /* For 32-byte modes allow even d->one_operand_p.
45990 The lack of cross-lane shuffling in some instructions
45991 might prevent a single insn shuffle. */
45992 dfinal = *d;
45993 dfinal.testing_p = true;
45994 /* If expand_vec_perm_interleave3 can expand this into
45995 a 3 insn sequence, give up and let it be expanded as
45996 3 insn sequence. While that is one insn longer,
45997 it doesn't need a memory operand and in the common
45998 case that both interleave low and high permutations
45999 with the same operands are adjacent needs 4 insns
46000 for both after CSE. */
46001 if (expand_vec_perm_interleave3 (&dfinal))
46002 return false;
46004 else
46005 return false;
46007 /* Examine from whence the elements come. */
46008 contents = 0;
46009 for (i = 0; i < nelt; ++i)
46010 contents |= HOST_WIDE_INT_1U << d->perm[i];
46012 memset (remap, 0xff, sizeof (remap));
46013 dremap = *d;
46015 if (GET_MODE_SIZE (d->vmode) == 16)
46017 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46019 /* Split the two input vectors into 4 halves. */
46020 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46021 h2 = h1 << nelt2;
46022 h3 = h2 << nelt2;
46023 h4 = h3 << nelt2;
46025 /* If the elements from the low halves use interleave low, and similarly
46026 for interleave high. If the elements are from mis-matched halves, we
46027 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46028 if ((contents & (h1 | h3)) == contents)
46030 /* punpckl* */
46031 for (i = 0; i < nelt2; ++i)
46033 remap[i] = i * 2;
46034 remap[i + nelt] = i * 2 + 1;
46035 dremap.perm[i * 2] = i;
46036 dremap.perm[i * 2 + 1] = i + nelt;
46038 if (!TARGET_SSE2 && d->vmode == V4SImode)
46039 dremap.vmode = V4SFmode;
46041 else if ((contents & (h2 | h4)) == contents)
46043 /* punpckh* */
46044 for (i = 0; i < nelt2; ++i)
46046 remap[i + nelt2] = i * 2;
46047 remap[i + nelt + nelt2] = i * 2 + 1;
46048 dremap.perm[i * 2] = i + nelt2;
46049 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46051 if (!TARGET_SSE2 && d->vmode == V4SImode)
46052 dremap.vmode = V4SFmode;
46054 else if ((contents & (h1 | h4)) == contents)
46056 /* shufps */
46057 for (i = 0; i < nelt2; ++i)
46059 remap[i] = i;
46060 remap[i + nelt + nelt2] = i + nelt2;
46061 dremap.perm[i] = i;
46062 dremap.perm[i + nelt2] = i + nelt + nelt2;
46064 if (nelt != 4)
46066 /* shufpd */
46067 dremap.vmode = V2DImode;
46068 dremap.nelt = 2;
46069 dremap.perm[0] = 0;
46070 dremap.perm[1] = 3;
46073 else if ((contents & (h2 | h3)) == contents)
46075 /* shufps */
46076 for (i = 0; i < nelt2; ++i)
46078 remap[i + nelt2] = i;
46079 remap[i + nelt] = i + nelt2;
46080 dremap.perm[i] = i + nelt2;
46081 dremap.perm[i + nelt2] = i + nelt;
46083 if (nelt != 4)
46085 /* shufpd */
46086 dremap.vmode = V2DImode;
46087 dremap.nelt = 2;
46088 dremap.perm[0] = 1;
46089 dremap.perm[1] = 2;
46092 else
46093 return false;
46095 else
46097 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46098 unsigned HOST_WIDE_INT q[8];
46099 unsigned int nonzero_halves[4];
46101 /* Split the two input vectors into 8 quarters. */
46102 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46103 for (i = 1; i < 8; ++i)
46104 q[i] = q[0] << (nelt4 * i);
46105 for (i = 0; i < 4; ++i)
46106 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46108 nonzero_halves[nzcnt] = i;
46109 ++nzcnt;
46112 if (nzcnt == 1)
46114 gcc_assert (d->one_operand_p);
46115 nonzero_halves[1] = nonzero_halves[0];
46116 same_halves = true;
46118 else if (d->one_operand_p)
46120 gcc_assert (nonzero_halves[0] == 0);
46121 gcc_assert (nonzero_halves[1] == 1);
46124 if (nzcnt <= 2)
46126 if (d->perm[0] / nelt2 == nonzero_halves[1])
46128 /* Attempt to increase the likelihood that dfinal
46129 shuffle will be intra-lane. */
46130 std::swap (nonzero_halves[0], nonzero_halves[1]);
46133 /* vperm2f128 or vperm2i128. */
46134 for (i = 0; i < nelt2; ++i)
46136 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46137 remap[i + nonzero_halves[0] * nelt2] = i;
46138 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46139 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46142 if (d->vmode != V8SFmode
46143 && d->vmode != V4DFmode
46144 && d->vmode != V8SImode)
46146 dremap.vmode = V8SImode;
46147 dremap.nelt = 8;
46148 for (i = 0; i < 4; ++i)
46150 dremap.perm[i] = i + nonzero_halves[0] * 4;
46151 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46155 else if (d->one_operand_p)
46156 return false;
46157 else if (TARGET_AVX2
46158 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46160 /* vpunpckl* */
46161 for (i = 0; i < nelt4; ++i)
46163 remap[i] = i * 2;
46164 remap[i + nelt] = i * 2 + 1;
46165 remap[i + nelt2] = i * 2 + nelt2;
46166 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46167 dremap.perm[i * 2] = i;
46168 dremap.perm[i * 2 + 1] = i + nelt;
46169 dremap.perm[i * 2 + nelt2] = i + nelt2;
46170 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46173 else if (TARGET_AVX2
46174 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46176 /* vpunpckh* */
46177 for (i = 0; i < nelt4; ++i)
46179 remap[i + nelt4] = i * 2;
46180 remap[i + nelt + nelt4] = i * 2 + 1;
46181 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46182 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46183 dremap.perm[i * 2] = i + nelt4;
46184 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46185 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46186 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46189 else
46190 return false;
46193 /* Use the remapping array set up above to move the elements from their
46194 swizzled locations into their final destinations. */
46195 dfinal = *d;
46196 for (i = 0; i < nelt; ++i)
46198 unsigned e = remap[d->perm[i]];
46199 gcc_assert (e < nelt);
46200 /* If same_halves is true, both halves of the remapped vector are the
46201 same. Avoid cross-lane accesses if possible. */
46202 if (same_halves && i >= nelt2)
46204 gcc_assert (e < nelt2);
46205 dfinal.perm[i] = e + nelt2;
46207 else
46208 dfinal.perm[i] = e;
46210 if (!d->testing_p)
46212 dremap.target = gen_reg_rtx (dremap.vmode);
46213 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46215 dfinal.op1 = dfinal.op0;
46216 dfinal.one_operand_p = true;
46218 /* Test if the final remap can be done with a single insn. For V4SFmode or
46219 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46220 start_sequence ();
46221 ok = expand_vec_perm_1 (&dfinal);
46222 seq = get_insns ();
46223 end_sequence ();
46225 if (!ok)
46226 return false;
46228 if (d->testing_p)
46229 return true;
46231 if (dremap.vmode != dfinal.vmode)
46233 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46234 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46237 ok = expand_vec_perm_1 (&dremap);
46238 gcc_assert (ok);
46240 emit_insn (seq);
46241 return true;
46244 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46245 a single vector cross-lane permutation into vpermq followed
46246 by any of the single insn permutations. */
46248 static bool
46249 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46251 struct expand_vec_perm_d dremap, dfinal;
46252 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46253 unsigned contents[2];
46254 bool ok;
46256 if (!(TARGET_AVX2
46257 && (d->vmode == V32QImode || d->vmode == V16HImode)
46258 && d->one_operand_p))
46259 return false;
46261 contents[0] = 0;
46262 contents[1] = 0;
46263 for (i = 0; i < nelt2; ++i)
46265 contents[0] |= 1u << (d->perm[i] / nelt4);
46266 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46269 for (i = 0; i < 2; ++i)
46271 unsigned int cnt = 0;
46272 for (j = 0; j < 4; ++j)
46273 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46274 return false;
46277 if (d->testing_p)
46278 return true;
46280 dremap = *d;
46281 dremap.vmode = V4DImode;
46282 dremap.nelt = 4;
46283 dremap.target = gen_reg_rtx (V4DImode);
46284 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46285 dremap.op1 = dremap.op0;
46286 dremap.one_operand_p = true;
46287 for (i = 0; i < 2; ++i)
46289 unsigned int cnt = 0;
46290 for (j = 0; j < 4; ++j)
46291 if ((contents[i] & (1u << j)) != 0)
46292 dremap.perm[2 * i + cnt++] = j;
46293 for (; cnt < 2; ++cnt)
46294 dremap.perm[2 * i + cnt] = 0;
46297 dfinal = *d;
46298 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46299 dfinal.op1 = dfinal.op0;
46300 dfinal.one_operand_p = true;
46301 for (i = 0, j = 0; i < nelt; ++i)
46303 if (i == nelt2)
46304 j = 2;
46305 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46306 if ((d->perm[i] / nelt4) == dremap.perm[j])
46308 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46309 dfinal.perm[i] |= nelt4;
46310 else
46311 gcc_unreachable ();
46314 ok = expand_vec_perm_1 (&dremap);
46315 gcc_assert (ok);
46317 ok = expand_vec_perm_1 (&dfinal);
46318 gcc_assert (ok);
46320 return true;
46323 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46324 a vector permutation using two instructions, vperm2f128 resp.
46325 vperm2i128 followed by any single in-lane permutation. */
46327 static bool
46328 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46330 struct expand_vec_perm_d dfirst, dsecond;
46331 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46332 bool ok;
46334 if (!TARGET_AVX
46335 || GET_MODE_SIZE (d->vmode) != 32
46336 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46337 return false;
46339 dsecond = *d;
46340 dsecond.one_operand_p = false;
46341 dsecond.testing_p = true;
46343 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46344 immediate. For perm < 16 the second permutation uses
46345 d->op0 as first operand, for perm >= 16 it uses d->op1
46346 as first operand. The second operand is the result of
46347 vperm2[fi]128. */
46348 for (perm = 0; perm < 32; perm++)
46350 /* Ignore permutations which do not move anything cross-lane. */
46351 if (perm < 16)
46353 /* The second shuffle for e.g. V4DFmode has
46354 0123 and ABCD operands.
46355 Ignore AB23, as 23 is already in the second lane
46356 of the first operand. */
46357 if ((perm & 0xc) == (1 << 2)) continue;
46358 /* And 01CD, as 01 is in the first lane of the first
46359 operand. */
46360 if ((perm & 3) == 0) continue;
46361 /* And 4567, as then the vperm2[fi]128 doesn't change
46362 anything on the original 4567 second operand. */
46363 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46365 else
46367 /* The second shuffle for e.g. V4DFmode has
46368 4567 and ABCD operands.
46369 Ignore AB67, as 67 is already in the second lane
46370 of the first operand. */
46371 if ((perm & 0xc) == (3 << 2)) continue;
46372 /* And 45CD, as 45 is in the first lane of the first
46373 operand. */
46374 if ((perm & 3) == 2) continue;
46375 /* And 0123, as then the vperm2[fi]128 doesn't change
46376 anything on the original 0123 first operand. */
46377 if ((perm & 0xf) == (1 << 2)) continue;
46380 for (i = 0; i < nelt; i++)
46382 j = d->perm[i] / nelt2;
46383 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46384 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46385 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46386 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46387 else
46388 break;
46391 if (i == nelt)
46393 start_sequence ();
46394 ok = expand_vec_perm_1 (&dsecond);
46395 end_sequence ();
46397 else
46398 ok = false;
46400 if (ok)
46402 if (d->testing_p)
46403 return true;
46405 /* Found a usable second shuffle. dfirst will be
46406 vperm2f128 on d->op0 and d->op1. */
46407 dsecond.testing_p = false;
46408 dfirst = *d;
46409 dfirst.target = gen_reg_rtx (d->vmode);
46410 for (i = 0; i < nelt; i++)
46411 dfirst.perm[i] = (i & (nelt2 - 1))
46412 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46414 canonicalize_perm (&dfirst);
46415 ok = expand_vec_perm_1 (&dfirst);
46416 gcc_assert (ok);
46418 /* And dsecond is some single insn shuffle, taking
46419 d->op0 and result of vperm2f128 (if perm < 16) or
46420 d->op1 and result of vperm2f128 (otherwise). */
46421 if (perm >= 16)
46422 dsecond.op0 = dsecond.op1;
46423 dsecond.op1 = dfirst.target;
46425 ok = expand_vec_perm_1 (&dsecond);
46426 gcc_assert (ok);
46428 return true;
46431 /* For one operand, the only useful vperm2f128 permutation is 0x01
46432 aka lanes swap. */
46433 if (d->one_operand_p)
46434 return false;
46437 return false;
46440 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46441 a two vector permutation using 2 intra-lane interleave insns
46442 and cross-lane shuffle for 32-byte vectors. */
46444 static bool
46445 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46447 unsigned i, nelt;
46448 rtx (*gen) (rtx, rtx, rtx);
46450 if (d->one_operand_p)
46451 return false;
46452 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46454 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46456 else
46457 return false;
46459 nelt = d->nelt;
46460 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46461 return false;
46462 for (i = 0; i < nelt; i += 2)
46463 if (d->perm[i] != d->perm[0] + i / 2
46464 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46465 return false;
46467 if (d->testing_p)
46468 return true;
46470 switch (d->vmode)
46472 case V32QImode:
46473 if (d->perm[0])
46474 gen = gen_vec_interleave_highv32qi;
46475 else
46476 gen = gen_vec_interleave_lowv32qi;
46477 break;
46478 case V16HImode:
46479 if (d->perm[0])
46480 gen = gen_vec_interleave_highv16hi;
46481 else
46482 gen = gen_vec_interleave_lowv16hi;
46483 break;
46484 case V8SImode:
46485 if (d->perm[0])
46486 gen = gen_vec_interleave_highv8si;
46487 else
46488 gen = gen_vec_interleave_lowv8si;
46489 break;
46490 case V4DImode:
46491 if (d->perm[0])
46492 gen = gen_vec_interleave_highv4di;
46493 else
46494 gen = gen_vec_interleave_lowv4di;
46495 break;
46496 case V8SFmode:
46497 if (d->perm[0])
46498 gen = gen_vec_interleave_highv8sf;
46499 else
46500 gen = gen_vec_interleave_lowv8sf;
46501 break;
46502 case V4DFmode:
46503 if (d->perm[0])
46504 gen = gen_vec_interleave_highv4df;
46505 else
46506 gen = gen_vec_interleave_lowv4df;
46507 break;
46508 default:
46509 gcc_unreachable ();
46512 emit_insn (gen (d->target, d->op0, d->op1));
46513 return true;
46516 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46517 a single vector permutation using a single intra-lane vector
46518 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46519 the non-swapped and swapped vectors together. */
46521 static bool
46522 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46524 struct expand_vec_perm_d dfirst, dsecond;
46525 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46526 rtx_insn *seq;
46527 bool ok;
46528 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46530 if (!TARGET_AVX
46531 || TARGET_AVX2
46532 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46533 || !d->one_operand_p)
46534 return false;
46536 dfirst = *d;
46537 for (i = 0; i < nelt; i++)
46538 dfirst.perm[i] = 0xff;
46539 for (i = 0, msk = 0; i < nelt; i++)
46541 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46542 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46543 return false;
46544 dfirst.perm[j] = d->perm[i];
46545 if (j != i)
46546 msk |= (1 << i);
46548 for (i = 0; i < nelt; i++)
46549 if (dfirst.perm[i] == 0xff)
46550 dfirst.perm[i] = i;
46552 if (!d->testing_p)
46553 dfirst.target = gen_reg_rtx (dfirst.vmode);
46555 start_sequence ();
46556 ok = expand_vec_perm_1 (&dfirst);
46557 seq = get_insns ();
46558 end_sequence ();
46560 if (!ok)
46561 return false;
46563 if (d->testing_p)
46564 return true;
46566 emit_insn (seq);
46568 dsecond = *d;
46569 dsecond.op0 = dfirst.target;
46570 dsecond.op1 = dfirst.target;
46571 dsecond.one_operand_p = true;
46572 dsecond.target = gen_reg_rtx (dsecond.vmode);
46573 for (i = 0; i < nelt; i++)
46574 dsecond.perm[i] = i ^ nelt2;
46576 ok = expand_vec_perm_1 (&dsecond);
46577 gcc_assert (ok);
46579 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46580 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46581 return true;
46584 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46585 permutation using two vperm2f128, followed by a vshufpd insn blending
46586 the two vectors together. */
46588 static bool
46589 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46591 struct expand_vec_perm_d dfirst, dsecond, dthird;
46592 bool ok;
46594 if (!TARGET_AVX || (d->vmode != V4DFmode))
46595 return false;
46597 if (d->testing_p)
46598 return true;
46600 dfirst = *d;
46601 dsecond = *d;
46602 dthird = *d;
46604 dfirst.perm[0] = (d->perm[0] & ~1);
46605 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46606 dfirst.perm[2] = (d->perm[2] & ~1);
46607 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46608 dsecond.perm[0] = (d->perm[1] & ~1);
46609 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46610 dsecond.perm[2] = (d->perm[3] & ~1);
46611 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46612 dthird.perm[0] = (d->perm[0] % 2);
46613 dthird.perm[1] = (d->perm[1] % 2) + 4;
46614 dthird.perm[2] = (d->perm[2] % 2) + 2;
46615 dthird.perm[3] = (d->perm[3] % 2) + 6;
46617 dfirst.target = gen_reg_rtx (dfirst.vmode);
46618 dsecond.target = gen_reg_rtx (dsecond.vmode);
46619 dthird.op0 = dfirst.target;
46620 dthird.op1 = dsecond.target;
46621 dthird.one_operand_p = false;
46623 canonicalize_perm (&dfirst);
46624 canonicalize_perm (&dsecond);
46626 ok = expand_vec_perm_1 (&dfirst)
46627 && expand_vec_perm_1 (&dsecond)
46628 && expand_vec_perm_1 (&dthird);
46630 gcc_assert (ok);
46632 return true;
46635 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46636 permutation with two pshufb insns and an ior. We should have already
46637 failed all two instruction sequences. */
46639 static bool
46640 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46642 rtx rperm[2][16], vperm, l, h, op, m128;
46643 unsigned int i, nelt, eltsz;
46645 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46646 return false;
46647 gcc_assert (!d->one_operand_p);
46649 if (d->testing_p)
46650 return true;
46652 nelt = d->nelt;
46653 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46655 /* Generate two permutation masks. If the required element is within
46656 the given vector it is shuffled into the proper lane. If the required
46657 element is in the other vector, force a zero into the lane by setting
46658 bit 7 in the permutation mask. */
46659 m128 = GEN_INT (-128);
46660 for (i = 0; i < nelt; ++i)
46662 unsigned j, e = d->perm[i];
46663 unsigned which = (e >= nelt);
46664 if (e >= nelt)
46665 e -= nelt;
46667 for (j = 0; j < eltsz; ++j)
46669 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46670 rperm[1-which][i*eltsz + j] = m128;
46674 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46675 vperm = force_reg (V16QImode, vperm);
46677 l = gen_reg_rtx (V16QImode);
46678 op = gen_lowpart (V16QImode, d->op0);
46679 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46681 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46682 vperm = force_reg (V16QImode, vperm);
46684 h = gen_reg_rtx (V16QImode);
46685 op = gen_lowpart (V16QImode, d->op1);
46686 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46688 op = d->target;
46689 if (d->vmode != V16QImode)
46690 op = gen_reg_rtx (V16QImode);
46691 emit_insn (gen_iorv16qi3 (op, l, h));
46692 if (op != d->target)
46693 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46695 return true;
46698 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46699 with two vpshufb insns, vpermq and vpor. We should have already failed
46700 all two or three instruction sequences. */
46702 static bool
46703 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46705 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46706 unsigned int i, nelt, eltsz;
46708 if (!TARGET_AVX2
46709 || !d->one_operand_p
46710 || (d->vmode != V32QImode && d->vmode != V16HImode))
46711 return false;
46713 if (d->testing_p)
46714 return true;
46716 nelt = d->nelt;
46717 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46719 /* Generate two permutation masks. If the required element is within
46720 the same lane, it is shuffled in. If the required element from the
46721 other lane, force a zero by setting bit 7 in the permutation mask.
46722 In the other mask the mask has non-negative elements if element
46723 is requested from the other lane, but also moved to the other lane,
46724 so that the result of vpshufb can have the two V2TImode halves
46725 swapped. */
46726 m128 = GEN_INT (-128);
46727 for (i = 0; i < nelt; ++i)
46729 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46730 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46732 for (j = 0; j < eltsz; ++j)
46734 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46735 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46739 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46740 vperm = force_reg (V32QImode, vperm);
46742 h = gen_reg_rtx (V32QImode);
46743 op = gen_lowpart (V32QImode, d->op0);
46744 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46746 /* Swap the 128-byte lanes of h into hp. */
46747 hp = gen_reg_rtx (V4DImode);
46748 op = gen_lowpart (V4DImode, h);
46749 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46750 const1_rtx));
46752 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46753 vperm = force_reg (V32QImode, vperm);
46755 l = gen_reg_rtx (V32QImode);
46756 op = gen_lowpart (V32QImode, d->op0);
46757 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46759 op = d->target;
46760 if (d->vmode != V32QImode)
46761 op = gen_reg_rtx (V32QImode);
46762 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46763 if (op != d->target)
46764 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46766 return true;
46769 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46770 and extract-odd permutations of two V32QImode and V16QImode operand
46771 with two vpshufb insns, vpor and vpermq. We should have already
46772 failed all two or three instruction sequences. */
46774 static bool
46775 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46777 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46778 unsigned int i, nelt, eltsz;
46780 if (!TARGET_AVX2
46781 || d->one_operand_p
46782 || (d->vmode != V32QImode && d->vmode != V16HImode))
46783 return false;
46785 for (i = 0; i < d->nelt; ++i)
46786 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46787 return false;
46789 if (d->testing_p)
46790 return true;
46792 nelt = d->nelt;
46793 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46795 /* Generate two permutation masks. In the first permutation mask
46796 the first quarter will contain indexes for the first half
46797 of the op0, the second quarter will contain bit 7 set, third quarter
46798 will contain indexes for the second half of the op0 and the
46799 last quarter bit 7 set. In the second permutation mask
46800 the first quarter will contain bit 7 set, the second quarter
46801 indexes for the first half of the op1, the third quarter bit 7 set
46802 and last quarter indexes for the second half of the op1.
46803 I.e. the first mask e.g. for V32QImode extract even will be:
46804 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46805 (all values masked with 0xf except for -128) and second mask
46806 for extract even will be
46807 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46808 m128 = GEN_INT (-128);
46809 for (i = 0; i < nelt; ++i)
46811 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46812 unsigned which = d->perm[i] >= nelt;
46813 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46815 for (j = 0; j < eltsz; ++j)
46817 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46818 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46822 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46823 vperm = force_reg (V32QImode, vperm);
46825 l = gen_reg_rtx (V32QImode);
46826 op = gen_lowpart (V32QImode, d->op0);
46827 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46829 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46830 vperm = force_reg (V32QImode, vperm);
46832 h = gen_reg_rtx (V32QImode);
46833 op = gen_lowpart (V32QImode, d->op1);
46834 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46836 ior = gen_reg_rtx (V32QImode);
46837 emit_insn (gen_iorv32qi3 (ior, l, h));
46839 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46840 op = gen_reg_rtx (V4DImode);
46841 ior = gen_lowpart (V4DImode, ior);
46842 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46843 const1_rtx, GEN_INT (3)));
46844 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46846 return true;
46849 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46850 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46851 with two "and" and "pack" or two "shift" and "pack" insns. We should
46852 have already failed all two instruction sequences. */
46854 static bool
46855 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46857 rtx op, dop0, dop1, t, rperm[16];
46858 unsigned i, odd, c, s, nelt = d->nelt;
46859 bool end_perm = false;
46860 machine_mode half_mode;
46861 rtx (*gen_and) (rtx, rtx, rtx);
46862 rtx (*gen_pack) (rtx, rtx, rtx);
46863 rtx (*gen_shift) (rtx, rtx, rtx);
46865 if (d->one_operand_p)
46866 return false;
46868 switch (d->vmode)
46870 case V8HImode:
46871 /* Required for "pack". */
46872 if (!TARGET_SSE4_1)
46873 return false;
46874 c = 0xffff;
46875 s = 16;
46876 half_mode = V4SImode;
46877 gen_and = gen_andv4si3;
46878 gen_pack = gen_sse4_1_packusdw;
46879 gen_shift = gen_lshrv4si3;
46880 break;
46881 case V16QImode:
46882 /* No check as all instructions are SSE2. */
46883 c = 0xff;
46884 s = 8;
46885 half_mode = V8HImode;
46886 gen_and = gen_andv8hi3;
46887 gen_pack = gen_sse2_packuswb;
46888 gen_shift = gen_lshrv8hi3;
46889 break;
46890 case V16HImode:
46891 if (!TARGET_AVX2)
46892 return false;
46893 c = 0xffff;
46894 s = 16;
46895 half_mode = V8SImode;
46896 gen_and = gen_andv8si3;
46897 gen_pack = gen_avx2_packusdw;
46898 gen_shift = gen_lshrv8si3;
46899 end_perm = true;
46900 break;
46901 case V32QImode:
46902 if (!TARGET_AVX2)
46903 return false;
46904 c = 0xff;
46905 s = 8;
46906 half_mode = V16HImode;
46907 gen_and = gen_andv16hi3;
46908 gen_pack = gen_avx2_packuswb;
46909 gen_shift = gen_lshrv16hi3;
46910 end_perm = true;
46911 break;
46912 default:
46913 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46914 general shuffles. */
46915 return false;
46918 /* Check that permutation is even or odd. */
46919 odd = d->perm[0];
46920 if (odd > 1)
46921 return false;
46923 for (i = 1; i < nelt; ++i)
46924 if (d->perm[i] != 2 * i + odd)
46925 return false;
46927 if (d->testing_p)
46928 return true;
46930 dop0 = gen_reg_rtx (half_mode);
46931 dop1 = gen_reg_rtx (half_mode);
46932 if (odd == 0)
46934 for (i = 0; i < nelt / 2; i++)
46935 rperm[i] = GEN_INT (c);
46936 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
46937 t = force_reg (half_mode, t);
46938 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
46939 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
46941 else
46943 emit_insn (gen_shift (dop0,
46944 gen_lowpart (half_mode, d->op0),
46945 GEN_INT (s)));
46946 emit_insn (gen_shift (dop1,
46947 gen_lowpart (half_mode, d->op1),
46948 GEN_INT (s)));
46950 /* In AVX2 for 256 bit case we need to permute pack result. */
46951 if (TARGET_AVX2 && end_perm)
46953 op = gen_reg_rtx (d->vmode);
46954 t = gen_reg_rtx (V4DImode);
46955 emit_insn (gen_pack (op, dop0, dop1));
46956 emit_insn (gen_avx2_permv4di_1 (t,
46957 gen_lowpart (V4DImode, op),
46958 const0_rtx,
46959 const2_rtx,
46960 const1_rtx,
46961 GEN_INT (3)));
46962 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
46964 else
46965 emit_insn (gen_pack (d->target, dop0, dop1));
46967 return true;
46970 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46971 and extract-odd permutations of two V64QI operands
46972 with two "shifts", two "truncs" and one "concat" insns for "odd"
46973 and two "truncs" and one concat insn for "even."
46974 Have already failed all two instruction sequences. */
46976 static bool
46977 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
46979 rtx t1, t2, t3, t4;
46980 unsigned i, odd, nelt = d->nelt;
46982 if (!TARGET_AVX512BW
46983 || d->one_operand_p
46984 || d->vmode != V64QImode)
46985 return false;
46987 /* Check that permutation is even or odd. */
46988 odd = d->perm[0];
46989 if (odd > 1)
46990 return false;
46992 for (i = 1; i < nelt; ++i)
46993 if (d->perm[i] != 2 * i + odd)
46994 return false;
46996 if (d->testing_p)
46997 return true;
47000 if (odd)
47002 t1 = gen_reg_rtx (V32HImode);
47003 t2 = gen_reg_rtx (V32HImode);
47004 emit_insn (gen_lshrv32hi3 (t1,
47005 gen_lowpart (V32HImode, d->op0),
47006 GEN_INT (8)));
47007 emit_insn (gen_lshrv32hi3 (t2,
47008 gen_lowpart (V32HImode, d->op1),
47009 GEN_INT (8)));
47011 else
47013 t1 = gen_lowpart (V32HImode, d->op0);
47014 t2 = gen_lowpart (V32HImode, d->op1);
47017 t3 = gen_reg_rtx (V32QImode);
47018 t4 = gen_reg_rtx (V32QImode);
47019 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47020 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47021 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47023 return true;
47026 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47027 and extract-odd permutations. */
47029 static bool
47030 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47032 rtx t1, t2, t3, t4, t5;
47034 switch (d->vmode)
47036 case V4DFmode:
47037 if (d->testing_p)
47038 break;
47039 t1 = gen_reg_rtx (V4DFmode);
47040 t2 = gen_reg_rtx (V4DFmode);
47042 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47043 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47044 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47046 /* Now an unpck[lh]pd will produce the result required. */
47047 if (odd)
47048 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47049 else
47050 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47051 emit_insn (t3);
47052 break;
47054 case V8SFmode:
47056 int mask = odd ? 0xdd : 0x88;
47058 if (d->testing_p)
47059 break;
47060 t1 = gen_reg_rtx (V8SFmode);
47061 t2 = gen_reg_rtx (V8SFmode);
47062 t3 = gen_reg_rtx (V8SFmode);
47064 /* Shuffle within the 128-bit lanes to produce:
47065 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47066 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47067 GEN_INT (mask)));
47069 /* Shuffle the lanes around to produce:
47070 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47071 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47072 GEN_INT (0x3)));
47074 /* Shuffle within the 128-bit lanes to produce:
47075 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47076 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47078 /* Shuffle within the 128-bit lanes to produce:
47079 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47080 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47082 /* Shuffle the lanes around to produce:
47083 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47084 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47085 GEN_INT (0x20)));
47087 break;
47089 case V2DFmode:
47090 case V4SFmode:
47091 case V2DImode:
47092 case V4SImode:
47093 /* These are always directly implementable by expand_vec_perm_1. */
47094 gcc_unreachable ();
47096 case V8HImode:
47097 if (TARGET_SSE4_1)
47098 return expand_vec_perm_even_odd_pack (d);
47099 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47100 return expand_vec_perm_pshufb2 (d);
47101 else
47103 if (d->testing_p)
47104 break;
47105 /* We need 2*log2(N)-1 operations to achieve odd/even
47106 with interleave. */
47107 t1 = gen_reg_rtx (V8HImode);
47108 t2 = gen_reg_rtx (V8HImode);
47109 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47110 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47111 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47112 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47113 if (odd)
47114 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47115 else
47116 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47117 emit_insn (t3);
47119 break;
47121 case V16QImode:
47122 return expand_vec_perm_even_odd_pack (d);
47124 case V16HImode:
47125 case V32QImode:
47126 return expand_vec_perm_even_odd_pack (d);
47128 case V64QImode:
47129 return expand_vec_perm_even_odd_trunc (d);
47131 case V4DImode:
47132 if (!TARGET_AVX2)
47134 struct expand_vec_perm_d d_copy = *d;
47135 d_copy.vmode = V4DFmode;
47136 if (d->testing_p)
47137 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47138 else
47139 d_copy.target = gen_reg_rtx (V4DFmode);
47140 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47141 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47142 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47144 if (!d->testing_p)
47145 emit_move_insn (d->target,
47146 gen_lowpart (V4DImode, d_copy.target));
47147 return true;
47149 return false;
47152 if (d->testing_p)
47153 break;
47155 t1 = gen_reg_rtx (V4DImode);
47156 t2 = gen_reg_rtx (V4DImode);
47158 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47159 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47160 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47162 /* Now an vpunpck[lh]qdq will produce the result required. */
47163 if (odd)
47164 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47165 else
47166 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47167 emit_insn (t3);
47168 break;
47170 case V8SImode:
47171 if (!TARGET_AVX2)
47173 struct expand_vec_perm_d d_copy = *d;
47174 d_copy.vmode = V8SFmode;
47175 if (d->testing_p)
47176 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47177 else
47178 d_copy.target = gen_reg_rtx (V8SFmode);
47179 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47180 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47181 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47183 if (!d->testing_p)
47184 emit_move_insn (d->target,
47185 gen_lowpart (V8SImode, d_copy.target));
47186 return true;
47188 return false;
47191 if (d->testing_p)
47192 break;
47194 t1 = gen_reg_rtx (V8SImode);
47195 t2 = gen_reg_rtx (V8SImode);
47196 t3 = gen_reg_rtx (V4DImode);
47197 t4 = gen_reg_rtx (V4DImode);
47198 t5 = gen_reg_rtx (V4DImode);
47200 /* Shuffle the lanes around into
47201 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47202 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47203 gen_lowpart (V4DImode, d->op1),
47204 GEN_INT (0x20)));
47205 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47206 gen_lowpart (V4DImode, d->op1),
47207 GEN_INT (0x31)));
47209 /* Swap the 2nd and 3rd position in each lane into
47210 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47211 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47212 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47213 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47214 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47216 /* Now an vpunpck[lh]qdq will produce
47217 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47218 if (odd)
47219 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47220 gen_lowpart (V4DImode, t2));
47221 else
47222 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47223 gen_lowpart (V4DImode, t2));
47224 emit_insn (t3);
47225 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47226 break;
47228 default:
47229 gcc_unreachable ();
47232 return true;
47235 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47236 extract-even and extract-odd permutations. */
47238 static bool
47239 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47241 unsigned i, odd, nelt = d->nelt;
47243 odd = d->perm[0];
47244 if (odd != 0 && odd != 1)
47245 return false;
47247 for (i = 1; i < nelt; ++i)
47248 if (d->perm[i] != 2 * i + odd)
47249 return false;
47251 return expand_vec_perm_even_odd_1 (d, odd);
47254 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47255 permutations. We assume that expand_vec_perm_1 has already failed. */
47257 static bool
47258 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47260 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47261 machine_mode vmode = d->vmode;
47262 unsigned char perm2[4];
47263 rtx op0 = d->op0, dest;
47264 bool ok;
47266 switch (vmode)
47268 case V4DFmode:
47269 case V8SFmode:
47270 /* These are special-cased in sse.md so that we can optionally
47271 use the vbroadcast instruction. They expand to two insns
47272 if the input happens to be in a register. */
47273 gcc_unreachable ();
47275 case V2DFmode:
47276 case V2DImode:
47277 case V4SFmode:
47278 case V4SImode:
47279 /* These are always implementable using standard shuffle patterns. */
47280 gcc_unreachable ();
47282 case V8HImode:
47283 case V16QImode:
47284 /* These can be implemented via interleave. We save one insn by
47285 stopping once we have promoted to V4SImode and then use pshufd. */
47286 if (d->testing_p)
47287 return true;
47290 rtx dest;
47291 rtx (*gen) (rtx, rtx, rtx)
47292 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47293 : gen_vec_interleave_lowv8hi;
47295 if (elt >= nelt2)
47297 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47298 : gen_vec_interleave_highv8hi;
47299 elt -= nelt2;
47301 nelt2 /= 2;
47303 dest = gen_reg_rtx (vmode);
47304 emit_insn (gen (dest, op0, op0));
47305 vmode = get_mode_wider_vector (vmode);
47306 op0 = gen_lowpart (vmode, dest);
47308 while (vmode != V4SImode);
47310 memset (perm2, elt, 4);
47311 dest = gen_reg_rtx (V4SImode);
47312 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47313 gcc_assert (ok);
47314 if (!d->testing_p)
47315 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47316 return true;
47318 case V64QImode:
47319 case V32QImode:
47320 case V16HImode:
47321 case V8SImode:
47322 case V4DImode:
47323 /* For AVX2 broadcasts of the first element vpbroadcast* or
47324 vpermq should be used by expand_vec_perm_1. */
47325 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47326 return false;
47328 default:
47329 gcc_unreachable ();
47333 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47334 broadcast permutations. */
47336 static bool
47337 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47339 unsigned i, elt, nelt = d->nelt;
47341 if (!d->one_operand_p)
47342 return false;
47344 elt = d->perm[0];
47345 for (i = 1; i < nelt; ++i)
47346 if (d->perm[i] != elt)
47347 return false;
47349 return expand_vec_perm_broadcast_1 (d);
47352 /* Implement arbitrary permutations of two V64QImode operands
47353 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
47354 static bool
47355 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
47357 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47358 return false;
47360 if (d->testing_p)
47361 return true;
47363 struct expand_vec_perm_d ds[2];
47364 rtx rperm[128], vperm, target0, target1;
47365 unsigned int i, nelt;
47366 machine_mode vmode;
47368 nelt = d->nelt;
47369 vmode = V64QImode;
47371 for (i = 0; i < 2; i++)
47373 ds[i] = *d;
47374 ds[i].vmode = V32HImode;
47375 ds[i].nelt = 32;
47376 ds[i].target = gen_reg_rtx (V32HImode);
47377 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47378 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47381 /* Prepare permutations such that the first one takes care of
47382 putting the even bytes into the right positions or one higher
47383 positions (ds[0]) and the second one takes care of
47384 putting the odd bytes into the right positions or one below
47385 (ds[1]). */
47387 for (i = 0; i < nelt; i++)
47389 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47390 if (i & 1)
47392 rperm[i] = constm1_rtx;
47393 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47395 else
47397 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47398 rperm[i + 64] = constm1_rtx;
47402 bool ok = expand_vec_perm_1 (&ds[0]);
47403 gcc_assert (ok);
47404 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47406 ok = expand_vec_perm_1 (&ds[1]);
47407 gcc_assert (ok);
47408 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47410 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47411 vperm = force_reg (vmode, vperm);
47412 target0 = gen_reg_rtx (V64QImode);
47413 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47415 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47416 vperm = force_reg (vmode, vperm);
47417 target1 = gen_reg_rtx (V64QImode);
47418 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47420 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47421 return true;
47424 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47425 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47426 all the shorter instruction sequences. */
47428 static bool
47429 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47431 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47432 unsigned int i, nelt, eltsz;
47433 bool used[4];
47435 if (!TARGET_AVX2
47436 || d->one_operand_p
47437 || (d->vmode != V32QImode && d->vmode != V16HImode))
47438 return false;
47440 if (d->testing_p)
47441 return true;
47443 nelt = d->nelt;
47444 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47446 /* Generate 4 permutation masks. If the required element is within
47447 the same lane, it is shuffled in. If the required element from the
47448 other lane, force a zero by setting bit 7 in the permutation mask.
47449 In the other mask the mask has non-negative elements if element
47450 is requested from the other lane, but also moved to the other lane,
47451 so that the result of vpshufb can have the two V2TImode halves
47452 swapped. */
47453 m128 = GEN_INT (-128);
47454 for (i = 0; i < 32; ++i)
47456 rperm[0][i] = m128;
47457 rperm[1][i] = m128;
47458 rperm[2][i] = m128;
47459 rperm[3][i] = m128;
47461 used[0] = false;
47462 used[1] = false;
47463 used[2] = false;
47464 used[3] = false;
47465 for (i = 0; i < nelt; ++i)
47467 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47468 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47469 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47471 for (j = 0; j < eltsz; ++j)
47472 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47473 used[which] = true;
47476 for (i = 0; i < 2; ++i)
47478 if (!used[2 * i + 1])
47480 h[i] = NULL_RTX;
47481 continue;
47483 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47484 gen_rtvec_v (32, rperm[2 * i + 1]));
47485 vperm = force_reg (V32QImode, vperm);
47486 h[i] = gen_reg_rtx (V32QImode);
47487 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47488 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47491 /* Swap the 128-byte lanes of h[X]. */
47492 for (i = 0; i < 2; ++i)
47494 if (h[i] == NULL_RTX)
47495 continue;
47496 op = gen_reg_rtx (V4DImode);
47497 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47498 const2_rtx, GEN_INT (3), const0_rtx,
47499 const1_rtx));
47500 h[i] = gen_lowpart (V32QImode, op);
47503 for (i = 0; i < 2; ++i)
47505 if (!used[2 * i])
47507 l[i] = NULL_RTX;
47508 continue;
47510 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47511 vperm = force_reg (V32QImode, vperm);
47512 l[i] = gen_reg_rtx (V32QImode);
47513 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47514 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47517 for (i = 0; i < 2; ++i)
47519 if (h[i] && l[i])
47521 op = gen_reg_rtx (V32QImode);
47522 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47523 l[i] = op;
47525 else if (h[i])
47526 l[i] = h[i];
47529 gcc_assert (l[0] && l[1]);
47530 op = d->target;
47531 if (d->vmode != V32QImode)
47532 op = gen_reg_rtx (V32QImode);
47533 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47534 if (op != d->target)
47535 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47536 return true;
47539 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47540 With all of the interface bits taken care of, perform the expansion
47541 in D and return true on success. */
47543 static bool
47544 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47546 /* Try a single instruction expansion. */
47547 if (expand_vec_perm_1 (d))
47548 return true;
47550 /* Try sequences of two instructions. */
47552 if (expand_vec_perm_pshuflw_pshufhw (d))
47553 return true;
47555 if (expand_vec_perm_palignr (d, false))
47556 return true;
47558 if (expand_vec_perm_interleave2 (d))
47559 return true;
47561 if (expand_vec_perm_broadcast (d))
47562 return true;
47564 if (expand_vec_perm_vpermq_perm_1 (d))
47565 return true;
47567 if (expand_vec_perm_vperm2f128 (d))
47568 return true;
47570 if (expand_vec_perm_pblendv (d))
47571 return true;
47573 /* Try sequences of three instructions. */
47575 if (expand_vec_perm_even_odd_pack (d))
47576 return true;
47578 if (expand_vec_perm_2vperm2f128_vshuf (d))
47579 return true;
47581 if (expand_vec_perm_pshufb2 (d))
47582 return true;
47584 if (expand_vec_perm_interleave3 (d))
47585 return true;
47587 if (expand_vec_perm_vperm2f128_vblend (d))
47588 return true;
47590 /* Try sequences of four instructions. */
47592 if (expand_vec_perm_even_odd_trunc (d))
47593 return true;
47594 if (expand_vec_perm_vpshufb2_vpermq (d))
47595 return true;
47597 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47598 return true;
47600 if (expand_vec_perm_vpermi2_vpshub2 (d))
47601 return true;
47603 /* ??? Look for narrow permutations whose element orderings would
47604 allow the promotion to a wider mode. */
47606 /* ??? Look for sequences of interleave or a wider permute that place
47607 the data into the correct lanes for a half-vector shuffle like
47608 pshuf[lh]w or vpermilps. */
47610 /* ??? Look for sequences of interleave that produce the desired results.
47611 The combinatorics of punpck[lh] get pretty ugly... */
47613 if (expand_vec_perm_even_odd (d))
47614 return true;
47616 /* Even longer sequences. */
47617 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47618 return true;
47620 /* See if we can get the same permutation in different vector integer
47621 mode. */
47622 struct expand_vec_perm_d nd;
47623 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47625 if (!d->testing_p)
47626 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47627 return true;
47630 return false;
47633 /* If a permutation only uses one operand, make it clear. Returns true
47634 if the permutation references both operands. */
47636 static bool
47637 canonicalize_perm (struct expand_vec_perm_d *d)
47639 int i, which, nelt = d->nelt;
47641 for (i = which = 0; i < nelt; ++i)
47642 which |= (d->perm[i] < nelt ? 1 : 2);
47644 d->one_operand_p = true;
47645 switch (which)
47647 default:
47648 gcc_unreachable();
47650 case 3:
47651 if (!rtx_equal_p (d->op0, d->op1))
47653 d->one_operand_p = false;
47654 break;
47656 /* The elements of PERM do not suggest that only the first operand
47657 is used, but both operands are identical. Allow easier matching
47658 of the permutation by folding the permutation into the single
47659 input vector. */
47660 /* FALLTHRU */
47662 case 2:
47663 for (i = 0; i < nelt; ++i)
47664 d->perm[i] &= nelt - 1;
47665 d->op0 = d->op1;
47666 break;
47668 case 1:
47669 d->op1 = d->op0;
47670 break;
47673 return (which == 3);
47676 bool
47677 ix86_expand_vec_perm_const (rtx operands[4])
47679 struct expand_vec_perm_d d;
47680 unsigned char perm[MAX_VECT_LEN];
47681 int i, nelt;
47682 bool two_args;
47683 rtx sel;
47685 d.target = operands[0];
47686 d.op0 = operands[1];
47687 d.op1 = operands[2];
47688 sel = operands[3];
47690 d.vmode = GET_MODE (d.target);
47691 gcc_assert (VECTOR_MODE_P (d.vmode));
47692 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47693 d.testing_p = false;
47695 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47696 gcc_assert (XVECLEN (sel, 0) == nelt);
47697 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47699 for (i = 0; i < nelt; ++i)
47701 rtx e = XVECEXP (sel, 0, i);
47702 int ei = INTVAL (e) & (2 * nelt - 1);
47703 d.perm[i] = ei;
47704 perm[i] = ei;
47707 two_args = canonicalize_perm (&d);
47709 if (ix86_expand_vec_perm_const_1 (&d))
47710 return true;
47712 /* If the selector says both arguments are needed, but the operands are the
47713 same, the above tried to expand with one_operand_p and flattened selector.
47714 If that didn't work, retry without one_operand_p; we succeeded with that
47715 during testing. */
47716 if (two_args && d.one_operand_p)
47718 d.one_operand_p = false;
47719 memcpy (d.perm, perm, sizeof (perm));
47720 return ix86_expand_vec_perm_const_1 (&d);
47723 return false;
47726 /* Implement targetm.vectorize.vec_perm_const_ok. */
47728 static bool
47729 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
47730 const unsigned char *sel)
47732 struct expand_vec_perm_d d;
47733 unsigned int i, nelt, which;
47734 bool ret;
47736 d.vmode = vmode;
47737 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47738 d.testing_p = true;
47740 /* Given sufficient ISA support we can just return true here
47741 for selected vector modes. */
47742 switch (d.vmode)
47744 case V16SFmode:
47745 case V16SImode:
47746 case V8DImode:
47747 case V8DFmode:
47748 if (TARGET_AVX512F)
47749 /* All implementable with a single vpermi2 insn. */
47750 return true;
47751 break;
47752 case V32HImode:
47753 if (TARGET_AVX512BW)
47754 /* All implementable with a single vpermi2 insn. */
47755 return true;
47756 break;
47757 case V64QImode:
47758 if (TARGET_AVX512BW)
47759 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
47760 return true;
47761 break;
47762 case V8SImode:
47763 case V8SFmode:
47764 case V4DFmode:
47765 case V4DImode:
47766 if (TARGET_AVX512VL)
47767 /* All implementable with a single vpermi2 insn. */
47768 return true;
47769 break;
47770 case V16HImode:
47771 if (TARGET_AVX2)
47772 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47773 return true;
47774 break;
47775 case V32QImode:
47776 if (TARGET_AVX2)
47777 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47778 return true;
47779 break;
47780 case V4SImode:
47781 case V4SFmode:
47782 case V8HImode:
47783 case V16QImode:
47784 /* All implementable with a single vpperm insn. */
47785 if (TARGET_XOP)
47786 return true;
47787 /* All implementable with 2 pshufb + 1 ior. */
47788 if (TARGET_SSSE3)
47789 return true;
47790 break;
47791 case V2DImode:
47792 case V2DFmode:
47793 /* All implementable with shufpd or unpck[lh]pd. */
47794 return true;
47795 default:
47796 return false;
47799 /* Extract the values from the vector CST into the permutation
47800 array in D. */
47801 memcpy (d.perm, sel, nelt);
47802 for (i = which = 0; i < nelt; ++i)
47804 unsigned char e = d.perm[i];
47805 gcc_assert (e < 2 * nelt);
47806 which |= (e < nelt ? 1 : 2);
47809 /* For all elements from second vector, fold the elements to first. */
47810 if (which == 2)
47811 for (i = 0; i < nelt; ++i)
47812 d.perm[i] -= nelt;
47814 /* Check whether the mask can be applied to the vector type. */
47815 d.one_operand_p = (which != 3);
47817 /* Implementable with shufps or pshufd. */
47818 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47819 return true;
47821 /* Otherwise we have to go through the motions and see if we can
47822 figure out how to generate the requested permutation. */
47823 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47824 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47825 if (!d.one_operand_p)
47826 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47828 start_sequence ();
47829 ret = ix86_expand_vec_perm_const_1 (&d);
47830 end_sequence ();
47832 return ret;
47835 void
47836 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47838 struct expand_vec_perm_d d;
47839 unsigned i, nelt;
47841 d.target = targ;
47842 d.op0 = op0;
47843 d.op1 = op1;
47844 d.vmode = GET_MODE (targ);
47845 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47846 d.one_operand_p = false;
47847 d.testing_p = false;
47849 for (i = 0; i < nelt; ++i)
47850 d.perm[i] = i * 2 + odd;
47852 /* We'll either be able to implement the permutation directly... */
47853 if (expand_vec_perm_1 (&d))
47854 return;
47856 /* ... or we use the special-case patterns. */
47857 expand_vec_perm_even_odd_1 (&d, odd);
47860 static void
47861 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47863 struct expand_vec_perm_d d;
47864 unsigned i, nelt, base;
47865 bool ok;
47867 d.target = targ;
47868 d.op0 = op0;
47869 d.op1 = op1;
47870 d.vmode = GET_MODE (targ);
47871 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47872 d.one_operand_p = false;
47873 d.testing_p = false;
47875 base = high_p ? nelt / 2 : 0;
47876 for (i = 0; i < nelt / 2; ++i)
47878 d.perm[i * 2] = i + base;
47879 d.perm[i * 2 + 1] = i + base + nelt;
47882 /* Note that for AVX this isn't one instruction. */
47883 ok = ix86_expand_vec_perm_const_1 (&d);
47884 gcc_assert (ok);
47888 /* Expand a vector operation CODE for a V*QImode in terms of the
47889 same operation on V*HImode. */
47891 void
47892 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47894 machine_mode qimode = GET_MODE (dest);
47895 machine_mode himode;
47896 rtx (*gen_il) (rtx, rtx, rtx);
47897 rtx (*gen_ih) (rtx, rtx, rtx);
47898 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47899 struct expand_vec_perm_d d;
47900 bool ok, full_interleave;
47901 bool uns_p = false;
47902 int i;
47904 switch (qimode)
47906 case V16QImode:
47907 himode = V8HImode;
47908 gen_il = gen_vec_interleave_lowv16qi;
47909 gen_ih = gen_vec_interleave_highv16qi;
47910 break;
47911 case V32QImode:
47912 himode = V16HImode;
47913 gen_il = gen_avx2_interleave_lowv32qi;
47914 gen_ih = gen_avx2_interleave_highv32qi;
47915 break;
47916 case V64QImode:
47917 himode = V32HImode;
47918 gen_il = gen_avx512bw_interleave_lowv64qi;
47919 gen_ih = gen_avx512bw_interleave_highv64qi;
47920 break;
47921 default:
47922 gcc_unreachable ();
47925 op2_l = op2_h = op2;
47926 switch (code)
47928 case MULT:
47929 /* Unpack data such that we've got a source byte in each low byte of
47930 each word. We don't care what goes into the high byte of each word.
47931 Rather than trying to get zero in there, most convenient is to let
47932 it be a copy of the low byte. */
47933 op2_l = gen_reg_rtx (qimode);
47934 op2_h = gen_reg_rtx (qimode);
47935 emit_insn (gen_il (op2_l, op2, op2));
47936 emit_insn (gen_ih (op2_h, op2, op2));
47937 /* FALLTHRU */
47939 op1_l = gen_reg_rtx (qimode);
47940 op1_h = gen_reg_rtx (qimode);
47941 emit_insn (gen_il (op1_l, op1, op1));
47942 emit_insn (gen_ih (op1_h, op1, op1));
47943 full_interleave = qimode == V16QImode;
47944 break;
47946 case ASHIFT:
47947 case LSHIFTRT:
47948 uns_p = true;
47949 /* FALLTHRU */
47950 case ASHIFTRT:
47951 op1_l = gen_reg_rtx (himode);
47952 op1_h = gen_reg_rtx (himode);
47953 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
47954 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
47955 full_interleave = true;
47956 break;
47957 default:
47958 gcc_unreachable ();
47961 /* Perform the operation. */
47962 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
47963 1, OPTAB_DIRECT);
47964 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
47965 1, OPTAB_DIRECT);
47966 gcc_assert (res_l && res_h);
47968 /* Merge the data back into the right place. */
47969 d.target = dest;
47970 d.op0 = gen_lowpart (qimode, res_l);
47971 d.op1 = gen_lowpart (qimode, res_h);
47972 d.vmode = qimode;
47973 d.nelt = GET_MODE_NUNITS (qimode);
47974 d.one_operand_p = false;
47975 d.testing_p = false;
47977 if (full_interleave)
47979 /* For SSE2, we used an full interleave, so the desired
47980 results are in the even elements. */
47981 for (i = 0; i < d.nelt; ++i)
47982 d.perm[i] = i * 2;
47984 else
47986 /* For AVX, the interleave used above was not cross-lane. So the
47987 extraction is evens but with the second and third quarter swapped.
47988 Happily, that is even one insn shorter than even extraction.
47989 For AVX512BW we have 4 lanes. We extract evens from within a lane,
47990 always first from the first and then from the second source operand,
47991 the index bits above the low 4 bits remains the same.
47992 Thus, for d.nelt == 32 we want permutation
47993 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
47994 and for d.nelt == 64 we want permutation
47995 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
47996 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
47997 for (i = 0; i < d.nelt; ++i)
47998 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48001 ok = ix86_expand_vec_perm_const_1 (&d);
48002 gcc_assert (ok);
48004 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48005 gen_rtx_fmt_ee (code, qimode, op1, op2));
48008 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48009 if op is CONST_VECTOR with all odd elements equal to their
48010 preceding element. */
48012 static bool
48013 const_vector_equal_evenodd_p (rtx op)
48015 machine_mode mode = GET_MODE (op);
48016 int i, nunits = GET_MODE_NUNITS (mode);
48017 if (GET_CODE (op) != CONST_VECTOR
48018 || nunits != CONST_VECTOR_NUNITS (op))
48019 return false;
48020 for (i = 0; i < nunits; i += 2)
48021 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48022 return false;
48023 return true;
48026 void
48027 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48028 bool uns_p, bool odd_p)
48030 machine_mode mode = GET_MODE (op1);
48031 machine_mode wmode = GET_MODE (dest);
48032 rtx x;
48033 rtx orig_op1 = op1, orig_op2 = op2;
48035 if (!nonimmediate_operand (op1, mode))
48036 op1 = force_reg (mode, op1);
48037 if (!nonimmediate_operand (op2, mode))
48038 op2 = force_reg (mode, op2);
48040 /* We only play even/odd games with vectors of SImode. */
48041 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48043 /* If we're looking for the odd results, shift those members down to
48044 the even slots. For some cpus this is faster than a PSHUFD. */
48045 if (odd_p)
48047 /* For XOP use vpmacsdqh, but only for smult, as it is only
48048 signed. */
48049 if (TARGET_XOP && mode == V4SImode && !uns_p)
48051 x = force_reg (wmode, CONST0_RTX (wmode));
48052 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48053 return;
48056 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48057 if (!const_vector_equal_evenodd_p (orig_op1))
48058 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48059 x, NULL, 1, OPTAB_DIRECT);
48060 if (!const_vector_equal_evenodd_p (orig_op2))
48061 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48062 x, NULL, 1, OPTAB_DIRECT);
48063 op1 = gen_lowpart (mode, op1);
48064 op2 = gen_lowpart (mode, op2);
48067 if (mode == V16SImode)
48069 if (uns_p)
48070 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48071 else
48072 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48074 else if (mode == V8SImode)
48076 if (uns_p)
48077 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48078 else
48079 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48081 else if (uns_p)
48082 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48083 else if (TARGET_SSE4_1)
48084 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48085 else
48087 rtx s1, s2, t0, t1, t2;
48089 /* The easiest way to implement this without PMULDQ is to go through
48090 the motions as if we are performing a full 64-bit multiply. With
48091 the exception that we need to do less shuffling of the elements. */
48093 /* Compute the sign-extension, aka highparts, of the two operands. */
48094 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48095 op1, pc_rtx, pc_rtx);
48096 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48097 op2, pc_rtx, pc_rtx);
48099 /* Multiply LO(A) * HI(B), and vice-versa. */
48100 t1 = gen_reg_rtx (wmode);
48101 t2 = gen_reg_rtx (wmode);
48102 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48103 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48105 /* Multiply LO(A) * LO(B). */
48106 t0 = gen_reg_rtx (wmode);
48107 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48109 /* Combine and shift the highparts into place. */
48110 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48111 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48112 1, OPTAB_DIRECT);
48114 /* Combine high and low parts. */
48115 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48116 return;
48118 emit_insn (x);
48121 void
48122 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48123 bool uns_p, bool high_p)
48125 machine_mode wmode = GET_MODE (dest);
48126 machine_mode mode = GET_MODE (op1);
48127 rtx t1, t2, t3, t4, mask;
48129 switch (mode)
48131 case V4SImode:
48132 t1 = gen_reg_rtx (mode);
48133 t2 = gen_reg_rtx (mode);
48134 if (TARGET_XOP && !uns_p)
48136 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48137 shuffle the elements once so that all elements are in the right
48138 place for immediate use: { A C B D }. */
48139 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48140 const1_rtx, GEN_INT (3)));
48141 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48142 const1_rtx, GEN_INT (3)));
48144 else
48146 /* Put the elements into place for the multiply. */
48147 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48148 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48149 high_p = false;
48151 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48152 break;
48154 case V8SImode:
48155 /* Shuffle the elements between the lanes. After this we
48156 have { A B E F | C D G H } for each operand. */
48157 t1 = gen_reg_rtx (V4DImode);
48158 t2 = gen_reg_rtx (V4DImode);
48159 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48160 const0_rtx, const2_rtx,
48161 const1_rtx, GEN_INT (3)));
48162 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48163 const0_rtx, const2_rtx,
48164 const1_rtx, GEN_INT (3)));
48166 /* Shuffle the elements within the lanes. After this we
48167 have { A A B B | C C D D } or { E E F F | G G H H }. */
48168 t3 = gen_reg_rtx (V8SImode);
48169 t4 = gen_reg_rtx (V8SImode);
48170 mask = GEN_INT (high_p
48171 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48172 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48173 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48174 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48176 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48177 break;
48179 case V8HImode:
48180 case V16HImode:
48181 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48182 uns_p, OPTAB_DIRECT);
48183 t2 = expand_binop (mode,
48184 uns_p ? umul_highpart_optab : smul_highpart_optab,
48185 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48186 gcc_assert (t1 && t2);
48188 t3 = gen_reg_rtx (mode);
48189 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48190 emit_move_insn (dest, gen_lowpart (wmode, t3));
48191 break;
48193 case V16QImode:
48194 case V32QImode:
48195 case V32HImode:
48196 case V16SImode:
48197 case V64QImode:
48198 t1 = gen_reg_rtx (wmode);
48199 t2 = gen_reg_rtx (wmode);
48200 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48201 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48203 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48204 break;
48206 default:
48207 gcc_unreachable ();
48211 void
48212 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48214 rtx res_1, res_2, res_3, res_4;
48216 res_1 = gen_reg_rtx (V4SImode);
48217 res_2 = gen_reg_rtx (V4SImode);
48218 res_3 = gen_reg_rtx (V2DImode);
48219 res_4 = gen_reg_rtx (V2DImode);
48220 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48221 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48223 /* Move the results in element 2 down to element 1; we don't care
48224 what goes in elements 2 and 3. Then we can merge the parts
48225 back together with an interleave.
48227 Note that two other sequences were tried:
48228 (1) Use interleaves at the start instead of psrldq, which allows
48229 us to use a single shufps to merge things back at the end.
48230 (2) Use shufps here to combine the two vectors, then pshufd to
48231 put the elements in the correct order.
48232 In both cases the cost of the reformatting stall was too high
48233 and the overall sequence slower. */
48235 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48236 const0_rtx, const2_rtx,
48237 const0_rtx, const0_rtx));
48238 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48239 const0_rtx, const2_rtx,
48240 const0_rtx, const0_rtx));
48241 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48243 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48246 void
48247 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48249 machine_mode mode = GET_MODE (op0);
48250 rtx t1, t2, t3, t4, t5, t6;
48252 if (TARGET_AVX512DQ && mode == V8DImode)
48253 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48254 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48255 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48256 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48257 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48258 else if (TARGET_XOP && mode == V2DImode)
48260 /* op1: A,B,C,D, op2: E,F,G,H */
48261 op1 = gen_lowpart (V4SImode, op1);
48262 op2 = gen_lowpart (V4SImode, op2);
48264 t1 = gen_reg_rtx (V4SImode);
48265 t2 = gen_reg_rtx (V4SImode);
48266 t3 = gen_reg_rtx (V2DImode);
48267 t4 = gen_reg_rtx (V2DImode);
48269 /* t1: B,A,D,C */
48270 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48271 GEN_INT (1),
48272 GEN_INT (0),
48273 GEN_INT (3),
48274 GEN_INT (2)));
48276 /* t2: (B*E),(A*F),(D*G),(C*H) */
48277 emit_insn (gen_mulv4si3 (t2, t1, op2));
48279 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48280 emit_insn (gen_xop_phadddq (t3, t2));
48282 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48283 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48285 /* Multiply lower parts and add all */
48286 t5 = gen_reg_rtx (V2DImode);
48287 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48288 gen_lowpart (V4SImode, op1),
48289 gen_lowpart (V4SImode, op2)));
48290 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48293 else
48295 machine_mode nmode;
48296 rtx (*umul) (rtx, rtx, rtx);
48298 if (mode == V2DImode)
48300 umul = gen_vec_widen_umult_even_v4si;
48301 nmode = V4SImode;
48303 else if (mode == V4DImode)
48305 umul = gen_vec_widen_umult_even_v8si;
48306 nmode = V8SImode;
48308 else if (mode == V8DImode)
48310 umul = gen_vec_widen_umult_even_v16si;
48311 nmode = V16SImode;
48313 else
48314 gcc_unreachable ();
48317 /* Multiply low parts. */
48318 t1 = gen_reg_rtx (mode);
48319 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48321 /* Shift input vectors right 32 bits so we can multiply high parts. */
48322 t6 = GEN_INT (32);
48323 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48324 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48326 /* Multiply high parts by low parts. */
48327 t4 = gen_reg_rtx (mode);
48328 t5 = gen_reg_rtx (mode);
48329 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48330 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48332 /* Combine and shift the highparts back. */
48333 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48334 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48336 /* Combine high and low parts. */
48337 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48340 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48341 gen_rtx_MULT (mode, op1, op2));
48344 /* Return 1 if control tansfer instruction INSN
48345 should be encoded with bnd prefix.
48346 If insn is NULL then return 1 when control
48347 transfer instructions should be prefixed with
48348 bnd by default for current function. */
48350 bool
48351 ix86_bnd_prefixed_insn_p (rtx insn)
48353 /* For call insns check special flag. */
48354 if (insn && CALL_P (insn))
48356 rtx call = get_call_rtx_from (insn);
48357 if (call)
48358 return CALL_EXPR_WITH_BOUNDS_P (call);
48361 /* All other insns are prefixed only if function is instrumented. */
48362 return chkp_function_instrumented_p (current_function_decl);
48365 /* Calculate integer abs() using only SSE2 instructions. */
48367 void
48368 ix86_expand_sse2_abs (rtx target, rtx input)
48370 machine_mode mode = GET_MODE (target);
48371 rtx tmp0, tmp1, x;
48373 switch (mode)
48375 /* For 32-bit signed integer X, the best way to calculate the absolute
48376 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48377 case V4SImode:
48378 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48379 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48380 NULL, 0, OPTAB_DIRECT);
48381 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48382 NULL, 0, OPTAB_DIRECT);
48383 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48384 target, 0, OPTAB_DIRECT);
48385 break;
48387 /* For 16-bit signed integer X, the best way to calculate the absolute
48388 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48389 case V8HImode:
48390 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48392 x = expand_simple_binop (mode, SMAX, tmp0, input,
48393 target, 0, OPTAB_DIRECT);
48394 break;
48396 /* For 8-bit signed integer X, the best way to calculate the absolute
48397 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48398 as SSE2 provides the PMINUB insn. */
48399 case V16QImode:
48400 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48402 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48403 target, 0, OPTAB_DIRECT);
48404 break;
48406 default:
48407 gcc_unreachable ();
48410 if (x != target)
48411 emit_move_insn (target, x);
48414 /* Expand an extract from a vector register through pextr insn.
48415 Return true if successful. */
48417 bool
48418 ix86_expand_pextr (rtx *operands)
48420 rtx dst = operands[0];
48421 rtx src = operands[1];
48423 unsigned int size = INTVAL (operands[2]);
48424 unsigned int pos = INTVAL (operands[3]);
48426 if (SUBREG_P (dst))
48428 /* Reject non-lowpart subregs. */
48429 if (SUBREG_BYTE (dst) > 0)
48430 return false;
48431 dst = SUBREG_REG (dst);
48434 if (SUBREG_P (src))
48436 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48437 src = SUBREG_REG (src);
48440 switch (GET_MODE (src))
48442 case V16QImode:
48443 case V8HImode:
48444 case V4SImode:
48445 case V2DImode:
48446 case V1TImode:
48447 case TImode:
48449 machine_mode srcmode, dstmode;
48450 rtx d, pat;
48452 dstmode = mode_for_size (size, MODE_INT, 0);
48454 switch (dstmode)
48456 case QImode:
48457 if (!TARGET_SSE4_1)
48458 return false;
48459 srcmode = V16QImode;
48460 break;
48462 case HImode:
48463 if (!TARGET_SSE2)
48464 return false;
48465 srcmode = V8HImode;
48466 break;
48468 case SImode:
48469 if (!TARGET_SSE4_1)
48470 return false;
48471 srcmode = V4SImode;
48472 break;
48474 case DImode:
48475 gcc_assert (TARGET_64BIT);
48476 if (!TARGET_SSE4_1)
48477 return false;
48478 srcmode = V2DImode;
48479 break;
48481 default:
48482 return false;
48485 /* Reject extractions from misaligned positions. */
48486 if (pos & (size-1))
48487 return false;
48489 if (GET_MODE (dst) == dstmode)
48490 d = dst;
48491 else
48492 d = gen_reg_rtx (dstmode);
48494 /* Construct insn pattern. */
48495 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48496 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48498 /* Let the rtl optimizers know about the zero extension performed. */
48499 if (dstmode == QImode || dstmode == HImode)
48501 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48502 d = gen_lowpart (SImode, d);
48505 emit_insn (gen_rtx_SET (d, pat));
48507 if (d != dst)
48508 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48509 return true;
48512 default:
48513 return false;
48517 /* Expand an insert into a vector register through pinsr insn.
48518 Return true if successful. */
48520 bool
48521 ix86_expand_pinsr (rtx *operands)
48523 rtx dst = operands[0];
48524 rtx src = operands[3];
48526 unsigned int size = INTVAL (operands[1]);
48527 unsigned int pos = INTVAL (operands[2]);
48529 if (SUBREG_P (dst))
48531 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48532 dst = SUBREG_REG (dst);
48535 switch (GET_MODE (dst))
48537 case V16QImode:
48538 case V8HImode:
48539 case V4SImode:
48540 case V2DImode:
48541 case V1TImode:
48542 case TImode:
48544 machine_mode srcmode, dstmode;
48545 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48546 rtx d;
48548 srcmode = mode_for_size (size, MODE_INT, 0);
48550 switch (srcmode)
48552 case QImode:
48553 if (!TARGET_SSE4_1)
48554 return false;
48555 dstmode = V16QImode;
48556 pinsr = gen_sse4_1_pinsrb;
48557 break;
48559 case HImode:
48560 if (!TARGET_SSE2)
48561 return false;
48562 dstmode = V8HImode;
48563 pinsr = gen_sse2_pinsrw;
48564 break;
48566 case SImode:
48567 if (!TARGET_SSE4_1)
48568 return false;
48569 dstmode = V4SImode;
48570 pinsr = gen_sse4_1_pinsrd;
48571 break;
48573 case DImode:
48574 gcc_assert (TARGET_64BIT);
48575 if (!TARGET_SSE4_1)
48576 return false;
48577 dstmode = V2DImode;
48578 pinsr = gen_sse4_1_pinsrq;
48579 break;
48581 default:
48582 return false;
48585 /* Reject insertions to misaligned positions. */
48586 if (pos & (size-1))
48587 return false;
48589 if (SUBREG_P (src))
48591 unsigned int srcpos = SUBREG_BYTE (src);
48593 if (srcpos > 0)
48595 rtx extr_ops[4];
48597 extr_ops[0] = gen_reg_rtx (srcmode);
48598 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48599 extr_ops[2] = GEN_INT (size);
48600 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48602 if (!ix86_expand_pextr (extr_ops))
48603 return false;
48605 src = extr_ops[0];
48607 else
48608 src = gen_lowpart (srcmode, SUBREG_REG (src));
48611 if (GET_MODE (dst) == dstmode)
48612 d = dst;
48613 else
48614 d = gen_reg_rtx (dstmode);
48616 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48617 gen_lowpart (srcmode, src),
48618 GEN_INT (1 << (pos / size))));
48619 if (d != dst)
48620 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48621 return true;
48624 default:
48625 return false;
48629 /* This function returns the calling abi specific va_list type node.
48630 It returns the FNDECL specific va_list type. */
48632 static tree
48633 ix86_fn_abi_va_list (tree fndecl)
48635 if (!TARGET_64BIT)
48636 return va_list_type_node;
48637 gcc_assert (fndecl != NULL_TREE);
48639 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48640 return ms_va_list_type_node;
48641 else
48642 return sysv_va_list_type_node;
48645 /* Returns the canonical va_list type specified by TYPE. If there
48646 is no valid TYPE provided, it return NULL_TREE. */
48648 static tree
48649 ix86_canonical_va_list_type (tree type)
48651 if (TARGET_64BIT)
48653 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48654 return ms_va_list_type_node;
48656 if ((TREE_CODE (type) == ARRAY_TYPE
48657 && integer_zerop (array_type_nelts (type)))
48658 || POINTER_TYPE_P (type))
48660 tree elem_type = TREE_TYPE (type);
48661 if (TREE_CODE (elem_type) == RECORD_TYPE
48662 && lookup_attribute ("sysv_abi va_list",
48663 TYPE_ATTRIBUTES (elem_type)))
48664 return sysv_va_list_type_node;
48667 return NULL_TREE;
48670 return std_canonical_va_list_type (type);
48673 /* Iterate through the target-specific builtin types for va_list.
48674 IDX denotes the iterator, *PTREE is set to the result type of
48675 the va_list builtin, and *PNAME to its internal type.
48676 Returns zero if there is no element for this index, otherwise
48677 IDX should be increased upon the next call.
48678 Note, do not iterate a base builtin's name like __builtin_va_list.
48679 Used from c_common_nodes_and_builtins. */
48681 static int
48682 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48684 if (TARGET_64BIT)
48686 switch (idx)
48688 default:
48689 break;
48691 case 0:
48692 *ptree = ms_va_list_type_node;
48693 *pname = "__builtin_ms_va_list";
48694 return 1;
48696 case 1:
48697 *ptree = sysv_va_list_type_node;
48698 *pname = "__builtin_sysv_va_list";
48699 return 1;
48703 return 0;
48706 #undef TARGET_SCHED_DISPATCH
48707 #define TARGET_SCHED_DISPATCH has_dispatch
48708 #undef TARGET_SCHED_DISPATCH_DO
48709 #define TARGET_SCHED_DISPATCH_DO do_dispatch
48710 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48711 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48712 #undef TARGET_SCHED_REORDER
48713 #define TARGET_SCHED_REORDER ix86_sched_reorder
48714 #undef TARGET_SCHED_ADJUST_PRIORITY
48715 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48716 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48717 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48718 ix86_dependencies_evaluation_hook
48720 /* The size of the dispatch window is the total number of bytes of
48721 object code allowed in a window. */
48722 #define DISPATCH_WINDOW_SIZE 16
48724 /* Number of dispatch windows considered for scheduling. */
48725 #define MAX_DISPATCH_WINDOWS 3
48727 /* Maximum number of instructions in a window. */
48728 #define MAX_INSN 4
48730 /* Maximum number of immediate operands in a window. */
48731 #define MAX_IMM 4
48733 /* Maximum number of immediate bits allowed in a window. */
48734 #define MAX_IMM_SIZE 128
48736 /* Maximum number of 32 bit immediates allowed in a window. */
48737 #define MAX_IMM_32 4
48739 /* Maximum number of 64 bit immediates allowed in a window. */
48740 #define MAX_IMM_64 2
48742 /* Maximum total of loads or prefetches allowed in a window. */
48743 #define MAX_LOAD 2
48745 /* Maximum total of stores allowed in a window. */
48746 #define MAX_STORE 1
48748 #undef BIG
48749 #define BIG 100
48752 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
48753 enum dispatch_group {
48754 disp_no_group = 0,
48755 disp_load,
48756 disp_store,
48757 disp_load_store,
48758 disp_prefetch,
48759 disp_imm,
48760 disp_imm_32,
48761 disp_imm_64,
48762 disp_branch,
48763 disp_cmp,
48764 disp_jcc,
48765 disp_last
48768 /* Number of allowable groups in a dispatch window. It is an array
48769 indexed by dispatch_group enum. 100 is used as a big number,
48770 because the number of these kind of operations does not have any
48771 effect in dispatch window, but we need them for other reasons in
48772 the table. */
48773 static unsigned int num_allowable_groups[disp_last] = {
48774 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
48777 char group_name[disp_last + 1][16] = {
48778 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
48779 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
48780 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
48783 /* Instruction path. */
48784 enum insn_path {
48785 no_path = 0,
48786 path_single, /* Single micro op. */
48787 path_double, /* Double micro op. */
48788 path_multi, /* Instructions with more than 2 micro op.. */
48789 last_path
48792 /* sched_insn_info defines a window to the instructions scheduled in
48793 the basic block. It contains a pointer to the insn_info table and
48794 the instruction scheduled.
48796 Windows are allocated for each basic block and are linked
48797 together. */
48798 typedef struct sched_insn_info_s {
48799 rtx insn;
48800 enum dispatch_group group;
48801 enum insn_path path;
48802 int byte_len;
48803 int imm_bytes;
48804 } sched_insn_info;
48806 /* Linked list of dispatch windows. This is a two way list of
48807 dispatch windows of a basic block. It contains information about
48808 the number of uops in the window and the total number of
48809 instructions and of bytes in the object code for this dispatch
48810 window. */
48811 typedef struct dispatch_windows_s {
48812 int num_insn; /* Number of insn in the window. */
48813 int num_uops; /* Number of uops in the window. */
48814 int window_size; /* Number of bytes in the window. */
48815 int window_num; /* Window number between 0 or 1. */
48816 int num_imm; /* Number of immediates in an insn. */
48817 int num_imm_32; /* Number of 32 bit immediates in an insn. */
48818 int num_imm_64; /* Number of 64 bit immediates in an insn. */
48819 int imm_size; /* Total immediates in the window. */
48820 int num_loads; /* Total memory loads in the window. */
48821 int num_stores; /* Total memory stores in the window. */
48822 int violation; /* Violation exists in window. */
48823 sched_insn_info *window; /* Pointer to the window. */
48824 struct dispatch_windows_s *next;
48825 struct dispatch_windows_s *prev;
48826 } dispatch_windows;
48828 /* Immediate valuse used in an insn. */
48829 typedef struct imm_info_s
48831 int imm;
48832 int imm32;
48833 int imm64;
48834 } imm_info;
48836 static dispatch_windows *dispatch_window_list;
48837 static dispatch_windows *dispatch_window_list1;
48839 /* Get dispatch group of insn. */
48841 static enum dispatch_group
48842 get_mem_group (rtx_insn *insn)
48844 enum attr_memory memory;
48846 if (INSN_CODE (insn) < 0)
48847 return disp_no_group;
48848 memory = get_attr_memory (insn);
48849 if (memory == MEMORY_STORE)
48850 return disp_store;
48852 if (memory == MEMORY_LOAD)
48853 return disp_load;
48855 if (memory == MEMORY_BOTH)
48856 return disp_load_store;
48858 return disp_no_group;
48861 /* Return true if insn is a compare instruction. */
48863 static bool
48864 is_cmp (rtx_insn *insn)
48866 enum attr_type type;
48868 type = get_attr_type (insn);
48869 return (type == TYPE_TEST
48870 || type == TYPE_ICMP
48871 || type == TYPE_FCMP
48872 || GET_CODE (PATTERN (insn)) == COMPARE);
48875 /* Return true if a dispatch violation encountered. */
48877 static bool
48878 dispatch_violation (void)
48880 if (dispatch_window_list->next)
48881 return dispatch_window_list->next->violation;
48882 return dispatch_window_list->violation;
48885 /* Return true if insn is a branch instruction. */
48887 static bool
48888 is_branch (rtx_insn *insn)
48890 return (CALL_P (insn) || JUMP_P (insn));
48893 /* Return true if insn is a prefetch instruction. */
48895 static bool
48896 is_prefetch (rtx_insn *insn)
48898 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
48901 /* This function initializes a dispatch window and the list container holding a
48902 pointer to the window. */
48904 static void
48905 init_window (int window_num)
48907 int i;
48908 dispatch_windows *new_list;
48910 if (window_num == 0)
48911 new_list = dispatch_window_list;
48912 else
48913 new_list = dispatch_window_list1;
48915 new_list->num_insn = 0;
48916 new_list->num_uops = 0;
48917 new_list->window_size = 0;
48918 new_list->next = NULL;
48919 new_list->prev = NULL;
48920 new_list->window_num = window_num;
48921 new_list->num_imm = 0;
48922 new_list->num_imm_32 = 0;
48923 new_list->num_imm_64 = 0;
48924 new_list->imm_size = 0;
48925 new_list->num_loads = 0;
48926 new_list->num_stores = 0;
48927 new_list->violation = false;
48929 for (i = 0; i < MAX_INSN; i++)
48931 new_list->window[i].insn = NULL;
48932 new_list->window[i].group = disp_no_group;
48933 new_list->window[i].path = no_path;
48934 new_list->window[i].byte_len = 0;
48935 new_list->window[i].imm_bytes = 0;
48937 return;
48940 /* This function allocates and initializes a dispatch window and the
48941 list container holding a pointer to the window. */
48943 static dispatch_windows *
48944 allocate_window (void)
48946 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
48947 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
48949 return new_list;
48952 /* This routine initializes the dispatch scheduling information. It
48953 initiates building dispatch scheduler tables and constructs the
48954 first dispatch window. */
48956 static void
48957 init_dispatch_sched (void)
48959 /* Allocate a dispatch list and a window. */
48960 dispatch_window_list = allocate_window ();
48961 dispatch_window_list1 = allocate_window ();
48962 init_window (0);
48963 init_window (1);
48966 /* This function returns true if a branch is detected. End of a basic block
48967 does not have to be a branch, but here we assume only branches end a
48968 window. */
48970 static bool
48971 is_end_basic_block (enum dispatch_group group)
48973 return group == disp_branch;
48976 /* This function is called when the end of a window processing is reached. */
48978 static void
48979 process_end_window (void)
48981 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
48982 if (dispatch_window_list->next)
48984 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
48985 gcc_assert (dispatch_window_list->window_size
48986 + dispatch_window_list1->window_size <= 48);
48987 init_window (1);
48989 init_window (0);
48992 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
48993 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
48994 for 48 bytes of instructions. Note that these windows are not dispatch
48995 windows that their sizes are DISPATCH_WINDOW_SIZE. */
48997 static dispatch_windows *
48998 allocate_next_window (int window_num)
49000 if (window_num == 0)
49002 if (dispatch_window_list->next)
49003 init_window (1);
49004 init_window (0);
49005 return dispatch_window_list;
49008 dispatch_window_list->next = dispatch_window_list1;
49009 dispatch_window_list1->prev = dispatch_window_list;
49011 return dispatch_window_list1;
49014 /* Compute number of immediate operands of an instruction. */
49016 static void
49017 find_constant (rtx in_rtx, imm_info *imm_values)
49019 if (INSN_P (in_rtx))
49020 in_rtx = PATTERN (in_rtx);
49021 subrtx_iterator::array_type array;
49022 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
49023 if (const_rtx x = *iter)
49024 switch (GET_CODE (x))
49026 case CONST:
49027 case SYMBOL_REF:
49028 case CONST_INT:
49029 (imm_values->imm)++;
49030 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
49031 (imm_values->imm32)++;
49032 else
49033 (imm_values->imm64)++;
49034 break;
49036 case CONST_DOUBLE:
49037 case CONST_WIDE_INT:
49038 (imm_values->imm)++;
49039 (imm_values->imm64)++;
49040 break;
49042 case CODE_LABEL:
49043 if (LABEL_KIND (x) == LABEL_NORMAL)
49045 (imm_values->imm)++;
49046 (imm_values->imm32)++;
49048 break;
49050 default:
49051 break;
49055 /* Return total size of immediate operands of an instruction along with number
49056 of corresponding immediate-operands. It initializes its parameters to zero
49057 befor calling FIND_CONSTANT.
49058 INSN is the input instruction. IMM is the total of immediates.
49059 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
49060 bit immediates. */
49062 static int
49063 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
49065 imm_info imm_values = {0, 0, 0};
49067 find_constant (insn, &imm_values);
49068 *imm = imm_values.imm;
49069 *imm32 = imm_values.imm32;
49070 *imm64 = imm_values.imm64;
49071 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
49074 /* This function indicates if an operand of an instruction is an
49075 immediate. */
49077 static bool
49078 has_immediate (rtx_insn *insn)
49080 int num_imm_operand;
49081 int num_imm32_operand;
49082 int num_imm64_operand;
49084 if (insn)
49085 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49086 &num_imm64_operand);
49087 return false;
49090 /* Return single or double path for instructions. */
49092 static enum insn_path
49093 get_insn_path (rtx_insn *insn)
49095 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
49097 if ((int)path == 0)
49098 return path_single;
49100 if ((int)path == 1)
49101 return path_double;
49103 return path_multi;
49106 /* Return insn dispatch group. */
49108 static enum dispatch_group
49109 get_insn_group (rtx_insn *insn)
49111 enum dispatch_group group = get_mem_group (insn);
49112 if (group)
49113 return group;
49115 if (is_branch (insn))
49116 return disp_branch;
49118 if (is_cmp (insn))
49119 return disp_cmp;
49121 if (has_immediate (insn))
49122 return disp_imm;
49124 if (is_prefetch (insn))
49125 return disp_prefetch;
49127 return disp_no_group;
49130 /* Count number of GROUP restricted instructions in a dispatch
49131 window WINDOW_LIST. */
49133 static int
49134 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
49136 enum dispatch_group group = get_insn_group (insn);
49137 int imm_size;
49138 int num_imm_operand;
49139 int num_imm32_operand;
49140 int num_imm64_operand;
49142 if (group == disp_no_group)
49143 return 0;
49145 if (group == disp_imm)
49147 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49148 &num_imm64_operand);
49149 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
49150 || num_imm_operand + window_list->num_imm > MAX_IMM
49151 || (num_imm32_operand > 0
49152 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
49153 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
49154 || (num_imm64_operand > 0
49155 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
49156 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
49157 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
49158 && num_imm64_operand > 0
49159 && ((window_list->num_imm_64 > 0
49160 && window_list->num_insn >= 2)
49161 || window_list->num_insn >= 3)))
49162 return BIG;
49164 return 1;
49167 if ((group == disp_load_store
49168 && (window_list->num_loads >= MAX_LOAD
49169 || window_list->num_stores >= MAX_STORE))
49170 || ((group == disp_load
49171 || group == disp_prefetch)
49172 && window_list->num_loads >= MAX_LOAD)
49173 || (group == disp_store
49174 && window_list->num_stores >= MAX_STORE))
49175 return BIG;
49177 return 1;
49180 /* This function returns true if insn satisfies dispatch rules on the
49181 last window scheduled. */
49183 static bool
49184 fits_dispatch_window (rtx_insn *insn)
49186 dispatch_windows *window_list = dispatch_window_list;
49187 dispatch_windows *window_list_next = dispatch_window_list->next;
49188 unsigned int num_restrict;
49189 enum dispatch_group group = get_insn_group (insn);
49190 enum insn_path path = get_insn_path (insn);
49191 int sum;
49193 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
49194 instructions should be given the lowest priority in the
49195 scheduling process in Haifa scheduler to make sure they will be
49196 scheduled in the same dispatch window as the reference to them. */
49197 if (group == disp_jcc || group == disp_cmp)
49198 return false;
49200 /* Check nonrestricted. */
49201 if (group == disp_no_group || group == disp_branch)
49202 return true;
49204 /* Get last dispatch window. */
49205 if (window_list_next)
49206 window_list = window_list_next;
49208 if (window_list->window_num == 1)
49210 sum = window_list->prev->window_size + window_list->window_size;
49212 if (sum == 32
49213 || (min_insn_size (insn) + sum) >= 48)
49214 /* Window 1 is full. Go for next window. */
49215 return true;
49218 num_restrict = count_num_restricted (insn, window_list);
49220 if (num_restrict > num_allowable_groups[group])
49221 return false;
49223 /* See if it fits in the first window. */
49224 if (window_list->window_num == 0)
49226 /* The first widow should have only single and double path
49227 uops. */
49228 if (path == path_double
49229 && (window_list->num_uops + 2) > MAX_INSN)
49230 return false;
49231 else if (path != path_single)
49232 return false;
49234 return true;
49237 /* Add an instruction INSN with NUM_UOPS micro-operations to the
49238 dispatch window WINDOW_LIST. */
49240 static void
49241 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
49243 int byte_len = min_insn_size (insn);
49244 int num_insn = window_list->num_insn;
49245 int imm_size;
49246 sched_insn_info *window = window_list->window;
49247 enum dispatch_group group = get_insn_group (insn);
49248 enum insn_path path = get_insn_path (insn);
49249 int num_imm_operand;
49250 int num_imm32_operand;
49251 int num_imm64_operand;
49253 if (!window_list->violation && group != disp_cmp
49254 && !fits_dispatch_window (insn))
49255 window_list->violation = true;
49257 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49258 &num_imm64_operand);
49260 /* Initialize window with new instruction. */
49261 window[num_insn].insn = insn;
49262 window[num_insn].byte_len = byte_len;
49263 window[num_insn].group = group;
49264 window[num_insn].path = path;
49265 window[num_insn].imm_bytes = imm_size;
49267 window_list->window_size += byte_len;
49268 window_list->num_insn = num_insn + 1;
49269 window_list->num_uops = window_list->num_uops + num_uops;
49270 window_list->imm_size += imm_size;
49271 window_list->num_imm += num_imm_operand;
49272 window_list->num_imm_32 += num_imm32_operand;
49273 window_list->num_imm_64 += num_imm64_operand;
49275 if (group == disp_store)
49276 window_list->num_stores += 1;
49277 else if (group == disp_load
49278 || group == disp_prefetch)
49279 window_list->num_loads += 1;
49280 else if (group == disp_load_store)
49282 window_list->num_stores += 1;
49283 window_list->num_loads += 1;
49287 /* Adds a scheduled instruction, INSN, to the current dispatch window.
49288 If the total bytes of instructions or the number of instructions in
49289 the window exceed allowable, it allocates a new window. */
49291 static void
49292 add_to_dispatch_window (rtx_insn *insn)
49294 int byte_len;
49295 dispatch_windows *window_list;
49296 dispatch_windows *next_list;
49297 dispatch_windows *window0_list;
49298 enum insn_path path;
49299 enum dispatch_group insn_group;
49300 bool insn_fits;
49301 int num_insn;
49302 int num_uops;
49303 int window_num;
49304 int insn_num_uops;
49305 int sum;
49307 if (INSN_CODE (insn) < 0)
49308 return;
49310 byte_len = min_insn_size (insn);
49311 window_list = dispatch_window_list;
49312 next_list = window_list->next;
49313 path = get_insn_path (insn);
49314 insn_group = get_insn_group (insn);
49316 /* Get the last dispatch window. */
49317 if (next_list)
49318 window_list = dispatch_window_list->next;
49320 if (path == path_single)
49321 insn_num_uops = 1;
49322 else if (path == path_double)
49323 insn_num_uops = 2;
49324 else
49325 insn_num_uops = (int) path;
49327 /* If current window is full, get a new window.
49328 Window number zero is full, if MAX_INSN uops are scheduled in it.
49329 Window number one is full, if window zero's bytes plus window
49330 one's bytes is 32, or if the bytes of the new instruction added
49331 to the total makes it greater than 48, or it has already MAX_INSN
49332 instructions in it. */
49333 num_insn = window_list->num_insn;
49334 num_uops = window_list->num_uops;
49335 window_num = window_list->window_num;
49336 insn_fits = fits_dispatch_window (insn);
49338 if (num_insn >= MAX_INSN
49339 || num_uops + insn_num_uops > MAX_INSN
49340 || !(insn_fits))
49342 window_num = ~window_num & 1;
49343 window_list = allocate_next_window (window_num);
49346 if (window_num == 0)
49348 add_insn_window (insn, window_list, insn_num_uops);
49349 if (window_list->num_insn >= MAX_INSN
49350 && insn_group == disp_branch)
49352 process_end_window ();
49353 return;
49356 else if (window_num == 1)
49358 window0_list = window_list->prev;
49359 sum = window0_list->window_size + window_list->window_size;
49360 if (sum == 32
49361 || (byte_len + sum) >= 48)
49363 process_end_window ();
49364 window_list = dispatch_window_list;
49367 add_insn_window (insn, window_list, insn_num_uops);
49369 else
49370 gcc_unreachable ();
49372 if (is_end_basic_block (insn_group))
49374 /* End of basic block is reached do end-basic-block process. */
49375 process_end_window ();
49376 return;
49380 /* Print the dispatch window, WINDOW_NUM, to FILE. */
49382 DEBUG_FUNCTION static void
49383 debug_dispatch_window_file (FILE *file, int window_num)
49385 dispatch_windows *list;
49386 int i;
49388 if (window_num == 0)
49389 list = dispatch_window_list;
49390 else
49391 list = dispatch_window_list1;
49393 fprintf (file, "Window #%d:\n", list->window_num);
49394 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
49395 list->num_insn, list->num_uops, list->window_size);
49396 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49397 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
49399 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
49400 list->num_stores);
49401 fprintf (file, " insn info:\n");
49403 for (i = 0; i < MAX_INSN; i++)
49405 if (!list->window[i].insn)
49406 break;
49407 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
49408 i, group_name[list->window[i].group],
49409 i, (void *)list->window[i].insn,
49410 i, list->window[i].path,
49411 i, list->window[i].byte_len,
49412 i, list->window[i].imm_bytes);
49416 /* Print to stdout a dispatch window. */
49418 DEBUG_FUNCTION void
49419 debug_dispatch_window (int window_num)
49421 debug_dispatch_window_file (stdout, window_num);
49424 /* Print INSN dispatch information to FILE. */
49426 DEBUG_FUNCTION static void
49427 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
49429 int byte_len;
49430 enum insn_path path;
49431 enum dispatch_group group;
49432 int imm_size;
49433 int num_imm_operand;
49434 int num_imm32_operand;
49435 int num_imm64_operand;
49437 if (INSN_CODE (insn) < 0)
49438 return;
49440 byte_len = min_insn_size (insn);
49441 path = get_insn_path (insn);
49442 group = get_insn_group (insn);
49443 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
49444 &num_imm64_operand);
49446 fprintf (file, " insn info:\n");
49447 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
49448 group_name[group], path, byte_len);
49449 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
49450 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
49453 /* Print to STDERR the status of the ready list with respect to
49454 dispatch windows. */
49456 DEBUG_FUNCTION void
49457 debug_ready_dispatch (void)
49459 int i;
49460 int no_ready = number_in_ready ();
49462 fprintf (stdout, "Number of ready: %d\n", no_ready);
49464 for (i = 0; i < no_ready; i++)
49465 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
49468 /* This routine is the driver of the dispatch scheduler. */
49470 static void
49471 do_dispatch (rtx_insn *insn, int mode)
49473 if (mode == DISPATCH_INIT)
49474 init_dispatch_sched ();
49475 else if (mode == ADD_TO_DISPATCH_WINDOW)
49476 add_to_dispatch_window (insn);
49479 /* Return TRUE if Dispatch Scheduling is supported. */
49481 static bool
49482 has_dispatch (rtx_insn *insn, int action)
49484 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
49485 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
49486 switch (action)
49488 default:
49489 return false;
49491 case IS_DISPATCH_ON:
49492 return true;
49494 case IS_CMP:
49495 return is_cmp (insn);
49497 case DISPATCH_VIOLATION:
49498 return dispatch_violation ();
49500 case FITS_DISPATCH_WINDOW:
49501 return fits_dispatch_window (insn);
49504 return false;
49507 /* Implementation of reassociation_width target hook used by
49508 reassoc phase to identify parallelism level in reassociated
49509 tree. Statements tree_code is passed in OPC. Arguments type
49510 is passed in MODE.
49512 Currently parallel reassociation is enabled for Atom
49513 processors only and we set reassociation width to be 2
49514 because Atom may issue up to 2 instructions per cycle.
49516 Return value should be fixed if parallel reassociation is
49517 enabled for other processors. */
49519 static int
49520 ix86_reassociation_width (unsigned int, machine_mode mode)
49522 /* Vector part. */
49523 if (VECTOR_MODE_P (mode))
49525 if (TARGET_VECTOR_PARALLEL_EXECUTION)
49526 return 2;
49527 else
49528 return 1;
49531 /* Scalar part. */
49532 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
49533 return 2;
49534 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
49535 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
49536 else
49537 return 1;
49540 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49541 place emms and femms instructions. */
49543 static machine_mode
49544 ix86_preferred_simd_mode (machine_mode mode)
49546 if (!TARGET_SSE)
49547 return word_mode;
49549 switch (mode)
49551 case QImode:
49552 return TARGET_AVX512BW ? V64QImode :
49553 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
49554 case HImode:
49555 return TARGET_AVX512BW ? V32HImode :
49556 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
49557 case SImode:
49558 return TARGET_AVX512F ? V16SImode :
49559 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
49560 case DImode:
49561 return TARGET_AVX512F ? V8DImode :
49562 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
49564 case SFmode:
49565 if (TARGET_AVX512F)
49566 return V16SFmode;
49567 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49568 return V8SFmode;
49569 else
49570 return V4SFmode;
49572 case DFmode:
49573 if (TARGET_AVX512F)
49574 return V8DFmode;
49575 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49576 return V4DFmode;
49577 else if (TARGET_SSE2)
49578 return V2DFmode;
49579 /* FALLTHRU */
49581 default:
49582 return word_mode;
49586 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49587 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49588 256bit and 128bit vectors. */
49590 static unsigned int
49591 ix86_autovectorize_vector_sizes (void)
49593 return TARGET_AVX512F ? 64 | 32 | 16 :
49594 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
49597 /* Implemenation of targetm.vectorize.get_mask_mode. */
49599 static machine_mode
49600 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
49602 unsigned elem_size = vector_size / nunits;
49604 /* Scalar mask case. */
49605 if ((TARGET_AVX512F && vector_size == 64)
49606 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49608 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49609 return smallest_mode_for_size (nunits, MODE_INT);
49612 machine_mode elem_mode
49613 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
49615 gcc_assert (elem_size * nunits == vector_size);
49617 return mode_for_vector (elem_mode, nunits);
49622 /* Return class of registers which could be used for pseudo of MODE
49623 and of class RCLASS for spilling instead of memory. Return NO_REGS
49624 if it is not possible or non-profitable. */
49626 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49628 static reg_class_t
49629 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49631 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49632 && TARGET_SSE2
49633 && TARGET_INTER_UNIT_MOVES_TO_VEC
49634 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49635 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49636 && INTEGER_CLASS_P (rclass))
49637 return ALL_SSE_REGS;
49638 return NO_REGS;
49641 /* Implement targetm.vectorize.init_cost. */
49643 static void *
49644 ix86_init_cost (struct loop *)
49646 unsigned *cost = XNEWVEC (unsigned, 3);
49647 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49648 return cost;
49651 /* Implement targetm.vectorize.add_stmt_cost. */
49653 static unsigned
49654 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49655 struct _stmt_vec_info *stmt_info, int misalign,
49656 enum vect_cost_model_location where)
49658 unsigned *cost = (unsigned *) data;
49659 unsigned retval = 0;
49661 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49662 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49664 /* Penalize DFmode vector operations for Bonnell. */
49665 if (TARGET_BONNELL && kind == vector_stmt
49666 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49667 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49669 /* Statements in an inner loop relative to the loop being
49670 vectorized are weighted more heavily. The value here is
49671 arbitrary and could potentially be improved with analysis. */
49672 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49673 count *= 50; /* FIXME. */
49675 retval = (unsigned) (count * stmt_cost);
49677 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49678 for Silvermont as it has out of order integer pipeline and can execute
49679 2 scalar instruction per tick, but has in order SIMD pipeline. */
49680 if (TARGET_SILVERMONT || TARGET_INTEL)
49681 if (stmt_info && stmt_info->stmt)
49683 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49684 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49685 retval = (retval * 17) / 10;
49688 cost[where] += retval;
49690 return retval;
49693 /* Implement targetm.vectorize.finish_cost. */
49695 static void
49696 ix86_finish_cost (void *data, unsigned *prologue_cost,
49697 unsigned *body_cost, unsigned *epilogue_cost)
49699 unsigned *cost = (unsigned *) data;
49700 *prologue_cost = cost[vect_prologue];
49701 *body_cost = cost[vect_body];
49702 *epilogue_cost = cost[vect_epilogue];
49705 /* Implement targetm.vectorize.destroy_cost_data. */
49707 static void
49708 ix86_destroy_cost_data (void *data)
49710 free (data);
49713 /* Validate target specific memory model bits in VAL. */
49715 static unsigned HOST_WIDE_INT
49716 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49718 enum memmodel model = memmodel_from_int (val);
49719 bool strong;
49721 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49722 |MEMMODEL_MASK)
49723 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49725 warning (OPT_Winvalid_memory_model,
49726 "Unknown architecture specific memory model");
49727 return MEMMODEL_SEQ_CST;
49729 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49730 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49732 warning (OPT_Winvalid_memory_model,
49733 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49734 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49736 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49738 warning (OPT_Winvalid_memory_model,
49739 "HLE_RELEASE not used with RELEASE or stronger memory model");
49740 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49742 return val;
49745 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49746 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49747 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49748 or number of vecsize_mangle variants that should be emitted. */
49750 static int
49751 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49752 struct cgraph_simd_clone *clonei,
49753 tree base_type, int num)
49755 int ret = 1;
49757 if (clonei->simdlen
49758 && (clonei->simdlen < 2
49759 || clonei->simdlen > 1024
49760 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49762 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49763 "unsupported simdlen %d", clonei->simdlen);
49764 return 0;
49767 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49768 if (TREE_CODE (ret_type) != VOID_TYPE)
49769 switch (TYPE_MODE (ret_type))
49771 case QImode:
49772 case HImode:
49773 case SImode:
49774 case DImode:
49775 case SFmode:
49776 case DFmode:
49777 /* case SCmode: */
49778 /* case DCmode: */
49779 break;
49780 default:
49781 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49782 "unsupported return type %qT for simd\n", ret_type);
49783 return 0;
49786 tree t;
49787 int i;
49789 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49790 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49791 switch (TYPE_MODE (TREE_TYPE (t)))
49793 case QImode:
49794 case HImode:
49795 case SImode:
49796 case DImode:
49797 case SFmode:
49798 case DFmode:
49799 /* case SCmode: */
49800 /* case DCmode: */
49801 break;
49802 default:
49803 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49804 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49805 return 0;
49808 if (clonei->cilk_elemental)
49810 /* Parse here processor clause. If not present, default to 'b'. */
49811 clonei->vecsize_mangle = 'b';
49813 else if (!TREE_PUBLIC (node->decl))
49815 /* If the function isn't exported, we can pick up just one ISA
49816 for the clones. */
49817 if (TARGET_AVX512F)
49818 clonei->vecsize_mangle = 'e';
49819 else if (TARGET_AVX2)
49820 clonei->vecsize_mangle = 'd';
49821 else if (TARGET_AVX)
49822 clonei->vecsize_mangle = 'c';
49823 else
49824 clonei->vecsize_mangle = 'b';
49825 ret = 1;
49827 else
49829 clonei->vecsize_mangle = "bcde"[num];
49830 ret = 4;
49832 clonei->mask_mode = VOIDmode;
49833 switch (clonei->vecsize_mangle)
49835 case 'b':
49836 clonei->vecsize_int = 128;
49837 clonei->vecsize_float = 128;
49838 break;
49839 case 'c':
49840 clonei->vecsize_int = 128;
49841 clonei->vecsize_float = 256;
49842 break;
49843 case 'd':
49844 clonei->vecsize_int = 256;
49845 clonei->vecsize_float = 256;
49846 break;
49847 case 'e':
49848 clonei->vecsize_int = 512;
49849 clonei->vecsize_float = 512;
49850 if (TYPE_MODE (base_type) == QImode)
49851 clonei->mask_mode = DImode;
49852 else
49853 clonei->mask_mode = SImode;
49854 break;
49856 if (clonei->simdlen == 0)
49858 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49859 clonei->simdlen = clonei->vecsize_int;
49860 else
49861 clonei->simdlen = clonei->vecsize_float;
49862 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49864 else if (clonei->simdlen > 16)
49866 /* For compatibility with ICC, use the same upper bounds
49867 for simdlen. In particular, for CTYPE below, use the return type,
49868 unless the function returns void, in that case use the characteristic
49869 type. If it is possible for given SIMDLEN to pass CTYPE value
49870 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49871 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49872 emit corresponding clone. */
49873 tree ctype = ret_type;
49874 if (TREE_CODE (ret_type) == VOID_TYPE)
49875 ctype = base_type;
49876 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49877 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49878 cnt /= clonei->vecsize_int;
49879 else
49880 cnt /= clonei->vecsize_float;
49881 if (cnt > (TARGET_64BIT ? 16 : 8))
49883 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49884 "unsupported simdlen %d", clonei->simdlen);
49885 return 0;
49888 return ret;
49891 /* Add target attribute to SIMD clone NODE if needed. */
49893 static void
49894 ix86_simd_clone_adjust (struct cgraph_node *node)
49896 const char *str = NULL;
49897 gcc_assert (node->decl == cfun->decl);
49898 switch (node->simdclone->vecsize_mangle)
49900 case 'b':
49901 if (!TARGET_SSE2)
49902 str = "sse2";
49903 break;
49904 case 'c':
49905 if (!TARGET_AVX)
49906 str = "avx";
49907 break;
49908 case 'd':
49909 if (!TARGET_AVX2)
49910 str = "avx2";
49911 break;
49912 case 'e':
49913 if (!TARGET_AVX512F)
49914 str = "avx512f";
49915 break;
49916 default:
49917 gcc_unreachable ();
49919 if (str == NULL)
49920 return;
49921 push_cfun (NULL);
49922 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49923 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49924 gcc_assert (ok);
49925 pop_cfun ();
49926 ix86_reset_previous_fndecl ();
49927 ix86_set_current_function (node->decl);
49930 /* If SIMD clone NODE can't be used in a vectorized loop
49931 in current function, return -1, otherwise return a badness of using it
49932 (0 if it is most desirable from vecsize_mangle point of view, 1
49933 slightly less desirable, etc.). */
49935 static int
49936 ix86_simd_clone_usable (struct cgraph_node *node)
49938 switch (node->simdclone->vecsize_mangle)
49940 case 'b':
49941 if (!TARGET_SSE2)
49942 return -1;
49943 if (!TARGET_AVX)
49944 return 0;
49945 return TARGET_AVX2 ? 2 : 1;
49946 case 'c':
49947 if (!TARGET_AVX)
49948 return -1;
49949 return TARGET_AVX2 ? 1 : 0;
49950 case 'd':
49951 if (!TARGET_AVX2)
49952 return -1;
49953 return 0;
49954 case 'e':
49955 if (!TARGET_AVX512F)
49956 return -1;
49957 return 0;
49958 default:
49959 gcc_unreachable ();
49963 /* This function adjusts the unroll factor based on
49964 the hardware capabilities. For ex, bdver3 has
49965 a loop buffer which makes unrolling of smaller
49966 loops less important. This function decides the
49967 unroll factor using number of memory references
49968 (value 32 is used) as a heuristic. */
49970 static unsigned
49971 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49973 basic_block *bbs;
49974 rtx_insn *insn;
49975 unsigned i;
49976 unsigned mem_count = 0;
49978 if (!TARGET_ADJUST_UNROLL)
49979 return nunroll;
49981 /* Count the number of memory references within the loop body.
49982 This value determines the unrolling factor for bdver3 and bdver4
49983 architectures. */
49984 subrtx_iterator::array_type array;
49985 bbs = get_loop_body (loop);
49986 for (i = 0; i < loop->num_nodes; i++)
49987 FOR_BB_INSNS (bbs[i], insn)
49988 if (NONDEBUG_INSN_P (insn))
49989 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49990 if (const_rtx x = *iter)
49991 if (MEM_P (x))
49993 machine_mode mode = GET_MODE (x);
49994 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49995 if (n_words > 4)
49996 mem_count += 2;
49997 else
49998 mem_count += 1;
50000 free (bbs);
50002 if (mem_count && mem_count <=32)
50003 return 32/mem_count;
50005 return nunroll;
50009 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50011 static bool
50012 ix86_float_exceptions_rounding_supported_p (void)
50014 /* For x87 floating point with standard excess precision handling,
50015 there is no adddf3 pattern (since x87 floating point only has
50016 XFmode operations) so the default hook implementation gets this
50017 wrong. */
50018 return TARGET_80387 || TARGET_SSE_MATH;
50021 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50023 static void
50024 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50026 if (!TARGET_80387 && !TARGET_SSE_MATH)
50027 return;
50028 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50029 if (TARGET_80387)
50031 tree fenv_index_type = build_index_type (size_int (6));
50032 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50033 tree fenv_var = create_tmp_var_raw (fenv_type);
50034 TREE_ADDRESSABLE (fenv_var) = 1;
50035 tree fenv_ptr = build_pointer_type (fenv_type);
50036 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50037 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50038 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50039 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50040 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50041 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50042 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50043 tree hold_fnclex = build_call_expr (fnclex, 0);
50044 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50045 NULL_TREE, NULL_TREE);
50046 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50047 hold_fnclex);
50048 *clear = build_call_expr (fnclex, 0);
50049 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50050 tree fnstsw_call = build_call_expr (fnstsw, 0);
50051 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50052 sw_var, fnstsw_call);
50053 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50054 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50055 exceptions_var, exceptions_x87);
50056 *update = build2 (COMPOUND_EXPR, integer_type_node,
50057 sw_mod, update_mod);
50058 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50059 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50061 if (TARGET_SSE_MATH)
50063 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50064 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50065 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50066 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50067 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50068 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50069 mxcsr_orig_var, stmxcsr_hold_call);
50070 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50071 mxcsr_orig_var,
50072 build_int_cst (unsigned_type_node, 0x1f80));
50073 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50074 build_int_cst (unsigned_type_node, 0xffffffc0));
50075 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50076 mxcsr_mod_var, hold_mod_val);
50077 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50078 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50079 hold_assign_orig, hold_assign_mod);
50080 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50081 ldmxcsr_hold_call);
50082 if (*hold)
50083 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50084 else
50085 *hold = hold_all;
50086 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50087 if (*clear)
50088 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50089 ldmxcsr_clear_call);
50090 else
50091 *clear = ldmxcsr_clear_call;
50092 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50093 tree exceptions_sse = fold_convert (integer_type_node,
50094 stxmcsr_update_call);
50095 if (*update)
50097 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50098 exceptions_var, exceptions_sse);
50099 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50100 exceptions_var, exceptions_mod);
50101 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50102 exceptions_assign);
50104 else
50105 *update = build2 (MODIFY_EXPR, integer_type_node,
50106 exceptions_var, exceptions_sse);
50107 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50108 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50109 ldmxcsr_update_call);
50111 tree atomic_feraiseexcept
50112 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50113 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50114 1, exceptions_var);
50115 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50116 atomic_feraiseexcept_call);
50119 /* Return mode to be used for bounds or VOIDmode
50120 if bounds are not supported. */
50122 static enum machine_mode
50123 ix86_mpx_bound_mode ()
50125 /* Do not support pointer checker if MPX
50126 is not enabled. */
50127 if (!TARGET_MPX)
50129 if (flag_check_pointer_bounds)
50130 warning (0, "Pointer Checker requires MPX support on this target."
50131 " Use -mmpx options to enable MPX.");
50132 return VOIDmode;
50135 return BNDmode;
50138 /* Return constant used to statically initialize constant bounds.
50140 This function is used to create special bound values. For now
50141 only INIT bounds and NONE bounds are expected. More special
50142 values may be added later. */
50144 static tree
50145 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50147 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50148 : build_zero_cst (pointer_sized_int_node);
50149 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50150 : build_minus_one_cst (pointer_sized_int_node);
50152 /* This function is supposed to be used to create INIT and
50153 NONE bounds only. */
50154 gcc_assert ((lb == 0 && ub == -1)
50155 || (lb == -1 && ub == 0));
50157 return build_complex (NULL, low, high);
50160 /* Generate a list of statements STMTS to initialize pointer bounds
50161 variable VAR with bounds LB and UB. Return the number of generated
50162 statements. */
50164 static int
50165 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50167 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50168 tree lhs, modify, var_p;
50170 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50171 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50173 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50174 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50175 append_to_statement_list (modify, stmts);
50177 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50178 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50179 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50180 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50181 append_to_statement_list (modify, stmts);
50183 return 2;
50186 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50187 /* For i386, common symbol is local only for non-PIE binaries. For
50188 x86-64, common symbol is local only for non-PIE binaries or linker
50189 supports copy reloc in PIE binaries. */
50191 static bool
50192 ix86_binds_local_p (const_tree exp)
50194 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50195 (!flag_pic
50196 || (TARGET_64BIT
50197 && HAVE_LD_PIE_COPYRELOC != 0)));
50199 #endif
50201 /* If MEM is in the form of [base+offset], extract the two parts
50202 of address and set to BASE and OFFSET, otherwise return false. */
50204 static bool
50205 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50207 rtx addr;
50209 gcc_assert (MEM_P (mem));
50211 addr = XEXP (mem, 0);
50213 if (GET_CODE (addr) == CONST)
50214 addr = XEXP (addr, 0);
50216 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50218 *base = addr;
50219 *offset = const0_rtx;
50220 return true;
50223 if (GET_CODE (addr) == PLUS
50224 && (REG_P (XEXP (addr, 0))
50225 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50226 && CONST_INT_P (XEXP (addr, 1)))
50228 *base = XEXP (addr, 0);
50229 *offset = XEXP (addr, 1);
50230 return true;
50233 return false;
50236 /* Given OPERANDS of consecutive load/store, check if we can merge
50237 them into move multiple. LOAD is true if they are load instructions.
50238 MODE is the mode of memory operands. */
50240 bool
50241 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50242 enum machine_mode mode)
50244 HOST_WIDE_INT offval_1, offval_2, msize;
50245 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50247 if (load)
50249 mem_1 = operands[1];
50250 mem_2 = operands[3];
50251 reg_1 = operands[0];
50252 reg_2 = operands[2];
50254 else
50256 mem_1 = operands[0];
50257 mem_2 = operands[2];
50258 reg_1 = operands[1];
50259 reg_2 = operands[3];
50262 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50264 if (REGNO (reg_1) != REGNO (reg_2))
50265 return false;
50267 /* Check if the addresses are in the form of [base+offset]. */
50268 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50269 return false;
50270 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50271 return false;
50273 /* Check if the bases are the same. */
50274 if (!rtx_equal_p (base_1, base_2))
50275 return false;
50277 offval_1 = INTVAL (offset_1);
50278 offval_2 = INTVAL (offset_2);
50279 msize = GET_MODE_SIZE (mode);
50280 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50281 if (offval_1 + msize != offval_2)
50282 return false;
50284 return true;
50287 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50289 static bool
50290 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50291 optimization_type opt_type)
50293 switch (op)
50295 case asin_optab:
50296 case acos_optab:
50297 case log1p_optab:
50298 case exp_optab:
50299 case exp10_optab:
50300 case exp2_optab:
50301 case expm1_optab:
50302 case ldexp_optab:
50303 case scalb_optab:
50304 case round_optab:
50305 return opt_type == OPTIMIZE_FOR_SPEED;
50307 case rint_optab:
50308 if (SSE_FLOAT_MODE_P (mode1)
50309 && TARGET_SSE_MATH
50310 && !flag_trapping_math
50311 && !TARGET_ROUND)
50312 return opt_type == OPTIMIZE_FOR_SPEED;
50313 return true;
50315 case floor_optab:
50316 case ceil_optab:
50317 case btrunc_optab:
50318 if (SSE_FLOAT_MODE_P (mode1)
50319 && TARGET_SSE_MATH
50320 && !flag_trapping_math
50321 && TARGET_ROUND)
50322 return true;
50323 return opt_type == OPTIMIZE_FOR_SPEED;
50325 case rsqrt_optab:
50326 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50328 default:
50329 return true;
50333 /* Address space support.
50335 This is not "far pointers" in the 16-bit sense, but an easy way
50336 to use %fs and %gs segment prefixes. Therefore:
50338 (a) All address spaces have the same modes,
50339 (b) All address spaces have the same addresss forms,
50340 (c) While %fs and %gs are technically subsets of the generic
50341 address space, they are probably not subsets of each other.
50342 (d) Since we have no access to the segment base register values
50343 without resorting to a system call, we cannot convert a
50344 non-default address space to a default address space.
50345 Therefore we do not claim %fs or %gs are subsets of generic.
50347 Therefore we can (mostly) use the default hooks. */
50349 /* All use of segmentation is assumed to make address 0 valid. */
50351 static bool
50352 ix86_addr_space_zero_address_valid (addr_space_t as)
50354 return as != ADDR_SPACE_GENERIC;
50356 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50357 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50359 /* Initialize the GCC target structure. */
50360 #undef TARGET_RETURN_IN_MEMORY
50361 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50363 #undef TARGET_LEGITIMIZE_ADDRESS
50364 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50366 #undef TARGET_ATTRIBUTE_TABLE
50367 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50368 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50369 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50370 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50371 # undef TARGET_MERGE_DECL_ATTRIBUTES
50372 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50373 #endif
50375 #undef TARGET_COMP_TYPE_ATTRIBUTES
50376 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50378 #undef TARGET_INIT_BUILTINS
50379 #define TARGET_INIT_BUILTINS ix86_init_builtins
50380 #undef TARGET_BUILTIN_DECL
50381 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50382 #undef TARGET_EXPAND_BUILTIN
50383 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50385 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50386 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50387 ix86_builtin_vectorized_function
50389 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50390 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50392 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50393 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50395 #undef TARGET_BUILTIN_RECIPROCAL
50396 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50398 #undef TARGET_ASM_FUNCTION_EPILOGUE
50399 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50401 #undef TARGET_ENCODE_SECTION_INFO
50402 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50403 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50404 #else
50405 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50406 #endif
50408 #undef TARGET_ASM_OPEN_PAREN
50409 #define TARGET_ASM_OPEN_PAREN ""
50410 #undef TARGET_ASM_CLOSE_PAREN
50411 #define TARGET_ASM_CLOSE_PAREN ""
50413 #undef TARGET_ASM_BYTE_OP
50414 #define TARGET_ASM_BYTE_OP ASM_BYTE
50416 #undef TARGET_ASM_ALIGNED_HI_OP
50417 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50418 #undef TARGET_ASM_ALIGNED_SI_OP
50419 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50420 #ifdef ASM_QUAD
50421 #undef TARGET_ASM_ALIGNED_DI_OP
50422 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50423 #endif
50425 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50426 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50428 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50429 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50431 #undef TARGET_ASM_UNALIGNED_HI_OP
50432 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50433 #undef TARGET_ASM_UNALIGNED_SI_OP
50434 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50435 #undef TARGET_ASM_UNALIGNED_DI_OP
50436 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50438 #undef TARGET_PRINT_OPERAND
50439 #define TARGET_PRINT_OPERAND ix86_print_operand
50440 #undef TARGET_PRINT_OPERAND_ADDRESS
50441 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50442 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50443 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50444 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50445 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50447 #undef TARGET_SCHED_INIT_GLOBAL
50448 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50449 #undef TARGET_SCHED_ADJUST_COST
50450 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50451 #undef TARGET_SCHED_ISSUE_RATE
50452 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50453 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50454 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50455 ia32_multipass_dfa_lookahead
50456 #undef TARGET_SCHED_MACRO_FUSION_P
50457 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50458 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50459 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50461 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50462 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50464 #undef TARGET_MEMMODEL_CHECK
50465 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50467 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50468 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50470 #ifdef HAVE_AS_TLS
50471 #undef TARGET_HAVE_TLS
50472 #define TARGET_HAVE_TLS true
50473 #endif
50474 #undef TARGET_CANNOT_FORCE_CONST_MEM
50475 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50476 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50477 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50479 #undef TARGET_DELEGITIMIZE_ADDRESS
50480 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50482 #undef TARGET_MS_BITFIELD_LAYOUT_P
50483 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50485 #if TARGET_MACHO
50486 #undef TARGET_BINDS_LOCAL_P
50487 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50488 #else
50489 #undef TARGET_BINDS_LOCAL_P
50490 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50491 #endif
50492 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50493 #undef TARGET_BINDS_LOCAL_P
50494 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50495 #endif
50497 #undef TARGET_ASM_OUTPUT_MI_THUNK
50498 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50499 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50500 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50502 #undef TARGET_ASM_FILE_START
50503 #define TARGET_ASM_FILE_START x86_file_start
50505 #undef TARGET_OPTION_OVERRIDE
50506 #define TARGET_OPTION_OVERRIDE ix86_option_override
50508 #undef TARGET_REGISTER_MOVE_COST
50509 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50510 #undef TARGET_MEMORY_MOVE_COST
50511 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50512 #undef TARGET_RTX_COSTS
50513 #define TARGET_RTX_COSTS ix86_rtx_costs
50514 #undef TARGET_ADDRESS_COST
50515 #define TARGET_ADDRESS_COST ix86_address_cost
50517 #undef TARGET_FIXED_CONDITION_CODE_REGS
50518 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50519 #undef TARGET_CC_MODES_COMPATIBLE
50520 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50522 #undef TARGET_MACHINE_DEPENDENT_REORG
50523 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50525 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50526 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50528 #undef TARGET_BUILD_BUILTIN_VA_LIST
50529 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50531 #undef TARGET_FOLD_BUILTIN
50532 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50534 #undef TARGET_COMPARE_VERSION_PRIORITY
50535 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50537 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50538 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50539 ix86_generate_version_dispatcher_body
50541 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50542 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50543 ix86_get_function_versions_dispatcher
50545 #undef TARGET_ENUM_VA_LIST_P
50546 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50548 #undef TARGET_FN_ABI_VA_LIST
50549 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50551 #undef TARGET_CANONICAL_VA_LIST_TYPE
50552 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50554 #undef TARGET_EXPAND_BUILTIN_VA_START
50555 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50557 #undef TARGET_MD_ASM_ADJUST
50558 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50560 #undef TARGET_PROMOTE_PROTOTYPES
50561 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50562 #undef TARGET_SETUP_INCOMING_VARARGS
50563 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50564 #undef TARGET_MUST_PASS_IN_STACK
50565 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50566 #undef TARGET_FUNCTION_ARG_ADVANCE
50567 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50568 #undef TARGET_FUNCTION_ARG
50569 #define TARGET_FUNCTION_ARG ix86_function_arg
50570 #undef TARGET_INIT_PIC_REG
50571 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50572 #undef TARGET_USE_PSEUDO_PIC_REG
50573 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50574 #undef TARGET_FUNCTION_ARG_BOUNDARY
50575 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50576 #undef TARGET_PASS_BY_REFERENCE
50577 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50578 #undef TARGET_INTERNAL_ARG_POINTER
50579 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50580 #undef TARGET_UPDATE_STACK_BOUNDARY
50581 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50582 #undef TARGET_GET_DRAP_RTX
50583 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50584 #undef TARGET_STRICT_ARGUMENT_NAMING
50585 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50586 #undef TARGET_STATIC_CHAIN
50587 #define TARGET_STATIC_CHAIN ix86_static_chain
50588 #undef TARGET_TRAMPOLINE_INIT
50589 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50590 #undef TARGET_RETURN_POPS_ARGS
50591 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50593 #undef TARGET_LEGITIMATE_COMBINED_INSN
50594 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50596 #undef TARGET_ASAN_SHADOW_OFFSET
50597 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50599 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50600 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50602 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50603 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50605 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50606 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50608 #undef TARGET_C_MODE_FOR_SUFFIX
50609 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50611 #ifdef HAVE_AS_TLS
50612 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50613 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50614 #endif
50616 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50617 #undef TARGET_INSERT_ATTRIBUTES
50618 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50619 #endif
50621 #undef TARGET_MANGLE_TYPE
50622 #define TARGET_MANGLE_TYPE ix86_mangle_type
50624 #if !TARGET_MACHO
50625 #undef TARGET_STACK_PROTECT_FAIL
50626 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50627 #endif
50629 #undef TARGET_FUNCTION_VALUE
50630 #define TARGET_FUNCTION_VALUE ix86_function_value
50632 #undef TARGET_FUNCTION_VALUE_REGNO_P
50633 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50635 #undef TARGET_PROMOTE_FUNCTION_MODE
50636 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50638 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50639 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50641 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50642 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50644 #undef TARGET_INSTANTIATE_DECLS
50645 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50647 #undef TARGET_SECONDARY_RELOAD
50648 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50650 #undef TARGET_CLASS_MAX_NREGS
50651 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50653 #undef TARGET_PREFERRED_RELOAD_CLASS
50654 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50655 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50656 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50657 #undef TARGET_CLASS_LIKELY_SPILLED_P
50658 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50660 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50661 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50662 ix86_builtin_vectorization_cost
50663 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50664 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50665 ix86_vectorize_vec_perm_const_ok
50666 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50667 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50668 ix86_preferred_simd_mode
50669 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50670 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50671 ix86_autovectorize_vector_sizes
50672 #undef TARGET_VECTORIZE_GET_MASK_MODE
50673 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50674 #undef TARGET_VECTORIZE_INIT_COST
50675 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50676 #undef TARGET_VECTORIZE_ADD_STMT_COST
50677 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50678 #undef TARGET_VECTORIZE_FINISH_COST
50679 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50680 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50681 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50683 #undef TARGET_SET_CURRENT_FUNCTION
50684 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50686 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50687 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50689 #undef TARGET_OPTION_SAVE
50690 #define TARGET_OPTION_SAVE ix86_function_specific_save
50692 #undef TARGET_OPTION_RESTORE
50693 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50695 #undef TARGET_OPTION_POST_STREAM_IN
50696 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50698 #undef TARGET_OPTION_PRINT
50699 #define TARGET_OPTION_PRINT ix86_function_specific_print
50701 #undef TARGET_OPTION_FUNCTION_VERSIONS
50702 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
50704 #undef TARGET_CAN_INLINE_P
50705 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50707 #undef TARGET_LEGITIMATE_ADDRESS_P
50708 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50710 #undef TARGET_REGISTER_PRIORITY
50711 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50713 #undef TARGET_REGISTER_USAGE_LEVELING_P
50714 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50716 #undef TARGET_LEGITIMATE_CONSTANT_P
50717 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50719 #undef TARGET_FRAME_POINTER_REQUIRED
50720 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50722 #undef TARGET_CAN_ELIMINATE
50723 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50725 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50726 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50728 #undef TARGET_ASM_CODE_END
50729 #define TARGET_ASM_CODE_END ix86_code_end
50731 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50732 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50734 #if TARGET_MACHO
50735 #undef TARGET_INIT_LIBFUNCS
50736 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
50737 #endif
50739 #undef TARGET_LOOP_UNROLL_ADJUST
50740 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50742 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50743 #undef TARGET_SPILL_CLASS
50744 #define TARGET_SPILL_CLASS ix86_spill_class
50746 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50747 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50748 ix86_simd_clone_compute_vecsize_and_simdlen
50750 #undef TARGET_SIMD_CLONE_ADJUST
50751 #define TARGET_SIMD_CLONE_ADJUST \
50752 ix86_simd_clone_adjust
50754 #undef TARGET_SIMD_CLONE_USABLE
50755 #define TARGET_SIMD_CLONE_USABLE \
50756 ix86_simd_clone_usable
50758 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50759 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50760 ix86_float_exceptions_rounding_supported_p
50762 #undef TARGET_MODE_EMIT
50763 #define TARGET_MODE_EMIT ix86_emit_mode_set
50765 #undef TARGET_MODE_NEEDED
50766 #define TARGET_MODE_NEEDED ix86_mode_needed
50768 #undef TARGET_MODE_AFTER
50769 #define TARGET_MODE_AFTER ix86_mode_after
50771 #undef TARGET_MODE_ENTRY
50772 #define TARGET_MODE_ENTRY ix86_mode_entry
50774 #undef TARGET_MODE_EXIT
50775 #define TARGET_MODE_EXIT ix86_mode_exit
50777 #undef TARGET_MODE_PRIORITY
50778 #define TARGET_MODE_PRIORITY ix86_mode_priority
50780 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50781 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50783 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50784 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50786 #undef TARGET_STORE_BOUNDS_FOR_ARG
50787 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50789 #undef TARGET_LOAD_RETURNED_BOUNDS
50790 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50792 #undef TARGET_STORE_RETURNED_BOUNDS
50793 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50795 #undef TARGET_CHKP_BOUND_MODE
50796 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50798 #undef TARGET_BUILTIN_CHKP_FUNCTION
50799 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50801 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50802 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50804 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50805 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50807 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50808 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50810 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50811 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50813 #undef TARGET_OFFLOAD_OPTIONS
50814 #define TARGET_OFFLOAD_OPTIONS \
50815 ix86_offload_options
50817 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50818 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50820 #undef TARGET_OPTAB_SUPPORTED_P
50821 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50823 #undef TARGET_HARD_REGNO_SCRATCH_OK
50824 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50826 struct gcc_target targetm = TARGET_INITIALIZER;
50828 #include "gt-i386.h"